Amazon Glue examples using SDK for Java 2.x - Amazon SDK for Java 2.x
Services or capabilities described in Amazon Web Services documentation might vary by Region. To see the differences applicable to the China Regions, see Getting Started with Amazon Web Services in China (PDF).

Amazon Glue examples using SDK for Java 2.x

The following code examples show you how to perform actions and implement common scenarios by using the Amazon SDK for Java 2.x with Amazon Glue.

Actions are code excerpts from larger programs and must be run in context. While actions show you how to call individual service functions, you can see actions in context in their related scenarios and cross-service examples.

Scenarios are code examples that show you how to accomplish a specific task by calling multiple functions within the same service.

Each example includes a link to GitHub, where you can find instructions on how to set up and run the code in context.

Get started

The following code examples show how to get started using Amazon Glue.

SDK for Java 2.x
Note

There's more on GitHub. Find the complete example and learn how to set up and run in the Amazon Code Examples Repository.

package com.example.glue; import software.amazon.awssdk.regions.Region; import software.amazon.awssdk.services.glue.GlueClient; import software.amazon.awssdk.services.glue.model.ListJobsRequest; import software.amazon.awssdk.services.glue.model.ListJobsResponse; import java.util.List; public class HelloGlue { public static void main(String[] args) { GlueClient glueClient = GlueClient.builder() .region(Region.US_EAST_1) .build(); listJobs(glueClient); } public static void listJobs(GlueClient glueClient) { ListJobsRequest request = ListJobsRequest.builder() .maxResults(10) .build(); ListJobsResponse response = glueClient.listJobs(request); List<String> jobList = response.jobNames(); jobList.forEach(job -> { System.out.println("Job Name: " + job); }); } }
  • For API details, see ListJobs in Amazon SDK for Java 2.x API Reference.

Actions

The following code example shows how to use CreateCrawler.

SDK for Java 2.x
Note

There's more on GitHub. Find the complete example and learn how to set up and run in the Amazon Code Examples Repository.

import software.amazon.awssdk.regions.Region; import software.amazon.awssdk.services.glue.GlueClient; import software.amazon.awssdk.services.glue.model.CreateCrawlerRequest; import software.amazon.awssdk.services.glue.model.CrawlerTargets; import software.amazon.awssdk.services.glue.model.GlueException; import software.amazon.awssdk.services.glue.model.S3Target; import java.util.ArrayList; import java.util.List; /** * Before running this Java V2 code example, set up your development * environment, including your credentials. * * For more information, see the following documentation topic: * * https://docs.aws.amazon.com/sdk-for-java/latest/developer-guide/get-started.html */ public class CreateCrawler { public static void main(String[] args) { final String usage = """ Usage: <IAM> <s3Path> <cron> <dbName> <crawlerName> Where: IAM - The ARN of the IAM role that has AWS Glue and S3 permissions.\s s3Path - The Amazon Simple Storage Service (Amazon S3) target that contains data (for example, CSV data). cron - A cron expression used to specify the schedule (i.e., cron(15 12 * * ? *). dbName - The database name.\s crawlerName - The name of the crawler.\s """; if (args.length != 5) { System.out.println(usage); System.exit(1); } String iam = args[0]; String s3Path = args[1]; String cron = args[2]; String dbName = args[3]; String crawlerName = args[4]; Region region = Region.US_EAST_1; GlueClient glueClient = GlueClient.builder() .region(region) .build(); createGlueCrawler(glueClient, iam, s3Path, cron, dbName, crawlerName); glueClient.close(); } public static void createGlueCrawler(GlueClient glueClient, String iam, String s3Path, String cron, String dbName, String crawlerName) { try { S3Target s3Target = S3Target.builder() .path(s3Path) .build(); // Add the S3Target to a list. List<S3Target> targetList = new ArrayList<>(); targetList.add(s3Target); CrawlerTargets targets = CrawlerTargets.builder() .s3Targets(targetList) .build(); CreateCrawlerRequest crawlerRequest = CreateCrawlerRequest.builder() .databaseName(dbName) .name(crawlerName) .description("Created by the AWS Glue Java API") .targets(targets) .role(iam) .schedule(cron) .build(); glueClient.createCrawler(crawlerRequest); System.out.println(crawlerName + " was successfully created"); } catch (GlueException e) { System.err.println(e.awsErrorDetails().errorMessage()); System.exit(1); } } }
  • For API details, see CreateCrawler in Amazon SDK for Java 2.x API Reference.

The following code example shows how to use GetCrawler.

SDK for Java 2.x
Note

There's more on GitHub. Find the complete example and learn how to set up and run in the Amazon Code Examples Repository.

import software.amazon.awssdk.regions.Region; import software.amazon.awssdk.services.glue.GlueClient; import software.amazon.awssdk.services.glue.model.GetCrawlerRequest; import software.amazon.awssdk.services.glue.model.GetCrawlerResponse; import software.amazon.awssdk.services.glue.model.GlueException; import java.time.Instant; import java.time.ZoneId; import java.time.format.DateTimeFormatter; import java.time.format.FormatStyle; import java.util.Locale; /** * Before running this Java V2 code example, set up your development * environment, including your credentials. * * For more information, see the following documentation topic: * * https://docs.aws.amazon.com/sdk-for-java/latest/developer-guide/get-started.html */ public class GetCrawler { public static void main(String[] args) { final String usage = """ Usage: <crawlerName> Where: crawlerName - The name of the crawler.\s """; if (args.length != 1) { System.out.println(usage); System.exit(1); } String crawlerName = args[0]; Region region = Region.US_EAST_1; GlueClient glueClient = GlueClient.builder() .region(region) .build(); getSpecificCrawler(glueClient, crawlerName); glueClient.close(); } public static void getSpecificCrawler(GlueClient glueClient, String crawlerName) { try { GetCrawlerRequest crawlerRequest = GetCrawlerRequest.builder() .name(crawlerName) .build(); GetCrawlerResponse response = glueClient.getCrawler(crawlerRequest); Instant createDate = response.crawler().creationTime(); // Convert the Instant to readable date DateTimeFormatter formatter = DateTimeFormatter.ofLocalizedDateTime(FormatStyle.SHORT) .withLocale(Locale.US) .withZone(ZoneId.systemDefault()); formatter.format(createDate); System.out.println("The create date of the Crawler is " + createDate); } catch (GlueException e) { System.err.println(e.awsErrorDetails().errorMessage()); System.exit(1); } } }
  • For API details, see GetCrawler in Amazon SDK for Java 2.x API Reference.

The following code example shows how to use GetDatabase.

SDK for Java 2.x
Note

There's more on GitHub. Find the complete example and learn how to set up and run in the Amazon Code Examples Repository.

import software.amazon.awssdk.regions.Region; import software.amazon.awssdk.services.glue.GlueClient; import software.amazon.awssdk.services.glue.model.GetDatabaseRequest; import software.amazon.awssdk.services.glue.model.GetDatabaseResponse; import software.amazon.awssdk.services.glue.model.GlueException; import java.time.Instant; import java.time.ZoneId; import java.time.format.DateTimeFormatter; import java.time.format.FormatStyle; import java.util.Locale; /** * Before running this Java V2 code example, set up your development * environment, including your credentials. * * For more information, see the following documentation topic: * * https://docs.aws.amazon.com/sdk-for-java/latest/developer-guide/get-started.html */ public class GetDatabase { public static void main(String[] args) { final String usage = """ Usage: <databaseName> Where: databaseName - The name of the database.\s """; if (args.length != 1) { System.out.println(usage); System.exit(1); } String databaseName = args[0]; Region region = Region.US_EAST_1; GlueClient glueClient = GlueClient.builder() .region(region) .build(); getSpecificDatabase(glueClient, databaseName); glueClient.close(); } public static void getSpecificDatabase(GlueClient glueClient, String databaseName) { try { GetDatabaseRequest databasesRequest = GetDatabaseRequest.builder() .name(databaseName) .build(); GetDatabaseResponse response = glueClient.getDatabase(databasesRequest); Instant createDate = response.database().createTime(); // Convert the Instant to readable date. DateTimeFormatter formatter = DateTimeFormatter.ofLocalizedDateTime(FormatStyle.SHORT) .withLocale(Locale.US) .withZone(ZoneId.systemDefault()); formatter.format(createDate); System.out.println("The create date of the database is " + createDate); } catch (GlueException e) { System.err.println(e.awsErrorDetails().errorMessage()); System.exit(1); } } }
  • For API details, see GetDatabase in Amazon SDK for Java 2.x API Reference.

The following code example shows how to use GetTables.

SDK for Java 2.x
Note

There's more on GitHub. Find the complete example and learn how to set up and run in the Amazon Code Examples Repository.

import software.amazon.awssdk.regions.Region; import software.amazon.awssdk.services.glue.GlueClient; import software.amazon.awssdk.services.glue.model.GetTableRequest; import software.amazon.awssdk.services.glue.model.GetTableResponse; import software.amazon.awssdk.services.glue.model.GlueException; import java.time.Instant; import java.time.ZoneId; import java.time.format.DateTimeFormatter; import java.time.format.FormatStyle; import java.util.Locale; /** * Before running this Java V2 code example, set up your development * environment, including your credentials. * * For more information, see the following documentation topic: * * https://docs.aws.amazon.com/sdk-for-java/latest/developer-guide/get-started.html */ public class GetTable { public static void main(String[] args) { final String usage = """ Usage: <dbName> <tableName> Where: dbName - The database name.\s tableName - The name of the table.\s """; if (args.length != 2) { System.out.println(usage); System.exit(1); } String dbName = args[0]; String tableName = args[1]; Region region = Region.US_EAST_1; GlueClient glueClient = GlueClient.builder() .region(region) .build(); getGlueTable(glueClient, dbName, tableName); glueClient.close(); } public static void getGlueTable(GlueClient glueClient, String dbName, String tableName) { try { GetTableRequest tableRequest = GetTableRequest.builder() .databaseName(dbName) .name(tableName) .build(); GetTableResponse tableResponse = glueClient.getTable(tableRequest); Instant createDate = tableResponse.table().createTime(); // Convert the Instant to readable date. DateTimeFormatter formatter = DateTimeFormatter.ofLocalizedDateTime(FormatStyle.SHORT) .withLocale(Locale.US) .withZone(ZoneId.systemDefault()); formatter.format(createDate); System.out.println("The create date of the table is " + createDate); } catch (GlueException e) { System.err.println(e.awsErrorDetails().errorMessage()); System.exit(1); } } }
  • For API details, see GetTables in Amazon SDK for Java 2.x API Reference.

The following code example shows how to use StartCrawler.

SDK for Java 2.x
Note

There's more on GitHub. Find the complete example and learn how to set up and run in the Amazon Code Examples Repository.

import software.amazon.awssdk.regions.Region; import software.amazon.awssdk.services.glue.GlueClient; import software.amazon.awssdk.services.glue.model.GlueException; import software.amazon.awssdk.services.glue.model.StartCrawlerRequest; /** * Before running this Java V2 code example, set up your development * environment, including your credentials. * * For more information, see the following documentation topic: * * https://docs.aws.amazon.com/sdk-for-java/latest/developer-guide/get-started.html */ public class StartCrawler { public static void main(String[] args) { final String usage = """ Usage: <crawlerName> Where: crawlerName - The name of the crawler.\s """; if (args.length != 1) { System.out.println(usage); System.exit(1); } String crawlerName = args[0]; Region region = Region.US_EAST_1; GlueClient glueClient = GlueClient.builder() .region(region) .build(); startSpecificCrawler(glueClient, crawlerName); glueClient.close(); } public static void startSpecificCrawler(GlueClient glueClient, String crawlerName) { try { StartCrawlerRequest crawlerRequest = StartCrawlerRequest.builder() .name(crawlerName) .build(); glueClient.startCrawler(crawlerRequest); } catch (GlueException e) { System.err.println(e.awsErrorDetails().errorMessage()); System.exit(1); } } }
  • For API details, see StartCrawler in Amazon SDK for Java 2.x API Reference.

Scenarios

The following code example shows how to:

  • Create a crawler that crawls a public Amazon S3 bucket and generates a database of CSV-formatted metadata.

  • List information about databases and tables in your Amazon Glue Data Catalog.

  • Create a job to extract CSV data from the S3 bucket, transform the data, and load JSON-formatted output into another S3 bucket.

  • List information about job runs, view transformed data, and clean up resources.

For more information, see Tutorial: Getting started with Amazon Glue Studio.

SDK for Java 2.x
Note

There's more on GitHub. Find the complete example and learn how to set up and run in the Amazon Code Examples Repository.

/** * * Before running this Java V2 code example, set up your development * environment, including your credentials. * * For more information, see the following documentation topic: * * https://docs.aws.amazon.com/sdk-for-java/latest/developer-guide/get-started.html * * To set up the resources, see this documentation topic: * * https://docs.aws.amazon.com/glue/latest/ug/tutorial-add-crawler.html * * This example performs the following tasks: * * 1. Create a database. * 2. Create a crawler. * 3. Get a crawler. * 4. Start a crawler. * 5. Get a database. * 6. Get tables. * 7. Create a job. * 8. Start a job run. * 9. List all jobs. * 10. Get job runs. * 11. Delete a job. * 12. Delete a database. * 13. Delete a crawler. */ public class GlueScenario { public static final String DASHES = new String(new char[80]).replace("\0", "-"); public static void main(String[] args) throws InterruptedException { final String usage = """ Usage: <iam> <s3Path> <cron> <dbName> <crawlerName> <jobName>\s Where: iam - The ARN of the IAM role that has AWS Glue and S3 permissions.\s s3Path - The Amazon Simple Storage Service (Amazon S3) target that contains data (for example, CSV data). cron - A cron expression used to specify the schedule (i.e., cron(15 12 * * ? *). dbName - The database name.\s crawlerName - The name of the crawler.\s jobName - The name you assign to this job definition. scriptLocation - The Amazon S3 path to a script that runs a job. locationUri - The location of the database bucketNameSc - The Amazon S3 bucket name used when creating a job """; if (args.length != 9) { System.out.println(usage); System.exit(1); } String iam = args[0]; String s3Path = args[1]; String cron = args[2]; String dbName = args[3]; String crawlerName = args[4]; String jobName = args[5]; String scriptLocation = args[6]; String locationUri = args[7]; String bucketNameSc = args[8]; Region region = Region.US_EAST_1; GlueClient glueClient = GlueClient.builder() .region(region) .build(); System.out.println(DASHES); System.out.println("Welcome to the AWS Glue scenario."); System.out.println(DASHES); System.out.println(DASHES); System.out.println("1. Create a database."); createDatabase(glueClient, dbName, locationUri); System.out.println(DASHES); System.out.println(DASHES); System.out.println("2. Create a crawler."); createGlueCrawler(glueClient, iam, s3Path, cron, dbName, crawlerName); System.out.println(DASHES); System.out.println(DASHES); System.out.println("3. Get a crawler."); getSpecificCrawler(glueClient, crawlerName); System.out.println(DASHES); System.out.println(DASHES); System.out.println("4. Start a crawler."); startSpecificCrawler(glueClient, crawlerName); System.out.println(DASHES); System.out.println(DASHES); System.out.println("5. Get a database."); getSpecificDatabase(glueClient, dbName); System.out.println(DASHES); System.out.println(DASHES); System.out.println("*** Wait 5 min for the tables to become available"); TimeUnit.MINUTES.sleep(5); System.out.println("6. Get tables."); String myTableName = getGlueTables(glueClient, dbName); System.out.println(DASHES); System.out.println(DASHES); System.out.println("7. Create a job."); createJob(glueClient, jobName, iam, scriptLocation); System.out.println(DASHES); System.out.println(DASHES); System.out.println("8. Start a Job run."); startJob(glueClient, jobName, dbName, myTableName, bucketNameSc); System.out.println(DASHES); System.out.println(DASHES); System.out.println("9. List all jobs."); getAllJobs(glueClient); System.out.println(DASHES); System.out.println(DASHES); System.out.println("10. Get job runs."); getJobRuns(glueClient, jobName); System.out.println(DASHES); System.out.println(DASHES); System.out.println("11. Delete a job."); deleteJob(glueClient, jobName); System.out.println("*** Wait 5 MIN for the " + crawlerName + " to stop"); TimeUnit.MINUTES.sleep(5); System.out.println(DASHES); System.out.println(DASHES); System.out.println("12. Delete a database."); deleteDatabase(glueClient, dbName); System.out.println(DASHES); System.out.println(DASHES); System.out.println("Delete a crawler."); deleteSpecificCrawler(glueClient, crawlerName); System.out.println(DASHES); System.out.println(DASHES); System.out.println("Successfully completed the AWS Glue Scenario"); System.out.println(DASHES); } public static void createDatabase(GlueClient glueClient, String dbName, String locationUri) { try { DatabaseInput input = DatabaseInput.builder() .description("Built with the AWS SDK for Java V2") .name(dbName) .locationUri(locationUri) .build(); CreateDatabaseRequest request = CreateDatabaseRequest.builder() .databaseInput(input) .build(); glueClient.createDatabase(request); System.out.println(dbName + " was successfully created"); } catch (GlueException e) { System.err.println(e.awsErrorDetails().errorMessage()); System.exit(1); } } public static void createGlueCrawler(GlueClient glueClient, String iam, String s3Path, String cron, String dbName, String crawlerName) { try { S3Target s3Target = S3Target.builder() .path(s3Path) .build(); List<S3Target> targetList = new ArrayList<>(); targetList.add(s3Target); CrawlerTargets targets = CrawlerTargets.builder() .s3Targets(targetList) .build(); CreateCrawlerRequest crawlerRequest = CreateCrawlerRequest.builder() .databaseName(dbName) .name(crawlerName) .description("Created by the AWS Glue Java API") .targets(targets) .role(iam) .schedule(cron) .build(); glueClient.createCrawler(crawlerRequest); System.out.println(crawlerName + " was successfully created"); } catch (GlueException e) { System.err.println(e.awsErrorDetails().errorMessage()); System.exit(1); } } public static void getSpecificCrawler(GlueClient glueClient, String crawlerName) { try { GetCrawlerRequest crawlerRequest = GetCrawlerRequest.builder() .name(crawlerName) .build(); boolean ready = false; while (!ready) { GetCrawlerResponse response = glueClient.getCrawler(crawlerRequest); String status = response.crawler().stateAsString(); if (status.compareTo("READY") == 0) { ready = true; } Thread.sleep(3000); } System.out.println("The crawler is now ready"); } catch (GlueException | InterruptedException e) { System.err.println(e.getMessage()); System.exit(1); } } public static void startSpecificCrawler(GlueClient glueClient, String crawlerName) { try { StartCrawlerRequest crawlerRequest = StartCrawlerRequest.builder() .name(crawlerName) .build(); glueClient.startCrawler(crawlerRequest); System.out.println(crawlerName + " was successfully started!"); } catch (GlueException e) { System.err.println(e.awsErrorDetails().errorMessage()); System.exit(1); } } public static void getSpecificDatabase(GlueClient glueClient, String databaseName) { try { GetDatabaseRequest databasesRequest = GetDatabaseRequest.builder() .name(databaseName) .build(); GetDatabaseResponse response = glueClient.getDatabase(databasesRequest); Instant createDate = response.database().createTime(); // Convert the Instant to readable date. DateTimeFormatter formatter = DateTimeFormatter.ofLocalizedDateTime(FormatStyle.SHORT) .withLocale(Locale.US) .withZone(ZoneId.systemDefault()); formatter.format(createDate); System.out.println("The create date of the database is " + createDate); } catch (GlueException e) { System.err.println(e.awsErrorDetails().errorMessage()); System.exit(1); } } public static String getGlueTables(GlueClient glueClient, String dbName) { String myTableName = ""; try { GetTablesRequest tableRequest = GetTablesRequest.builder() .databaseName(dbName) .build(); GetTablesResponse response = glueClient.getTables(tableRequest); List<Table> tables = response.tableList(); if (tables.isEmpty()) { System.out.println("No tables were returned"); } else { for (Table table : tables) { myTableName = table.name(); System.out.println("Table name is: " + myTableName); } } } catch (GlueException e) { System.err.println(e.awsErrorDetails().errorMessage()); System.exit(1); } return myTableName; } public static void startJob(GlueClient glueClient, String jobName, String inputDatabase, String inputTable, String outBucket) { try { Map<String, String> myMap = new HashMap<>(); myMap.put("--input_database", inputDatabase); myMap.put("--input_table", inputTable); myMap.put("--output_bucket_url", outBucket); StartJobRunRequest runRequest = StartJobRunRequest.builder() .workerType(WorkerType.G_1_X) .numberOfWorkers(10) .arguments(myMap) .jobName(jobName) .build(); StartJobRunResponse response = glueClient.startJobRun(runRequest); System.out.println("The request Id of the job is " + response.responseMetadata().requestId()); } catch (GlueException e) { System.err.println(e.awsErrorDetails().errorMessage()); System.exit(1); } } public static void createJob(GlueClient glueClient, String jobName, String iam, String scriptLocation) { try { JobCommand command = JobCommand.builder() .pythonVersion("3") .name("glueetl") .scriptLocation(scriptLocation) .build(); CreateJobRequest jobRequest = CreateJobRequest.builder() .description("A Job created by using the AWS SDK for Java V2") .glueVersion("2.0") .workerType(WorkerType.G_1_X) .numberOfWorkers(10) .name(jobName) .role(iam) .command(command) .build(); glueClient.createJob(jobRequest); System.out.println(jobName + " was successfully created."); } catch (GlueException e) { System.err.println(e.awsErrorDetails().errorMessage()); System.exit(1); } } public static void getAllJobs(GlueClient glueClient) { try { GetJobsRequest jobsRequest = GetJobsRequest.builder() .maxResults(10) .build(); GetJobsResponse jobsResponse = glueClient.getJobs(jobsRequest); List<Job> jobs = jobsResponse.jobs(); for (Job job : jobs) { System.out.println("Job name is : " + job.name()); System.out.println("The job worker type is : " + job.workerType().name()); } } catch (GlueException e) { System.err.println(e.awsErrorDetails().errorMessage()); System.exit(1); } } public static void getJobRuns(GlueClient glueClient, String jobName) { try { GetJobRunsRequest runsRequest = GetJobRunsRequest.builder() .jobName(jobName) .maxResults(20) .build(); boolean jobDone = false; while (!jobDone) { GetJobRunsResponse response = glueClient.getJobRuns(runsRequest); List<JobRun> jobRuns = response.jobRuns(); for (JobRun jobRun : jobRuns) { String jobState = jobRun.jobRunState().name(); if (jobState.compareTo("SUCCEEDED") == 0) { System.out.println(jobName + " has succeeded"); jobDone = true; } else if (jobState.compareTo("STOPPED") == 0) { System.out.println("Job run has stopped"); jobDone = true; } else if (jobState.compareTo("FAILED") == 0) { System.out.println("Job run has failed"); jobDone = true; } else if (jobState.compareTo("TIMEOUT") == 0) { System.out.println("Job run has timed out"); jobDone = true; } else { System.out.println("*** Job run state is " + jobRun.jobRunState().name()); System.out.println("Job run Id is " + jobRun.id()); System.out.println("The Glue version is " + jobRun.glueVersion()); } TimeUnit.SECONDS.sleep(5); } } } catch (GlueException | InterruptedException e) { System.err.println(e.getMessage()); System.exit(1); } } public static void deleteJob(GlueClient glueClient, String jobName) { try { DeleteJobRequest jobRequest = DeleteJobRequest.builder() .jobName(jobName) .build(); glueClient.deleteJob(jobRequest); System.out.println(jobName + " was successfully deleted"); } catch (GlueException e) { System.err.println(e.awsErrorDetails().errorMessage()); System.exit(1); } } public static void deleteDatabase(GlueClient glueClient, String databaseName) { try { DeleteDatabaseRequest request = DeleteDatabaseRequest.builder() .name(databaseName) .build(); glueClient.deleteDatabase(request); System.out.println(databaseName + " was successfully deleted"); } catch (GlueException e) { System.err.println(e.awsErrorDetails().errorMessage()); System.exit(1); } } public static void deleteSpecificCrawler(GlueClient glueClient, String crawlerName) { try { DeleteCrawlerRequest deleteCrawlerRequest = DeleteCrawlerRequest.builder() .name(crawlerName) .build(); glueClient.deleteCrawler(deleteCrawlerRequest); System.out.println(crawlerName + " was deleted"); } catch (GlueException e) { System.err.println(e.awsErrorDetails().errorMessage()); System.exit(1); } } }