Scala 脚本示例 - 流式处理 ETL - AWS Glue
AWS 文档中描述的 AWS 服务或功能可能因区域而异。要查看适用于中国区域的差异,请参阅中国的 AWS 服务入门

如果我们为英文版本指南提供翻译,那么如果存在任何冲突,将以英文版本指南为准。在提供翻译时使用机器翻译。

Scala 脚本示例 - 流式处理 ETL

以下示例脚本连接到 Amazon Kinesis Data Streams,使用来自 数据目录 的 schema 解析数据流,将流联接到 Amazon S3 上的静态数据集,并以 parquet 格式将联接结果输出到 Amazon S3。

// This script connects to an Amazon Kinesis stream, uses a schema from the data catalog to parse the stream, // joins the stream to a static dataset on Amazon S3, and outputs the joined results to Amazon S3 in parquet format. import com.amazonaws.services.glue.GlueContext import com.amazonaws.services.glue.util.GlueArgParser import com.amazonaws.services.glue.util.Job import java.util.Calendar import org.apache.spark.SparkContext import org.apache.spark.sql.Dataset import org.apache.spark.sql.Row import org.apache.spark.sql.SaveMode import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions.from_json import org.apache.spark.sql.streaming.Trigger import scala.collection.JavaConverters._ object streamJoiner { def main(sysArgs: Array[String]) { val spark: SparkContext = new SparkContext() val glueContext: GlueContext = new GlueContext(spark) val sparkSession: SparkSession = glueContext.getSparkSession import sparkSession.implicits._ // @params: [JOB_NAME] val args = GlueArgParser.getResolvedOptions(sysArgs, Seq("JOB_NAME").toArray) Job.init(args("JOB_NAME"), glueContext, args.asJava) val staticData = sparkSession.read // read() returns type DataFrameReader .format("csv") .option("header", "true") .load("s3://awsexamplebucket-streaming-demo2/inputs/productsStatic.csv") // load() returns a DataFrame val datasource0 = sparkSession.readStream // readstream() returns type DataStreamReader .format("kinesis") .option("streamName", "stream-join-demo") .option("endpointUrl", "https://kinesis.us-east-1.amazonaws.com") .option("startingPosition", "TRIM_HORIZON") .load // load() returns a DataFrame val selectfields1 = datasource0.select(from_json($"data".cast("string"), glueContext.getCatalogSchemaAsSparkSchema("stream-demos", "stream-join-demo2")) as "data").select("data.*") val datasink2 = selectfields1.writeStream.foreachBatch { (dataFrame: Dataset[Row], batchId: Long) => { //foreachBatch() returns type DataStreamWriter val joined = dataFrame.join(staticData, "product_id") val year: Int = Calendar.getInstance().get(Calendar.YEAR) val month :Int = Calendar.getInstance().get(Calendar.MONTH) + 1 val day: Int = Calendar.getInstance().get(Calendar.DATE) val hour: Int = Calendar.getInstance().get(Calendar.HOUR_OF_DAY) if (dataFrame.count() > 0) { joined.write // joined.write returns type DataFrameWriter .mode(SaveMode.Append) .format("parquet") .option("quote", " ") .save("s3://awsexamplebucket-streaming-demo2/output/" + "/year=" + "%04d".format(year) + "/month=" + "%02d".format(month) + "/day=" + "%02d".format(day) + "/hour=" + "%02d".format(hour) + "/") } } } // end foreachBatch() .trigger(Trigger.ProcessingTime("100 seconds")) .option("checkpointLocation", "s3://awsexamplebucket-streaming-demo2/checkpoint/") .start().awaitTermination() // start() returns type StreamingQuery Job.commit() } }