use of com.google.cloud.teleport.v2.cdc.sources.DataStreamIO in project DataflowTemplates by GoogleCloudPlatform.
the class DataStreamToSQL method run.
/**
* Runs the pipeline with the supplied options.
*
* @param options The execution parameters to the pipeline.
* @return The result of the pipeline execution.
*/
public static PipelineResult run(Options options) {
/*
* Stages:
* 1) Ingest and Normalize Data to FailsafeElement with JSON Strings
* 2) Write JSON Strings to SQL DML Objects
* 3) Filter stale rows using stateful PK transform
* 4) Write DML statements to SQL Database via jdbc
*/
Pipeline pipeline = Pipeline.create(options);
CdcJdbcIO.DataSourceConfiguration dataSourceConfiguration = getDataSourceConfiguration(options);
validateOptions(options, dataSourceConfiguration);
Map<String, String> schemaMap = parseSchemaMap(options.getSchemaMap());
/*
* Stage 1: Ingest and Normalize Data to FailsafeElement with JSON Strings
* a) Read DataStream data from GCS into JSON String FailsafeElements (datastreamJsonRecords)
*/
PCollection<FailsafeElement<String, String>> datastreamJsonRecords = pipeline.apply(new DataStreamIO(options.getStreamName(), options.getInputFilePattern(), options.getInputFileFormat(), options.getGcsPubSubSubscription(), options.getRfcStartDateTime()).withLowercaseSourceColumns().withHashColumnValue("_metadata_row_id", "rowid"));
/*
* Stage 2: Write JSON Strings to SQL Insert Strings
* a) Convert JSON String FailsafeElements to TableRow's (tableRowRecords)
* Stage 3) Filter stale rows using stateful PK transform
*/
PCollection<DmlInfo> dmlStatements = datastreamJsonRecords.apply("Format to DML", CreateDml.of(dataSourceConfiguration).withSchemaMap(schemaMap)).apply("DML Stateful Processing", ProcessDml.statefulOrderByPK());
/*
* Stage 4: Write Inserts to CloudSQL
*/
dmlStatements.apply("Write to SQL", CdcJdbcIO.<DmlInfo>write().withDataSourceConfiguration(dataSourceConfiguration).withStatementFormatter(new CdcJdbcIO.StatementFormatter<DmlInfo>() {
public String formatStatement(DmlInfo element) {
return element.getDmlSql();
}
}));
// Execute the pipeline and return the result.
return pipeline.run();
}
use of com.google.cloud.teleport.v2.cdc.sources.DataStreamIO in project DataflowTemplates by GoogleCloudPlatform.
the class DataStreamToSpanner method run.
/**
* Runs the pipeline with the supplied options.
*
* @param options The execution parameters to the pipeline.
* @return The result of the pipeline execution.
*/
public static PipelineResult run(Options options) {
/*
* Stages:
* 1) Ingest and Normalize Data to FailsafeElement with JSON Strings
* 2) Write JSON Strings to Cloud Spanner
* 3) Write Failures to GCS Dead Letter Queue
*/
Pipeline pipeline = Pipeline.create(options);
DeadLetterQueueManager dlqManager = buildDlqManager(options);
/*
* Stage 1: Ingest/Normalize Data to FailsafeElement with JSON Strings and
* read Cloud Spanner information schema.
* a) Prepare spanner config and process information schema
* b) Read DataStream data from GCS into JSON String FailsafeElements
* c) Reconsume Dead Letter Queue data from GCS into JSON String FailsafeElements
* d) Flatten DataStream and DLQ Streams
*/
// Prepare Spanner config
SpannerConfig spannerConfig = ExposedSpannerConfig.create().withHost(ValueProvider.StaticValueProvider.of(options.getSpannerHost())).withInstanceId(ValueProvider.StaticValueProvider.of(options.getInstanceId())).withDatabaseId(ValueProvider.StaticValueProvider.of(options.getDatabaseId()));
/* Process information schema
* 1) Read information schema from destination Cloud Spanner database
* 2) Check if shadow tables are present and create if necessary
* 3) Return new information schema
*/
PCollection<Ddl> ddl = pipeline.apply("Process Information Schema", new ProcessInformationSchema(spannerConfig, options.getShouldCreateShadowTables(), options.getShadowTablePrefix(), options.getDatastreamSourceType()));
PCollectionView<Ddl> ddlView = ddl.apply("Cloud Spanner DDL as view", View.asSingleton());
PCollection<FailsafeElement<String, String>> datastreamJsonRecords = pipeline.apply(new DataStreamIO(options.getStreamName(), options.getInputFilePattern(), options.getInputFileFormat(), options.getGcsPubSubSubscription(), options.getRfcStartDateTime()).withFileReadConcurrency(options.getFileReadConcurrency()));
// Elements sent to the Dead Letter Queue are to be reconsumed.
// A DLQManager is to be created using PipelineOptions, and it is in charge
// of building pieces of the DLQ.
PCollectionTuple reconsumedElements = dlqManager.getReconsumerDataTransform(pipeline.apply(dlqManager.dlqReconsumer(options.getDlqRetryMinutes())));
PCollection<FailsafeElement<String, String>> dlqJsonRecords = reconsumedElements.get(DeadLetterQueueManager.RETRYABLE_ERRORS).setCoder(FailsafeElementCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of()));
PCollection<FailsafeElement<String, String>> jsonRecords = PCollectionList.of(datastreamJsonRecords).and(dlqJsonRecords).apply(Flatten.pCollections()).apply("Reshuffle", Reshuffle.viaRandomKey());
/*
* Stage 2: Write records to Cloud Spanner
*/
SpannerTransactionWriter.Result spannerWriteResults = jsonRecords.apply("Write events to Cloud Spanner", new SpannerTransactionWriter(spannerConfig, ddlView, options.getShadowTablePrefix(), options.getDatastreamSourceType()));
/*
* Stage 3: Write failures to GCS Dead Letter Queue
* a) Retryable errors are written to retry GCS Dead letter queue
* b) Severe errors are written to severe GCS Dead letter queue
*/
spannerWriteResults.retryableErrors().apply("DLQ: Write retryable Failures to GCS", MapElements.via(new StringDeadLetterQueueSanitizer())).setCoder(StringUtf8Coder.of()).apply("Write To DLQ", DLQWriteTransform.WriteDLQ.newBuilder().withDlqDirectory(dlqManager.getRetryDlqDirectoryWithDateTime()).withTmpDirectory(dlqManager.getRetryDlqDirectory() + "tmp/").build());
PCollection<FailsafeElement<String, String>> dlqErrorRecords = reconsumedElements.get(DeadLetterQueueManager.PERMANENT_ERRORS).setCoder(FailsafeElementCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of()));
PCollection<FailsafeElement<String, String>> permanentErrors = PCollectionList.of(dlqErrorRecords).and(spannerWriteResults.permanentErrors()).apply(Flatten.pCollections()).apply("Reshuffle", Reshuffle.viaRandomKey());
permanentErrors.apply("DLQ: Write Severe errors to GCS", MapElements.via(new StringDeadLetterQueueSanitizer())).setCoder(StringUtf8Coder.of()).apply("Write To DLQ", DLQWriteTransform.WriteDLQ.newBuilder().withDlqDirectory(dlqManager.getSevereDlqDirectoryWithDateTime()).withTmpDirectory(dlqManager.getSevereDlqDirectory() + "tmp/").build());
// Execute the pipeline and return the result.
return pipeline.run();
}
use of com.google.cloud.teleport.v2.cdc.sources.DataStreamIO in project DataflowTemplates by GoogleCloudPlatform.
the class DataStreamToMongoDB method run.
/**
* Runs the pipeline with the supplied options.
*
* @param options The execution parameters to the pipeline.
* @return The result of the pipeline execution.
*/
public static PipelineResult run(Options options) {
/*
* Stages:
* 1) Ingest and Normalize Data to FailsafeElement with JSON Strings
* 2) Push the data to MongoDB
*/
Pipeline pipeline = Pipeline.create(options);
/*
* Stage 1: Ingest and Normalize Data to FailsafeElement with JSON Strings
* a) Read DataStream data from GCS into JSON String FailsafeElements (datastreamJsonRecords)
*/
PCollection<FailsafeElement<String, String>> datastreamJsonRecords = pipeline.apply(new DataStreamIO(options.getStreamName(), options.getInputFilePattern(), options.getInputFileFormat(), options.getInputSubscription(), options.getRfcStartDateTime()).withFileReadConcurrency(options.getFileReadConcurrency()));
PCollection<FailsafeElement<String, String>> jsonRecords = PCollectionList.of(datastreamJsonRecords).apply(Flatten.pCollections());
/**
* Does below steps:
* 1. Converts JSON to BSON documents.
* 2. Removes the metadata fileds.
* 3. Inserts the data into MongoDB collections.
*/
jsonRecords.apply("jsonToDocuments", MapElements.via(new SimpleFunction<FailsafeElement<String, String>, Document>() {
@Override
public Document apply(FailsafeElement<String, String> jsonString) {
String s = jsonString.getOriginalPayload();
Document doc = Document.parse(s);
return removeTableRowFields(doc, MAPPER_IGNORE_FIELDS);
}
})).apply("Write To MongoDB", MongoDbIO.write().withUri(options.getMongoDBUri()).withDatabase(options.getDatabase()).withCollection(options.getCollection()));
// Execute the pipeline and return the result.
return pipeline.run();
}
use of com.google.cloud.teleport.v2.cdc.sources.DataStreamIO in project DataflowTemplates by GoogleCloudPlatform.
the class DataStreamToPostgres method run.
/**
* Runs the pipeline with the supplied options.
*
* @param options The execution parameters to the pipeline.
* @return The result of the pipeline execution.
*/
public static PipelineResult run(Options options) {
/*
* Stages:
* 1) Ingest and Normalize Data to FailsafeElement with JSON Strings
* 2) Write JSON Strings to Postgres DML Objects
* 3) Filter stale rows using stateful PK transform
* 4) Write DML statements to Postgres
*/
Pipeline pipeline = Pipeline.create(options);
String jdbcDriverConnectionString = String.format("jdbc:postgresql://%s:%s/%s", options.getDatabaseHost(), options.getDatabasePort(), options.getDatabaseName());
CdcJdbcIO.DataSourceConfiguration dataSourceConfiguration = CdcJdbcIO.DataSourceConfiguration.create("org.postgresql.Driver", jdbcDriverConnectionString).withUsername(options.getDatabaseUser()).withPassword(options.getDatabasePassword()).withMaxIdleConnections(new Integer(0));
validateOptions(options, dataSourceConfiguration);
/*
* Stage 1: Ingest and Normalize Data to FailsafeElement with JSON Strings
* a) Read DataStream data from GCS into JSON String FailsafeElements (datastreamJsonRecords)
*/
PCollection<FailsafeElement<String, String>> datastreamJsonRecords = pipeline.apply(new DataStreamIO(options.getStreamName(), options.getInputFilePattern(), options.getInputFileFormat(), options.getGcsPubSubSubscription(), options.getRfcStartDateTime()).withLowercaseSourceColumns().withHashColumnValue("_metadata_row_id", "rowid"));
/*
* Stage 2: Write JSON Strings to Postgres Insert Strings
* a) Convert JSON String FailsafeElements to TableRow's (tableRowRecords)
* Stage 3) Filter stale rows using stateful PK transform
*/
PCollection<DmlInfo> dmlStatements = datastreamJsonRecords.apply("Format to Postgres DML", CreateDml.createDmlObjects(dataSourceConfiguration)).apply("DML Stateful Processing", ProcessDml.statefulOrderByPK());
/*
* Stage 4: Write Inserts to CloudSQL
*/
dmlStatements.apply("Write to Postgres", CdcJdbcIO.<DmlInfo>write().withDataSourceConfiguration(dataSourceConfiguration).withStatementFormatter(new CdcJdbcIO.StatementFormatter<DmlInfo>() {
public String formatStatement(DmlInfo element) {
return element.getDmlSql();
}
}));
// Execute the pipeline and return the result.
return pipeline.run();
}
use of com.google.cloud.teleport.v2.cdc.sources.DataStreamIO in project DataflowTemplates by GoogleCloudPlatform.
the class DataStreamToBigQuery method run.
/**
* Runs the pipeline with the supplied options.
*
* @param options The execution parameters to the pipeline.
* @return The result of the pipeline execution.
*/
public static PipelineResult run(Options options) {
/*
* Stages:
* 1) Ingest and Normalize Data to FailsafeElement with JSON Strings
* 2) Write JSON Strings to TableRow Collection
* - Optionally apply a UDF
* 3) BigQuery Output of TableRow Data
* a) Map New Columns & Write to Staging Tables
* b) Map New Columns & Merge Staging to Target Table
* 4) Write Failures to GCS Dead Letter Queue
*/
Pipeline pipeline = Pipeline.create(options);
DeadLetterQueueManager dlqManager = buildDlqManager(options);
String bigqueryProjectId = getBigQueryProjectId(options);
String dlqDirectory = dlqManager.getRetryDlqDirectoryWithDateTime();
String tempDlqDir = dlqManager.getRetryDlqDirectory() + "tmp/";
InputUDFToTableRow<String> failsafeTableRowTransformer = new InputUDFToTableRow<String>(options.getJavascriptTextTransformGcsPath(), options.getJavascriptTextTransformFunctionName(), options.getPythonTextTransformGcsPath(), options.getPythonTextTransformFunctionName(), options.getRuntimeRetries(), FAILSAFE_ELEMENT_CODER);
/*
* Stage 1: Ingest and Normalize Data to FailsafeElement with JSON Strings
* a) Read DataStream data from GCS into JSON String FailsafeElements (datastreamJsonRecords)
* b) Reconsume Dead Letter Queue data from GCS into JSON String FailsafeElements
* (dlqJsonRecords)
* c) Flatten DataStream and DLQ Streams (jsonRecords)
*/
PCollection<FailsafeElement<String, String>> datastreamJsonRecords = pipeline.apply(new DataStreamIO(options.getStreamName(), options.getInputFilePattern(), options.getInputFileFormat(), options.getGcsPubSubSubscription(), options.getRfcStartDateTime()).withFileReadConcurrency(options.getFileReadConcurrency()));
// Elements sent to the Dead Letter Queue are to be reconsumed.
// A DLQManager is to be created using PipelineOptions, and it is in charge
// of building pieces of the DLQ.
PCollection<FailsafeElement<String, String>> dlqJsonRecords = pipeline.apply("DLQ Consumer/reader", dlqManager.dlqReconsumer(options.getDlqRetryMinutes())).apply("DLQ Consumer/cleaner", ParDo.of(new DoFn<String, FailsafeElement<String, String>>() {
@ProcessElement
public void process(@Element String input, OutputReceiver<FailsafeElement<String, String>> receiver) {
receiver.output(FailsafeElement.of(input, input));
}
})).setCoder(FAILSAFE_ELEMENT_CODER);
PCollection<FailsafeElement<String, String>> jsonRecords = PCollectionList.of(datastreamJsonRecords).and(dlqJsonRecords).apply("Merge Datastream & DLQ", Flatten.pCollections());
/*
* Stage 2: Write JSON Strings to TableRow PCollectionTuple
* a) Optionally apply a Javascript or Python UDF
* b) Convert JSON String FailsafeElements to TableRow's (tableRowRecords)
*/
PCollectionTuple tableRowRecords = jsonRecords.apply("UDF to TableRow/udf", failsafeTableRowTransformer);
PCollection<TableRow> shuffledTableRows = tableRowRecords.get(failsafeTableRowTransformer.transformOut).apply("UDF to TableRow/ReShuffle", Reshuffle.<TableRow>viaRandomKey().withNumBuckets(100));
/*
* Stage 3: BigQuery Output of TableRow Data
* a) Map New Columns & Write to Staging Tables (writeResult)
* b) Map New Columns & Merge Staging to Target Table (null)
*
* failsafe: writeResult.getFailedInsertsWithErr()
*/
// TODO(beam 2.23): InsertRetryPolicy should be CDC compliant
Set<String> fieldsToIgnore = getFieldsToIgnore(options.getIgnoreFields());
WriteResult writeResult = shuffledTableRows.apply("Map to Staging Tables", new DataStreamMapper(options.as(GcpOptions.class), options.getOutputProjectId(), options.getOutputStagingDatasetTemplate(), options.getOutputStagingTableNameTemplate()).withDataStreamRootUrl(options.getDataStreamRootUrl()).withDefaultSchema(BigQueryDefaultSchemas.DATASTREAM_METADATA_SCHEMA).withDayPartitioning(true).withIgnoreFields(fieldsToIgnore)).apply("Write Successful Records", BigQueryIO.<KV<TableId, TableRow>>write().to(new BigQueryDynamicConverters().bigQueryDynamicDestination()).withFormatFunction(element -> removeTableRowFields(element.getValue(), fieldsToIgnore)).withFormatRecordOnFailureFunction(element -> element.getValue()).withoutValidation().ignoreInsertIds().withCreateDisposition(CreateDisposition.CREATE_NEVER).withWriteDisposition(WriteDisposition.WRITE_APPEND).withExtendedErrorInfo().withMethod(BigQueryIO.Write.Method.STREAMING_INSERTS).withFailedInsertRetryPolicy(InsertRetryPolicy.retryTransientErrors()));
if (options.getApplyMerge()) {
shuffledTableRows.apply("Map To Replica Tables", new DataStreamMapper(options.as(GcpOptions.class), options.getOutputProjectId(), options.getOutputDatasetTemplate(), options.getOutputTableNameTemplate()).withDataStreamRootUrl(options.getDataStreamRootUrl()).withDefaultSchema(BigQueryDefaultSchemas.DATASTREAM_METADATA_SCHEMA).withIgnoreFields(fieldsToIgnore)).apply("BigQuery Merge/Build MergeInfo", new MergeInfoMapper(bigqueryProjectId, options.getOutputStagingDatasetTemplate(), options.getOutputStagingTableNameTemplate(), options.getOutputDatasetTemplate(), options.getOutputTableNameTemplate())).apply("BigQuery Merge/Merge into Replica Tables", BigQueryMerger.of(MergeConfiguration.bigQueryConfiguration().withMergeWindowDuration(Duration.standardMinutes(options.getMergeFrequencyMinutes()))));
}
/*
* Stage 4: Write Failures to GCS Dead Letter Queue
*/
PCollection<String> udfDlqJson = PCollectionList.of(tableRowRecords.get(failsafeTableRowTransformer.udfDeadletterOut)).and(tableRowRecords.get(failsafeTableRowTransformer.transformDeadletterOut)).apply("UDF Failures/Flatten", Flatten.pCollections()).apply("UDF Failures/Sanitize", MapElements.via(new StringDeadLetterQueueSanitizer()));
PCollection<String> bqWriteDlqJson = writeResult.getFailedInsertsWithErr().apply("BigQuery Failures", MapElements.via(new BigQueryDeadLetterQueueSanitizer()));
PCollectionList.of(udfDlqJson).and(bqWriteDlqJson).apply("Write To DLQ/Flatten", Flatten.pCollections()).apply("Write To DLQ/Writer", DLQWriteTransform.WriteDLQ.newBuilder().withDlqDirectory(dlqDirectory).withTmpDirectory(tempDlqDir).build());
// Execute the pipeline and return the result.
return pipeline.run();
}
Aggregations