use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.
the class DataStreamToSpanner method run.
/**
* Runs the pipeline with the supplied options.
*
* @param options The execution parameters to the pipeline.
* @return The result of the pipeline execution.
*/
public static PipelineResult run(Options options) {
/*
* Stages:
* 1) Ingest and Normalize Data to FailsafeElement with JSON Strings
* 2) Write JSON Strings to Cloud Spanner
* 3) Write Failures to GCS Dead Letter Queue
*/
Pipeline pipeline = Pipeline.create(options);
DeadLetterQueueManager dlqManager = buildDlqManager(options);
/*
* Stage 1: Ingest/Normalize Data to FailsafeElement with JSON Strings and
* read Cloud Spanner information schema.
* a) Prepare spanner config and process information schema
* b) Read DataStream data from GCS into JSON String FailsafeElements
* c) Reconsume Dead Letter Queue data from GCS into JSON String FailsafeElements
* d) Flatten DataStream and DLQ Streams
*/
// Prepare Spanner config
SpannerConfig spannerConfig = ExposedSpannerConfig.create().withHost(ValueProvider.StaticValueProvider.of(options.getSpannerHost())).withInstanceId(ValueProvider.StaticValueProvider.of(options.getInstanceId())).withDatabaseId(ValueProvider.StaticValueProvider.of(options.getDatabaseId()));
/* Process information schema
* 1) Read information schema from destination Cloud Spanner database
* 2) Check if shadow tables are present and create if necessary
* 3) Return new information schema
*/
PCollection<Ddl> ddl = pipeline.apply("Process Information Schema", new ProcessInformationSchema(spannerConfig, options.getShouldCreateShadowTables(), options.getShadowTablePrefix(), options.getDatastreamSourceType()));
PCollectionView<Ddl> ddlView = ddl.apply("Cloud Spanner DDL as view", View.asSingleton());
PCollection<FailsafeElement<String, String>> datastreamJsonRecords = pipeline.apply(new DataStreamIO(options.getStreamName(), options.getInputFilePattern(), options.getInputFileFormat(), options.getGcsPubSubSubscription(), options.getRfcStartDateTime()).withFileReadConcurrency(options.getFileReadConcurrency()));
// Elements sent to the Dead Letter Queue are to be reconsumed.
// A DLQManager is to be created using PipelineOptions, and it is in charge
// of building pieces of the DLQ.
PCollectionTuple reconsumedElements = dlqManager.getReconsumerDataTransform(pipeline.apply(dlqManager.dlqReconsumer(options.getDlqRetryMinutes())));
PCollection<FailsafeElement<String, String>> dlqJsonRecords = reconsumedElements.get(DeadLetterQueueManager.RETRYABLE_ERRORS).setCoder(FailsafeElementCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of()));
PCollection<FailsafeElement<String, String>> jsonRecords = PCollectionList.of(datastreamJsonRecords).and(dlqJsonRecords).apply(Flatten.pCollections()).apply("Reshuffle", Reshuffle.viaRandomKey());
/*
* Stage 2: Write records to Cloud Spanner
*/
SpannerTransactionWriter.Result spannerWriteResults = jsonRecords.apply("Write events to Cloud Spanner", new SpannerTransactionWriter(spannerConfig, ddlView, options.getShadowTablePrefix(), options.getDatastreamSourceType()));
/*
* Stage 3: Write failures to GCS Dead Letter Queue
* a) Retryable errors are written to retry GCS Dead letter queue
* b) Severe errors are written to severe GCS Dead letter queue
*/
spannerWriteResults.retryableErrors().apply("DLQ: Write retryable Failures to GCS", MapElements.via(new StringDeadLetterQueueSanitizer())).setCoder(StringUtf8Coder.of()).apply("Write To DLQ", DLQWriteTransform.WriteDLQ.newBuilder().withDlqDirectory(dlqManager.getRetryDlqDirectoryWithDateTime()).withTmpDirectory(dlqManager.getRetryDlqDirectory() + "tmp/").setIncludePaneInfo(true).build());
PCollection<FailsafeElement<String, String>> dlqErrorRecords = reconsumedElements.get(DeadLetterQueueManager.PERMANENT_ERRORS).setCoder(FailsafeElementCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of()));
PCollection<FailsafeElement<String, String>> permanentErrors = PCollectionList.of(dlqErrorRecords).and(spannerWriteResults.permanentErrors()).apply(Flatten.pCollections()).apply("Reshuffle", Reshuffle.viaRandomKey());
permanentErrors.apply("DLQ: Write Severe errors to GCS", MapElements.via(new StringDeadLetterQueueSanitizer())).setCoder(StringUtf8Coder.of()).apply("Write To DLQ", DLQWriteTransform.WriteDLQ.newBuilder().withDlqDirectory(dlqManager.getSevereDlqDirectoryWithDateTime()).withTmpDirectory(dlqManager.getSevereDlqDirectory() + "tmp/").setIncludePaneInfo(true).build());
// Execute the pipeline and return the result.
return pipeline.run();
}
use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.
the class DataStreamToBigQuery method main.
/**
* Main entry point for executing the pipeline.
*
* @param args The command-line arguments to the pipeline.
*/
public static void main(String[] args) {
LOG.info("Starting Input Files to BigQuery");
Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
options.setStreaming(true);
options.setEnableStreamingEngine(true);
validateOptions(options);
run(options);
}
use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.
the class DataStreamToMongoDB method run.
/**
* Runs the pipeline with the supplied options.
*
* @param options The execution parameters to the pipeline.
* @return The result of the pipeline execution.
*/
public static PipelineResult run(Options options) {
/*
* Stages:
* 1) Ingest and Normalize Data to FailsafeElement with JSON Strings
* 2) Push the data to MongoDB
*/
Pipeline pipeline = Pipeline.create(options);
/*
* Stage 1: Ingest and Normalize Data to FailsafeElement with JSON Strings
* a) Read DataStream data from GCS into JSON String FailsafeElements (datastreamJsonRecords)
*/
PCollection<FailsafeElement<String, String>> datastreamJsonRecords = pipeline.apply(new DataStreamIO(options.getStreamName(), options.getInputFilePattern(), options.getInputFileFormat(), options.getInputSubscription(), options.getRfcStartDateTime()).withFileReadConcurrency(options.getFileReadConcurrency()));
PCollection<FailsafeElement<String, String>> jsonRecords = PCollectionList.of(datastreamJsonRecords).apply(Flatten.pCollections());
/**
* Does below steps:
* 1. Converts JSON to BSON documents.
* 2. Removes the metadata fileds.
* 3. Inserts the data into MongoDB collections.
*/
jsonRecords.apply("jsonToDocuments", MapElements.via(new SimpleFunction<FailsafeElement<String, String>, Document>() {
@Override
public Document apply(FailsafeElement<String, String> jsonString) {
String s = jsonString.getOriginalPayload();
Document doc = Document.parse(s);
return removeTableRowFields(doc, MAPPER_IGNORE_FIELDS);
}
})).apply("Write To MongoDB", MongoDbIO.write().withUri(options.getMongoDBUri()).withDatabase(options.getDatabase()).withCollection(options.getCollection()));
// Execute the pipeline and return the result.
return pipeline.run();
}
use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.
the class WriteToElasticsearch method expand.
@Override
public PDone expand(PCollection<String> jsonStrings) {
ConnectionInformation connectionInformation = new ConnectionInformation(options().getConnectionUrl());
ElasticsearchIO.ConnectionConfiguration config = ElasticsearchIO.ConnectionConfiguration.create(new String[] { connectionInformation.getElasticsearchURL().toString() }, options().getIndex(), DOCUMENT_TYPE);
// If username and password are not blank, use them instead of ApiKey
if (StringUtils.isNotBlank(options().getElasticsearchUsername()) && StringUtils.isNotBlank(options().getElasticsearchPassword())) {
config = config.withUsername(options().getElasticsearchUsername()).withPassword(options().getElasticsearchPassword());
} else {
config = config.withApiKey(options().getApiKey());
}
ElasticsearchIO.Write elasticsearchWriter = ElasticsearchIO.write().withConnectionConfiguration(config).withMaxBatchSize(options().getBatchSize()).withMaxBatchSizeBytes(options().getBatchSizeBytes());
if (Optional.ofNullable(options().getMaxRetryAttempts()).isPresent()) {
elasticsearchWriter.withRetryConfiguration(ElasticsearchIO.RetryConfiguration.create(options().getMaxRetryAttempts(), getDuration(options().getMaxRetryDuration())));
}
return jsonStrings.apply("WriteDocuments", elasticsearchWriter);
}
use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.
the class DataStreamToSQL method run.
/**
* Runs the pipeline with the supplied options.
*
* @param options The execution parameters to the pipeline.
* @return The result of the pipeline execution.
*/
public static PipelineResult run(Options options) {
/*
* Stages:
* 1) Ingest and Normalize Data to FailsafeElement with JSON Strings
* 2) Write JSON Strings to SQL DML Objects
* 3) Filter stale rows using stateful PK transform
* 4) Write DML statements to SQL Database via jdbc
*/
Pipeline pipeline = Pipeline.create(options);
CdcJdbcIO.DataSourceConfiguration dataSourceConfiguration = getDataSourceConfiguration(options);
validateOptions(options, dataSourceConfiguration);
Map<String, String> schemaMap = parseSchemaMap(options.getSchemaMap());
/*
* Stage 1: Ingest and Normalize Data to FailsafeElement with JSON Strings
* a) Read DataStream data from GCS into JSON String FailsafeElements (datastreamJsonRecords)
*/
PCollection<FailsafeElement<String, String>> datastreamJsonRecords = pipeline.apply(new DataStreamIO(options.getStreamName(), options.getInputFilePattern(), options.getInputFileFormat(), options.getGcsPubSubSubscription(), options.getRfcStartDateTime()).withLowercaseSourceColumns().withRenameColumnValue("_metadata_row_id", "rowid").withHashRowId());
/*
* Stage 2: Write JSON Strings to SQL Insert Strings
* a) Convert JSON String FailsafeElements to TableRow's (tableRowRecords)
* Stage 3) Filter stale rows using stateful PK transform
*/
PCollection<KV<String, DmlInfo>> dmlStatements = datastreamJsonRecords.apply("Format to DML", CreateDml.of(dataSourceConfiguration).withSchemaMap(schemaMap)).apply("DML Stateful Processing", ProcessDml.statefulOrderByPK());
/*
* Stage 4: Write Inserts to CloudSQL
*/
dmlStatements.apply("Write to SQL", CdcJdbcIO.<KV<String, DmlInfo>>write().withDataSourceConfiguration(dataSourceConfiguration).withStatementFormatter(new CdcJdbcIO.StatementFormatter<KV<String, DmlInfo>>() {
public String formatStatement(KV<String, DmlInfo> element) {
LOG.debug("Executing SQL: {}", element.getValue().getDmlSql());
return element.getValue().getDmlSql();
}
}));
// Execute the pipeline and return the result.
return pipeline.run();
}
Aggregations