use of com.google.cloud.teleport.v2.templates.spanner.ddl.Table in project DataflowTemplates by GoogleCloudPlatform.
the class DataplexBigQueryToGcs method transformPipeline.
@VisibleForTesting
static void transformPipeline(Pipeline pipeline, List<BigQueryTable> tables, DataplexBigQueryToGcsOptions options, String targetRootPath, BigQueryServices testBqServices, BigQueryClientFactory testBqClientFactory) {
List<PCollection<KV<BigQueryTable, KV<BigQueryTablePartition, String>>>> fileCollections = new ArrayList<>(tables.size());
tables.forEach(table -> {
fileCollections.add(pipeline.apply(String.format("ExportTable-%s", table.getTableName()), new BigQueryTableToGcsTransform(table, targetRootPath, options.getFileFormat(), options.getFileCompression(), options.getEnforceSamePartitionKey()).withTestServices(testBqServices)).apply(String.format("AttachTableKeys-%s", table.getTableName()), WithKeys.of(table)));
});
PCollection<KV<BigQueryTable, KV<BigQueryTablePartition, String>>> exportFileResults = PCollectionList.of(fileCollections).apply("FlattenTableResults", Flatten.pCollections());
PCollection<Void> metadataUpdateResults = exportFileResults.apply("UpdateDataplexMetadata", new UpdateDataplexBigQueryToGcsExportMetadataTransform());
exportFileResults.apply(MapElements.into(TypeDescriptors.kvs(TypeDescriptor.of(BigQueryTable.class), TypeDescriptor.of(BigQueryTablePartition.class))).via((SerializableFunction<KV<BigQueryTable, KV<BigQueryTablePartition, String>>, KV<BigQueryTable, BigQueryTablePartition>>) input -> KV.of(input.getKey(), input.getValue().getKey()))).apply("WaitForMetadataUpdate", Wait.on(metadataUpdateResults)).apply("TruncateBigQueryData", ParDo.of(new DeleteBigQueryDataFn().withTestBqClientFactory(testBqClientFactory)));
}
use of com.google.cloud.teleport.v2.templates.spanner.ddl.Table in project DataflowTemplates by GoogleCloudPlatform.
the class JdbcToPubsub method run.
/**
* Runs a pipeline which reads message from JdbcIO and writes to Pub/Sub.
*
* @param options The execution options.
* @return The pipeline result.
*/
public static PipelineResult run(JdbcToPubsubOptions options) {
// Create the pipeline
Pipeline pipeline = Pipeline.create(options);
LOG.info("Starting Jdbc-To-PubSub Pipeline.");
/*
* Steps:
* 1) Read data from a Jdbc Table
* 2) Write to Pub/Sub topic
*/
DynamicJdbcIO.DynamicDataSourceConfiguration dataSourceConfiguration = DynamicJdbcIO.DynamicDataSourceConfiguration.create(options.getDriverClassName(), maybeDecrypt(options.getConnectionUrl(), options.getKMSEncryptionKey())).withDriverJars(options.getDriverJars());
if (options.getUsername() != null) {
dataSourceConfiguration = dataSourceConfiguration.withUsername(maybeDecrypt(options.getUsername(), options.getKMSEncryptionKey()));
}
if (options.getPassword() != null) {
dataSourceConfiguration = dataSourceConfiguration.withPassword(maybeDecrypt(options.getPassword(), options.getKMSEncryptionKey()));
}
if (options.getConnectionProperties() != null) {
dataSourceConfiguration = dataSourceConfiguration.withConnectionProperties(options.getConnectionProperties());
}
PCollection<String> jdbcData = pipeline.apply("readFromJdbc", DynamicJdbcIO.<String>read().withDataSourceConfiguration(dataSourceConfiguration).withQuery(options.getQuery()).withCoder(StringUtf8Coder.of()).withRowMapper(new ResultSetToJSONString()));
jdbcData.apply("writeSuccessMessages", PubsubIO.writeStrings().to(options.getOutputTopic()));
return pipeline.run();
}
use of com.google.cloud.teleport.v2.templates.spanner.ddl.Table in project DataflowTemplates by GoogleCloudPlatform.
the class PubsubToJdbc method run.
/**
* Runs a pipeline which reads message from Pub/Sub and writes to JdbcIO.
*
* @param options The execution options.
* @return The pipeline result.
*/
public static PipelineResult run(PubsubToJdbcOptions options) {
// Create the pipeline
Pipeline pipeline = Pipeline.create(options);
LOG.info("Starting Pubsub-to-Jdbc Pipeline.");
/*
* Steps:
* 1) Read data from a Pub/Sub subscription
* 2) Write to Jdbc Table
* 3) Write errors to deadletter topic
*/
PCollection<String> pubsubData = pipeline.apply("readFromPubSubSubscription", PubsubIO.readStrings().fromSubscription(options.getInputSubscription()));
DynamicJdbcIO.DynamicDataSourceConfiguration dataSourceConfiguration = DynamicJdbcIO.DynamicDataSourceConfiguration.create(options.getDriverClassName(), maybeDecrypt(options.getConnectionUrl(), options.getKMSEncryptionKey())).withDriverJars(options.getDriverJars());
if (options.getUsername() != null) {
dataSourceConfiguration = dataSourceConfiguration.withUsername(maybeDecrypt(options.getUsername(), options.getKMSEncryptionKey()));
}
if (options.getPassword() != null) {
dataSourceConfiguration = dataSourceConfiguration.withPassword(maybeDecrypt(options.getPassword(), options.getKMSEncryptionKey()));
}
if (options.getConnectionProperties() != null) {
dataSourceConfiguration = dataSourceConfiguration.withConnectionProperties(options.getConnectionProperties());
}
PCollection<FailsafeElement<String, String>> errors = pubsubData.apply("writeToJdbc", DynamicJdbcIO.<String>write().withDataSourceConfiguration(dataSourceConfiguration).withStatement(options.getStatement()).withPreparedStatementSetter(new MapJsonStringToQuery(getKeyOrder(options.getStatement())))).setCoder(FAILSAFE_ELEMENT_CODER);
errors.apply("WriteFailedRecords", ErrorConverters.WriteStringMessageErrorsToPubSub.newBuilder().setErrorRecordsTopic(options.getOutputDeadletterTopic()).build());
return pipeline.run();
}
use of com.google.cloud.teleport.v2.templates.spanner.ddl.Table in project DataflowTemplates by GoogleCloudPlatform.
the class SpannerChangeStreamsToBigQuery method run.
/**
* Runs the pipeline with the supplied options.
*
* @param options The execution parameters to the pipeline.
* @return The result of the pipeline execution.
*/
public static PipelineResult run(SpannerChangeStreamsToBigQueryOptions options) {
setOptions(options);
validateOptions(options);
/**
* Stages: 1) Read {@link DataChangeRecord} from change stream. 2) Create {@link
* FailsafeElement} of {@link Mod} JSON and merge from: - {@link DataChangeRecord}. - GCS Dead
* letter queue. 3) Convert {@link Mod} JSON into {@link TableRow} by reading from Spanner at
* commit timestamp. 4) Append {@link TableRow} to BigQuery. 5) Write Failures from 2), 3) and
* 4) to GCS dead letter queue.
*/
Pipeline pipeline = Pipeline.create(options);
DeadLetterQueueManager dlqManager = buildDlqManager(options);
String spannerProjectId = getSpannerProjectId(options);
String dlqDirectory = dlqManager.getRetryDlqDirectoryWithDateTime();
String tempDlqDirectory = dlqManager.getRetryDlqDirectory() + "tmp/";
// Retrieve and parse the startTimestamp and endTimestamp.
Timestamp startTimestamp = options.getStartTimestamp().isEmpty() ? Timestamp.now() : Timestamp.parseTimestamp(options.getStartTimestamp());
Timestamp endTimestamp = options.getEndTimestamp().isEmpty() ? Timestamp.MAX_VALUE : Timestamp.parseTimestamp(options.getEndTimestamp());
SpannerConfig spannerConfig = SpannerConfig.create().withHost(ValueProvider.StaticValueProvider.of(options.getSpannerHost())).withProjectId(spannerProjectId).withInstanceId(options.getSpannerInstanceId()).withDatabaseId(options.getSpannerDatabase()).withRpcPriority(options.getSpannerRpcPriority());
SpannerIO.ReadChangeStream readChangeStream = SpannerIO.readChangeStream().withSpannerConfig(spannerConfig).withMetadataInstance(options.getSpannerMetadataInstanceId()).withMetadataDatabase(options.getSpannerMetadataDatabase()).withChangeStreamName(options.getSpannerChangeStreamName()).withInclusiveStartAt(startTimestamp).withInclusiveEndAt(endTimestamp).withRpcPriority(options.getSpannerRpcPriority());
String spannerMetadataTableName = options.getSpannerMetadataTableName();
if (spannerMetadataTableName != null) {
readChangeStream = readChangeStream.withMetadataTable(spannerMetadataTableName);
}
PCollection<DataChangeRecord> dataChangeRecord = pipeline.apply("Read from Spanner Change Streams", readChangeStream).apply("Reshuffle DataChangeRecord", Reshuffle.viaRandomKey());
PCollection<FailsafeElement<String, String>> sourceFailsafeModJson = dataChangeRecord.apply("DataChangeRecord To Mod JSON", ParDo.of(new DataChangeRecordToModJsonFn())).apply("Wrap Mod JSON In FailsafeElement", ParDo.of(new DoFn<String, FailsafeElement<String, String>>() {
@ProcessElement
public void process(@Element String input, OutputReceiver<FailsafeElement<String, String>> receiver) {
receiver.output(FailsafeElement.of(input, input));
}
})).setCoder(FAILSAFE_ELEMENT_CODER);
PCollectionTuple dlqModJson = dlqManager.getReconsumerDataTransform(pipeline.apply(dlqManager.dlqReconsumer(options.getDlqRetryMinutes())));
PCollection<FailsafeElement<String, String>> retryableDlqFailsafeModJson = dlqModJson.get(DeadLetterQueueManager.RETRYABLE_ERRORS).setCoder(FAILSAFE_ELEMENT_CODER);
PCollection<FailsafeElement<String, String>> failsafeModJson = PCollectionList.of(sourceFailsafeModJson).and(retryableDlqFailsafeModJson).apply("Merge Source And DLQ Mod JSON", Flatten.pCollections());
ImmutableSet.Builder<String> ignoreFieldsBuilder = ImmutableSet.builder();
for (String ignoreField : options.getIgnoreFields().split(",")) {
ignoreFieldsBuilder.add(ignoreField);
}
ImmutableSet<String> ignoreFields = ignoreFieldsBuilder.build();
FailsafeModJsonToTableRowTransformer.FailsafeModJsonToTableRowOptions failsafeModJsonToTableRowOptions = FailsafeModJsonToTableRowTransformer.FailsafeModJsonToTableRowOptions.builder().setSpannerConfig(spannerConfig).setSpannerChangeStream(options.getSpannerChangeStreamName()).setIgnoreFields(ignoreFields).setCoder(FAILSAFE_ELEMENT_CODER).build();
FailsafeModJsonToTableRowTransformer.FailsafeModJsonToTableRow failsafeModJsonToTableRow = new FailsafeModJsonToTableRowTransformer.FailsafeModJsonToTableRow(failsafeModJsonToTableRowOptions);
PCollectionTuple tableRowTuple = failsafeModJson.apply("Mod JSON To TableRow", failsafeModJsonToTableRow);
BigQueryDynamicDestinations.BigQueryDynamicDestinationsOptions bigQueryDynamicDestinationsOptions = BigQueryDynamicDestinations.BigQueryDynamicDestinationsOptions.builder().setSpannerConfig(spannerConfig).setChangeStreamName(options.getSpannerChangeStreamName()).setIgnoreFields(ignoreFields).setBigQueryProject(getBigQueryProjectId(options)).setBigQueryDataset(options.getBigQueryDataset()).setBigQueryTableTemplate(options.getBigQueryChangelogTableNameTemplate()).build();
WriteResult writeResult = tableRowTuple.get(failsafeModJsonToTableRow.transformOut).apply("Write To BigQuery", BigQueryIO.<TableRow>write().to(BigQueryDynamicDestinations.of(bigQueryDynamicDestinationsOptions)).withFormatFunction(element -> removeIntermediateMetadataFields(element)).withFormatRecordOnFailureFunction(element -> element).withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED).withWriteDisposition(Write.WriteDisposition.WRITE_APPEND).withExtendedErrorInfo().withMethod(Write.Method.STREAMING_INSERTS).withFailedInsertRetryPolicy(InsertRetryPolicy.retryTransientErrors()));
PCollection<String> transformDlqJson = tableRowTuple.get(failsafeModJsonToTableRow.transformDeadLetterOut).apply("Failed Mod JSON During Table Row Transformation", MapElements.via(new StringDeadLetterQueueSanitizer()));
PCollection<String> bqWriteDlqJson = writeResult.getFailedInsertsWithErr().apply("Failed Mod JSON During BigQuery Writes", MapElements.via(new BigQueryDeadLetterQueueSanitizer()));
PCollectionList.of(transformDlqJson).and(bqWriteDlqJson).apply("Merge Failed Mod JSON From Transform And BigQuery", Flatten.pCollections()).apply("Write Failed Mod JSON To DLQ", DLQWriteTransform.WriteDLQ.newBuilder().withDlqDirectory(dlqDirectory).withTmpDirectory(tempDlqDirectory).setIncludePaneInfo(true).build());
PCollection<FailsafeElement<String, String>> nonRetryableDlqModJsonFailsafe = dlqModJson.get(DeadLetterQueueManager.PERMANENT_ERRORS).setCoder(FAILSAFE_ELEMENT_CODER);
nonRetryableDlqModJsonFailsafe.apply("Write Mod JSON With Non-retryable Error To DLQ", MapElements.via(new StringDeadLetterQueueSanitizer())).setCoder(StringUtf8Coder.of()).apply(DLQWriteTransform.WriteDLQ.newBuilder().withDlqDirectory(dlqManager.getSevereDlqDirectoryWithDateTime()).withTmpDirectory(dlqManager.getSevereDlqDirectory() + "tmp/").setIncludePaneInfo(true).build());
return pipeline.run();
}
use of com.google.cloud.teleport.v2.templates.spanner.ddl.Table in project DataflowTemplates by GoogleCloudPlatform.
the class DeleteBigQueryDataFnTest method testTransform_withDeleteSourceDataDisabled_doesntTruncateData.
@Test
@Category(NeedsRunner.class)
public void testTransform_withDeleteSourceDataDisabled_doesntTruncateData() {
Options options = TestPipeline.testingPipelineOptions().as(Options.class);
options.setDeleteSourceData(false);
BigQueryTable partitionedTable = table.toBuilder().setPartitions(Collections.singletonList(partition)).setPartitioningColumn("column-name-doesnt-matter").build();
DeleteBigQueryDataFn fn = new DeleteBigQueryDataFn().withTestBqClientFactory(() -> bqMock);
PCollection<Void> actual = testPipeline.apply("CreateInput", Create.of(KV.of(partitionedTable, partition), KV.of(table, (BigQueryTablePartition) null)).withCoder(fnCoder)).apply("TestDeleteBigQueryDataFn", ParDo.of(fn));
PAssert.that(actual).empty();
testPipeline.run(options);
verifyNoMoreInteractions(bqMock);
}
Aggregations