use of com.google.cloud.teleport.v2.templates.spanner.ddl.Table in project DataflowTemplates by GoogleCloudPlatform.
the class PubSubCdcToBigQuery method run.
/**
* Runs the pipeline to completion with the specified options. This method does not wait until the
* pipeline is finished before returning. Invoke {@code result.waitUntilFinish()} on the result
* object to block until the pipeline is finished running if blocking programmatic execution is
* required.
*
* @param options The execution options.
* @return The pipeline result.
*/
public static PipelineResult run(Options options) {
Pipeline pipeline = Pipeline.create(options);
DeadLetterQueueManager dlqManager = buildDlqManager(options);
String gcsOutputDateTimeDirectory = null;
if (options.getDeadLetterQueueDirectory() != null) {
gcsOutputDateTimeDirectory = dlqManager.getRetryDlqDirectory() + "YYYY/MM/DD/HH/mm/";
}
CoderRegistry coderRegistry = pipeline.getCoderRegistry();
coderRegistry.registerCoderForType(CODER.getEncodedTypeDescriptor(), CODER);
coderRegistry.registerCoderForType(FAILSAFE_ELEMENT_CODER.getEncodedTypeDescriptor(), FAILSAFE_ELEMENT_CODER);
InputUDFToTableRow<String> failsafeTableRowTransformer = new InputUDFToTableRow<String>(options.getJavascriptTextTransformGcsPath(), options.getJavascriptTextTransformFunctionName(), options.getPythonTextTransformGcsPath(), options.getPythonTextTransformFunctionName(), options.getRuntimeRetries(), FAILSAFE_ELEMENT_CODER);
BigQueryTableConfigManager bqConfigManager = new BigQueryTableConfigManager((String) options.as(GcpOptions.class).getProject(), (String) options.getOutputDatasetTemplate(), (String) options.getOutputTableNameTemplate(), (String) options.getOutputTableSpec());
/*
* Steps:
* 1) Read messages in from Pub/Sub
* 2) Transform the PubsubMessages into TableRows
* - Transform message payload via UDF
* - Convert UDF result to TableRow objects
* 3) Write successful records out to BigQuery
* - Automap new objects to BigQuery if enabled
* - Write records to BigQuery tables
* 4) Write failed records out to BigQuery
*/
/*
* Step #1: Read messages in from Pub/Sub
*/
PCollection<PubsubMessage> messages = pipeline.apply("ReadPubSubSubscription", PubsubIO.readMessagesWithAttributes().fromSubscription(options.getInputSubscription()));
PCollection<FailsafeElement<String, String>> jsonRecords;
if (options.getDeadLetterQueueDirectory() != null) {
PCollection<FailsafeElement<String, String>> failsafeMessages = messages.apply("ConvertPubSubToFailsafe", ParDo.of(new PubSubToFailSafeElement()));
PCollection<FailsafeElement<String, String>> dlqJsonRecords = pipeline.apply(dlqManager.dlqReconsumer()).apply(ParDo.of(new DoFn<String, FailsafeElement<String, String>>() {
@ProcessElement
public void process(@Element String input, OutputReceiver<FailsafeElement<String, String>> receiver) {
receiver.output(FailsafeElement.of(input, input));
}
})).setCoder(FAILSAFE_ELEMENT_CODER);
jsonRecords = PCollectionList.of(failsafeMessages).and(dlqJsonRecords).apply(Flatten.pCollections());
} else {
jsonRecords = messages.apply("ConvertPubSubToFailsafe", ParDo.of(new PubSubToFailSafeElement()));
}
PCollectionTuple convertedTableRows = jsonRecords.apply(Reshuffle.<FailsafeElement<String, String>>viaRandomKey().withNumBuckets(options.getThreadCount())).apply("ApplyUdfAndConvertToTableRow", failsafeTableRowTransformer);
/*
* Step #3: Write the successful records out to BigQuery
* Either extract table destination only
* or extract table destination and auto-map new columns
*/
PCollection<KV<TableId, TableRow>> tableEvents;
if (options.getAutoMapTables()) {
tableEvents = convertedTableRows.get(failsafeTableRowTransformer.transformOut).apply("Map Data to BigQuery Tables", new BigQueryMappers(bqConfigManager.getProjectId()).buildBigQueryTableMapper(bqConfigManager.getDatasetTemplate(), bqConfigManager.getTableTemplate()).withDefaultSchemaFromGCS(options.getSchemaFilePath()));
} else {
tableEvents = convertedTableRows.get(failsafeTableRowTransformer.transformOut).apply("ExtractBigQueryTableDestination", BigQueryDynamicConverters.extractTableRowDestination(bqConfigManager.getProjectId(), bqConfigManager.getDatasetTemplate(), bqConfigManager.getTableTemplate()));
}
/*
* Step #3: Cont.
* - Write rows out to BigQuery
*/
// TODO(https://github.com/apache/beam/pull/12004): Switch out alwaysRetry
WriteResult writeResult = tableEvents.apply("WriteSuccessfulRecords", BigQueryIO.<KV<TableId, TableRow>>write().to(new BigQueryDynamicConverters().bigQueryDynamicDestination()).withFormatFunction(element -> element.getValue()).withoutValidation().withCreateDisposition(CreateDisposition.CREATE_NEVER).withWriteDisposition(WriteDisposition.WRITE_APPEND).withExtendedErrorInfo().withMethod(BigQueryIO.Write.Method.STREAMING_INSERTS).withFailedInsertRetryPolicy(InsertRetryPolicy.alwaysRetry()));
// TODO: Cover tableRowRecords.get(TRANSFORM_DEADLETTER_OUT) error values
if (options.getDeadLetterQueueDirectory() != null) {
writeResult.getFailedInsertsWithErr().apply("DLQ: Write Insert Failures to GCS", MapElements.via(new BigQueryDeadLetterQueueSanitizer())).apply("Creating " + options.getWindowDuration() + " Window", Window.into(FixedWindows.of(DurationUtils.parseDuration(options.getWindowDuration())))).apply("DLQ: Write File(s)", TextIO.write().withWindowedWrites().withNumShards(20).to(new WindowedFilenamePolicy(gcsOutputDateTimeDirectory, "error", "-SSSSS-of-NNNNN", ".json")).withTempDirectory(FileBasedSink.convertToFileResourceIfPossible(options.getDeadLetterQueueDirectory())));
PCollection<FailsafeElement<String, String>> transformDeadletter = PCollectionList.of(ImmutableList.of(convertedTableRows.get(failsafeTableRowTransformer.udfDeadletterOut), convertedTableRows.get(failsafeTableRowTransformer.transformDeadletterOut))).apply("Flatten", Flatten.pCollections()).apply("Creating " + options.getWindowDuration() + " Window", Window.into(FixedWindows.of(DurationUtils.parseDuration(options.getWindowDuration()))));
PCollection<String> dlqWindowing = transformDeadletter.apply("Sanitize records", MapElements.via(new StringDeadLetterQueueSanitizer())).setCoder(StringUtf8Coder.of());
dlqWindowing.apply("DLQ: Write File(s)", TextIO.write().withWindowedWrites().withNumShards(20).to(new WindowedFilenamePolicy(gcsOutputDateTimeDirectory, "error", "-SSSSS-of-NNNNN", ".json")).withTempDirectory(FileBasedSink.convertToFileResourceIfPossible(gcsOutputDateTimeDirectory + "tmp/")));
} else {
PCollection<FailsafeElement<String, String>> failedInserts = writeResult.getFailedInsertsWithErr().apply("WrapInsertionErrors", MapElements.into(FAILSAFE_ELEMENT_CODER.getEncodedTypeDescriptor()).via((BigQueryInsertError e) -> BigQueryConverters.wrapBigQueryInsertError(e))).setCoder(FAILSAFE_ELEMENT_CODER);
/*
* Step #4: Write records that failed table row transformation
* or conversion out to BigQuery deadletter table.
*/
PCollectionList.of(ImmutableList.of(convertedTableRows.get(failsafeTableRowTransformer.udfDeadletterOut), convertedTableRows.get(failsafeTableRowTransformer.transformDeadletterOut))).apply("Flatten", Flatten.pCollections()).apply("WriteFailedRecords", ErrorConverters.WriteStringMessageErrors.newBuilder().setErrorRecordsTable(BigQueryConverters.maybeUseDefaultDeadletterTable(options.getOutputDeadletterTable(), bqConfigManager.getOutputTableSpec(), DEFAULT_DEADLETTER_TABLE_SUFFIX)).setErrorRecordsTableSchema(ResourceUtils.getDeadletterTableSchemaJson()).build());
// 5) Insert records that failed insert into deadletter table
failedInserts.apply("WriteFailedRecords", ErrorConverters.WriteStringMessageErrors.newBuilder().setErrorRecordsTable(BigQueryConverters.maybeUseDefaultDeadletterTable(options.getOutputDeadletterTable(), bqConfigManager.getOutputTableSpec(), DEFAULT_DEADLETTER_TABLE_SUFFIX)).setErrorRecordsTableSchema(ResourceUtils.getDeadletterTableSchemaJson()).build());
}
return pipeline.run();
}
use of com.google.cloud.teleport.v2.templates.spanner.ddl.Table in project DataflowTemplates by GoogleCloudPlatform.
the class CsvConvertersTest method testLineToFailsafeJsonNoHeadersUdfDeadletter.
/**
* Tests {@link CsvConverters.LineToFailsafeJson} converts a line to a {@link FailsafeElement}
* correctly using a Javascript Udf. Udf processing is handled by {@link
* JavascriptTextTransformer}. Should output record to deadletter table tag.
*/
@Test
public void testLineToFailsafeJsonNoHeadersUdfDeadletter() {
FailsafeElementCoder<String, String> coder = FAILSAFE_ELEMENT_CODER;
CoderRegistry coderRegistry = pipeline.getCoderRegistry();
coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);
PCollection<String> lines = pipeline.apply(Create.of(BAD_JSON_STRING_RECORD).withCoder(StringUtf8Coder.of()));
PCollectionTuple linesTuple = PCollectionTuple.of(CSV_LINES, lines);
CsvConverters.CsvPipelineOptions options = PipelineOptionsFactory.create().as(CsvConverters.CsvPipelineOptions.class);
options.setDelimiter(",");
options.setJavascriptTextTransformGcsPath(SCRIPT_PARSE_EXCEPTION_FILE_PATH);
options.setJavascriptTextTransformFunctionName("transform");
PCollectionTuple failsafe = linesTuple.apply("TestLineToFailsafeJsonNoHeadersUdfBad", CsvConverters.LineToFailsafeJson.newBuilder().setDelimiter(options.getDelimiter()).setUdfFileSystemPath(options.getJavascriptTextTransformGcsPath()).setUdfFunctionName(options.getJavascriptTextTransformFunctionName()).setJsonSchemaPath(options.getJsonSchemaPath()).setJsonSchemaPath(null).setHeaderTag(CSV_HEADERS).setLineTag(CSV_LINES).setUdfOutputTag(PROCESSING_OUT).setUdfDeadletterTag(PROCESSING_DEADLETTER_OUT).build());
PAssert.that(failsafe.get(PROCESSING_OUT)).empty();
PAssert.that(failsafe.get(PROCESSING_DEADLETTER_OUT)).satisfies(collection -> {
FailsafeElement result = collection.iterator().next();
assertThat(result.getPayload(), is(equalTo(BAD_JSON_STRING_RECORD)));
return null;
});
pipeline.run();
}
use of com.google.cloud.teleport.v2.templates.spanner.ddl.Table in project DataflowTemplates by GoogleCloudPlatform.
the class DataStreamToBigQuery method run.
/**
* Runs the pipeline with the supplied options.
*
* @param options The execution parameters to the pipeline.
* @return The result of the pipeline execution.
*/
public static PipelineResult run(Options options) {
/*
* Stages:
* 1) Ingest and Normalize Data to FailsafeElement with JSON Strings
* 2) Write JSON Strings to TableRow Collection
* - Optionally apply a UDF
* 3) BigQuery Output of TableRow Data
* a) Map New Columns & Write to Staging Tables
* b) Map New Columns & Merge Staging to Target Table
* 4) Write Failures to GCS Dead Letter Queue
*/
Pipeline pipeline = Pipeline.create(options);
DeadLetterQueueManager dlqManager = buildDlqManager(options);
String bigqueryProjectId = getBigQueryProjectId(options);
String dlqDirectory = dlqManager.getRetryDlqDirectoryWithDateTime();
String tempDlqDir = dlqManager.getRetryDlqDirectory() + "tmp/";
InputUDFToTableRow<String> failsafeTableRowTransformer = new InputUDFToTableRow<String>(options.getJavascriptTextTransformGcsPath(), options.getJavascriptTextTransformFunctionName(), options.getPythonTextTransformGcsPath(), options.getPythonTextTransformFunctionName(), options.getRuntimeRetries(), FAILSAFE_ELEMENT_CODER);
StatefulRowCleaner statefulCleaner = StatefulRowCleaner.of();
/*
* Stage 1: Ingest and Normalize Data to FailsafeElement with JSON Strings
* a) Read DataStream data from GCS into JSON String FailsafeElements (datastreamJsonRecords)
* b) Reconsume Dead Letter Queue data from GCS into JSON String FailsafeElements
* (dlqJsonRecords)
* c) Flatten DataStream and DLQ Streams (jsonRecords)
*/
PCollection<FailsafeElement<String, String>> datastreamJsonRecords = pipeline.apply(new DataStreamIO(options.getStreamName(), options.getInputFilePattern(), options.getInputFileFormat(), options.getGcsPubSubSubscription(), options.getRfcStartDateTime()).withFileReadConcurrency(options.getFileReadConcurrency()));
// Elements sent to the Dead Letter Queue are to be reconsumed.
// A DLQManager is to be created using PipelineOptions, and it is in charge
// of building pieces of the DLQ.
PCollection<FailsafeElement<String, String>> dlqJsonRecords = pipeline.apply("DLQ Consumer/reader", dlqManager.dlqReconsumer(options.getDlqRetryMinutes())).apply("DLQ Consumer/cleaner", ParDo.of(new DoFn<String, FailsafeElement<String, String>>() {
@ProcessElement
public void process(@Element String input, OutputReceiver<FailsafeElement<String, String>> receiver) {
receiver.output(FailsafeElement.of(input, input));
}
})).setCoder(FAILSAFE_ELEMENT_CODER);
PCollection<FailsafeElement<String, String>> jsonRecords = PCollectionList.of(datastreamJsonRecords).and(dlqJsonRecords).apply("Merge Datastream & DLQ", Flatten.pCollections());
/*
* Stage 2: Write JSON Strings to TableRow PCollectionTuple
* a) Optionally apply a Javascript or Python UDF
* b) Convert JSON String FailsafeElements to TableRow's (tableRowRecords)
*/
PCollectionTuple tableRowRecords = jsonRecords.apply("UDF to TableRow/udf", failsafeTableRowTransformer);
PCollectionTuple cleanedRows = tableRowRecords.get(failsafeTableRowTransformer.transformOut).apply("UDF to TableRow/Oracle Cleaner", statefulCleaner);
PCollection<TableRow> shuffledTableRows = cleanedRows.get(statefulCleaner.successTag).apply("UDF to TableRow/ReShuffle", Reshuffle.<TableRow>viaRandomKey().withNumBuckets(100));
/*
* Stage 3: BigQuery Output of TableRow Data
* a) Map New Columns & Write to Staging Tables (writeResult)
* b) Map New Columns & Merge Staging to Target Table (null)
*
* failsafe: writeResult.getFailedInsertsWithErr()
*/
// TODO(beam 2.23): InsertRetryPolicy should be CDC compliant
Set<String> fieldsToIgnore = getFieldsToIgnore(options.getIgnoreFields());
WriteResult writeResult = shuffledTableRows.apply("Map to Staging Tables", new DataStreamMapper(options.as(GcpOptions.class), options.getOutputProjectId(), options.getOutputStagingDatasetTemplate(), options.getOutputStagingTableNameTemplate()).withDataStreamRootUrl(options.getDataStreamRootUrl()).withDefaultSchema(BigQueryDefaultSchemas.DATASTREAM_METADATA_SCHEMA).withDayPartitioning(true).withIgnoreFields(fieldsToIgnore)).apply("Write Successful Records", BigQueryIO.<KV<TableId, TableRow>>write().to(new BigQueryDynamicConverters().bigQueryDynamicDestination()).withFormatFunction(element -> removeTableRowFields(element.getValue(), fieldsToIgnore)).withFormatRecordOnFailureFunction(element -> element.getValue()).withoutValidation().ignoreInsertIds().withCreateDisposition(CreateDisposition.CREATE_NEVER).withWriteDisposition(WriteDisposition.WRITE_APPEND).withExtendedErrorInfo().withMethod(BigQueryIO.Write.Method.STREAMING_INSERTS).withFailedInsertRetryPolicy(InsertRetryPolicy.retryTransientErrors()));
if (options.getApplyMerge()) {
shuffledTableRows.apply("Map To Replica Tables", new DataStreamMapper(options.as(GcpOptions.class), options.getOutputProjectId(), options.getOutputDatasetTemplate(), options.getOutputTableNameTemplate()).withDataStreamRootUrl(options.getDataStreamRootUrl()).withDefaultSchema(BigQueryDefaultSchemas.DATASTREAM_METADATA_SCHEMA).withIgnoreFields(fieldsToIgnore)).apply("BigQuery Merge/Build MergeInfo", new MergeInfoMapper(bigqueryProjectId, options.getOutputStagingDatasetTemplate(), options.getOutputStagingTableNameTemplate(), options.getOutputDatasetTemplate(), options.getOutputTableNameTemplate())).apply("BigQuery Merge/Merge into Replica Tables", BigQueryMerger.of(MergeConfiguration.bigQueryConfiguration().withMergeWindowDuration(Duration.standardMinutes(options.getMergeFrequencyMinutes()))));
}
/*
* Stage 4: Write Failures to GCS Dead Letter Queue
*/
PCollection<String> udfDlqJson = PCollectionList.of(tableRowRecords.get(failsafeTableRowTransformer.udfDeadletterOut)).and(tableRowRecords.get(failsafeTableRowTransformer.transformDeadletterOut)).apply("Transform Failures/Flatten", Flatten.pCollections()).apply("Transform Failures/Sanitize", MapElements.via(new StringDeadLetterQueueSanitizer()));
PCollection<String> rowCleanerJson = cleanedRows.get(statefulCleaner.failureTag).apply("Transform Failures/Oracle Cleaner Failures", MapElements.via(new RowCleanerDeadLetterQueueSanitizer()));
PCollection<String> bqWriteDlqJson = writeResult.getFailedInsertsWithErr().apply("BigQuery Failures", MapElements.via(new BigQueryDeadLetterQueueSanitizer()));
PCollectionList.of(udfDlqJson).and(rowCleanerJson).and(bqWriteDlqJson).apply("Write To DLQ/Flatten", Flatten.pCollections()).apply("Write To DLQ/Writer", DLQWriteTransform.WriteDLQ.newBuilder().withDlqDirectory(dlqDirectory).withTmpDirectory(tempDlqDir).setIncludePaneInfo(true).build());
// Execute the pipeline and return the result.
return pipeline.run();
}
use of com.google.cloud.teleport.v2.templates.spanner.ddl.Table in project DataflowTemplates by GoogleCloudPlatform.
the class DataplexBigQueryToGcsTest method testE2E_withTargetStrategyFail_andEnforceSamePartitionKeyEnabled_throwsException.
/**
* Tests that the pipeline throws an exception if {@code writeDisposition = FAIL}, {@code
* enforceSamePartitionKey = true}, and one of the target files exist, when processing a
* partitioned table.
*
* <p>This is a special case because depending on the {@code enforceSamePartitionKey} param the
* generated file path can be different (for partitioned tables only!), so this verifies that
* {@link com.google.cloud.teleport.v2.utils.DataplexBigQueryToGcsFilter
* DataplexBigQueryToGcsFilter} can find such files correctly.
*/
@Test
public void testE2E_withTargetStrategyFail_andEnforceSamePartitionKeyEnabled_throwsException() throws Exception {
options.setFileFormat(FileFormatOptions.PARQUET);
options.setWriteDisposition(WriteDispositionOptions.FAIL);
options.setEnforceSamePartitionKey(true);
writeOutputFile("partitioned_table/ts=p2", "output-partitioned_table-p2.parquet", "Test data");
when(bqMock.query(any())).then(invocation -> {
Iterable<FieldValueList> result = null;
QueryJobConfiguration q = (QueryJobConfiguration) invocation.getArguments()[0];
if (TABLE_QUERY_PATTERN.matcher(q.getQuery()).find()) {
result = Collections.singletonList(fields("partitioned_table", "0", "ts"));
} else if (PARTITION_QUERY_PATTERN.matcher(q.getQuery()).find()) {
result = Arrays.asList(fields("p1", "0"), fields("p2", "0"));
}
when(tableResultMock.iterateAll()).thenReturn(result);
return tableResultMock;
});
try {
DataplexBigQueryToGcs.buildPipeline(options, metadataLoader, outDir.getAbsolutePath(), DatasetId.of(PROJECT, DATASET));
fail("Expected a WriteDispositionException");
} catch (Exception e) {
assertThat(e).hasCauseThat().hasCauseThat().isInstanceOf(WriteDispositionException.class);
assertThat(e).hasCauseThat().hasCauseThat().hasMessageThat().contains("Target File partitioned_table/ts=p2/output-partitioned_table-p2.parquet exists for" + " partitioned_table$p2.");
}
}
use of com.google.cloud.teleport.v2.templates.spanner.ddl.Table in project DataflowTemplates by GoogleCloudPlatform.
the class SpannerChangeStreamsToGcsTest method testWriteToGCSText.
@Test
@Category(IntegrationTest.class)
public // mvn -Dexcluded.spanner.tests="" -Dtest=SpannerChangeStreamsToGcsTest test
void testWriteToGCSText() throws Exception {
// Create a test database.
String testDatabase = generateDatabaseName();
fakeDir = tmpDir.newFolder("output").getAbsolutePath();
fakeTempLocation = tmpDir.newFolder("temporaryLocation").getAbsolutePath();
spannerServer.dropDatabase(testDatabase);
// Create a table.
List<String> statements = new ArrayList<String>();
final String createTable = "CREATE TABLE " + TEST_TABLE + " (" + "user_id INT64 NOT NULL," + "name STRING(MAX) " + ") PRIMARY KEY(user_id)";
final String createChangeStream = "CREATE CHANGE STREAM " + TEST_CHANGE_STREAM + " FOR Users";
statements.add(createTable);
statements.add(createChangeStream);
spannerServer.createDatabase(testDatabase, statements);
Timestamp startTimestamp = Timestamp.now();
// Create a mutation for the table that will generate 1 data change record.
List<Mutation> mutations = new ArrayList<>();
mutations.add(Mutation.newInsertBuilder(TEST_TABLE).set("user_id").to(1).set("name").to("Name1").build());
mutations.add(Mutation.newInsertBuilder(TEST_TABLE).set("user_id").to(2).set("name").to("Name2").build());
spannerServer.getDbClient(testDatabase).write(mutations);
Timestamp endTimestamp = Timestamp.now();
SpannerChangeStreamsToGcsOptions options = PipelineOptionsFactory.create().as(SpannerChangeStreamsToGcsOptions.class);
options.setSpannerProjectId(TEST_PROJECT);
options.setSpannerInstanceId(TEST_INSTANCE);
options.setSpannerDatabase(testDatabase);
options.setSpannerMetadataInstanceId(TEST_INSTANCE);
options.setSpannerMetadataDatabase(testDatabase);
options.setSpannerChangeStreamName(TEST_CHANGE_STREAM);
options.setStartTimestamp(startTimestamp.toString());
options.setEndTimestamp(endTimestamp.toString());
List<String> experiments = new ArrayList<String>();
options.setExperiments(experiments);
options.setOutputFileFormat(FileFormat.TEXT);
options.setGcsOutputDirectory(fakeDir);
options.setOutputFilenamePrefix(TEXT_FILENAME_PREFIX);
options.setNumShards(NUM_SHARDS);
options.setTempLocation(fakeTempLocation);
// Run the pipeline.
PipelineResult result = run(options);
result.waitUntilFinish();
// Read from the output Avro file to assert that 1 data change record has been generated.
PCollection<String> dataChangeRecords = pipeline.apply("readRecords", TextIO.read().from(fakeDir + "/text-output-*.txt"));
PAssert.that(dataChangeRecords).satisfies(new VerifyDataChangeRecordText());
pipeline.run();
// Drop the database.
spannerServer.dropDatabase(testDatabase);
}
Aggregations