use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.
the class DataStreamToBigQuery method run.
/**
* Runs the pipeline with the supplied options.
*
* @param options The execution parameters to the pipeline.
* @return The result of the pipeline execution.
*/
public static PipelineResult run(Options options) {
/*
* Stages:
* 1) Ingest and Normalize Data to FailsafeElement with JSON Strings
* 2) Write JSON Strings to TableRow Collection
* - Optionally apply a UDF
* 3) BigQuery Output of TableRow Data
* a) Map New Columns & Write to Staging Tables
* b) Map New Columns & Merge Staging to Target Table
* 4) Write Failures to GCS Dead Letter Queue
*/
Pipeline pipeline = Pipeline.create(options);
DeadLetterQueueManager dlqManager = buildDlqManager(options);
String bigqueryProjectId = getBigQueryProjectId(options);
String dlqDirectory = dlqManager.getRetryDlqDirectoryWithDateTime();
String tempDlqDir = dlqManager.getRetryDlqDirectory() + "tmp/";
InputUDFToTableRow<String> failsafeTableRowTransformer = new InputUDFToTableRow<String>(options.getJavascriptTextTransformGcsPath(), options.getJavascriptTextTransformFunctionName(), options.getPythonTextTransformGcsPath(), options.getPythonTextTransformFunctionName(), options.getRuntimeRetries(), FAILSAFE_ELEMENT_CODER);
StatefulRowCleaner statefulCleaner = StatefulRowCleaner.of();
/*
* Stage 1: Ingest and Normalize Data to FailsafeElement with JSON Strings
* a) Read DataStream data from GCS into JSON String FailsafeElements (datastreamJsonRecords)
* b) Reconsume Dead Letter Queue data from GCS into JSON String FailsafeElements
* (dlqJsonRecords)
* c) Flatten DataStream and DLQ Streams (jsonRecords)
*/
PCollection<FailsafeElement<String, String>> datastreamJsonRecords = pipeline.apply(new DataStreamIO(options.getStreamName(), options.getInputFilePattern(), options.getInputFileFormat(), options.getGcsPubSubSubscription(), options.getRfcStartDateTime()).withFileReadConcurrency(options.getFileReadConcurrency()));
// Elements sent to the Dead Letter Queue are to be reconsumed.
// A DLQManager is to be created using PipelineOptions, and it is in charge
// of building pieces of the DLQ.
PCollection<FailsafeElement<String, String>> dlqJsonRecords = pipeline.apply("DLQ Consumer/reader", dlqManager.dlqReconsumer(options.getDlqRetryMinutes())).apply("DLQ Consumer/cleaner", ParDo.of(new DoFn<String, FailsafeElement<String, String>>() {
@ProcessElement
public void process(@Element String input, OutputReceiver<FailsafeElement<String, String>> receiver) {
receiver.output(FailsafeElement.of(input, input));
}
})).setCoder(FAILSAFE_ELEMENT_CODER);
PCollection<FailsafeElement<String, String>> jsonRecords = PCollectionList.of(datastreamJsonRecords).and(dlqJsonRecords).apply("Merge Datastream & DLQ", Flatten.pCollections());
/*
* Stage 2: Write JSON Strings to TableRow PCollectionTuple
* a) Optionally apply a Javascript or Python UDF
* b) Convert JSON String FailsafeElements to TableRow's (tableRowRecords)
*/
PCollectionTuple tableRowRecords = jsonRecords.apply("UDF to TableRow/udf", failsafeTableRowTransformer);
PCollectionTuple cleanedRows = tableRowRecords.get(failsafeTableRowTransformer.transformOut).apply("UDF to TableRow/Oracle Cleaner", statefulCleaner);
PCollection<TableRow> shuffledTableRows = cleanedRows.get(statefulCleaner.successTag).apply("UDF to TableRow/ReShuffle", Reshuffle.<TableRow>viaRandomKey().withNumBuckets(100));
/*
* Stage 3: BigQuery Output of TableRow Data
* a) Map New Columns & Write to Staging Tables (writeResult)
* b) Map New Columns & Merge Staging to Target Table (null)
*
* failsafe: writeResult.getFailedInsertsWithErr()
*/
// TODO(beam 2.23): InsertRetryPolicy should be CDC compliant
Set<String> fieldsToIgnore = getFieldsToIgnore(options.getIgnoreFields());
WriteResult writeResult = shuffledTableRows.apply("Map to Staging Tables", new DataStreamMapper(options.as(GcpOptions.class), options.getOutputProjectId(), options.getOutputStagingDatasetTemplate(), options.getOutputStagingTableNameTemplate()).withDataStreamRootUrl(options.getDataStreamRootUrl()).withDefaultSchema(BigQueryDefaultSchemas.DATASTREAM_METADATA_SCHEMA).withDayPartitioning(true).withIgnoreFields(fieldsToIgnore)).apply("Write Successful Records", BigQueryIO.<KV<TableId, TableRow>>write().to(new BigQueryDynamicConverters().bigQueryDynamicDestination()).withFormatFunction(element -> removeTableRowFields(element.getValue(), fieldsToIgnore)).withFormatRecordOnFailureFunction(element -> element.getValue()).withoutValidation().ignoreInsertIds().withCreateDisposition(CreateDisposition.CREATE_NEVER).withWriteDisposition(WriteDisposition.WRITE_APPEND).withExtendedErrorInfo().withMethod(BigQueryIO.Write.Method.STREAMING_INSERTS).withFailedInsertRetryPolicy(InsertRetryPolicy.retryTransientErrors()));
if (options.getApplyMerge()) {
shuffledTableRows.apply("Map To Replica Tables", new DataStreamMapper(options.as(GcpOptions.class), options.getOutputProjectId(), options.getOutputDatasetTemplate(), options.getOutputTableNameTemplate()).withDataStreamRootUrl(options.getDataStreamRootUrl()).withDefaultSchema(BigQueryDefaultSchemas.DATASTREAM_METADATA_SCHEMA).withIgnoreFields(fieldsToIgnore)).apply("BigQuery Merge/Build MergeInfo", new MergeInfoMapper(bigqueryProjectId, options.getOutputStagingDatasetTemplate(), options.getOutputStagingTableNameTemplate(), options.getOutputDatasetTemplate(), options.getOutputTableNameTemplate())).apply("BigQuery Merge/Merge into Replica Tables", BigQueryMerger.of(MergeConfiguration.bigQueryConfiguration().withMergeWindowDuration(Duration.standardMinutes(options.getMergeFrequencyMinutes()))));
}
/*
* Stage 4: Write Failures to GCS Dead Letter Queue
*/
PCollection<String> udfDlqJson = PCollectionList.of(tableRowRecords.get(failsafeTableRowTransformer.udfDeadletterOut)).and(tableRowRecords.get(failsafeTableRowTransformer.transformDeadletterOut)).apply("Transform Failures/Flatten", Flatten.pCollections()).apply("Transform Failures/Sanitize", MapElements.via(new StringDeadLetterQueueSanitizer()));
PCollection<String> rowCleanerJson = cleanedRows.get(statefulCleaner.failureTag).apply("Transform Failures/Oracle Cleaner Failures", MapElements.via(new RowCleanerDeadLetterQueueSanitizer()));
PCollection<String> bqWriteDlqJson = writeResult.getFailedInsertsWithErr().apply("BigQuery Failures", MapElements.via(new BigQueryDeadLetterQueueSanitizer()));
PCollectionList.of(udfDlqJson).and(rowCleanerJson).and(bqWriteDlqJson).apply("Write To DLQ/Flatten", Flatten.pCollections()).apply("Write To DLQ/Writer", DLQWriteTransform.WriteDLQ.newBuilder().withDlqDirectory(dlqDirectory).withTmpDirectory(tempDlqDir).setIncludePaneInfo(true).build());
// Execute the pipeline and return the result.
return pipeline.run();
}
use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.
the class DataplexBigQueryToGcsTest method testE2E_withTargetStrategyFail_andEnforceSamePartitionKeyEnabled_throwsException.
/**
* Tests that the pipeline throws an exception if {@code writeDisposition = FAIL}, {@code
* enforceSamePartitionKey = true}, and one of the target files exist, when processing a
* partitioned table.
*
* <p>This is a special case because depending on the {@code enforceSamePartitionKey} param the
* generated file path can be different (for partitioned tables only!), so this verifies that
* {@link com.google.cloud.teleport.v2.utils.DataplexBigQueryToGcsFilter
* DataplexBigQueryToGcsFilter} can find such files correctly.
*/
@Test
public void testE2E_withTargetStrategyFail_andEnforceSamePartitionKeyEnabled_throwsException() throws Exception {
options.setFileFormat(FileFormatOptions.PARQUET);
options.setWriteDisposition(WriteDispositionOptions.FAIL);
options.setEnforceSamePartitionKey(true);
writeOutputFile("partitioned_table/ts=p2", "output-partitioned_table-p2.parquet", "Test data");
when(bqMock.query(any())).then(invocation -> {
Iterable<FieldValueList> result = null;
QueryJobConfiguration q = (QueryJobConfiguration) invocation.getArguments()[0];
if (TABLE_QUERY_PATTERN.matcher(q.getQuery()).find()) {
result = Collections.singletonList(fields("partitioned_table", "0", "ts"));
} else if (PARTITION_QUERY_PATTERN.matcher(q.getQuery()).find()) {
result = Arrays.asList(fields("p1", "0"), fields("p2", "0"));
}
when(tableResultMock.iterateAll()).thenReturn(result);
return tableResultMock;
});
try {
DataplexBigQueryToGcs.buildPipeline(options, metadataLoader, outDir.getAbsolutePath(), DatasetId.of(PROJECT, DATASET));
fail("Expected a WriteDispositionException");
} catch (Exception e) {
assertThat(e).hasCauseThat().hasCauseThat().isInstanceOf(WriteDispositionException.class);
assertThat(e).hasCauseThat().hasCauseThat().hasMessageThat().contains("Target File partitioned_table/ts=p2/output-partitioned_table-p2.parquet exists for" + " partitioned_table$p2.");
}
}
use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.
the class DataplexFileFormatConversionTest method testAssetWithEntityJsonToParquetFailOnExistingFilesE2E.
/**
* Tests JSON to Parquet conversion for an asset with entity when one of the files already exists
* and the existing file behaviour is FAIL.
*/
@Test(expected = RuntimeException.class)
@Category(NeedsRunner.class)
public void testAssetWithEntityJsonToParquetFailOnExistingFilesE2E() throws IOException {
// setup Dataplex client to return entity 2
DataplexClient dataplex = mock(DataplexClient.class);
when(dataplex.getCloudStorageEntities(asset2.getName())).thenReturn(ImmutableList.of(entity2));
when(dataplex.getPartitions(entity2.getName())).thenReturn(ImmutableList.of());
when(dataplex.getAsset(outputAsset.getName())).thenReturn(outputAsset);
// setup options to fail on existing files
FileFormatConversionOptions options = PipelineOptionsFactory.create().as(FileFormatConversionOptions.class);
options.setInputAssetOrEntitiesList(asset2.getName());
options.setOutputFileFormat(FileFormatOptions.PARQUET);
options.setOutputAsset(outputAsset.getName());
options.setWriteDisposition(WriteDispositionOptions.FAIL);
// simulate the 1.json -> 1.parquet conversion already happened
copyFileToOutputBucket("entity2.existing/1.parquet", "entity2/1.parquet");
// simulate the 2.json -> 2.parquet conversion already happened
copyFileToOutputBucket("entity2.existing/1.parquet", "entity2/2.parquet");
// run the pipeline, the job should fail because 1.parquet already exists
DataplexFileFormatConversion.run(mainPipeline, options, dataplex, DataplexFileFormatConversionTest::outputPathProvider).waitUntilFinish();
}
use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.
the class DataplexFileFormatConversionTest method testAssetWithEntityJsonToParquetSkipExistingFilesE2E.
/**
* Tests JSON to Parquet conversion for an asset with entity when one of the files already exists
* and the existing file behaviour is SKIP.
*/
@Test
@Category(NeedsRunner.class)
public void testAssetWithEntityJsonToParquetSkipExistingFilesE2E() throws IOException {
// setup Dataplex client to return entity 2
DataplexClient dataplex = mock(DataplexClient.class);
when(dataplex.getCloudStorageEntities(asset2.getName())).thenReturn(ImmutableList.of(entity2));
when(dataplex.getPartitions(entity2.getName())).thenReturn(ImmutableList.of());
when(dataplex.getAsset(outputAsset.getName())).thenReturn(outputAsset);
// setup options to skip existing files
FileFormatConversionOptions options = PipelineOptionsFactory.create().as(FileFormatConversionOptions.class);
options.setInputAssetOrEntitiesList(asset2.getName());
options.setOutputFileFormat(FileFormatOptions.PARQUET);
options.setOutputAsset(outputAsset.getName());
options.setWriteDisposition(WriteDispositionOptions.SKIP);
// simulate the 1.json -> 1.parquet conversion already happened
copyFileToOutputBucket("entity2.existing/1.parquet", "entity2/1.parquet");
// run the pipeline, only 2.json -> 2.parquet conversion should happen
DataplexFileFormatConversion.run(mainPipeline, options, dataplex, DataplexFileFormatConversionTest::outputPathProvider);
// read the conversion results
PCollection<GenericRecord> readParquetFile = readPipeline.apply("ReadParquetFile", ParquetConverters.ReadParquetFile.newBuilder().withInputFileSpec(temporaryFolder.getRoot().getAbsolutePath() + "/**/*.parquet").withSerializedSchema(EXPECT_SERIALIZED_AVRO_SCHEMA).build());
// expect old 1.parquet (from entity2.existing) and newly converted 2.parquet (from entity2)
ImmutableList.Builder<GenericRecord> expected = ImmutableList.builder();
Record record = new Record(EXPECTED_AVRO_SCHEMA);
record.put("Word", "abc.existing");
record.put("Number", 1);
expected.add(record);
record = new Record(EXPECTED_AVRO_SCHEMA);
record.put("Word", "def");
record.put("Number", 2);
expected.add(record);
record = new Record(EXPECTED_AVRO_SCHEMA);
record.put("Word", "ghi");
record.put("Number", 3);
expected.add(record);
PAssert.that(readParquetFile).containsInAnyOrder(expected.build());
readPipeline.run();
}
use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.
the class DataplexFileFormatConversionTest method testAssetWithEntityJsonToGzippedParquetE2E.
/**
* Tests JSON to Parquet conversion for an asset with entity using non-default compression.
*/
@Test
@Category(NeedsRunner.class)
public void testAssetWithEntityJsonToGzippedParquetE2E() throws IOException {
DataplexClient dataplex = mock(DataplexClient.class);
when(dataplex.getCloudStorageEntities(asset2.getName())).thenReturn(ImmutableList.of(entity2));
when(dataplex.getPartitions(entity2.getName())).thenReturn(ImmutableList.of());
when(dataplex.getAsset(outputAsset.getName())).thenReturn(outputAsset);
FileFormatConversionOptions options = PipelineOptionsFactory.create().as(FileFormatConversionOptions.class);
options.setInputAssetOrEntitiesList(asset2.getName());
options.setOutputFileFormat(FileFormatOptions.PARQUET);
options.setOutputAsset(outputAsset.getName());
options.setOutputFileCompression(DataplexCompression.GZIP);
DataplexFileFormatConversion.run(mainPipeline, options, dataplex, DataplexFileFormatConversionTest::outputPathProvider);
PCollection<GenericRecord> readParquetFile = readPipeline.apply("ReadParquetFile", ParquetConverters.ReadParquetFile.newBuilder().withInputFileSpec(temporaryFolder.getRoot().getAbsolutePath() + "/**/*.parquet").withSerializedSchema(EXPECT_SERIALIZED_AVRO_SCHEMA).build());
PAssert.that(readParquetFile).containsInAnyOrder(EXPECTED_GENERIC_RECORDS);
readPipeline.run();
}
Aggregations