Search in sources :

Example 1 with StatefulRowCleaner

use of com.google.cloud.teleport.v2.transforms.StatefulRowCleaner in project DataflowTemplates by GoogleCloudPlatform.

the class DataStreamToBigQuery method run.

/**
 * Runs the pipeline with the supplied options.
 *
 * @param options The execution parameters to the pipeline.
 * @return The result of the pipeline execution.
 */
public static PipelineResult run(Options options) {
    /*
     * Stages:
     *   1) Ingest and Normalize Data to FailsafeElement with JSON Strings
     *   2) Write JSON Strings to TableRow Collection
     *       - Optionally apply a UDF
     *   3) BigQuery Output of TableRow Data
     *     a) Map New Columns & Write to Staging Tables
     *     b) Map New Columns & Merge Staging to Target Table
     *   4) Write Failures to GCS Dead Letter Queue
     */
    Pipeline pipeline = Pipeline.create(options);
    DeadLetterQueueManager dlqManager = buildDlqManager(options);
    String bigqueryProjectId = getBigQueryProjectId(options);
    String dlqDirectory = dlqManager.getRetryDlqDirectoryWithDateTime();
    String tempDlqDir = dlqManager.getRetryDlqDirectory() + "tmp/";
    InputUDFToTableRow<String> failsafeTableRowTransformer = new InputUDFToTableRow<String>(options.getJavascriptTextTransformGcsPath(), options.getJavascriptTextTransformFunctionName(), options.getPythonTextTransformGcsPath(), options.getPythonTextTransformFunctionName(), options.getRuntimeRetries(), FAILSAFE_ELEMENT_CODER);
    StatefulRowCleaner statefulCleaner = StatefulRowCleaner.of();
    /*
     * Stage 1: Ingest and Normalize Data to FailsafeElement with JSON Strings
     *   a) Read DataStream data from GCS into JSON String FailsafeElements (datastreamJsonRecords)
     *   b) Reconsume Dead Letter Queue data from GCS into JSON String FailsafeElements
     *     (dlqJsonRecords)
     *   c) Flatten DataStream and DLQ Streams (jsonRecords)
     */
    PCollection<FailsafeElement<String, String>> datastreamJsonRecords = pipeline.apply(new DataStreamIO(options.getStreamName(), options.getInputFilePattern(), options.getInputFileFormat(), options.getGcsPubSubSubscription(), options.getRfcStartDateTime()).withFileReadConcurrency(options.getFileReadConcurrency()));
    // Elements sent to the Dead Letter Queue are to be reconsumed.
    // A DLQManager is to be created using PipelineOptions, and it is in charge
    // of building pieces of the DLQ.
    PCollection<FailsafeElement<String, String>> dlqJsonRecords = pipeline.apply("DLQ Consumer/reader", dlqManager.dlqReconsumer(options.getDlqRetryMinutes())).apply("DLQ Consumer/cleaner", ParDo.of(new DoFn<String, FailsafeElement<String, String>>() {

        @ProcessElement
        public void process(@Element String input, OutputReceiver<FailsafeElement<String, String>> receiver) {
            receiver.output(FailsafeElement.of(input, input));
        }
    })).setCoder(FAILSAFE_ELEMENT_CODER);
    PCollection<FailsafeElement<String, String>> jsonRecords = PCollectionList.of(datastreamJsonRecords).and(dlqJsonRecords).apply("Merge Datastream & DLQ", Flatten.pCollections());
    /*
     * Stage 2: Write JSON Strings to TableRow PCollectionTuple
     *   a) Optionally apply a Javascript or Python UDF
     *   b) Convert JSON String FailsafeElements to TableRow's (tableRowRecords)
     */
    PCollectionTuple tableRowRecords = jsonRecords.apply("UDF to TableRow/udf", failsafeTableRowTransformer);
    PCollectionTuple cleanedRows = tableRowRecords.get(failsafeTableRowTransformer.transformOut).apply("UDF to TableRow/Oracle Cleaner", statefulCleaner);
    PCollection<TableRow> shuffledTableRows = cleanedRows.get(statefulCleaner.successTag).apply("UDF to TableRow/ReShuffle", Reshuffle.<TableRow>viaRandomKey().withNumBuckets(100));
    /*
     * Stage 3: BigQuery Output of TableRow Data
     *   a) Map New Columns & Write to Staging Tables (writeResult)
     *   b) Map New Columns & Merge Staging to Target Table (null)
     *
     *   failsafe: writeResult.getFailedInsertsWithErr()
     */
    // TODO(beam 2.23): InsertRetryPolicy should be CDC compliant
    Set<String> fieldsToIgnore = getFieldsToIgnore(options.getIgnoreFields());
    WriteResult writeResult = shuffledTableRows.apply("Map to Staging Tables", new DataStreamMapper(options.as(GcpOptions.class), options.getOutputProjectId(), options.getOutputStagingDatasetTemplate(), options.getOutputStagingTableNameTemplate()).withDataStreamRootUrl(options.getDataStreamRootUrl()).withDefaultSchema(BigQueryDefaultSchemas.DATASTREAM_METADATA_SCHEMA).withDayPartitioning(true).withIgnoreFields(fieldsToIgnore)).apply("Write Successful Records", BigQueryIO.<KV<TableId, TableRow>>write().to(new BigQueryDynamicConverters().bigQueryDynamicDestination()).withFormatFunction(element -> removeTableRowFields(element.getValue(), fieldsToIgnore)).withFormatRecordOnFailureFunction(element -> element.getValue()).withoutValidation().ignoreInsertIds().withCreateDisposition(CreateDisposition.CREATE_NEVER).withWriteDisposition(WriteDisposition.WRITE_APPEND).withExtendedErrorInfo().withMethod(BigQueryIO.Write.Method.STREAMING_INSERTS).withFailedInsertRetryPolicy(InsertRetryPolicy.retryTransientErrors()));
    if (options.getApplyMerge()) {
        shuffledTableRows.apply("Map To Replica Tables", new DataStreamMapper(options.as(GcpOptions.class), options.getOutputProjectId(), options.getOutputDatasetTemplate(), options.getOutputTableNameTemplate()).withDataStreamRootUrl(options.getDataStreamRootUrl()).withDefaultSchema(BigQueryDefaultSchemas.DATASTREAM_METADATA_SCHEMA).withIgnoreFields(fieldsToIgnore)).apply("BigQuery Merge/Build MergeInfo", new MergeInfoMapper(bigqueryProjectId, options.getOutputStagingDatasetTemplate(), options.getOutputStagingTableNameTemplate(), options.getOutputDatasetTemplate(), options.getOutputTableNameTemplate())).apply("BigQuery Merge/Merge into Replica Tables", BigQueryMerger.of(MergeConfiguration.bigQueryConfiguration().withMergeWindowDuration(Duration.standardMinutes(options.getMergeFrequencyMinutes()))));
    }
    /*
     * Stage 4: Write Failures to GCS Dead Letter Queue
     */
    PCollection<String> udfDlqJson = PCollectionList.of(tableRowRecords.get(failsafeTableRowTransformer.udfDeadletterOut)).and(tableRowRecords.get(failsafeTableRowTransformer.transformDeadletterOut)).apply("Transform Failures/Flatten", Flatten.pCollections()).apply("Transform Failures/Sanitize", MapElements.via(new StringDeadLetterQueueSanitizer()));
    PCollection<String> rowCleanerJson = cleanedRows.get(statefulCleaner.failureTag).apply("Transform Failures/Oracle Cleaner Failures", MapElements.via(new RowCleanerDeadLetterQueueSanitizer()));
    PCollection<String> bqWriteDlqJson = writeResult.getFailedInsertsWithErr().apply("BigQuery Failures", MapElements.via(new BigQueryDeadLetterQueueSanitizer()));
    PCollectionList.of(udfDlqJson).and(rowCleanerJson).and(bqWriteDlqJson).apply("Write To DLQ/Flatten", Flatten.pCollections()).apply("Write To DLQ/Writer", DLQWriteTransform.WriteDLQ.newBuilder().withDlqDirectory(dlqDirectory).withTmpDirectory(tempDlqDir).setIncludePaneInfo(true).build());
    // Execute the pipeline and return the result.
    return pipeline.run();
}
Also used : TableId(com.google.cloud.bigquery.TableId) PipelineResult(org.apache.beam.sdk.PipelineResult) TableId(com.google.cloud.bigquery.TableId) InsertRetryPolicy(org.apache.beam.sdk.io.gcp.bigquery.InsertRetryPolicy) LoggerFactory(org.slf4j.LoggerFactory) DLQWriteTransform(com.google.cloud.teleport.v2.transforms.DLQWriteTransform) InputUDFOptions(com.google.cloud.teleport.v2.transforms.UDFTextTransformer.InputUDFOptions) Description(org.apache.beam.sdk.options.Description) PCollectionList(org.apache.beam.sdk.values.PCollectionList) TableRow(com.google.api.services.bigquery.model.TableRow) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) BigQueryMerger(com.google.cloud.teleport.v2.cdc.merge.BigQueryMerger) Splitter(com.google.common.base.Splitter) Flatten(org.apache.beam.sdk.transforms.Flatten) MapElements(org.apache.beam.sdk.transforms.MapElements) DeadLetterQueueManager(com.google.cloud.teleport.v2.cdc.dlq.DeadLetterQueueManager) MergeInfoMapper(com.google.cloud.teleport.v2.cdc.mappers.MergeInfoMapper) GcpOptions(org.apache.beam.sdk.extensions.gcp.options.GcpOptions) StreamingOptions(org.apache.beam.sdk.options.StreamingOptions) Set(java.util.Set) CreateDisposition(org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition) RowCleanerDeadLetterQueueSanitizer(com.google.cloud.teleport.v2.transforms.StatefulRowCleaner.RowCleanerDeadLetterQueueSanitizer) ParDo(org.apache.beam.sdk.transforms.ParDo) StringDeadLetterQueueSanitizer(com.google.cloud.teleport.v2.cdc.dlq.StringDeadLetterQueueSanitizer) FailsafeElement(com.google.cloud.teleport.v2.values.FailsafeElement) Pattern(java.util.regex.Pattern) MergeConfiguration(com.google.cloud.teleport.v2.cdc.merge.MergeConfiguration) KV(org.apache.beam.sdk.values.KV) DataStreamIO(com.google.cloud.teleport.v2.cdc.sources.DataStreamIO) Default(org.apache.beam.sdk.options.Default) Duration(org.joda.time.Duration) StatefulRowCleaner(com.google.cloud.teleport.v2.transforms.StatefulRowCleaner) BigQueryOptions(org.apache.beam.sdk.io.gcp.bigquery.BigQueryOptions) PipelineOptionsFactory(org.apache.beam.sdk.options.PipelineOptionsFactory) StringUtf8Coder(org.apache.beam.sdk.coders.StringUtf8Coder) HashSet(java.util.HashSet) DataStreamMapper(com.google.cloud.teleport.v2.cdc.mappers.DataStreamMapper) FailsafeElementCoder(com.google.cloud.teleport.v2.coders.FailsafeElementCoder) BigQueryDefaultSchemas(com.google.cloud.teleport.v2.cdc.mappers.BigQueryDefaultSchemas) TupleTag(org.apache.beam.sdk.values.TupleTag) Pipeline(org.apache.beam.sdk.Pipeline) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) DoFn(org.apache.beam.sdk.transforms.DoFn) Reshuffle(org.apache.beam.sdk.transforms.Reshuffle) Logger(org.slf4j.Logger) BigQueryIO(org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO) DataflowPipelineOptions(org.apache.beam.runners.dataflow.options.DataflowPipelineOptions) WriteResult(org.apache.beam.sdk.io.gcp.bigquery.WriteResult) PCollection(org.apache.beam.sdk.values.PCollection) WriteDisposition(org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition) InputUDFToTableRow(com.google.cloud.teleport.v2.transforms.UDFTextTransformer.InputUDFToTableRow) DeadLetterQueueManager(com.google.cloud.teleport.v2.cdc.dlq.DeadLetterQueueManager) FailsafeElement(com.google.cloud.teleport.v2.values.FailsafeElement) GcpOptions(org.apache.beam.sdk.extensions.gcp.options.GcpOptions) RowCleanerDeadLetterQueueSanitizer(com.google.cloud.teleport.v2.transforms.StatefulRowCleaner.RowCleanerDeadLetterQueueSanitizer) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) StringDeadLetterQueueSanitizer(com.google.cloud.teleport.v2.cdc.dlq.StringDeadLetterQueueSanitizer) MergeInfoMapper(com.google.cloud.teleport.v2.cdc.mappers.MergeInfoMapper) Pipeline(org.apache.beam.sdk.Pipeline) WriteResult(org.apache.beam.sdk.io.gcp.bigquery.WriteResult) DataStreamMapper(com.google.cloud.teleport.v2.cdc.mappers.DataStreamMapper) DataStreamIO(com.google.cloud.teleport.v2.cdc.sources.DataStreamIO) StatefulRowCleaner(com.google.cloud.teleport.v2.transforms.StatefulRowCleaner) TableRow(com.google.api.services.bigquery.model.TableRow) InputUDFToTableRow(com.google.cloud.teleport.v2.transforms.UDFTextTransformer.InputUDFToTableRow) InputUDFToTableRow(com.google.cloud.teleport.v2.transforms.UDFTextTransformer.InputUDFToTableRow)

Aggregations

TableRow (com.google.api.services.bigquery.model.TableRow)1 TableId (com.google.cloud.bigquery.TableId)1 DeadLetterQueueManager (com.google.cloud.teleport.v2.cdc.dlq.DeadLetterQueueManager)1 StringDeadLetterQueueSanitizer (com.google.cloud.teleport.v2.cdc.dlq.StringDeadLetterQueueSanitizer)1 BigQueryDefaultSchemas (com.google.cloud.teleport.v2.cdc.mappers.BigQueryDefaultSchemas)1 DataStreamMapper (com.google.cloud.teleport.v2.cdc.mappers.DataStreamMapper)1 MergeInfoMapper (com.google.cloud.teleport.v2.cdc.mappers.MergeInfoMapper)1 BigQueryMerger (com.google.cloud.teleport.v2.cdc.merge.BigQueryMerger)1 MergeConfiguration (com.google.cloud.teleport.v2.cdc.merge.MergeConfiguration)1 DataStreamIO (com.google.cloud.teleport.v2.cdc.sources.DataStreamIO)1 FailsafeElementCoder (com.google.cloud.teleport.v2.coders.FailsafeElementCoder)1 DLQWriteTransform (com.google.cloud.teleport.v2.transforms.DLQWriteTransform)1 StatefulRowCleaner (com.google.cloud.teleport.v2.transforms.StatefulRowCleaner)1 RowCleanerDeadLetterQueueSanitizer (com.google.cloud.teleport.v2.transforms.StatefulRowCleaner.RowCleanerDeadLetterQueueSanitizer)1 InputUDFOptions (com.google.cloud.teleport.v2.transforms.UDFTextTransformer.InputUDFOptions)1 InputUDFToTableRow (com.google.cloud.teleport.v2.transforms.UDFTextTransformer.InputUDFToTableRow)1 FailsafeElement (com.google.cloud.teleport.v2.values.FailsafeElement)1 Splitter (com.google.common.base.Splitter)1 HashSet (java.util.HashSet)1 Set (java.util.Set)1