Search in sources :

Example 36 with FailsafeElement

use of com.google.cloud.teleport.v2.values.FailsafeElement in project DataflowTemplates by GoogleCloudPlatform.

the class PubSubCdcToBigQuery method run.

/**
 * Runs the pipeline to completion with the specified options. This method does not wait until the
 * pipeline is finished before returning. Invoke {@code result.waitUntilFinish()} on the result
 * object to block until the pipeline is finished running if blocking programmatic execution is
 * required.
 *
 * @param options The execution options.
 * @return The pipeline result.
 */
public static PipelineResult run(Options options) {
    Pipeline pipeline = Pipeline.create(options);
    DeadLetterQueueManager dlqManager = buildDlqManager(options);
    String gcsOutputDateTimeDirectory = null;
    if (options.getDeadLetterQueueDirectory() != null) {
        gcsOutputDateTimeDirectory = dlqManager.getRetryDlqDirectory() + "YYYY/MM/DD/HH/mm/";
    }
    CoderRegistry coderRegistry = pipeline.getCoderRegistry();
    coderRegistry.registerCoderForType(CODER.getEncodedTypeDescriptor(), CODER);
    coderRegistry.registerCoderForType(FAILSAFE_ELEMENT_CODER.getEncodedTypeDescriptor(), FAILSAFE_ELEMENT_CODER);
    InputUDFToTableRow<String> failsafeTableRowTransformer = new InputUDFToTableRow<String>(options.getJavascriptTextTransformGcsPath(), options.getJavascriptTextTransformFunctionName(), options.getPythonTextTransformGcsPath(), options.getPythonTextTransformFunctionName(), options.getRuntimeRetries(), FAILSAFE_ELEMENT_CODER);
    BigQueryTableConfigManager bqConfigManager = new BigQueryTableConfigManager((String) options.as(GcpOptions.class).getProject(), (String) options.getOutputDatasetTemplate(), (String) options.getOutputTableNameTemplate(), (String) options.getOutputTableSpec());
    /*
     * Steps:
     *  1) Read messages in from Pub/Sub
     *  2) Transform the PubsubMessages into TableRows
     *     - Transform message payload via UDF
     *     - Convert UDF result to TableRow objects
     *  3) Write successful records out to BigQuery
     *     - Automap new objects to BigQuery if enabled
     *     - Write records to BigQuery tables
     *  4) Write failed records out to BigQuery
     */
    /*
     * Step #1: Read messages in from Pub/Sub
     */
    PCollection<PubsubMessage> messages = pipeline.apply("ReadPubSubSubscription", PubsubIO.readMessagesWithAttributes().fromSubscription(options.getInputSubscription()));
    PCollection<FailsafeElement<String, String>> jsonRecords;
    if (options.getDeadLetterQueueDirectory() != null) {
        PCollection<FailsafeElement<String, String>> failsafeMessages = messages.apply("ConvertPubSubToFailsafe", ParDo.of(new PubSubToFailSafeElement()));
        PCollection<FailsafeElement<String, String>> dlqJsonRecords = pipeline.apply(dlqManager.dlqReconsumer()).apply(ParDo.of(new DoFn<String, FailsafeElement<String, String>>() {

            @ProcessElement
            public void process(@Element String input, OutputReceiver<FailsafeElement<String, String>> receiver) {
                receiver.output(FailsafeElement.of(input, input));
            }
        })).setCoder(FAILSAFE_ELEMENT_CODER);
        jsonRecords = PCollectionList.of(failsafeMessages).and(dlqJsonRecords).apply(Flatten.pCollections());
    } else {
        jsonRecords = messages.apply("ConvertPubSubToFailsafe", ParDo.of(new PubSubToFailSafeElement()));
    }
    PCollectionTuple convertedTableRows = jsonRecords.apply(Reshuffle.<FailsafeElement<String, String>>viaRandomKey().withNumBuckets(options.getThreadCount())).apply("ApplyUdfAndConvertToTableRow", failsafeTableRowTransformer);
    /*
     * Step #3: Write the successful records out to BigQuery
     *   Either extract table destination only
     *   or extract table destination and auto-map new columns
     */
    PCollection<KV<TableId, TableRow>> tableEvents;
    if (options.getAutoMapTables()) {
        tableEvents = convertedTableRows.get(failsafeTableRowTransformer.transformOut).apply("Map Data to BigQuery Tables", new BigQueryMappers(bqConfigManager.getProjectId()).buildBigQueryTableMapper(bqConfigManager.getDatasetTemplate(), bqConfigManager.getTableTemplate()).withDefaultSchemaFromGCS(options.getSchemaFilePath()));
    } else {
        tableEvents = convertedTableRows.get(failsafeTableRowTransformer.transformOut).apply("ExtractBigQueryTableDestination", BigQueryDynamicConverters.extractTableRowDestination(bqConfigManager.getProjectId(), bqConfigManager.getDatasetTemplate(), bqConfigManager.getTableTemplate()));
    }
    /*
     * Step #3: Cont.
     *    - Write rows out to BigQuery
     */
    // TODO(https://github.com/apache/beam/pull/12004): Switch out alwaysRetry
    WriteResult writeResult = tableEvents.apply("WriteSuccessfulRecords", BigQueryIO.<KV<TableId, TableRow>>write().to(new BigQueryDynamicConverters().bigQueryDynamicDestination()).withFormatFunction(element -> element.getValue()).withoutValidation().withCreateDisposition(CreateDisposition.CREATE_NEVER).withWriteDisposition(WriteDisposition.WRITE_APPEND).withExtendedErrorInfo().withMethod(BigQueryIO.Write.Method.STREAMING_INSERTS).withFailedInsertRetryPolicy(InsertRetryPolicy.alwaysRetry()));
    // TODO: Cover tableRowRecords.get(TRANSFORM_DEADLETTER_OUT) error values
    if (options.getDeadLetterQueueDirectory() != null) {
        writeResult.getFailedInsertsWithErr().apply("DLQ: Write Insert Failures to GCS", MapElements.via(new BigQueryDeadLetterQueueSanitizer())).apply("Creating " + options.getWindowDuration() + " Window", Window.into(FixedWindows.of(DurationUtils.parseDuration(options.getWindowDuration())))).apply("DLQ: Write File(s)", TextIO.write().withWindowedWrites().withNumShards(20).to(new WindowedFilenamePolicy(gcsOutputDateTimeDirectory, "error", "-SSSSS-of-NNNNN", ".json")).withTempDirectory(FileBasedSink.convertToFileResourceIfPossible(options.getDeadLetterQueueDirectory())));
        PCollection<FailsafeElement<String, String>> transformDeadletter = PCollectionList.of(ImmutableList.of(convertedTableRows.get(failsafeTableRowTransformer.udfDeadletterOut), convertedTableRows.get(failsafeTableRowTransformer.transformDeadletterOut))).apply("Flatten", Flatten.pCollections()).apply("Creating " + options.getWindowDuration() + " Window", Window.into(FixedWindows.of(DurationUtils.parseDuration(options.getWindowDuration()))));
        PCollection<String> dlqWindowing = transformDeadletter.apply("Sanitize records", MapElements.via(new StringDeadLetterQueueSanitizer())).setCoder(StringUtf8Coder.of());
        dlqWindowing.apply("DLQ: Write File(s)", TextIO.write().withWindowedWrites().withNumShards(20).to(new WindowedFilenamePolicy(gcsOutputDateTimeDirectory, "error", "-SSSSS-of-NNNNN", ".json")).withTempDirectory(FileBasedSink.convertToFileResourceIfPossible(gcsOutputDateTimeDirectory + "tmp/")));
    } else {
        PCollection<FailsafeElement<String, String>> failedInserts = writeResult.getFailedInsertsWithErr().apply("WrapInsertionErrors", MapElements.into(FAILSAFE_ELEMENT_CODER.getEncodedTypeDescriptor()).via((BigQueryInsertError e) -> BigQueryConverters.wrapBigQueryInsertError(e))).setCoder(FAILSAFE_ELEMENT_CODER);
        /*
       * Step #4: Write records that failed table row transformation
       * or conversion out to BigQuery deadletter table.
       */
        PCollectionList.of(ImmutableList.of(convertedTableRows.get(failsafeTableRowTransformer.udfDeadletterOut), convertedTableRows.get(failsafeTableRowTransformer.transformDeadletterOut))).apply("Flatten", Flatten.pCollections()).apply("WriteFailedRecords", ErrorConverters.WriteStringMessageErrors.newBuilder().setErrorRecordsTable(BigQueryConverters.maybeUseDefaultDeadletterTable(options.getOutputDeadletterTable(), bqConfigManager.getOutputTableSpec(), DEFAULT_DEADLETTER_TABLE_SUFFIX)).setErrorRecordsTableSchema(ResourceUtils.getDeadletterTableSchemaJson()).build());
        // 5) Insert records that failed insert into deadletter table
        failedInserts.apply("WriteFailedRecords", ErrorConverters.WriteStringMessageErrors.newBuilder().setErrorRecordsTable(BigQueryConverters.maybeUseDefaultDeadletterTable(options.getOutputDeadletterTable(), bqConfigManager.getOutputTableSpec(), DEFAULT_DEADLETTER_TABLE_SUFFIX)).setErrorRecordsTableSchema(ResourceUtils.getDeadletterTableSchemaJson()).build());
    }
    return pipeline.run();
}
Also used : TableId(com.google.cloud.bigquery.TableId) CoderRegistry(org.apache.beam.sdk.coders.CoderRegistry) PubSubToFailSafeElement(com.google.cloud.teleport.v2.transforms.PubSubToFailSafeElement) PipelineResult(org.apache.beam.sdk.PipelineResult) TableId(com.google.cloud.bigquery.TableId) InsertRetryPolicy(org.apache.beam.sdk.io.gcp.bigquery.InsertRetryPolicy) LoggerFactory(org.slf4j.LoggerFactory) InputUDFOptions(com.google.cloud.teleport.v2.transforms.UDFTextTransformer.InputUDFOptions) Description(org.apache.beam.sdk.options.Description) PubsubMessage(org.apache.beam.sdk.io.gcp.pubsub.PubsubMessage) PCollectionList(org.apache.beam.sdk.values.PCollectionList) TableRow(com.google.api.services.bigquery.model.TableRow) Window(org.apache.beam.sdk.transforms.windowing.Window) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) Flatten(org.apache.beam.sdk.transforms.Flatten) MapElements(org.apache.beam.sdk.transforms.MapElements) ErrorConverters(com.google.cloud.teleport.v2.transforms.ErrorConverters) DeadLetterQueueManager(com.google.cloud.teleport.v2.cdc.dlq.DeadLetterQueueManager) GcpOptions(org.apache.beam.sdk.extensions.gcp.options.GcpOptions) WindowedFilenamePolicy(com.google.cloud.teleport.v2.io.WindowedFilenamePolicy) CreateDisposition(org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition) BigQueryDynamicConverters(com.google.cloud.teleport.v2.transforms.BigQueryDynamicConverters) ParDo(org.apache.beam.sdk.transforms.ParDo) StringDeadLetterQueueSanitizer(com.google.cloud.teleport.v2.cdc.dlq.StringDeadLetterQueueSanitizer) FailsafeElement(com.google.cloud.teleport.v2.values.FailsafeElement) ImmutableList(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList) DurationUtils(com.google.cloud.teleport.v2.utils.DurationUtils) KV(org.apache.beam.sdk.values.KV) Default(org.apache.beam.sdk.options.Default) BigQueryConverters(com.google.cloud.teleport.v2.transforms.BigQueryConverters) PipelineOptionsFactory(org.apache.beam.sdk.options.PipelineOptionsFactory) PubsubMessageWithAttributesCoder(org.apache.beam.sdk.io.gcp.pubsub.PubsubMessageWithAttributesCoder) StringUtf8Coder(org.apache.beam.sdk.coders.StringUtf8Coder) FileBasedSink(org.apache.beam.sdk.io.FileBasedSink) FailsafeElementCoder(com.google.cloud.teleport.v2.coders.FailsafeElementCoder) Pipeline(org.apache.beam.sdk.Pipeline) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) BigQueryTableConfigManager(com.google.cloud.teleport.v2.transforms.BigQueryConverters.BigQueryTableConfigManager) BigQueryInsertError(org.apache.beam.sdk.io.gcp.bigquery.BigQueryInsertError) DoFn(org.apache.beam.sdk.transforms.DoFn) Reshuffle(org.apache.beam.sdk.transforms.Reshuffle) Logger(org.slf4j.Logger) BigQueryIO(org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO) BigQueryDeadLetterQueueSanitizer(com.google.cloud.teleport.v2.cdc.dlq.BigQueryDeadLetterQueueSanitizer) FixedWindows(org.apache.beam.sdk.transforms.windowing.FixedWindows) WriteResult(org.apache.beam.sdk.io.gcp.bigquery.WriteResult) PCollection(org.apache.beam.sdk.values.PCollection) PubsubIO(org.apache.beam.sdk.io.gcp.pubsub.PubsubIO) WriteDisposition(org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition) BigQueryMappers(com.google.cloud.teleport.v2.cdc.mappers.BigQueryMappers) ResourceUtils(com.google.cloud.teleport.v2.utils.ResourceUtils) InputUDFToTableRow(com.google.cloud.teleport.v2.transforms.UDFTextTransformer.InputUDFToTableRow) TextIO(org.apache.beam.sdk.io.TextIO) DeadLetterQueueManager(com.google.cloud.teleport.v2.cdc.dlq.DeadLetterQueueManager) BigQueryInsertError(org.apache.beam.sdk.io.gcp.bigquery.BigQueryInsertError) BigQueryDynamicConverters(com.google.cloud.teleport.v2.transforms.BigQueryDynamicConverters) PubsubMessage(org.apache.beam.sdk.io.gcp.pubsub.PubsubMessage) FailsafeElement(com.google.cloud.teleport.v2.values.FailsafeElement) GcpOptions(org.apache.beam.sdk.extensions.gcp.options.GcpOptions) PubSubToFailSafeElement(com.google.cloud.teleport.v2.transforms.PubSubToFailSafeElement) BigQueryDeadLetterQueueSanitizer(com.google.cloud.teleport.v2.cdc.dlq.BigQueryDeadLetterQueueSanitizer) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) StringDeadLetterQueueSanitizer(com.google.cloud.teleport.v2.cdc.dlq.StringDeadLetterQueueSanitizer) BigQueryTableConfigManager(com.google.cloud.teleport.v2.transforms.BigQueryConverters.BigQueryTableConfigManager) KV(org.apache.beam.sdk.values.KV) BigQueryMappers(com.google.cloud.teleport.v2.cdc.mappers.BigQueryMappers) Pipeline(org.apache.beam.sdk.Pipeline) CoderRegistry(org.apache.beam.sdk.coders.CoderRegistry) WriteResult(org.apache.beam.sdk.io.gcp.bigquery.WriteResult) TableRow(com.google.api.services.bigquery.model.TableRow) InputUDFToTableRow(com.google.cloud.teleport.v2.transforms.UDFTextTransformer.InputUDFToTableRow) WindowedFilenamePolicy(com.google.cloud.teleport.v2.io.WindowedFilenamePolicy) InputUDFToTableRow(com.google.cloud.teleport.v2.transforms.UDFTextTransformer.InputUDFToTableRow)

Example 37 with FailsafeElement

use of com.google.cloud.teleport.v2.values.FailsafeElement in project DataflowTemplates by GoogleCloudPlatform.

the class CsvConvertersTest method testLineToFailsafeJsonNoHeadersUdfDeadletter.

/**
 * Tests {@link CsvConverters.LineToFailsafeJson} converts a line to a {@link FailsafeElement}
 * correctly using a Javascript Udf. Udf processing is handled by {@link
 * JavascriptTextTransformer}. Should output record to deadletter table tag.
 */
@Test
public void testLineToFailsafeJsonNoHeadersUdfDeadletter() {
    FailsafeElementCoder<String, String> coder = FAILSAFE_ELEMENT_CODER;
    CoderRegistry coderRegistry = pipeline.getCoderRegistry();
    coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);
    PCollection<String> lines = pipeline.apply(Create.of(BAD_JSON_STRING_RECORD).withCoder(StringUtf8Coder.of()));
    PCollectionTuple linesTuple = PCollectionTuple.of(CSV_LINES, lines);
    CsvConverters.CsvPipelineOptions options = PipelineOptionsFactory.create().as(CsvConverters.CsvPipelineOptions.class);
    options.setDelimiter(",");
    options.setJavascriptTextTransformGcsPath(SCRIPT_PARSE_EXCEPTION_FILE_PATH);
    options.setJavascriptTextTransformFunctionName("transform");
    PCollectionTuple failsafe = linesTuple.apply("TestLineToFailsafeJsonNoHeadersUdfBad", CsvConverters.LineToFailsafeJson.newBuilder().setDelimiter(options.getDelimiter()).setUdfFileSystemPath(options.getJavascriptTextTransformGcsPath()).setUdfFunctionName(options.getJavascriptTextTransformFunctionName()).setJsonSchemaPath(options.getJsonSchemaPath()).setJsonSchemaPath(null).setHeaderTag(CSV_HEADERS).setLineTag(CSV_LINES).setUdfOutputTag(PROCESSING_OUT).setUdfDeadletterTag(PROCESSING_DEADLETTER_OUT).build());
    PAssert.that(failsafe.get(PROCESSING_OUT)).empty();
    PAssert.that(failsafe.get(PROCESSING_DEADLETTER_OUT)).satisfies(collection -> {
        FailsafeElement result = collection.iterator().next();
        assertThat(result.getPayload(), is(equalTo(BAD_JSON_STRING_RECORD)));
        return null;
    });
    pipeline.run();
}
Also used : CoderRegistry(org.apache.beam.sdk.coders.CoderRegistry) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) FailsafeElement(com.google.cloud.teleport.v2.values.FailsafeElement) Test(org.junit.Test)

Example 38 with FailsafeElement

use of com.google.cloud.teleport.v2.values.FailsafeElement in project DataflowTemplates by GoogleCloudPlatform.

the class DataStreamToBigQuery method run.

/**
 * Runs the pipeline with the supplied options.
 *
 * @param options The execution parameters to the pipeline.
 * @return The result of the pipeline execution.
 */
public static PipelineResult run(Options options) {
    /*
     * Stages:
     *   1) Ingest and Normalize Data to FailsafeElement with JSON Strings
     *   2) Write JSON Strings to TableRow Collection
     *       - Optionally apply a UDF
     *   3) BigQuery Output of TableRow Data
     *     a) Map New Columns & Write to Staging Tables
     *     b) Map New Columns & Merge Staging to Target Table
     *   4) Write Failures to GCS Dead Letter Queue
     */
    Pipeline pipeline = Pipeline.create(options);
    DeadLetterQueueManager dlqManager = buildDlqManager(options);
    String bigqueryProjectId = getBigQueryProjectId(options);
    String dlqDirectory = dlqManager.getRetryDlqDirectoryWithDateTime();
    String tempDlqDir = dlqManager.getRetryDlqDirectory() + "tmp/";
    InputUDFToTableRow<String> failsafeTableRowTransformer = new InputUDFToTableRow<String>(options.getJavascriptTextTransformGcsPath(), options.getJavascriptTextTransformFunctionName(), options.getPythonTextTransformGcsPath(), options.getPythonTextTransformFunctionName(), options.getRuntimeRetries(), FAILSAFE_ELEMENT_CODER);
    StatefulRowCleaner statefulCleaner = StatefulRowCleaner.of();
    /*
     * Stage 1: Ingest and Normalize Data to FailsafeElement with JSON Strings
     *   a) Read DataStream data from GCS into JSON String FailsafeElements (datastreamJsonRecords)
     *   b) Reconsume Dead Letter Queue data from GCS into JSON String FailsafeElements
     *     (dlqJsonRecords)
     *   c) Flatten DataStream and DLQ Streams (jsonRecords)
     */
    PCollection<FailsafeElement<String, String>> datastreamJsonRecords = pipeline.apply(new DataStreamIO(options.getStreamName(), options.getInputFilePattern(), options.getInputFileFormat(), options.getGcsPubSubSubscription(), options.getRfcStartDateTime()).withFileReadConcurrency(options.getFileReadConcurrency()));
    // Elements sent to the Dead Letter Queue are to be reconsumed.
    // A DLQManager is to be created using PipelineOptions, and it is in charge
    // of building pieces of the DLQ.
    PCollection<FailsafeElement<String, String>> dlqJsonRecords = pipeline.apply("DLQ Consumer/reader", dlqManager.dlqReconsumer(options.getDlqRetryMinutes())).apply("DLQ Consumer/cleaner", ParDo.of(new DoFn<String, FailsafeElement<String, String>>() {

        @ProcessElement
        public void process(@Element String input, OutputReceiver<FailsafeElement<String, String>> receiver) {
            receiver.output(FailsafeElement.of(input, input));
        }
    })).setCoder(FAILSAFE_ELEMENT_CODER);
    PCollection<FailsafeElement<String, String>> jsonRecords = PCollectionList.of(datastreamJsonRecords).and(dlqJsonRecords).apply("Merge Datastream & DLQ", Flatten.pCollections());
    /*
     * Stage 2: Write JSON Strings to TableRow PCollectionTuple
     *   a) Optionally apply a Javascript or Python UDF
     *   b) Convert JSON String FailsafeElements to TableRow's (tableRowRecords)
     */
    PCollectionTuple tableRowRecords = jsonRecords.apply("UDF to TableRow/udf", failsafeTableRowTransformer);
    PCollectionTuple cleanedRows = tableRowRecords.get(failsafeTableRowTransformer.transformOut).apply("UDF to TableRow/Oracle Cleaner", statefulCleaner);
    PCollection<TableRow> shuffledTableRows = cleanedRows.get(statefulCleaner.successTag).apply("UDF to TableRow/ReShuffle", Reshuffle.<TableRow>viaRandomKey().withNumBuckets(100));
    /*
     * Stage 3: BigQuery Output of TableRow Data
     *   a) Map New Columns & Write to Staging Tables (writeResult)
     *   b) Map New Columns & Merge Staging to Target Table (null)
     *
     *   failsafe: writeResult.getFailedInsertsWithErr()
     */
    // TODO(beam 2.23): InsertRetryPolicy should be CDC compliant
    Set<String> fieldsToIgnore = getFieldsToIgnore(options.getIgnoreFields());
    WriteResult writeResult = shuffledTableRows.apply("Map to Staging Tables", new DataStreamMapper(options.as(GcpOptions.class), options.getOutputProjectId(), options.getOutputStagingDatasetTemplate(), options.getOutputStagingTableNameTemplate()).withDataStreamRootUrl(options.getDataStreamRootUrl()).withDefaultSchema(BigQueryDefaultSchemas.DATASTREAM_METADATA_SCHEMA).withDayPartitioning(true).withIgnoreFields(fieldsToIgnore)).apply("Write Successful Records", BigQueryIO.<KV<TableId, TableRow>>write().to(new BigQueryDynamicConverters().bigQueryDynamicDestination()).withFormatFunction(element -> removeTableRowFields(element.getValue(), fieldsToIgnore)).withFormatRecordOnFailureFunction(element -> element.getValue()).withoutValidation().ignoreInsertIds().withCreateDisposition(CreateDisposition.CREATE_NEVER).withWriteDisposition(WriteDisposition.WRITE_APPEND).withExtendedErrorInfo().withMethod(BigQueryIO.Write.Method.STREAMING_INSERTS).withFailedInsertRetryPolicy(InsertRetryPolicy.retryTransientErrors()));
    if (options.getApplyMerge()) {
        shuffledTableRows.apply("Map To Replica Tables", new DataStreamMapper(options.as(GcpOptions.class), options.getOutputProjectId(), options.getOutputDatasetTemplate(), options.getOutputTableNameTemplate()).withDataStreamRootUrl(options.getDataStreamRootUrl()).withDefaultSchema(BigQueryDefaultSchemas.DATASTREAM_METADATA_SCHEMA).withIgnoreFields(fieldsToIgnore)).apply("BigQuery Merge/Build MergeInfo", new MergeInfoMapper(bigqueryProjectId, options.getOutputStagingDatasetTemplate(), options.getOutputStagingTableNameTemplate(), options.getOutputDatasetTemplate(), options.getOutputTableNameTemplate())).apply("BigQuery Merge/Merge into Replica Tables", BigQueryMerger.of(MergeConfiguration.bigQueryConfiguration().withMergeWindowDuration(Duration.standardMinutes(options.getMergeFrequencyMinutes()))));
    }
    /*
     * Stage 4: Write Failures to GCS Dead Letter Queue
     */
    PCollection<String> udfDlqJson = PCollectionList.of(tableRowRecords.get(failsafeTableRowTransformer.udfDeadletterOut)).and(tableRowRecords.get(failsafeTableRowTransformer.transformDeadletterOut)).apply("Transform Failures/Flatten", Flatten.pCollections()).apply("Transform Failures/Sanitize", MapElements.via(new StringDeadLetterQueueSanitizer()));
    PCollection<String> rowCleanerJson = cleanedRows.get(statefulCleaner.failureTag).apply("Transform Failures/Oracle Cleaner Failures", MapElements.via(new RowCleanerDeadLetterQueueSanitizer()));
    PCollection<String> bqWriteDlqJson = writeResult.getFailedInsertsWithErr().apply("BigQuery Failures", MapElements.via(new BigQueryDeadLetterQueueSanitizer()));
    PCollectionList.of(udfDlqJson).and(rowCleanerJson).and(bqWriteDlqJson).apply("Write To DLQ/Flatten", Flatten.pCollections()).apply("Write To DLQ/Writer", DLQWriteTransform.WriteDLQ.newBuilder().withDlqDirectory(dlqDirectory).withTmpDirectory(tempDlqDir).setIncludePaneInfo(true).build());
    // Execute the pipeline and return the result.
    return pipeline.run();
}
Also used : TableId(com.google.cloud.bigquery.TableId) PipelineResult(org.apache.beam.sdk.PipelineResult) TableId(com.google.cloud.bigquery.TableId) InsertRetryPolicy(org.apache.beam.sdk.io.gcp.bigquery.InsertRetryPolicy) LoggerFactory(org.slf4j.LoggerFactory) DLQWriteTransform(com.google.cloud.teleport.v2.transforms.DLQWriteTransform) InputUDFOptions(com.google.cloud.teleport.v2.transforms.UDFTextTransformer.InputUDFOptions) Description(org.apache.beam.sdk.options.Description) PCollectionList(org.apache.beam.sdk.values.PCollectionList) TableRow(com.google.api.services.bigquery.model.TableRow) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) BigQueryMerger(com.google.cloud.teleport.v2.cdc.merge.BigQueryMerger) Splitter(com.google.common.base.Splitter) Flatten(org.apache.beam.sdk.transforms.Flatten) MapElements(org.apache.beam.sdk.transforms.MapElements) DeadLetterQueueManager(com.google.cloud.teleport.v2.cdc.dlq.DeadLetterQueueManager) MergeInfoMapper(com.google.cloud.teleport.v2.cdc.mappers.MergeInfoMapper) GcpOptions(org.apache.beam.sdk.extensions.gcp.options.GcpOptions) StreamingOptions(org.apache.beam.sdk.options.StreamingOptions) Set(java.util.Set) CreateDisposition(org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition) RowCleanerDeadLetterQueueSanitizer(com.google.cloud.teleport.v2.transforms.StatefulRowCleaner.RowCleanerDeadLetterQueueSanitizer) ParDo(org.apache.beam.sdk.transforms.ParDo) StringDeadLetterQueueSanitizer(com.google.cloud.teleport.v2.cdc.dlq.StringDeadLetterQueueSanitizer) FailsafeElement(com.google.cloud.teleport.v2.values.FailsafeElement) Pattern(java.util.regex.Pattern) MergeConfiguration(com.google.cloud.teleport.v2.cdc.merge.MergeConfiguration) KV(org.apache.beam.sdk.values.KV) DataStreamIO(com.google.cloud.teleport.v2.cdc.sources.DataStreamIO) Default(org.apache.beam.sdk.options.Default) Duration(org.joda.time.Duration) StatefulRowCleaner(com.google.cloud.teleport.v2.transforms.StatefulRowCleaner) BigQueryOptions(org.apache.beam.sdk.io.gcp.bigquery.BigQueryOptions) PipelineOptionsFactory(org.apache.beam.sdk.options.PipelineOptionsFactory) StringUtf8Coder(org.apache.beam.sdk.coders.StringUtf8Coder) HashSet(java.util.HashSet) DataStreamMapper(com.google.cloud.teleport.v2.cdc.mappers.DataStreamMapper) FailsafeElementCoder(com.google.cloud.teleport.v2.coders.FailsafeElementCoder) BigQueryDefaultSchemas(com.google.cloud.teleport.v2.cdc.mappers.BigQueryDefaultSchemas) TupleTag(org.apache.beam.sdk.values.TupleTag) Pipeline(org.apache.beam.sdk.Pipeline) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) DoFn(org.apache.beam.sdk.transforms.DoFn) Reshuffle(org.apache.beam.sdk.transforms.Reshuffle) Logger(org.slf4j.Logger) BigQueryIO(org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO) DataflowPipelineOptions(org.apache.beam.runners.dataflow.options.DataflowPipelineOptions) WriteResult(org.apache.beam.sdk.io.gcp.bigquery.WriteResult) PCollection(org.apache.beam.sdk.values.PCollection) WriteDisposition(org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition) InputUDFToTableRow(com.google.cloud.teleport.v2.transforms.UDFTextTransformer.InputUDFToTableRow) DeadLetterQueueManager(com.google.cloud.teleport.v2.cdc.dlq.DeadLetterQueueManager) FailsafeElement(com.google.cloud.teleport.v2.values.FailsafeElement) GcpOptions(org.apache.beam.sdk.extensions.gcp.options.GcpOptions) RowCleanerDeadLetterQueueSanitizer(com.google.cloud.teleport.v2.transforms.StatefulRowCleaner.RowCleanerDeadLetterQueueSanitizer) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) StringDeadLetterQueueSanitizer(com.google.cloud.teleport.v2.cdc.dlq.StringDeadLetterQueueSanitizer) MergeInfoMapper(com.google.cloud.teleport.v2.cdc.mappers.MergeInfoMapper) Pipeline(org.apache.beam.sdk.Pipeline) WriteResult(org.apache.beam.sdk.io.gcp.bigquery.WriteResult) DataStreamMapper(com.google.cloud.teleport.v2.cdc.mappers.DataStreamMapper) DataStreamIO(com.google.cloud.teleport.v2.cdc.sources.DataStreamIO) StatefulRowCleaner(com.google.cloud.teleport.v2.transforms.StatefulRowCleaner) TableRow(com.google.api.services.bigquery.model.TableRow) InputUDFToTableRow(com.google.cloud.teleport.v2.transforms.UDFTextTransformer.InputUDFToTableRow) InputUDFToTableRow(com.google.cloud.teleport.v2.transforms.UDFTextTransformer.InputUDFToTableRow)

Example 39 with FailsafeElement

use of com.google.cloud.teleport.v2.values.FailsafeElement in project DataflowTemplates by GoogleCloudPlatform.

the class PubsubToJdbc method run.

/**
 * Runs a pipeline which reads message from Pub/Sub and writes to JdbcIO.
 *
 * @param options The execution options.
 * @return The pipeline result.
 */
public static PipelineResult run(PubsubToJdbcOptions options) {
    // Create the pipeline
    Pipeline pipeline = Pipeline.create(options);
    LOG.info("Starting Pubsub-to-Jdbc Pipeline.");
    /*
     * Steps:
     *  1) Read data from a Pub/Sub subscription
     *  2) Write to Jdbc Table
     *  3) Write errors to deadletter topic
     */
    PCollection<String> pubsubData = pipeline.apply("readFromPubSubSubscription", PubsubIO.readStrings().fromSubscription(options.getInputSubscription()));
    DynamicJdbcIO.DynamicDataSourceConfiguration dataSourceConfiguration = DynamicJdbcIO.DynamicDataSourceConfiguration.create(options.getDriverClassName(), maybeDecrypt(options.getConnectionUrl(), options.getKMSEncryptionKey())).withDriverJars(options.getDriverJars());
    if (options.getUsername() != null) {
        dataSourceConfiguration = dataSourceConfiguration.withUsername(maybeDecrypt(options.getUsername(), options.getKMSEncryptionKey()));
    }
    if (options.getPassword() != null) {
        dataSourceConfiguration = dataSourceConfiguration.withPassword(maybeDecrypt(options.getPassword(), options.getKMSEncryptionKey()));
    }
    if (options.getConnectionProperties() != null) {
        dataSourceConfiguration = dataSourceConfiguration.withConnectionProperties(options.getConnectionProperties());
    }
    PCollection<FailsafeElement<String, String>> errors = pubsubData.apply("writeToJdbc", DynamicJdbcIO.<String>write().withDataSourceConfiguration(dataSourceConfiguration).withStatement(options.getStatement()).withPreparedStatementSetter(new MapJsonStringToQuery(getKeyOrder(options.getStatement())))).setCoder(FAILSAFE_ELEMENT_CODER);
    errors.apply("WriteFailedRecords", ErrorConverters.WriteStringMessageErrorsToPubSub.newBuilder().setErrorRecordsTopic(options.getOutputDeadletterTopic()).build());
    return pipeline.run();
}
Also used : DynamicJdbcIO(com.google.cloud.teleport.v2.io.DynamicJdbcIO) Pipeline(org.apache.beam.sdk.Pipeline) FailsafeElement(com.google.cloud.teleport.v2.values.FailsafeElement)

Example 40 with FailsafeElement

use of com.google.cloud.teleport.v2.values.FailsafeElement in project DataflowTemplates by GoogleCloudPlatform.

the class SpannerChangeStreamsToBigQuery method run.

/**
 * Runs the pipeline with the supplied options.
 *
 * @param options The execution parameters to the pipeline.
 * @return The result of the pipeline execution.
 */
public static PipelineResult run(SpannerChangeStreamsToBigQueryOptions options) {
    setOptions(options);
    validateOptions(options);
    /**
     * Stages: 1) Read {@link DataChangeRecord} from change stream. 2) Create {@link
     * FailsafeElement} of {@link Mod} JSON and merge from: - {@link DataChangeRecord}. - GCS Dead
     * letter queue. 3) Convert {@link Mod} JSON into {@link TableRow} by reading from Spanner at
     * commit timestamp. 4) Append {@link TableRow} to BigQuery. 5) Write Failures from 2), 3) and
     * 4) to GCS dead letter queue.
     */
    Pipeline pipeline = Pipeline.create(options);
    DeadLetterQueueManager dlqManager = buildDlqManager(options);
    String spannerProjectId = getSpannerProjectId(options);
    String dlqDirectory = dlqManager.getRetryDlqDirectoryWithDateTime();
    String tempDlqDirectory = dlqManager.getRetryDlqDirectory() + "tmp/";
    // Retrieve and parse the startTimestamp and endTimestamp.
    Timestamp startTimestamp = options.getStartTimestamp().isEmpty() ? Timestamp.now() : Timestamp.parseTimestamp(options.getStartTimestamp());
    Timestamp endTimestamp = options.getEndTimestamp().isEmpty() ? Timestamp.MAX_VALUE : Timestamp.parseTimestamp(options.getEndTimestamp());
    SpannerConfig spannerConfig = SpannerConfig.create().withHost(ValueProvider.StaticValueProvider.of(options.getSpannerHost())).withProjectId(spannerProjectId).withInstanceId(options.getSpannerInstanceId()).withDatabaseId(options.getSpannerDatabase()).withRpcPriority(options.getSpannerRpcPriority());
    SpannerIO.ReadChangeStream readChangeStream = SpannerIO.readChangeStream().withSpannerConfig(spannerConfig).withMetadataInstance(options.getSpannerMetadataInstanceId()).withMetadataDatabase(options.getSpannerMetadataDatabase()).withChangeStreamName(options.getSpannerChangeStreamName()).withInclusiveStartAt(startTimestamp).withInclusiveEndAt(endTimestamp).withRpcPriority(options.getSpannerRpcPriority());
    String spannerMetadataTableName = options.getSpannerMetadataTableName();
    if (spannerMetadataTableName != null) {
        readChangeStream = readChangeStream.withMetadataTable(spannerMetadataTableName);
    }
    PCollection<DataChangeRecord> dataChangeRecord = pipeline.apply("Read from Spanner Change Streams", readChangeStream).apply("Reshuffle DataChangeRecord", Reshuffle.viaRandomKey());
    PCollection<FailsafeElement<String, String>> sourceFailsafeModJson = dataChangeRecord.apply("DataChangeRecord To Mod JSON", ParDo.of(new DataChangeRecordToModJsonFn())).apply("Wrap Mod JSON In FailsafeElement", ParDo.of(new DoFn<String, FailsafeElement<String, String>>() {

        @ProcessElement
        public void process(@Element String input, OutputReceiver<FailsafeElement<String, String>> receiver) {
            receiver.output(FailsafeElement.of(input, input));
        }
    })).setCoder(FAILSAFE_ELEMENT_CODER);
    PCollectionTuple dlqModJson = dlqManager.getReconsumerDataTransform(pipeline.apply(dlqManager.dlqReconsumer(options.getDlqRetryMinutes())));
    PCollection<FailsafeElement<String, String>> retryableDlqFailsafeModJson = dlqModJson.get(DeadLetterQueueManager.RETRYABLE_ERRORS).setCoder(FAILSAFE_ELEMENT_CODER);
    PCollection<FailsafeElement<String, String>> failsafeModJson = PCollectionList.of(sourceFailsafeModJson).and(retryableDlqFailsafeModJson).apply("Merge Source And DLQ Mod JSON", Flatten.pCollections());
    ImmutableSet.Builder<String> ignoreFieldsBuilder = ImmutableSet.builder();
    for (String ignoreField : options.getIgnoreFields().split(",")) {
        ignoreFieldsBuilder.add(ignoreField);
    }
    ImmutableSet<String> ignoreFields = ignoreFieldsBuilder.build();
    FailsafeModJsonToTableRowTransformer.FailsafeModJsonToTableRowOptions failsafeModJsonToTableRowOptions = FailsafeModJsonToTableRowTransformer.FailsafeModJsonToTableRowOptions.builder().setSpannerConfig(spannerConfig).setSpannerChangeStream(options.getSpannerChangeStreamName()).setIgnoreFields(ignoreFields).setCoder(FAILSAFE_ELEMENT_CODER).build();
    FailsafeModJsonToTableRowTransformer.FailsafeModJsonToTableRow failsafeModJsonToTableRow = new FailsafeModJsonToTableRowTransformer.FailsafeModJsonToTableRow(failsafeModJsonToTableRowOptions);
    PCollectionTuple tableRowTuple = failsafeModJson.apply("Mod JSON To TableRow", failsafeModJsonToTableRow);
    BigQueryDynamicDestinations.BigQueryDynamicDestinationsOptions bigQueryDynamicDestinationsOptions = BigQueryDynamicDestinations.BigQueryDynamicDestinationsOptions.builder().setSpannerConfig(spannerConfig).setChangeStreamName(options.getSpannerChangeStreamName()).setIgnoreFields(ignoreFields).setBigQueryProject(getBigQueryProjectId(options)).setBigQueryDataset(options.getBigQueryDataset()).setBigQueryTableTemplate(options.getBigQueryChangelogTableNameTemplate()).build();
    WriteResult writeResult = tableRowTuple.get(failsafeModJsonToTableRow.transformOut).apply("Write To BigQuery", BigQueryIO.<TableRow>write().to(BigQueryDynamicDestinations.of(bigQueryDynamicDestinationsOptions)).withFormatFunction(element -> removeIntermediateMetadataFields(element)).withFormatRecordOnFailureFunction(element -> element).withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED).withWriteDisposition(Write.WriteDisposition.WRITE_APPEND).withExtendedErrorInfo().withMethod(Write.Method.STREAMING_INSERTS).withFailedInsertRetryPolicy(InsertRetryPolicy.retryTransientErrors()));
    PCollection<String> transformDlqJson = tableRowTuple.get(failsafeModJsonToTableRow.transformDeadLetterOut).apply("Failed Mod JSON During Table Row Transformation", MapElements.via(new StringDeadLetterQueueSanitizer()));
    PCollection<String> bqWriteDlqJson = writeResult.getFailedInsertsWithErr().apply("Failed Mod JSON During BigQuery Writes", MapElements.via(new BigQueryDeadLetterQueueSanitizer()));
    PCollectionList.of(transformDlqJson).and(bqWriteDlqJson).apply("Merge Failed Mod JSON From Transform And BigQuery", Flatten.pCollections()).apply("Write Failed Mod JSON To DLQ", DLQWriteTransform.WriteDLQ.newBuilder().withDlqDirectory(dlqDirectory).withTmpDirectory(tempDlqDirectory).setIncludePaneInfo(true).build());
    PCollection<FailsafeElement<String, String>> nonRetryableDlqModJsonFailsafe = dlqModJson.get(DeadLetterQueueManager.PERMANENT_ERRORS).setCoder(FAILSAFE_ELEMENT_CODER);
    nonRetryableDlqModJsonFailsafe.apply("Write Mod JSON With Non-retryable Error To DLQ", MapElements.via(new StringDeadLetterQueueSanitizer())).setCoder(StringUtf8Coder.of()).apply(DLQWriteTransform.WriteDLQ.newBuilder().withDlqDirectory(dlqManager.getSevereDlqDirectoryWithDateTime()).withTmpDirectory(dlqManager.getSevereDlqDirectory() + "tmp/").setIncludePaneInfo(true).build());
    return pipeline.run();
}
Also used : SpannerConfig(org.apache.beam.sdk.io.gcp.spanner.SpannerConfig) DataflowPipelineWorkerPoolOptions(org.apache.beam.runners.dataflow.options.DataflowPipelineWorkerPoolOptions) Mod(com.google.cloud.teleport.v2.templates.spannerchangestreamstobigquery.model.Mod) PipelineResult(org.apache.beam.sdk.PipelineResult) InsertRetryPolicy(org.apache.beam.sdk.io.gcp.bigquery.InsertRetryPolicy) LoggerFactory(org.slf4j.LoggerFactory) Timestamp(com.google.cloud.Timestamp) PipelineOptionsFactory(org.apache.beam.sdk.options.PipelineOptionsFactory) DLQWriteTransform(com.google.cloud.teleport.v2.transforms.DLQWriteTransform) DataChangeRecord(org.apache.beam.sdk.io.gcp.spanner.changestreams.model.DataChangeRecord) ArrayList(java.util.ArrayList) StringUtf8Coder(org.apache.beam.sdk.coders.StringUtf8Coder) PCollectionList(org.apache.beam.sdk.values.PCollectionList) FailsafeElementCoder(com.google.cloud.teleport.v2.coders.FailsafeElementCoder) TableRow(com.google.api.services.bigquery.model.TableRow) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) Pipeline(org.apache.beam.sdk.Pipeline) ValueProvider(org.apache.beam.sdk.options.ValueProvider) Flatten(org.apache.beam.sdk.transforms.Flatten) DoFn(org.apache.beam.sdk.transforms.DoFn) MapElements(org.apache.beam.sdk.transforms.MapElements) SpannerChangeStreamsToBigQueryOptions(com.google.cloud.teleport.v2.options.SpannerChangeStreamsToBigQueryOptions) Reshuffle(org.apache.beam.sdk.transforms.Reshuffle) DeadLetterQueueManager(com.google.cloud.teleport.v2.cdc.dlq.DeadLetterQueueManager) ImmutableSet(com.google.common.collect.ImmutableSet) Logger(org.slf4j.Logger) BigQueryIO(org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO) Set(java.util.Set) IOException(java.io.IOException) CreateDisposition(org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition) DataflowPipelineOptions(org.apache.beam.runners.dataflow.options.DataflowPipelineOptions) WriteResult(org.apache.beam.sdk.io.gcp.bigquery.WriteResult) PCollection(org.apache.beam.sdk.values.PCollection) SpannerIO(org.apache.beam.sdk.io.gcp.spanner.SpannerIO) SpannerConfig(org.apache.beam.sdk.io.gcp.spanner.SpannerConfig) List(java.util.List) ParDo(org.apache.beam.sdk.transforms.ParDo) BigQueryUtils(com.google.cloud.teleport.v2.templates.spannerchangestreamstobigquery.schemautils.BigQueryUtils) StringDeadLetterQueueSanitizer(com.google.cloud.teleport.v2.cdc.dlq.StringDeadLetterQueueSanitizer) FailsafeElement(com.google.cloud.teleport.v2.values.FailsafeElement) Write(org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write) DeadLetterQueueManager(com.google.cloud.teleport.v2.cdc.dlq.DeadLetterQueueManager) DataChangeRecord(org.apache.beam.sdk.io.gcp.spanner.changestreams.model.DataChangeRecord) Timestamp(com.google.cloud.Timestamp) FailsafeElement(com.google.cloud.teleport.v2.values.FailsafeElement) ImmutableSet(com.google.common.collect.ImmutableSet) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) StringDeadLetterQueueSanitizer(com.google.cloud.teleport.v2.cdc.dlq.StringDeadLetterQueueSanitizer) Pipeline(org.apache.beam.sdk.Pipeline) WriteResult(org.apache.beam.sdk.io.gcp.bigquery.WriteResult) SpannerIO(org.apache.beam.sdk.io.gcp.spanner.SpannerIO)

Aggregations

FailsafeElement (com.google.cloud.teleport.v2.values.FailsafeElement)31 PCollectionTuple (org.apache.beam.sdk.values.PCollectionTuple)26 CoderRegistry (org.apache.beam.sdk.coders.CoderRegistry)21 Test (org.junit.Test)21 Pipeline (org.apache.beam.sdk.Pipeline)14 TableRow (com.google.api.services.bigquery.model.TableRow)8 PubsubMessage (org.apache.beam.sdk.io.gcp.pubsub.PubsubMessage)6 DoFn (org.apache.beam.sdk.transforms.DoFn)6 PubSubToElasticsearchOptions (com.google.cloud.teleport.v2.elasticsearch.options.PubSubToElasticsearchOptions)5 IntegrationTest (com.google.cloud.teleport.v2.spanner.IntegrationTest)5 JSONObject (org.json.JSONObject)5 DeadLetterQueueManager (com.google.cloud.teleport.v2.cdc.dlq.DeadLetterQueueManager)4 StringDeadLetterQueueSanitizer (com.google.cloud.teleport.v2.cdc.dlq.StringDeadLetterQueueSanitizer)4 DataStreamIO (com.google.cloud.teleport.v2.cdc.sources.DataStreamIO)4 FailsafeElementCoder (com.google.cloud.teleport.v2.coders.FailsafeElementCoder)4 GCSToSplunk.flattenErrorsAndConvertToString (com.google.cloud.teleport.v2.templates.GCSToSplunk.flattenErrorsAndConvertToString)4 PipelineResult (org.apache.beam.sdk.PipelineResult)4 SpannerConfig (org.apache.beam.sdk.io.gcp.spanner.SpannerConfig)4 KV (org.apache.beam.sdk.values.KV)4 GCSToElasticsearchOptions (com.google.cloud.teleport.v2.elasticsearch.options.GCSToElasticsearchOptions)3