Search in sources :

Example 1 with Mod

use of com.google.cloud.teleport.v2.templates.spannerchangestreamstobigquery.model.Mod in project DataflowTemplates by GoogleCloudPlatform.

the class FailsafeModJsonToTableRowTransformerTest method testFailsafeModJsonToTableRowFailedSnapshotRead.

// Test the case where the snapshot read to Spanner fails and we can capture the failures from
// transformDeadLetterOut of FailsafeModJsonToTableRow.
@Test
public void testFailsafeModJsonToTableRowFailedSnapshotRead() throws Exception {
    ObjectNode fakePkColJsonNode = new ObjectNode(JsonNodeFactory.instance);
    fakePkColJsonNode.put("fakePkCol", true);
    ObjectNode fakeNonPkColJsonNode = new ObjectNode(JsonNodeFactory.instance);
    fakeNonPkColJsonNode.put("fakeNonPkCol", true);
    Mod mod = new Mod(fakePkColJsonNode.toString(), fakeNonPkColJsonNode.toString(), Timestamp.ofTimeSecondsAndNanos(1650908264L, 925679000), "1", true, "00000001", TEST_SPANNER_TABLE, ModType.INSERT, 1L, 1L);
    TestStream<String> testSream = TestStream.create(SerializableCoder.of(String.class)).addElements(mod.toJson()).advanceWatermarkTo(Instant.now()).advanceWatermarkToInfinity();
    Pipeline p = Pipeline.create();
    PCollection<FailsafeElement<String, String>> input = p.apply(testSream).apply(ParDo.of(new DoFn<String, FailsafeElement<String, String>>() {

        @ProcessElement
        public void process(@Element String input, OutputReceiver<FailsafeElement<String, String>> receiver) {
            receiver.output(FailsafeElement.of(input, input));
        }
    })).setCoder(SpannerChangeStreamsToBigQuery.FAILSAFE_ELEMENT_CODER);
    PCollectionTuple out = input.apply("Mod JSON To TableRow", failsafeModJsonToTableRow);
    PAssert.that(out.get(failsafeModJsonToTableRow.transformOut)).empty();
    String expectedPayload = "{\"keysJson\":\"{\\\"fakePkCol\\\":true}\"," + "\"newValuesJson\":\"{\\\"fakeNonPkCol\\\":true}\"," + "\"commitTimestampSeconds\":1650908264,\"commitTimestampNanos\":925679000," + "\"serverTransactionId\":\"1\",\"isLastRecordInTransactionInPartition\":true," + "\"recordSequence\":\"00000001\",\"tableName\":\"AllTypes\",\"modType\":\"INSERT\"," + "\"numberOfRecordsInTransaction\":1,\"numberOfPartitionsInTransaction\":1}";
    PAssert.that(out.get(failsafeModJsonToTableRow.transformDeadLetterOut).apply(ParDo.of(new DoFn<FailsafeElement<String, String>, String>() {

        @ProcessElement
        public void process(@Element FailsafeElement<String, String> input, OutputReceiver<String> receiver) {
            receiver.output(String.format("originalPayload=%s, payload=%s, errorMessage=%s", input.getOriginalPayload(), input.getPayload(), input.getErrorMessage()));
        }
    }))).containsInAnyOrder(ImmutableList.of(String.format("originalPayload=%s, payload=%s, errorMessage=Cannot find value for key column" + " BooleanPkCol", expectedPayload, expectedPayload)));
    p.run().waitUntilFinish();
}
Also used : Mod(com.google.cloud.teleport.v2.templates.spannerchangestreamstobigquery.model.Mod) ObjectNode(org.codehaus.jackson.node.ObjectNode) FailsafeElement(com.google.cloud.teleport.v2.values.FailsafeElement) Pipeline(org.apache.beam.sdk.Pipeline) FailsafeElement(com.google.cloud.teleport.v2.values.FailsafeElement) DoFn(org.apache.beam.sdk.transforms.DoFn) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) Test(org.junit.Test) IntegrationTest(com.google.cloud.teleport.v2.spanner.IntegrationTest)

Example 2 with Mod

use of com.google.cloud.teleport.v2.templates.spannerchangestreamstobigquery.model.Mod in project DataflowTemplates by GoogleCloudPlatform.

the class FailsafeModJsonToTableRowTransformerTest method validateBigQueryRow.

private void validateBigQueryRow(String spannerDatabaseName, Timestamp commitTimestamp, ModType modType, String keysJson, String newValuesJson) throws Exception {
    Mod mod = new Mod(keysJson, newValuesJson, commitTimestamp, "1", true, "00000001", TEST_SPANNER_TABLE, modType, 1L, 1L);
    TableRow expectedTableRow = new TableRow();
    BigQueryUtils.setMetadataFiledsOfTableRow(TEST_SPANNER_TABLE, mod, mod.toJson(), commitTimestamp, expectedTableRow);
    expectedTableRow.set(BOOLEAN_PK_COL, BOOLEAN_RAW_VAL);
    expectedTableRow.set(BYTES_PK_COL, BYTES_RAW_VAL.toBase64());
    expectedTableRow.set(DATE_PK_COL, DATE_RAW_VAL.toString());
    expectedTableRow.set(FLOAT64_PK_COL, FLOAT64_RAW_VAL);
    expectedTableRow.set(INT64_PK_COL, INT64_RAW_VAL);
    expectedTableRow.set(NUMERIC_PK_COL, NUMERIC_RAW_VAL);
    expectedTableRow.set(STRING_PK_COL, STRING_RAW_VAL);
    expectedTableRow.set(TIMESTAMP_PK_COL, TIMESTAMP_RAW_VAL.toString());
    if (modType == modType.INSERT || modType == modType.UPDATE) {
        // and UPDATE.
        if (modType == modType.UPDATE) {
            expectedTableRow.set(TIMESTAMP_COL, commitTimestamp.toString());
        }
        expectedTableRow.set(BOOLEAN_ARRAY_COL, BOOLEAN_ARRAY_RAW_VAL);
        expectedTableRow.set(BYTES_ARRAY_COL, BYTES_ARRAY_RAW_VAL);
        expectedTableRow.set(DATE_ARRAY_COL, DATE_ARRAY_RAW_VAL);
        expectedTableRow.set(FLOAT64_ARRAY_COL, FLOAT64_ARRAY_RAW_VAL);
        expectedTableRow.set(INT64_ARRAY_COL, INT64_ARRAY_RAW_VAL);
        expectedTableRow.set(JSON_ARRAY_COL, JSON_ARRAY_RAW_VAL);
        expectedTableRow.set(NUMERIC_ARRAY_COL, NUMERIC_ARRAY_RAW_VAL);
        expectedTableRow.set(STRING_ARRAY_COL, STRING_ARRAY_RAW_VAL);
        expectedTableRow.set(TIMESTAMP_ARRAY_COL, TIMESTAMP_ARRAY_RAW_VAL);
        expectedTableRow.set(BOOLEAN_COL, BOOLEAN_RAW_VAL);
        expectedTableRow.set(BYTES_COL, BYTES_RAW_VAL.toBase64());
        expectedTableRow.set(DATE_COL, DATE_RAW_VAL.toString());
        expectedTableRow.set(FLOAT64_COL, FLOAT64_RAW_VAL);
        expectedTableRow.set(INT64_COL, INT64_RAW_VAL);
        expectedTableRow.set(JSON_COL, JSON_RAW_VAL);
        expectedTableRow.set(NUMERIC_COL, NUMERIC_RAW_VAL);
        expectedTableRow.set(STRING_COL, STRING_RAW_VAL);
        if (modType == modType.INSERT) {
            expectedTableRow.set(TIMESTAMP_COL, commitTimestamp.toString());
        }
    }
    TestStream<String> testSream = TestStream.create(SerializableCoder.of(String.class)).addElements(mod.toJson()).advanceWatermarkTo(Instant.now()).advanceWatermarkToInfinity();
    Pipeline p = Pipeline.create();
    PCollection<FailsafeElement<String, String>> input = p.apply(testSream).apply(ParDo.of(new DoFn<String, FailsafeElement<String, String>>() {

        @ProcessElement
        public void process(@Element String input, OutputReceiver<FailsafeElement<String, String>> receiver) {
            receiver.output(FailsafeElement.of(input, input));
        }
    })).setCoder(SpannerChangeStreamsToBigQuery.FAILSAFE_ELEMENT_CODER);
    PCollectionTuple out = input.apply("Mod JSON To TableRow", failsafeModJsonToTableRow);
    PAssert.that(out.get(failsafeModJsonToTableRow.transformOut).apply(ParDo.of(new DoFn<TableRow, String>() {

        @ProcessElement
        public void process(@Element TableRow input, OutputReceiver<String> receiver) {
            receiver.output(input.toString());
        }
    }))).containsInAnyOrder(ImmutableList.of(expectedTableRow.toString()));
    PAssert.that(out.get(failsafeModJsonToTableRow.transformDeadLetterOut)).empty();
    p.run().waitUntilFinish();
}
Also used : Mod(com.google.cloud.teleport.v2.templates.spannerchangestreamstobigquery.model.Mod) FailsafeElement(com.google.cloud.teleport.v2.values.FailsafeElement) Pipeline(org.apache.beam.sdk.Pipeline) FailsafeElement(com.google.cloud.teleport.v2.values.FailsafeElement) DoFn(org.apache.beam.sdk.transforms.DoFn) FailsafeModJsonToTableRow(com.google.cloud.teleport.v2.templates.spannerchangestreamstobigquery.FailsafeModJsonToTableRowTransformer.FailsafeModJsonToTableRow) TableRow(com.google.api.services.bigquery.model.TableRow) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple)

Example 3 with Mod

use of com.google.cloud.teleport.v2.templates.spannerchangestreamstobigquery.model.Mod in project DataflowTemplates by GoogleCloudPlatform.

the class SpannerChangeStreamsToBigQuery method run.

/**
 * Runs the pipeline with the supplied options.
 *
 * @param options The execution parameters to the pipeline.
 * @return The result of the pipeline execution.
 */
public static PipelineResult run(SpannerChangeStreamsToBigQueryOptions options) {
    setOptions(options);
    validateOptions(options);
    /**
     * Stages: 1) Read {@link DataChangeRecord} from change stream. 2) Create {@link
     * FailsafeElement} of {@link Mod} JSON and merge from: - {@link DataChangeRecord}. - GCS Dead
     * letter queue. 3) Convert {@link Mod} JSON into {@link TableRow} by reading from Spanner at
     * commit timestamp. 4) Append {@link TableRow} to BigQuery. 5) Write Failures from 2), 3) and
     * 4) to GCS dead letter queue.
     */
    Pipeline pipeline = Pipeline.create(options);
    DeadLetterQueueManager dlqManager = buildDlqManager(options);
    String spannerProjectId = getSpannerProjectId(options);
    String dlqDirectory = dlqManager.getRetryDlqDirectoryWithDateTime();
    String tempDlqDirectory = dlqManager.getRetryDlqDirectory() + "tmp/";
    // Retrieve and parse the startTimestamp and endTimestamp.
    Timestamp startTimestamp = options.getStartTimestamp().isEmpty() ? Timestamp.now() : Timestamp.parseTimestamp(options.getStartTimestamp());
    Timestamp endTimestamp = options.getEndTimestamp().isEmpty() ? Timestamp.MAX_VALUE : Timestamp.parseTimestamp(options.getEndTimestamp());
    SpannerConfig spannerConfig = SpannerConfig.create().withHost(ValueProvider.StaticValueProvider.of(options.getSpannerHost())).withProjectId(spannerProjectId).withInstanceId(options.getSpannerInstanceId()).withDatabaseId(options.getSpannerDatabase()).withRpcPriority(options.getSpannerRpcPriority());
    SpannerIO.ReadChangeStream readChangeStream = SpannerIO.readChangeStream().withSpannerConfig(spannerConfig).withMetadataInstance(options.getSpannerMetadataInstanceId()).withMetadataDatabase(options.getSpannerMetadataDatabase()).withChangeStreamName(options.getSpannerChangeStreamName()).withInclusiveStartAt(startTimestamp).withInclusiveEndAt(endTimestamp).withRpcPriority(options.getSpannerRpcPriority());
    String spannerMetadataTableName = options.getSpannerMetadataTableName();
    if (spannerMetadataTableName != null) {
        readChangeStream = readChangeStream.withMetadataTable(spannerMetadataTableName);
    }
    PCollection<DataChangeRecord> dataChangeRecord = pipeline.apply("Read from Spanner Change Streams", readChangeStream).apply("Reshuffle DataChangeRecord", Reshuffle.viaRandomKey());
    PCollection<FailsafeElement<String, String>> sourceFailsafeModJson = dataChangeRecord.apply("DataChangeRecord To Mod JSON", ParDo.of(new DataChangeRecordToModJsonFn())).apply("Wrap Mod JSON In FailsafeElement", ParDo.of(new DoFn<String, FailsafeElement<String, String>>() {

        @ProcessElement
        public void process(@Element String input, OutputReceiver<FailsafeElement<String, String>> receiver) {
            receiver.output(FailsafeElement.of(input, input));
        }
    })).setCoder(FAILSAFE_ELEMENT_CODER);
    PCollectionTuple dlqModJson = dlqManager.getReconsumerDataTransform(pipeline.apply(dlqManager.dlqReconsumer(options.getDlqRetryMinutes())));
    PCollection<FailsafeElement<String, String>> retryableDlqFailsafeModJson = dlqModJson.get(DeadLetterQueueManager.RETRYABLE_ERRORS).setCoder(FAILSAFE_ELEMENT_CODER);
    PCollection<FailsafeElement<String, String>> failsafeModJson = PCollectionList.of(sourceFailsafeModJson).and(retryableDlqFailsafeModJson).apply("Merge Source And DLQ Mod JSON", Flatten.pCollections());
    ImmutableSet.Builder<String> ignoreFieldsBuilder = ImmutableSet.builder();
    for (String ignoreField : options.getIgnoreFields().split(",")) {
        ignoreFieldsBuilder.add(ignoreField);
    }
    ImmutableSet<String> ignoreFields = ignoreFieldsBuilder.build();
    FailsafeModJsonToTableRowTransformer.FailsafeModJsonToTableRowOptions failsafeModJsonToTableRowOptions = FailsafeModJsonToTableRowTransformer.FailsafeModJsonToTableRowOptions.builder().setSpannerConfig(spannerConfig).setSpannerChangeStream(options.getSpannerChangeStreamName()).setIgnoreFields(ignoreFields).setCoder(FAILSAFE_ELEMENT_CODER).build();
    FailsafeModJsonToTableRowTransformer.FailsafeModJsonToTableRow failsafeModJsonToTableRow = new FailsafeModJsonToTableRowTransformer.FailsafeModJsonToTableRow(failsafeModJsonToTableRowOptions);
    PCollectionTuple tableRowTuple = failsafeModJson.apply("Mod JSON To TableRow", failsafeModJsonToTableRow);
    BigQueryDynamicDestinations.BigQueryDynamicDestinationsOptions bigQueryDynamicDestinationsOptions = BigQueryDynamicDestinations.BigQueryDynamicDestinationsOptions.builder().setSpannerConfig(spannerConfig).setChangeStreamName(options.getSpannerChangeStreamName()).setIgnoreFields(ignoreFields).setBigQueryProject(getBigQueryProjectId(options)).setBigQueryDataset(options.getBigQueryDataset()).setBigQueryTableTemplate(options.getBigQueryChangelogTableNameTemplate()).build();
    WriteResult writeResult = tableRowTuple.get(failsafeModJsonToTableRow.transformOut).apply("Write To BigQuery", BigQueryIO.<TableRow>write().to(BigQueryDynamicDestinations.of(bigQueryDynamicDestinationsOptions)).withFormatFunction(element -> removeIntermediateMetadataFields(element)).withFormatRecordOnFailureFunction(element -> element).withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED).withWriteDisposition(Write.WriteDisposition.WRITE_APPEND).withExtendedErrorInfo().withMethod(Write.Method.STREAMING_INSERTS).withFailedInsertRetryPolicy(InsertRetryPolicy.retryTransientErrors()));
    PCollection<String> transformDlqJson = tableRowTuple.get(failsafeModJsonToTableRow.transformDeadLetterOut).apply("Failed Mod JSON During Table Row Transformation", MapElements.via(new StringDeadLetterQueueSanitizer()));
    PCollection<String> bqWriteDlqJson = writeResult.getFailedInsertsWithErr().apply("Failed Mod JSON During BigQuery Writes", MapElements.via(new BigQueryDeadLetterQueueSanitizer()));
    PCollectionList.of(transformDlqJson).and(bqWriteDlqJson).apply("Merge Failed Mod JSON From Transform And BigQuery", Flatten.pCollections()).apply("Write Failed Mod JSON To DLQ", DLQWriteTransform.WriteDLQ.newBuilder().withDlqDirectory(dlqDirectory).withTmpDirectory(tempDlqDirectory).setIncludePaneInfo(true).build());
    PCollection<FailsafeElement<String, String>> nonRetryableDlqModJsonFailsafe = dlqModJson.get(DeadLetterQueueManager.PERMANENT_ERRORS).setCoder(FAILSAFE_ELEMENT_CODER);
    nonRetryableDlqModJsonFailsafe.apply("Write Mod JSON With Non-retryable Error To DLQ", MapElements.via(new StringDeadLetterQueueSanitizer())).setCoder(StringUtf8Coder.of()).apply(DLQWriteTransform.WriteDLQ.newBuilder().withDlqDirectory(dlqManager.getSevereDlqDirectoryWithDateTime()).withTmpDirectory(dlqManager.getSevereDlqDirectory() + "tmp/").setIncludePaneInfo(true).build());
    return pipeline.run();
}
Also used : SpannerConfig(org.apache.beam.sdk.io.gcp.spanner.SpannerConfig) DataflowPipelineWorkerPoolOptions(org.apache.beam.runners.dataflow.options.DataflowPipelineWorkerPoolOptions) Mod(com.google.cloud.teleport.v2.templates.spannerchangestreamstobigquery.model.Mod) PipelineResult(org.apache.beam.sdk.PipelineResult) InsertRetryPolicy(org.apache.beam.sdk.io.gcp.bigquery.InsertRetryPolicy) LoggerFactory(org.slf4j.LoggerFactory) Timestamp(com.google.cloud.Timestamp) PipelineOptionsFactory(org.apache.beam.sdk.options.PipelineOptionsFactory) DLQWriteTransform(com.google.cloud.teleport.v2.transforms.DLQWriteTransform) DataChangeRecord(org.apache.beam.sdk.io.gcp.spanner.changestreams.model.DataChangeRecord) ArrayList(java.util.ArrayList) StringUtf8Coder(org.apache.beam.sdk.coders.StringUtf8Coder) PCollectionList(org.apache.beam.sdk.values.PCollectionList) FailsafeElementCoder(com.google.cloud.teleport.v2.coders.FailsafeElementCoder) TableRow(com.google.api.services.bigquery.model.TableRow) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) Pipeline(org.apache.beam.sdk.Pipeline) ValueProvider(org.apache.beam.sdk.options.ValueProvider) Flatten(org.apache.beam.sdk.transforms.Flatten) DoFn(org.apache.beam.sdk.transforms.DoFn) MapElements(org.apache.beam.sdk.transforms.MapElements) SpannerChangeStreamsToBigQueryOptions(com.google.cloud.teleport.v2.options.SpannerChangeStreamsToBigQueryOptions) Reshuffle(org.apache.beam.sdk.transforms.Reshuffle) DeadLetterQueueManager(com.google.cloud.teleport.v2.cdc.dlq.DeadLetterQueueManager) ImmutableSet(com.google.common.collect.ImmutableSet) Logger(org.slf4j.Logger) BigQueryIO(org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO) Set(java.util.Set) IOException(java.io.IOException) CreateDisposition(org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition) DataflowPipelineOptions(org.apache.beam.runners.dataflow.options.DataflowPipelineOptions) WriteResult(org.apache.beam.sdk.io.gcp.bigquery.WriteResult) PCollection(org.apache.beam.sdk.values.PCollection) SpannerIO(org.apache.beam.sdk.io.gcp.spanner.SpannerIO) SpannerConfig(org.apache.beam.sdk.io.gcp.spanner.SpannerConfig) List(java.util.List) ParDo(org.apache.beam.sdk.transforms.ParDo) BigQueryUtils(com.google.cloud.teleport.v2.templates.spannerchangestreamstobigquery.schemautils.BigQueryUtils) StringDeadLetterQueueSanitizer(com.google.cloud.teleport.v2.cdc.dlq.StringDeadLetterQueueSanitizer) FailsafeElement(com.google.cloud.teleport.v2.values.FailsafeElement) Write(org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write) DeadLetterQueueManager(com.google.cloud.teleport.v2.cdc.dlq.DeadLetterQueueManager) DataChangeRecord(org.apache.beam.sdk.io.gcp.spanner.changestreams.model.DataChangeRecord) Timestamp(com.google.cloud.Timestamp) FailsafeElement(com.google.cloud.teleport.v2.values.FailsafeElement) ImmutableSet(com.google.common.collect.ImmutableSet) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) StringDeadLetterQueueSanitizer(com.google.cloud.teleport.v2.cdc.dlq.StringDeadLetterQueueSanitizer) Pipeline(org.apache.beam.sdk.Pipeline) WriteResult(org.apache.beam.sdk.io.gcp.bigquery.WriteResult) SpannerIO(org.apache.beam.sdk.io.gcp.spanner.SpannerIO)

Aggregations

Mod (com.google.cloud.teleport.v2.templates.spannerchangestreamstobigquery.model.Mod)3 FailsafeElement (com.google.cloud.teleport.v2.values.FailsafeElement)3 Pipeline (org.apache.beam.sdk.Pipeline)3 DoFn (org.apache.beam.sdk.transforms.DoFn)3 PCollectionTuple (org.apache.beam.sdk.values.PCollectionTuple)3 TableRow (com.google.api.services.bigquery.model.TableRow)2 Timestamp (com.google.cloud.Timestamp)1 DeadLetterQueueManager (com.google.cloud.teleport.v2.cdc.dlq.DeadLetterQueueManager)1 StringDeadLetterQueueSanitizer (com.google.cloud.teleport.v2.cdc.dlq.StringDeadLetterQueueSanitizer)1 FailsafeElementCoder (com.google.cloud.teleport.v2.coders.FailsafeElementCoder)1 SpannerChangeStreamsToBigQueryOptions (com.google.cloud.teleport.v2.options.SpannerChangeStreamsToBigQueryOptions)1 IntegrationTest (com.google.cloud.teleport.v2.spanner.IntegrationTest)1 FailsafeModJsonToTableRow (com.google.cloud.teleport.v2.templates.spannerchangestreamstobigquery.FailsafeModJsonToTableRowTransformer.FailsafeModJsonToTableRow)1 BigQueryUtils (com.google.cloud.teleport.v2.templates.spannerchangestreamstobigquery.schemautils.BigQueryUtils)1 DLQWriteTransform (com.google.cloud.teleport.v2.transforms.DLQWriteTransform)1 ImmutableSet (com.google.common.collect.ImmutableSet)1 IOException (java.io.IOException)1 ArrayList (java.util.ArrayList)1 List (java.util.List)1 Set (java.util.Set)1