Search in sources :

Example 16 with FailsafeElement

use of com.google.cloud.teleport.v2.values.FailsafeElement in project DataflowTemplates by GoogleCloudPlatform.

the class DatabaseMigrationUtils method convertJsonToDmlInfo.

public KV<String, DmlInfo> convertJsonToDmlInfo(FailsafeElement<String, String> element) {
    String jsonString = element.getPayload();
    ObjectMapper mapper = new ObjectMapper();
    JsonNode rowObj;
    try {
        rowObj = mapper.readTree(jsonString);
    } catch (IOException e) {
        LOG.error("IOException: {} :: {}", jsonString, e.toString());
        DmlInfo dmlInfo = DmlInfo.of(element.getOriginalPayload(), "", "", "", new ArrayList<String>(), new ArrayList<String>(), new ArrayList<String>(), new ArrayList<String>());
        // TODO(dhercher): how should we handle bad data?
        return KV.of(jsonString, dmlInfo);
    }
    try {
        // Oracle uses upper case while Postgres uses all lowercase.
        // We lowercase the values of these metadata fields to align with
        // our schema conversion rules.
        String schemaName = this.getPostgresSchemaName(rowObj);
        String tableName = this.getPostgresTableName(rowObj);
        Map<String, String> tableSchema = this.getTableSchema(schemaName, tableName);
        List<String> primaryKeys = this.getPrimaryKeys(schemaName, tableName, rowObj);
        List<String> orderByFields = Arrays.asList("_metadata_timestamp", "_metadata_scn");
        List<String> primaryKeyValues = getFieldValues(rowObj, primaryKeys);
        List<String> orderByValues = getFieldValues(rowObj, orderByFields);
        if (tableSchema.isEmpty()) {
            // If the table DNE we supply an empty SQL value (NOOP)
            DmlInfo dmlInfo = DmlInfo.of(element.getOriginalPayload(), "", schemaName, tableName, primaryKeys, orderByFields, primaryKeyValues, orderByValues);
            return KV.of(jsonString, dmlInfo);
        }
        String dmlSql;
        if (rowObj.get("_metadata_deleted").asBoolean()) {
            dmlSql = convertJsonToDeleteSql(rowObj, tableSchema, schemaName, tableName, primaryKeys);
        } else if (primaryKeys.size() == 0) {
            // TODO(dhercher): Do we choose to support this case?
            dmlSql = convertJsonToInsertSql(rowObj, tableSchema, schemaName, tableName);
        } else {
            dmlSql = convertJsonToUpsertSql(rowObj, tableSchema, schemaName, tableName, primaryKeys);
        }
        DmlInfo dmlInfo = DmlInfo.of(element.getOriginalPayload(), dmlSql, schemaName, tableName, primaryKeys, orderByFields, primaryKeyValues, orderByValues);
        return KV.of(dmlInfo.getStateWindowKey(), dmlInfo);
    } catch (Exception e) {
        LOG.error("Value Error: {} :: {}", rowObj.toString(), e.toString());
        DmlInfo dmlInfo = DmlInfo.of(element.getOriginalPayload(), "", "", "", new ArrayList<String>(), new ArrayList<String>(), new ArrayList<String>(), new ArrayList<String>());
        // TODO(dhercher): how should we handle bad data?
        return KV.of(jsonString, dmlInfo);
    }
}
Also used : ArrayList(java.util.ArrayList) JsonNode(org.codehaus.jackson.JsonNode) IOException(java.io.IOException) DmlInfo(com.google.cloud.teleport.v2.values.DmlInfo) ObjectMapper(org.codehaus.jackson.map.ObjectMapper) SQLException(java.sql.SQLException) IOException(java.io.IOException)

Example 17 with FailsafeElement

use of com.google.cloud.teleport.v2.values.FailsafeElement in project DataflowTemplates by GoogleCloudPlatform.

the class FailsafeModJsonToTableRowTransformerTest method testFailsafeModJsonToTableRowFailedSnapshotRead.

// Test the case where the snapshot read to Spanner fails and we can capture the failures from
// transformDeadLetterOut of FailsafeModJsonToTableRow.
@Test
public void testFailsafeModJsonToTableRowFailedSnapshotRead() throws Exception {
    ObjectNode fakePkColJsonNode = new ObjectNode(JsonNodeFactory.instance);
    fakePkColJsonNode.put("fakePkCol", true);
    ObjectNode fakeNonPkColJsonNode = new ObjectNode(JsonNodeFactory.instance);
    fakeNonPkColJsonNode.put("fakeNonPkCol", true);
    Mod mod = new Mod(fakePkColJsonNode.toString(), fakeNonPkColJsonNode.toString(), Timestamp.ofTimeSecondsAndNanos(1650908264L, 925679000), "1", true, "00000001", TEST_SPANNER_TABLE, ModType.INSERT, 1L, 1L);
    TestStream<String> testSream = TestStream.create(SerializableCoder.of(String.class)).addElements(mod.toJson()).advanceWatermarkTo(Instant.now()).advanceWatermarkToInfinity();
    Pipeline p = Pipeline.create();
    PCollection<FailsafeElement<String, String>> input = p.apply(testSream).apply(ParDo.of(new DoFn<String, FailsafeElement<String, String>>() {

        @ProcessElement
        public void process(@Element String input, OutputReceiver<FailsafeElement<String, String>> receiver) {
            receiver.output(FailsafeElement.of(input, input));
        }
    })).setCoder(SpannerChangeStreamsToBigQuery.FAILSAFE_ELEMENT_CODER);
    PCollectionTuple out = input.apply("Mod JSON To TableRow", failsafeModJsonToTableRow);
    PAssert.that(out.get(failsafeModJsonToTableRow.transformOut)).empty();
    String expectedPayload = "{\"keysJson\":\"{\\\"fakePkCol\\\":true}\"," + "\"newValuesJson\":\"{\\\"fakeNonPkCol\\\":true}\"," + "\"commitTimestampSeconds\":1650908264,\"commitTimestampNanos\":925679000," + "\"serverTransactionId\":\"1\",\"isLastRecordInTransactionInPartition\":true," + "\"recordSequence\":\"00000001\",\"tableName\":\"AllTypes\",\"modType\":\"INSERT\"," + "\"numberOfRecordsInTransaction\":1,\"numberOfPartitionsInTransaction\":1}";
    PAssert.that(out.get(failsafeModJsonToTableRow.transformDeadLetterOut).apply(ParDo.of(new DoFn<FailsafeElement<String, String>, String>() {

        @ProcessElement
        public void process(@Element FailsafeElement<String, String> input, OutputReceiver<String> receiver) {
            receiver.output(String.format("originalPayload=%s, payload=%s, errorMessage=%s", input.getOriginalPayload(), input.getPayload(), input.getErrorMessage()));
        }
    }))).containsInAnyOrder(ImmutableList.of(String.format("originalPayload=%s, payload=%s, errorMessage=Cannot find value for key column" + " BooleanPkCol", expectedPayload, expectedPayload)));
    p.run().waitUntilFinish();
}
Also used : Mod(com.google.cloud.teleport.v2.templates.spannerchangestreamstobigquery.model.Mod) ObjectNode(org.codehaus.jackson.node.ObjectNode) FailsafeElement(com.google.cloud.teleport.v2.values.FailsafeElement) Pipeline(org.apache.beam.sdk.Pipeline) FailsafeElement(com.google.cloud.teleport.v2.values.FailsafeElement) DoFn(org.apache.beam.sdk.transforms.DoFn) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) Test(org.junit.Test) IntegrationTest(com.google.cloud.teleport.v2.spanner.IntegrationTest)

Example 18 with FailsafeElement

use of com.google.cloud.teleport.v2.values.FailsafeElement in project DataflowTemplates by GoogleCloudPlatform.

the class FailsafeModJsonToTableRowTransformerTest method validateBigQueryRow.

private void validateBigQueryRow(String spannerDatabaseName, Timestamp commitTimestamp, ModType modType, String keysJson, String newValuesJson) throws Exception {
    Mod mod = new Mod(keysJson, newValuesJson, commitTimestamp, "1", true, "00000001", TEST_SPANNER_TABLE, modType, 1L, 1L);
    TableRow expectedTableRow = new TableRow();
    BigQueryUtils.setMetadataFiledsOfTableRow(TEST_SPANNER_TABLE, mod, mod.toJson(), commitTimestamp, expectedTableRow);
    expectedTableRow.set(BOOLEAN_PK_COL, BOOLEAN_RAW_VAL);
    expectedTableRow.set(BYTES_PK_COL, BYTES_RAW_VAL.toBase64());
    expectedTableRow.set(DATE_PK_COL, DATE_RAW_VAL.toString());
    expectedTableRow.set(FLOAT64_PK_COL, FLOAT64_RAW_VAL);
    expectedTableRow.set(INT64_PK_COL, INT64_RAW_VAL);
    expectedTableRow.set(NUMERIC_PK_COL, NUMERIC_RAW_VAL);
    expectedTableRow.set(STRING_PK_COL, STRING_RAW_VAL);
    expectedTableRow.set(TIMESTAMP_PK_COL, TIMESTAMP_RAW_VAL.toString());
    if (modType == modType.INSERT || modType == modType.UPDATE) {
        // and UPDATE.
        if (modType == modType.UPDATE) {
            expectedTableRow.set(TIMESTAMP_COL, commitTimestamp.toString());
        }
        expectedTableRow.set(BOOLEAN_ARRAY_COL, BOOLEAN_ARRAY_RAW_VAL);
        expectedTableRow.set(BYTES_ARRAY_COL, BYTES_ARRAY_RAW_VAL);
        expectedTableRow.set(DATE_ARRAY_COL, DATE_ARRAY_RAW_VAL);
        expectedTableRow.set(FLOAT64_ARRAY_COL, FLOAT64_ARRAY_RAW_VAL);
        expectedTableRow.set(INT64_ARRAY_COL, INT64_ARRAY_RAW_VAL);
        expectedTableRow.set(JSON_ARRAY_COL, JSON_ARRAY_RAW_VAL);
        expectedTableRow.set(NUMERIC_ARRAY_COL, NUMERIC_ARRAY_RAW_VAL);
        expectedTableRow.set(STRING_ARRAY_COL, STRING_ARRAY_RAW_VAL);
        expectedTableRow.set(TIMESTAMP_ARRAY_COL, TIMESTAMP_ARRAY_RAW_VAL);
        expectedTableRow.set(BOOLEAN_COL, BOOLEAN_RAW_VAL);
        expectedTableRow.set(BYTES_COL, BYTES_RAW_VAL.toBase64());
        expectedTableRow.set(DATE_COL, DATE_RAW_VAL.toString());
        expectedTableRow.set(FLOAT64_COL, FLOAT64_RAW_VAL);
        expectedTableRow.set(INT64_COL, INT64_RAW_VAL);
        expectedTableRow.set(JSON_COL, JSON_RAW_VAL);
        expectedTableRow.set(NUMERIC_COL, NUMERIC_RAW_VAL);
        expectedTableRow.set(STRING_COL, STRING_RAW_VAL);
        if (modType == modType.INSERT) {
            expectedTableRow.set(TIMESTAMP_COL, commitTimestamp.toString());
        }
    }
    TestStream<String> testSream = TestStream.create(SerializableCoder.of(String.class)).addElements(mod.toJson()).advanceWatermarkTo(Instant.now()).advanceWatermarkToInfinity();
    Pipeline p = Pipeline.create();
    PCollection<FailsafeElement<String, String>> input = p.apply(testSream).apply(ParDo.of(new DoFn<String, FailsafeElement<String, String>>() {

        @ProcessElement
        public void process(@Element String input, OutputReceiver<FailsafeElement<String, String>> receiver) {
            receiver.output(FailsafeElement.of(input, input));
        }
    })).setCoder(SpannerChangeStreamsToBigQuery.FAILSAFE_ELEMENT_CODER);
    PCollectionTuple out = input.apply("Mod JSON To TableRow", failsafeModJsonToTableRow);
    PAssert.that(out.get(failsafeModJsonToTableRow.transformOut).apply(ParDo.of(new DoFn<TableRow, String>() {

        @ProcessElement
        public void process(@Element TableRow input, OutputReceiver<String> receiver) {
            receiver.output(input.toString());
        }
    }))).containsInAnyOrder(ImmutableList.of(expectedTableRow.toString()));
    PAssert.that(out.get(failsafeModJsonToTableRow.transformDeadLetterOut)).empty();
    p.run().waitUntilFinish();
}
Also used : Mod(com.google.cloud.teleport.v2.templates.spannerchangestreamstobigquery.model.Mod) FailsafeElement(com.google.cloud.teleport.v2.values.FailsafeElement) Pipeline(org.apache.beam.sdk.Pipeline) FailsafeElement(com.google.cloud.teleport.v2.values.FailsafeElement) DoFn(org.apache.beam.sdk.transforms.DoFn) FailsafeModJsonToTableRow(com.google.cloud.teleport.v2.templates.spannerchangestreamstobigquery.FailsafeModJsonToTableRowTransformer.FailsafeModJsonToTableRow) TableRow(com.google.api.services.bigquery.model.TableRow) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple)

Example 19 with FailsafeElement

use of com.google.cloud.teleport.v2.values.FailsafeElement in project DataflowTemplates by GoogleCloudPlatform.

the class KafkaToBigQuery method run.

/**
 * Runs the pipeline to completion with the specified options. This method does not wait until the
 * pipeline is finished before returning. Invoke {@code result.waitUntilFinish()} on the result
 * object to block until the pipeline is finished running if blocking programmatic execution is
 * required.
 *
 * @param options The execution options.
 * @return The pipeline result.
 */
public static PipelineResult run(KafkaToBQOptions options) {
    // Create the pipeline
    Pipeline pipeline = Pipeline.create(options);
    // Register the coder for pipeline
    FailsafeElementCoder<KV<String, String>, String> coder = FailsafeElementCoder.of(KvCoder.of(NullableCoder.of(StringUtf8Coder.of()), NullableCoder.of(StringUtf8Coder.of())), NullableCoder.of(StringUtf8Coder.of()));
    CoderRegistry coderRegistry = pipeline.getCoderRegistry();
    coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);
    List<String> topicsList;
    if (options.getKafkaReadTopics() != null) {
        topicsList = new ArrayList<>(Arrays.asList(options.getKafkaReadTopics().split(",")));
    } else if (options.getInputTopics() != null) {
        topicsList = new ArrayList<>(Arrays.asList(options.getInputTopics().split(",")));
    } else {
        throw new IllegalArgumentException("Please Provide --kafkaReadTopic");
    }
    String bootstrapServers;
    if (options.getReadBootstrapServers() != null) {
        bootstrapServers = options.getReadBootstrapServers();
    } else if (options.getBootstrapServers() != null) {
        bootstrapServers = options.getBootstrapServers();
    } else {
        throw new IllegalArgumentException("Please Provide --bootstrapServers");
    }
    /*
     * Steps:
     *  1) Read messages in from Kafka
     *  2) Transform the messages into TableRows
     *     - Transform message payload via UDF
     *     - Convert UDF result to TableRow objects
     *  3) Write successful records out to BigQuery
     *  4) Write failed records out to BigQuery
     */
    PCollectionTuple convertedTableRows = pipeline.apply("ReadFromKafka", readFromKafka(bootstrapServers, topicsList, ImmutableMap.of(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest"), null)).apply("ConvertMessageToTableRow", new MessageToTableRow(options));
    /*
     * Step #3: Write the successful records out to BigQuery
     */
    WriteResult writeResult = convertedTableRows.get(TRANSFORM_OUT).apply("WriteSuccessfulRecords", BigQueryIO.writeTableRows().withoutValidation().withCreateDisposition(CreateDisposition.CREATE_NEVER).withWriteDisposition(WriteDisposition.WRITE_APPEND).withExtendedErrorInfo().withMethod(BigQueryIO.Write.Method.STREAMING_INSERTS).withFailedInsertRetryPolicy(InsertRetryPolicy.retryTransientErrors()).to(options.getOutputTableSpec()));
    /*
     * Step 3 Contd.
     * Elements that failed inserts into BigQuery are extracted and converted to FailsafeElement
     */
    PCollection<FailsafeElement<String, String>> failedInserts = writeResult.getFailedInsertsWithErr().apply("WrapInsertionErrors", MapElements.into(FAILSAFE_ELEMENT_CODER.getEncodedTypeDescriptor()).via(KafkaToBigQuery::wrapBigQueryInsertError)).setCoder(FAILSAFE_ELEMENT_CODER);
    /*
     * Step #4: Write failed records out to BigQuery
     */
    PCollectionList.of(convertedTableRows.get(UDF_DEADLETTER_OUT)).and(convertedTableRows.get(TRANSFORM_DEADLETTER_OUT)).apply("Flatten", Flatten.pCollections()).apply("WriteTransformationFailedRecords", WriteKafkaMessageErrors.newBuilder().setErrorRecordsTable(ObjectUtils.firstNonNull(options.getOutputDeadletterTable(), options.getOutputTableSpec() + DEFAULT_DEADLETTER_TABLE_SUFFIX)).setErrorRecordsTableSchema(SchemaUtils.DEADLETTER_SCHEMA).build());
    /*
     * Step #5: Insert records that failed BigQuery inserts into a deadletter table.
     */
    failedInserts.apply("WriteInsertionFailedRecords", ErrorConverters.WriteStringMessageErrors.newBuilder().setErrorRecordsTable(ObjectUtils.firstNonNull(options.getOutputDeadletterTable(), options.getOutputTableSpec() + DEFAULT_DEADLETTER_TABLE_SUFFIX)).setErrorRecordsTableSchema(SchemaUtils.DEADLETTER_SCHEMA).build());
    return pipeline.run();
}
Also used : ArrayList(java.util.ArrayList) KV(org.apache.beam.sdk.values.KV) Pipeline(org.apache.beam.sdk.Pipeline) FailsafeElement(com.google.cloud.teleport.v2.values.FailsafeElement) CoderRegistry(org.apache.beam.sdk.coders.CoderRegistry) WriteResult(org.apache.beam.sdk.io.gcp.bigquery.WriteResult) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple)

Example 20 with FailsafeElement

use of com.google.cloud.teleport.v2.values.FailsafeElement in project DataflowTemplates by GoogleCloudPlatform.

the class KafkaToBigQueryTest method testKafkaToBigQueryE2E.

/**
 * Tests the {@link KafkaToBigQuery} pipeline end-to-end.
 */
@Test
public void testKafkaToBigQueryE2E() throws Exception {
    // Test input
    final String key = "{\"id\": \"1001\"}";
    final String badKey = "{\"id\": \"1002\"}";
    final String payload = "{\"ticker\": \"GOOGL\", \"price\": 1006.94}";
    final String badPayload = "{\"tickets\": \"AMZ\", \"proctor\": 007";
    final KV<String, String> message = KV.of(key, payload);
    final KV<String, String> badMessage = KV.of(badKey, badPayload);
    final Instant timestamp = new DateTime(2022, 2, 22, 22, 22, 22, 222, DateTimeZone.UTC).toInstant();
    final FailsafeElementCoder<KV<String, String>, String> coder = FailsafeElementCoder.of(KvCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of()), StringUtf8Coder.of());
    CoderRegistry coderRegistry = pipeline.getCoderRegistry();
    coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);
    KafkaToBigQuery.KafkaToBQOptions options = PipelineOptionsFactory.create().as(KafkaToBigQuery.KafkaToBQOptions.class);
    options.setJavascriptTextTransformGcsPath(TRANSFORM_FILE_PATH);
    options.setJavascriptTextTransformFunctionName("transform");
    // Build pipeline
    PCollectionTuple transformOut = pipeline.apply("CreateInput", Create.of(message).withCoder(KvCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of()))).apply("ConvertMessageToTableRow", new MessageToTableRow(options));
    // Assert
    PAssert.that(transformOut.get(KafkaToBigQuery.UDF_DEADLETTER_OUT)).empty();
    PAssert.that(transformOut.get(KafkaToBigQuery.TRANSFORM_DEADLETTER_OUT)).empty();
    PAssert.that(transformOut.get(KafkaToBigQuery.TRANSFORM_OUT)).satisfies(collection -> {
        TableRow result = collection.iterator().next();
        assertThat(result.get("ticker"), is(equalTo("GOOGL")));
        assertThat(result.get("price"), is(equalTo(1006.94)));
        return null;
    });
    // Execute pipeline
    pipeline.run();
    // Build pipeline with malformed payload
    PCollectionTuple badTransformOut = pipeline.apply("CreateBadInput", Create.of(badMessage).withCoder(KvCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of()))).apply("ConvertMessageToTableRow", new MessageToTableRow(options));
    // Assert
    PAssert.that(badTransformOut.get(KafkaToBigQuery.UDF_DEADLETTER_OUT)).satisfies(collection -> {
        FailsafeElement badResult = collection.iterator().next();
        assertThat(badResult.getOriginalPayload(), is(equalTo(badMessage)));
        assertThat(badResult.getPayload(), is(equalTo(badPayload)));
        return null;
    });
    PAssert.that(badTransformOut.get(KafkaToBigQuery.TRANSFORM_DEADLETTER_OUT)).empty();
    PAssert.that(badTransformOut.get(KafkaToBigQuery.TRANSFORM_OUT)).empty();
    // Execute pipeline
    pipeline.run();
}
Also used : Instant(org.joda.time.Instant) KV(org.apache.beam.sdk.values.KV) DateTime(org.joda.time.DateTime) FailsafeElement(com.google.cloud.teleport.v2.values.FailsafeElement) CoderRegistry(org.apache.beam.sdk.coders.CoderRegistry) TableRow(com.google.api.services.bigquery.model.TableRow) MessageToTableRow(com.google.cloud.teleport.v2.templates.KafkaToBigQuery.MessageToTableRow) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) MessageToTableRow(com.google.cloud.teleport.v2.templates.KafkaToBigQuery.MessageToTableRow) Test(org.junit.Test)

Aggregations

FailsafeElement (com.google.cloud.teleport.v2.values.FailsafeElement)31 PCollectionTuple (org.apache.beam.sdk.values.PCollectionTuple)26 CoderRegistry (org.apache.beam.sdk.coders.CoderRegistry)21 Test (org.junit.Test)21 Pipeline (org.apache.beam.sdk.Pipeline)14 TableRow (com.google.api.services.bigquery.model.TableRow)8 PubsubMessage (org.apache.beam.sdk.io.gcp.pubsub.PubsubMessage)6 DoFn (org.apache.beam.sdk.transforms.DoFn)6 PubSubToElasticsearchOptions (com.google.cloud.teleport.v2.elasticsearch.options.PubSubToElasticsearchOptions)5 IntegrationTest (com.google.cloud.teleport.v2.spanner.IntegrationTest)5 JSONObject (org.json.JSONObject)5 DeadLetterQueueManager (com.google.cloud.teleport.v2.cdc.dlq.DeadLetterQueueManager)4 StringDeadLetterQueueSanitizer (com.google.cloud.teleport.v2.cdc.dlq.StringDeadLetterQueueSanitizer)4 DataStreamIO (com.google.cloud.teleport.v2.cdc.sources.DataStreamIO)4 FailsafeElementCoder (com.google.cloud.teleport.v2.coders.FailsafeElementCoder)4 GCSToSplunk.flattenErrorsAndConvertToString (com.google.cloud.teleport.v2.templates.GCSToSplunk.flattenErrorsAndConvertToString)4 PipelineResult (org.apache.beam.sdk.PipelineResult)4 SpannerConfig (org.apache.beam.sdk.io.gcp.spanner.SpannerConfig)4 KV (org.apache.beam.sdk.values.KV)4 GCSToElasticsearchOptions (com.google.cloud.teleport.v2.elasticsearch.options.GCSToElasticsearchOptions)3