Search in sources :

Example 6 with FailsafeElementCoder

use of com.google.cloud.teleport.v2.coders.FailsafeElementCoder in project DataflowTemplates by GoogleCloudPlatform.

the class KafkaToBigQuery method run.

/**
 * Runs the pipeline to completion with the specified options. This method does not wait until the
 * pipeline is finished before returning. Invoke {@code result.waitUntilFinish()} on the result
 * object to block until the pipeline is finished running if blocking programmatic execution is
 * required.
 *
 * @param options The execution options.
 * @return The pipeline result.
 */
public static PipelineResult run(KafkaToBQOptions options) {
    // Create the pipeline
    Pipeline pipeline = Pipeline.create(options);
    // Register the coder for pipeline
    FailsafeElementCoder<KV<String, String>, String> coder = FailsafeElementCoder.of(KvCoder.of(NullableCoder.of(StringUtf8Coder.of()), NullableCoder.of(StringUtf8Coder.of())), NullableCoder.of(StringUtf8Coder.of()));
    CoderRegistry coderRegistry = pipeline.getCoderRegistry();
    coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);
    List<String> topicsList;
    if (options.getKafkaReadTopics() != null) {
        topicsList = new ArrayList<>(Arrays.asList(options.getKafkaReadTopics().split(",")));
    } else if (options.getInputTopics() != null) {
        topicsList = new ArrayList<>(Arrays.asList(options.getInputTopics().split(",")));
    } else {
        throw new IllegalArgumentException("Please Provide --kafkaReadTopic");
    }
    String bootstrapServers;
    if (options.getReadBootstrapServers() != null) {
        bootstrapServers = options.getReadBootstrapServers();
    } else if (options.getBootstrapServers() != null) {
        bootstrapServers = options.getBootstrapServers();
    } else {
        throw new IllegalArgumentException("Please Provide --bootstrapServers");
    }
    /*
     * Steps:
     *  1) Read messages in from Kafka
     *  2) Transform the messages into TableRows
     *     - Transform message payload via UDF
     *     - Convert UDF result to TableRow objects
     *  3) Write successful records out to BigQuery
     *  4) Write failed records out to BigQuery
     */
    PCollectionTuple convertedTableRows = pipeline.apply("ReadFromKafka", readFromKafka(bootstrapServers, topicsList, ImmutableMap.of(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest"), null)).apply("ConvertMessageToTableRow", new MessageToTableRow(options));
    /*
     * Step #3: Write the successful records out to BigQuery
     */
    WriteResult writeResult = convertedTableRows.get(TRANSFORM_OUT).apply("WriteSuccessfulRecords", BigQueryIO.writeTableRows().withoutValidation().withCreateDisposition(CreateDisposition.CREATE_NEVER).withWriteDisposition(WriteDisposition.WRITE_APPEND).withExtendedErrorInfo().withMethod(BigQueryIO.Write.Method.STREAMING_INSERTS).withFailedInsertRetryPolicy(InsertRetryPolicy.retryTransientErrors()).to(options.getOutputTableSpec()));
    /*
     * Step 3 Contd.
     * Elements that failed inserts into BigQuery are extracted and converted to FailsafeElement
     */
    PCollection<FailsafeElement<String, String>> failedInserts = writeResult.getFailedInsertsWithErr().apply("WrapInsertionErrors", MapElements.into(FAILSAFE_ELEMENT_CODER.getEncodedTypeDescriptor()).via(KafkaToBigQuery::wrapBigQueryInsertError)).setCoder(FAILSAFE_ELEMENT_CODER);
    /*
     * Step #4: Write failed records out to BigQuery
     */
    PCollectionList.of(convertedTableRows.get(UDF_DEADLETTER_OUT)).and(convertedTableRows.get(TRANSFORM_DEADLETTER_OUT)).apply("Flatten", Flatten.pCollections()).apply("WriteTransformationFailedRecords", WriteKafkaMessageErrors.newBuilder().setErrorRecordsTable(ObjectUtils.firstNonNull(options.getOutputDeadletterTable(), options.getOutputTableSpec() + DEFAULT_DEADLETTER_TABLE_SUFFIX)).setErrorRecordsTableSchema(SchemaUtils.DEADLETTER_SCHEMA).build());
    /*
     * Step #5: Insert records that failed BigQuery inserts into a deadletter table.
     */
    failedInserts.apply("WriteInsertionFailedRecords", ErrorConverters.WriteStringMessageErrors.newBuilder().setErrorRecordsTable(ObjectUtils.firstNonNull(options.getOutputDeadletterTable(), options.getOutputTableSpec() + DEFAULT_DEADLETTER_TABLE_SUFFIX)).setErrorRecordsTableSchema(SchemaUtils.DEADLETTER_SCHEMA).build());
    return pipeline.run();
}
Also used : ArrayList(java.util.ArrayList) KV(org.apache.beam.sdk.values.KV) Pipeline(org.apache.beam.sdk.Pipeline) FailsafeElement(com.google.cloud.teleport.v2.values.FailsafeElement) CoderRegistry(org.apache.beam.sdk.coders.CoderRegistry) WriteResult(org.apache.beam.sdk.io.gcp.bigquery.WriteResult) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple)

Example 7 with FailsafeElementCoder

use of com.google.cloud.teleport.v2.coders.FailsafeElementCoder in project DataflowTemplates by GoogleCloudPlatform.

the class KafkaToBigQueryTest method testKafkaToBigQueryE2E.

/**
 * Tests the {@link KafkaToBigQuery} pipeline end-to-end.
 */
@Test
public void testKafkaToBigQueryE2E() throws Exception {
    // Test input
    final String key = "{\"id\": \"1001\"}";
    final String badKey = "{\"id\": \"1002\"}";
    final String payload = "{\"ticker\": \"GOOGL\", \"price\": 1006.94}";
    final String badPayload = "{\"tickets\": \"AMZ\", \"proctor\": 007";
    final KV<String, String> message = KV.of(key, payload);
    final KV<String, String> badMessage = KV.of(badKey, badPayload);
    final Instant timestamp = new DateTime(2022, 2, 22, 22, 22, 22, 222, DateTimeZone.UTC).toInstant();
    final FailsafeElementCoder<KV<String, String>, String> coder = FailsafeElementCoder.of(KvCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of()), StringUtf8Coder.of());
    CoderRegistry coderRegistry = pipeline.getCoderRegistry();
    coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);
    KafkaToBigQuery.KafkaToBQOptions options = PipelineOptionsFactory.create().as(KafkaToBigQuery.KafkaToBQOptions.class);
    options.setJavascriptTextTransformGcsPath(TRANSFORM_FILE_PATH);
    options.setJavascriptTextTransformFunctionName("transform");
    // Build pipeline
    PCollectionTuple transformOut = pipeline.apply("CreateInput", Create.of(message).withCoder(KvCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of()))).apply("ConvertMessageToTableRow", new MessageToTableRow(options));
    // Assert
    PAssert.that(transformOut.get(KafkaToBigQuery.UDF_DEADLETTER_OUT)).empty();
    PAssert.that(transformOut.get(KafkaToBigQuery.TRANSFORM_DEADLETTER_OUT)).empty();
    PAssert.that(transformOut.get(KafkaToBigQuery.TRANSFORM_OUT)).satisfies(collection -> {
        TableRow result = collection.iterator().next();
        assertThat(result.get("ticker"), is(equalTo("GOOGL")));
        assertThat(result.get("price"), is(equalTo(1006.94)));
        return null;
    });
    // Execute pipeline
    pipeline.run();
    // Build pipeline with malformed payload
    PCollectionTuple badTransformOut = pipeline.apply("CreateBadInput", Create.of(badMessage).withCoder(KvCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of()))).apply("ConvertMessageToTableRow", new MessageToTableRow(options));
    // Assert
    PAssert.that(badTransformOut.get(KafkaToBigQuery.UDF_DEADLETTER_OUT)).satisfies(collection -> {
        FailsafeElement badResult = collection.iterator().next();
        assertThat(badResult.getOriginalPayload(), is(equalTo(badMessage)));
        assertThat(badResult.getPayload(), is(equalTo(badPayload)));
        return null;
    });
    PAssert.that(badTransformOut.get(KafkaToBigQuery.TRANSFORM_DEADLETTER_OUT)).empty();
    PAssert.that(badTransformOut.get(KafkaToBigQuery.TRANSFORM_OUT)).empty();
    // Execute pipeline
    pipeline.run();
}
Also used : Instant(org.joda.time.Instant) KV(org.apache.beam.sdk.values.KV) DateTime(org.joda.time.DateTime) FailsafeElement(com.google.cloud.teleport.v2.values.FailsafeElement) CoderRegistry(org.apache.beam.sdk.coders.CoderRegistry) TableRow(com.google.api.services.bigquery.model.TableRow) MessageToTableRow(com.google.cloud.teleport.v2.templates.KafkaToBigQuery.MessageToTableRow) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) MessageToTableRow(com.google.cloud.teleport.v2.templates.KafkaToBigQuery.MessageToTableRow) Test(org.junit.Test)

Example 8 with FailsafeElementCoder

use of com.google.cloud.teleport.v2.coders.FailsafeElementCoder in project DataflowTemplates by GoogleCloudPlatform.

the class KafkaToPubsub method run.

/**
 * Runs a pipeline which reads message from Kafka and writes to Pub/Sub.
 *
 * @param options arguments to the pipeline
 */
public static PipelineResult run(KafkaToPubsubOptions options) {
    List<String> topicsList = new ArrayList<>(Arrays.asList(options.getInputTopics().split(",")));
    checkArgument(topicsList.size() > 0 && topicsList.stream().allMatch((s) -> s.trim().length() > 0), "inputTopics cannot be an empty string.");
    List<String> bootstrapServersList = new ArrayList<>(Arrays.asList(options.getBootstrapServers().split(",")));
    checkArgument(bootstrapServersList.size() > 0 && bootstrapServersList.stream().allMatch((s) -> s.trim().length() > 0), "bootstrapServers cannot be an empty string.");
    // Configure Kafka consumer properties
    Map<String, Object> kafkaConfig = new HashMap<>();
    Map<String, String> sslConfig = null;
    if (options.getSecretStoreUrl() != null && options.getVaultToken() != null) {
        Map<String, Map<String, String>> credentials = getKafkaCredentialsFromVault(options.getSecretStoreUrl(), options.getVaultToken());
        kafkaConfig = configureKafka(credentials.get(KafkaPubsubConstants.KAFKA_CREDENTIALS));
        sslConfig = credentials.get(KafkaPubsubConstants.SSL_CREDENTIALS);
    } else {
        LOG.warn("No information to retrieve Kafka credentials was provided. " + "Trying to initiate an unauthorized connection.");
    }
    // Create the pipeline
    Pipeline pipeline = Pipeline.create(options);
    // Register the coder for pipeline
    FailsafeElementCoder<KV<String, String>, String> coder = FailsafeElementCoder.of(KvCoder.of(NullableCoder.of(StringUtf8Coder.of()), NullableCoder.of(StringUtf8Coder.of())), NullableCoder.of(StringUtf8Coder.of()));
    CoderRegistry coderRegistry = pipeline.getCoderRegistry();
    coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);
    TypeDescriptor<String> stringTypeDescriptor = TypeDescriptors.strings();
    LOG.info("Starting Kafka-To-PubSub Pipeline with parameters bootstrap servers:{} input topics:{}" + " output pubsub topic:{} ", options.getBootstrapServers(), options.getInputTopics(), options.getOutputTopic());
    /*
     * Steps:
     *  1) Read messages in from Kafka
     *  2) Transform message payload via UDF
     *  3) Write successful records out to Pub/Sub
     *  4) Write failed records out to Pub/Sub dead-letter topic
     */
    PCollectionTuple appliedUdf = pipeline.apply("readFromKafka", readFromKafka(options.getBootstrapServers(), topicsList, kafkaConfig, sslConfig)).apply("applyUDF", new FormatTransform.UdfProcess(options));
    /* Step #3: Write the successful records out to Pub/Sub */
    appliedUdf.get(KafkaPubsubConstants.UDF_OUT).apply("getSuccessUDFOutElements", MapElements.into(stringTypeDescriptor).via(FailsafeElement::getPayload)).setCoder(NullableCoder.of(StringUtf8Coder.of())).apply("writeSuccessMessages", PubsubIO.writeStrings().to(options.getOutputTopic()));
    /* Step #4: Write failed messages out to Pub/Sub */
    if (options.getOutputDeadLetterTopic() != null) {
        appliedUdf.get(KafkaPubsubConstants.UDF_DEADLETTER_OUT).apply("getFailedMessages", MapElements.into(TypeDescriptors.kvs(stringTypeDescriptor, stringTypeDescriptor)).via(FailsafeElement::getOriginalPayload)).apply("extractMessageValues", MapElements.into(stringTypeDescriptor).via(KV<String, String>::getValue)).setCoder(NullableCoder.of(StringUtf8Coder.of())).apply("writeFailureMessages", PubsubIO.writeStrings().to(options.getOutputDeadLetterTopic()));
    }
    return pipeline.run();
}
Also used : HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) KV(org.apache.beam.sdk.values.KV) Pipeline(org.apache.beam.sdk.Pipeline) FailsafeElement(com.google.cloud.teleport.v2.values.FailsafeElement) FormatTransform(com.google.cloud.teleport.v2.transforms.FormatTransform) CoderRegistry(org.apache.beam.sdk.coders.CoderRegistry) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) HashMap(java.util.HashMap) Map(java.util.Map)

Example 9 with FailsafeElementCoder

use of com.google.cloud.teleport.v2.coders.FailsafeElementCoder in project DataflowTemplates by GoogleCloudPlatform.

the class GCSToElasticsearchTest method testGCSToElasticsearchUdfE2E.

/**
 * Tests the {@link GCSToElasticsearch} pipeline using a Udf to parse the Csv.
 */
@Test
public void testGCSToElasticsearchUdfE2E() {
    final String record = "007,CA,26.23";
    final String stringifiedJsonRecord = "{\"id\":\"007\",\"state\":\"CA\",\"price\":26.23}";
    final FailsafeElementCoder<String, String> coder = FailsafeElementCoder.of(NullableCoder.of(StringUtf8Coder.of()), NullableCoder.of(StringUtf8Coder.of()));
    CoderRegistry coderRegistry = pipeline.getCoderRegistry();
    coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);
    GCSToElasticsearchOptions options = PipelineOptionsFactory.create().as(GCSToElasticsearchOptions.class);
    options.setJavascriptTextTransformGcsPath(TRANSFORM_FILE_PATH);
    options.setJavascriptTextTransformFunctionName("transform");
    options.setContainsHeaders(false);
    options.setInputFileSpec(NO_HEADER_CSV_FILE_PATH);
    options.setApiKey("key");
    // Build pipeline with no headers.
    PCollectionTuple readCsvOut = pipeline.apply("ReadCsv", CsvConverters.ReadCsv.newBuilder().setCsvFormat(options.getCsvFormat()).setDelimiter(options.getDelimiter()).setHasHeaders(options.getContainsHeaders()).setInputFileSpec(options.getInputFileSpec()).setHeaderTag(GCSToElasticsearch.CSV_HEADERS).setLineTag(GCSToElasticsearch.CSV_LINES).setFileEncoding(options.getCsvFileEncoding()).build()).apply("ConvertLine", CsvConverters.LineToFailsafeJson.newBuilder().setDelimiter(options.getDelimiter()).setUdfFileSystemPath(options.getJavascriptTextTransformGcsPath()).setUdfFunctionName(options.getJavascriptTextTransformFunctionName()).setJsonSchemaPath(options.getJsonSchemaPath()).setHeaderTag(GCSToElasticsearch.CSV_HEADERS).setLineTag(GCSToElasticsearch.CSV_LINES).setUdfOutputTag(GCSToElasticsearch.PROCESSING_OUT).setUdfDeadletterTag(GCSToElasticsearch.PROCESSING_DEADLETTER_OUT).build());
    // Assert
    PAssert.that(readCsvOut.get(GCSToElasticsearch.PROCESSING_OUT)).satisfies(collection -> {
        FailsafeElement element = collection.iterator().next();
        assertThat(element.getOriginalPayload(), is(equalTo(record)));
        assertThat(element.getPayload(), is(equalTo(stringifiedJsonRecord)));
        return null;
    });
    // Execute pipeline
    pipeline.run();
}
Also used : CoderRegistry(org.apache.beam.sdk.coders.CoderRegistry) GCSToElasticsearchOptions(com.google.cloud.teleport.v2.elasticsearch.options.GCSToElasticsearchOptions) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) FailsafeElement(com.google.cloud.teleport.v2.values.FailsafeElement) Test(org.junit.Test)

Example 10 with FailsafeElementCoder

use of com.google.cloud.teleport.v2.coders.FailsafeElementCoder in project DataflowTemplates by GoogleCloudPlatform.

the class PubSubCdcToBigQueryTest method testPubSubCdcToBigQueryApplyJavaScriptUDF.

/**
 * Tests the {@link PubSubCdcToBigQuery} pipeline end-to-end.
 */
@Test
public void testPubSubCdcToBigQueryApplyJavaScriptUDF() throws Exception {
    // Test input
    final String payload = "{\"ticker\": \"GOOGL\", \"price\": 1006.94}";
    final PubsubMessage message = new PubsubMessage(payload.getBytes(), ImmutableMap.of("id", "123", "type", "custom_event"));
    final Instant timestamp = new DateTime(2022, 2, 22, 22, 22, 22, 222, DateTimeZone.UTC).toInstant();
    final FailsafeElementCoder<String, String> coder = FailsafeElementCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of());
    CoderRegistry coderRegistry = pipeline.getCoderRegistry();
    coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);
    // Parameters
    String transformPath = TRANSFORM_FILE_PATH;
    String transformFunction = "transform";
    PubSubCdcToBigQuery.Options options = PipelineOptionsFactory.create().as(PubSubCdcToBigQuery.Options.class);
    options.setJavascriptTextTransformGcsPath(transformPath);
    options.setJavascriptTextTransformFunctionName(transformFunction);
    InputUDFToTableRow<String> deadletterHandler = new InputUDFToTableRow<String>(options.getJavascriptTextTransformGcsPath(), options.getJavascriptTextTransformFunctionName(), options.getPythonTextTransformGcsPath(), options.getPythonTextTransformFunctionName(), options.getRuntimeRetries(), coder);
    // Build pipeline
    PCollectionTuple transformOut = pipeline.apply("CreateInput", Create.timestamped(TimestampedValue.of(message, timestamp)).withCoder(PubsubMessageWithAttributesCoder.of())).apply("ConvertPubSubToFailsafe", ParDo.of(new PubSubToFailSafeElement())).apply("ConvertMessageToTableRow", deadletterHandler);
    transformOut.get(deadletterHandler.udfDeadletterOut).setCoder(coder);
    transformOut.get(deadletterHandler.transformDeadletterOut).setCoder(coder);
    // Assert
    PAssert.that(transformOut.get(deadletterHandler.udfDeadletterOut)).empty();
    PAssert.that(transformOut.get(deadletterHandler.transformDeadletterOut)).empty();
    PAssert.that(transformOut.get(deadletterHandler.transformOut)).satisfies(collection -> {
        TableRow result = collection.iterator().next();
        assertThat(result.get("ticker"), is(equalTo("GOOGL")));
        assertThat(result.get("price"), is(equalTo(1006.94)));
        return null;
    });
    // Execute pipeline
    pipeline.run();
}
Also used : Instant(org.joda.time.Instant) PubsubMessage(org.apache.beam.sdk.io.gcp.pubsub.PubsubMessage) DateTime(org.joda.time.DateTime) CoderRegistry(org.apache.beam.sdk.coders.CoderRegistry) PubSubToFailSafeElement(com.google.cloud.teleport.v2.transforms.PubSubToFailSafeElement) TableRow(com.google.api.services.bigquery.model.TableRow) InputUDFToTableRow(com.google.cloud.teleport.v2.transforms.UDFTextTransformer.InputUDFToTableRow) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) InputUDFToTableRow(com.google.cloud.teleport.v2.transforms.UDFTextTransformer.InputUDFToTableRow) Test(org.junit.Test)

Aggregations

CoderRegistry (org.apache.beam.sdk.coders.CoderRegistry)10 PCollectionTuple (org.apache.beam.sdk.values.PCollectionTuple)9 FailsafeElement (com.google.cloud.teleport.v2.values.FailsafeElement)8 Test (org.junit.Test)8 TableRow (com.google.api.services.bigquery.model.TableRow)4 GCSToElasticsearchOptions (com.google.cloud.teleport.v2.elasticsearch.options.GCSToElasticsearchOptions)3 KV (org.apache.beam.sdk.values.KV)3 DateTime (org.joda.time.DateTime)3 Instant (org.joda.time.Instant)3 ArrayList (java.util.ArrayList)2 Pipeline (org.apache.beam.sdk.Pipeline)2 PubsubMessage (org.apache.beam.sdk.io.gcp.pubsub.PubsubMessage)2 FailsafeElementCoder (com.google.cloud.teleport.v2.coders.FailsafeElementCoder)1 MessageToTableRow (com.google.cloud.teleport.v2.templates.KafkaToBigQuery.MessageToTableRow)1 FailsafeJsonToTableRow (com.google.cloud.teleport.v2.transforms.BigQueryConverters.FailsafeJsonToTableRow)1 FailedStringToTableRowFn (com.google.cloud.teleport.v2.transforms.ErrorConverters.FailedStringToTableRowFn)1 FormatDatastreamJsonToJson (com.google.cloud.teleport.v2.transforms.FormatDatastreamJsonToJson)1 FormatTransform (com.google.cloud.teleport.v2.transforms.FormatTransform)1 PubSubToFailSafeElement (com.google.cloud.teleport.v2.transforms.PubSubToFailSafeElement)1 InputUDFToTableRow (com.google.cloud.teleport.v2.transforms.UDFTextTransformer.InputUDFToTableRow)1