Search in sources :

Example 21 with FailsafeElement

use of com.google.cloud.teleport.v2.values.FailsafeElement in project DataflowTemplates by GoogleCloudPlatform.

the class KafkaToPubsub method run.

/**
 * Runs a pipeline which reads message from Kafka and writes to Pub/Sub.
 *
 * @param options arguments to the pipeline
 */
public static PipelineResult run(KafkaToPubsubOptions options) {
    List<String> topicsList = new ArrayList<>(Arrays.asList(options.getInputTopics().split(",")));
    checkArgument(topicsList.size() > 0 && topicsList.stream().allMatch((s) -> s.trim().length() > 0), "inputTopics cannot be an empty string.");
    List<String> bootstrapServersList = new ArrayList<>(Arrays.asList(options.getBootstrapServers().split(",")));
    checkArgument(bootstrapServersList.size() > 0 && bootstrapServersList.stream().allMatch((s) -> s.trim().length() > 0), "bootstrapServers cannot be an empty string.");
    // Configure Kafka consumer properties
    Map<String, Object> kafkaConfig = new HashMap<>();
    Map<String, String> sslConfig = null;
    if (options.getSecretStoreUrl() != null && options.getVaultToken() != null) {
        Map<String, Map<String, String>> credentials = getKafkaCredentialsFromVault(options.getSecretStoreUrl(), options.getVaultToken());
        kafkaConfig = configureKafka(credentials.get(KafkaPubsubConstants.KAFKA_CREDENTIALS));
        sslConfig = credentials.get(KafkaPubsubConstants.SSL_CREDENTIALS);
    } else {
        LOG.warn("No information to retrieve Kafka credentials was provided. " + "Trying to initiate an unauthorized connection.");
    }
    // Create the pipeline
    Pipeline pipeline = Pipeline.create(options);
    // Register the coder for pipeline
    FailsafeElementCoder<KV<String, String>, String> coder = FailsafeElementCoder.of(KvCoder.of(NullableCoder.of(StringUtf8Coder.of()), NullableCoder.of(StringUtf8Coder.of())), NullableCoder.of(StringUtf8Coder.of()));
    CoderRegistry coderRegistry = pipeline.getCoderRegistry();
    coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);
    TypeDescriptor<String> stringTypeDescriptor = TypeDescriptors.strings();
    LOG.info("Starting Kafka-To-PubSub Pipeline with parameters bootstrap servers:{} input topics:{}" + " output pubsub topic:{} ", options.getBootstrapServers(), options.getInputTopics(), options.getOutputTopic());
    /*
     * Steps:
     *  1) Read messages in from Kafka
     *  2) Transform message payload via UDF
     *  3) Write successful records out to Pub/Sub
     *  4) Write failed records out to Pub/Sub dead-letter topic
     */
    PCollectionTuple appliedUdf = pipeline.apply("readFromKafka", readFromKafka(options.getBootstrapServers(), topicsList, kafkaConfig, sslConfig)).apply("applyUDF", new FormatTransform.UdfProcess(options));
    /* Step #3: Write the successful records out to Pub/Sub */
    appliedUdf.get(KafkaPubsubConstants.UDF_OUT).apply("getSuccessUDFOutElements", MapElements.into(stringTypeDescriptor).via(FailsafeElement::getPayload)).setCoder(NullableCoder.of(StringUtf8Coder.of())).apply("writeSuccessMessages", PubsubIO.writeStrings().to(options.getOutputTopic()));
    /* Step #4: Write failed messages out to Pub/Sub */
    if (options.getOutputDeadLetterTopic() != null) {
        appliedUdf.get(KafkaPubsubConstants.UDF_DEADLETTER_OUT).apply("getFailedMessages", MapElements.into(TypeDescriptors.kvs(stringTypeDescriptor, stringTypeDescriptor)).via(FailsafeElement::getOriginalPayload)).apply("extractMessageValues", MapElements.into(stringTypeDescriptor).via(KV<String, String>::getValue)).setCoder(NullableCoder.of(StringUtf8Coder.of())).apply("writeFailureMessages", PubsubIO.writeStrings().to(options.getOutputDeadLetterTopic()));
    }
    return pipeline.run();
}
Also used : HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) KV(org.apache.beam.sdk.values.KV) Pipeline(org.apache.beam.sdk.Pipeline) FailsafeElement(com.google.cloud.teleport.v2.values.FailsafeElement) FormatTransform(com.google.cloud.teleport.v2.transforms.FormatTransform) CoderRegistry(org.apache.beam.sdk.coders.CoderRegistry) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) HashMap(java.util.HashMap) Map(java.util.Map)

Example 22 with FailsafeElement

use of com.google.cloud.teleport.v2.values.FailsafeElement in project DataflowTemplates by GoogleCloudPlatform.

the class PubsubProtoToBigQuery method runUdf.

/**
 * Handles running the UDF.
 *
 * <p>If {@code options} is configured so as not to run the UDF, then the UDF will not be called.
 *
 * <p>This may add a branch to the pipeline for outputting failed UDF records to an unprocessed
 * topic.
 *
 * @param jsonCollection {@link PCollection} of JSON strings for use as input to the UDF
 * @param options the options containing info on running the UDF
 * @return the {@link PCollection} of UDF output as JSON or {@code jsonCollection} if UDF not
 *     called
 */
@VisibleForTesting
static PCollection<String> runUdf(PCollection<String> jsonCollection, PubSubProtoToBigQueryOptions options) {
    // intended, simply return the input as "success" output.
    if (Strings.isNullOrEmpty(options.getJavascriptTextTransformGcsPath())) {
        return jsonCollection;
    }
    // a value.
    if (Strings.isNullOrEmpty(options.getJavascriptTextTransformFunctionName())) {
        throw new IllegalArgumentException("JavaScript function name cannot be null or empty if file is set");
    }
    PCollectionTuple maybeSuccess = jsonCollection.apply("Run UDF", new RunUdf(options));
    maybeSuccess.get(UDF_FAILURE_TAG).setCoder(FAILSAFE_CODER).apply("Get UDF Failures", ConvertFailsafeElementToPubsubMessage.<String, String>builder().setOriginalPayloadSerializeFn(s -> ArrayUtils.toObject(s.getBytes(UTF_8))).setErrorMessageAttributeKey("udfErrorMessage").build()).apply("Write Failed UDF", writeUdfFailures(options));
    return maybeSuccess.get(UDF_SUCCESS_TAG).setCoder(FAILSAFE_CODER).apply("Get UDF Output", MapElements.into(TypeDescriptors.strings()).via(FailsafeElement::getPayload)).setCoder(NullableCoder.of(StringUtf8Coder.of()));
}
Also used : PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) FailsafeElement(com.google.cloud.teleport.v2.values.FailsafeElement) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 23 with FailsafeElement

use of com.google.cloud.teleport.v2.values.FailsafeElement in project DataflowTemplates by GoogleCloudPlatform.

the class GCSToSplunkTest method testGCSToSplunkReadHeaders.

@Test
public void testGCSToSplunkReadHeaders() {
    // Arrange
    String stringifiedJsonRecord = "{\"id\":\"008\",\"state\":\"CA\",\"price\":\"26.23\"}";
    SplunkEvent expectedSplunkEvent = SplunkEvent.newBuilder().withEvent(stringifiedJsonRecord).create();
    CoderRegistry coderRegistry = pipeline.getCoderRegistry();
    coderRegistry.registerCoderForClass(SplunkEvent.class, SplunkEventCoder.of());
    coderRegistry.registerCoderForType(FAILSAFE_ELEMENT_CODER.getEncodedTypeDescriptor(), FAILSAFE_ELEMENT_CODER);
    GCSToSplunkOptions options = PipelineOptionsFactory.create().as(GCSToSplunkOptions.class);
    options.setContainsHeaders(true);
    options.setInputFileSpec(HEADER_CSV_FILE_PATH);
    // Act
    PCollectionTuple readCsvOut = pipeline.apply("Read CSV", readFromCsv(options));
    PCollectionTuple transformedLines = readCsvOut.apply("Convert to JSON", convertToFailsafeAndMaybeApplyUdf(options));
    PCollectionTuple splunkEventTuple = transformedLines.get(UDF_OUT).apply("Convert to Splunk Event", convertToSplunkEvent());
    // Assert
    PAssert.that(transformedLines.get(UDF_OUT)).satisfies(collection -> {
        FailsafeElement element = collection.iterator().next();
        assertThat(element.getPayload()).isEqualTo(stringifiedJsonRecord);
        return null;
    });
    PAssert.that(transformedLines.get(UDF_ERROR_OUT)).empty();
    PAssert.that(splunkEventTuple.get(SPLUNK_EVENT_OUT)).containsInAnyOrder(expectedSplunkEvent);
    PAssert.that(splunkEventTuple.get(SPLUNK_EVENT_ERROR_OUT)).empty();
    // Execute pipeline
    pipeline.run();
}
Also used : GCSToSplunk.convertToSplunkEvent(com.google.cloud.teleport.v2.templates.GCSToSplunk.convertToSplunkEvent) SplunkEvent(org.apache.beam.sdk.io.splunk.SplunkEvent) CoderRegistry(org.apache.beam.sdk.coders.CoderRegistry) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) GCSToSplunkOptions(com.google.cloud.teleport.v2.templates.GCSToSplunk.GCSToSplunkOptions) GCSToSplunk.flattenErrorsAndConvertToString(com.google.cloud.teleport.v2.templates.GCSToSplunk.flattenErrorsAndConvertToString) FailsafeElement(com.google.cloud.teleport.v2.values.FailsafeElement) Test(org.junit.Test)

Example 24 with FailsafeElement

use of com.google.cloud.teleport.v2.values.FailsafeElement in project DataflowTemplates by GoogleCloudPlatform.

the class GCSToSplunkTest method testGCSToSplunkReadUdf.

@Test
public void testGCSToSplunkReadUdf() {
    // Arrange
    String stringifiedJsonRecord = "{\"id\":\"007\",\"state\":\"CA\",\"price\":26.23}";
    SplunkEvent expectedSplunkEvent = SplunkEvent.newBuilder().withEvent(stringifiedJsonRecord).create();
    CoderRegistry coderRegistry = pipeline.getCoderRegistry();
    coderRegistry.registerCoderForClass(SplunkEvent.class, SplunkEventCoder.of());
    coderRegistry.registerCoderForType(FAILSAFE_ELEMENT_CODER.getEncodedTypeDescriptor(), FAILSAFE_ELEMENT_CODER);
    GCSToSplunkOptions options = PipelineOptionsFactory.create().as(GCSToSplunkOptions.class);
    options.setJavascriptTextTransformGcsPath(TRANSFORM_FILE_PATH);
    options.setJavascriptTextTransformFunctionName("transform");
    options.setContainsHeaders(false);
    options.setInputFileSpec(NO_HEADER_CSV_FILE_PATH);
    // Act
    PCollectionTuple readCsvOut = pipeline.apply("Read CSV", readFromCsv(options));
    PCollectionTuple transformedLines = readCsvOut.apply("Convert to JSON", convertToFailsafeAndMaybeApplyUdf(options));
    PCollectionTuple splunkEventTuple = transformedLines.get(UDF_OUT).apply("Convert to Splunk Event", convertToSplunkEvent());
    // Assert
    PAssert.that(transformedLines.get(UDF_OUT)).satisfies(collection -> {
        FailsafeElement element = collection.iterator().next();
        assertThat(element.getPayload()).isEqualTo(stringifiedJsonRecord);
        return null;
    });
    PAssert.that(transformedLines.get(UDF_ERROR_OUT)).empty();
    PAssert.that(splunkEventTuple.get(SPLUNK_EVENT_OUT)).containsInAnyOrder(expectedSplunkEvent);
    PAssert.that(splunkEventTuple.get(SPLUNK_EVENT_ERROR_OUT)).empty();
    // Execute pipeline
    pipeline.run();
}
Also used : GCSToSplunk.convertToSplunkEvent(com.google.cloud.teleport.v2.templates.GCSToSplunk.convertToSplunkEvent) SplunkEvent(org.apache.beam.sdk.io.splunk.SplunkEvent) CoderRegistry(org.apache.beam.sdk.coders.CoderRegistry) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) GCSToSplunkOptions(com.google.cloud.teleport.v2.templates.GCSToSplunk.GCSToSplunkOptions) GCSToSplunk.flattenErrorsAndConvertToString(com.google.cloud.teleport.v2.templates.GCSToSplunk.flattenErrorsAndConvertToString) FailsafeElement(com.google.cloud.teleport.v2.values.FailsafeElement) Test(org.junit.Test)

Example 25 with FailsafeElement

use of com.google.cloud.teleport.v2.values.FailsafeElement in project DataflowTemplates by GoogleCloudPlatform.

the class DeadLetterQueueManager method getReconsumerDataTransform.

public PCollectionTuple getReconsumerDataTransform(PCollection<String> reconsumedElements) {
    return reconsumedElements.apply(ParDo.of(new DoFn<String, FailsafeElement<String, String>>() {

        @ProcessElement
        public void process(@Element String input, MultiOutputReceiver output) {
            FailsafeElement<String, String> element = FailsafeElement.of(input, input);
            // Early Return if maxRetries is set to 0
            if (maxRetries == 0) {
                output.get(RETRYABLE_ERRORS).output(element);
                return;
            }
            try {
                /* Remove error from metadata and populate error field
                       * in failsafe element.
                       */
                ObjectMapper mapper = new ObjectMapper();
                JsonNode jsonDLQElement = mapper.readTree(input);
                int retryCount = jsonDLQElement.get("_metadata_retry_count").asInt();
                if (retryCount <= maxRetries) {
                    output.get(RETRYABLE_ERRORS).output(element);
                    return;
                }
                String error = jsonDLQElement.get("_metadata_error").asText();
                element.setErrorMessage(error);
                output.get(PERMANENT_ERRORS).output(element);
            } catch (IOException e) {
                LOG.error("Issue parsing JSON record {}. Unable to continue.", input, e);
                output.get(PERMANENT_ERRORS).output(element);
            }
        }
    }).withOutputTags(RETRYABLE_ERRORS, TupleTagList.of(PERMANENT_ERRORS)));
}
Also used : JsonNode(org.codehaus.jackson.JsonNode) IOException(java.io.IOException) ObjectMapper(org.codehaus.jackson.map.ObjectMapper) FailsafeElement(com.google.cloud.teleport.v2.values.FailsafeElement)

Aggregations

FailsafeElement (com.google.cloud.teleport.v2.values.FailsafeElement)31 PCollectionTuple (org.apache.beam.sdk.values.PCollectionTuple)26 CoderRegistry (org.apache.beam.sdk.coders.CoderRegistry)21 Test (org.junit.Test)21 Pipeline (org.apache.beam.sdk.Pipeline)14 TableRow (com.google.api.services.bigquery.model.TableRow)8 PubsubMessage (org.apache.beam.sdk.io.gcp.pubsub.PubsubMessage)6 DoFn (org.apache.beam.sdk.transforms.DoFn)6 PubSubToElasticsearchOptions (com.google.cloud.teleport.v2.elasticsearch.options.PubSubToElasticsearchOptions)5 IntegrationTest (com.google.cloud.teleport.v2.spanner.IntegrationTest)5 JSONObject (org.json.JSONObject)5 DeadLetterQueueManager (com.google.cloud.teleport.v2.cdc.dlq.DeadLetterQueueManager)4 StringDeadLetterQueueSanitizer (com.google.cloud.teleport.v2.cdc.dlq.StringDeadLetterQueueSanitizer)4 DataStreamIO (com.google.cloud.teleport.v2.cdc.sources.DataStreamIO)4 FailsafeElementCoder (com.google.cloud.teleport.v2.coders.FailsafeElementCoder)4 GCSToSplunk.flattenErrorsAndConvertToString (com.google.cloud.teleport.v2.templates.GCSToSplunk.flattenErrorsAndConvertToString)4 PipelineResult (org.apache.beam.sdk.PipelineResult)4 SpannerConfig (org.apache.beam.sdk.io.gcp.spanner.SpannerConfig)4 KV (org.apache.beam.sdk.values.KV)4 GCSToElasticsearchOptions (com.google.cloud.teleport.v2.elasticsearch.options.GCSToElasticsearchOptions)3