Search in sources :

Example 1 with FailsafeElement

use of com.google.cloud.teleport.values.FailsafeElement in project DataflowTemplates by GoogleCloudPlatform.

the class ErrorConvertersTest method testFailedStringToPubsubMessageFn.

/**
 * Test successful conversion of {@link FailsafeElement} records into {@link PubsubMessage} with
 * attributes.
 */
@Test
@Category(NeedsRunner.class)
public void testFailedStringToPubsubMessageFn() {
    String testMessage = "original-test-message";
    FailsafeElement<String, String> element = FailsafeElement.of(testMessage, testMessage);
    Instant expectedTimestamp = Instant.now();
    String errorMessage = "my-error-message";
    element.setErrorMessage(errorMessage);
    TimestampedValue<FailsafeElement<String, String>> input = TimestampedValue.of(element, expectedTimestamp);
    PCollection<PubsubMessage> pCollection = pipeline.apply(Create.timestamped(input).withCoder(FailsafeElementCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of()))).apply(ParDo.of(new FailedStringToPubsubMessageFn()));
    String expectedTimestampString = FailedStringToPubsubMessageFn.TIMESTAMP_FORMATTER.print(expectedTimestamp.toDateTime(DateTimeZone.UTC));
    PAssert.that(pCollection).satisfies(collection -> {
        PubsubMessage actual = collection.iterator().next();
        assertThat(new String(actual.getPayload(), StandardCharsets.UTF_8), is(equalTo(testMessage)));
        assertThat(actual.getAttribute(FailedStringToPubsubMessageFn.ERROR_MESSAGE), is(equalTo(errorMessage)));
        assertThat(actual.getAttribute(FailedStringToPubsubMessageFn.TIMESTAMP), is(equalTo(expectedTimestampString)));
        return null;
    });
    pipeline.run();
}
Also used : Instant(org.joda.time.Instant) FailedStringToPubsubMessageFn(com.google.cloud.teleport.templates.common.ErrorConverters.FailedStringToPubsubMessageFn) PubsubMessage(org.apache.beam.sdk.io.gcp.pubsub.PubsubMessage) FailsafeElement(com.google.cloud.teleport.values.FailsafeElement) Category(org.junit.experimental.categories.Category) Test(org.junit.Test)

Example 2 with FailsafeElement

use of com.google.cloud.teleport.values.FailsafeElement in project DataflowTemplates by GoogleCloudPlatform.

the class PubSubToBigQuery method run.

/**
 * Runs the pipeline to completion with the specified options. This method does not wait until the
 * pipeline is finished before returning. Invoke {@code result.waitUntilFinish()} on the result
 * object to block until the pipeline is finished running if blocking programmatic execution is
 * required.
 *
 * @param options The execution options.
 * @return The pipeline result.
 */
public static PipelineResult run(Options options) {
    Pipeline pipeline = Pipeline.create(options);
    CoderRegistry coderRegistry = pipeline.getCoderRegistry();
    coderRegistry.registerCoderForType(CODER.getEncodedTypeDescriptor(), CODER);
    /*
     * Steps:
     *  1) Read messages in from Pub/Sub
     *  2) Transform the PubsubMessages into TableRows
     *     - Transform message payload via UDF
     *     - Convert UDF result to TableRow objects
     *  3) Write successful records out to BigQuery
     *  4) Write failed records out to BigQuery
     */
    /*
     * Step #1: Read messages in from Pub/Sub
     * Either from a Subscription or Topic
     */
    PCollection<PubsubMessage> messages = null;
    if (options.getUseSubscription()) {
        messages = pipeline.apply("ReadPubSubSubscription", PubsubIO.readMessagesWithAttributes().fromSubscription(options.getInputSubscription()));
    } else {
        messages = pipeline.apply("ReadPubSubTopic", PubsubIO.readMessagesWithAttributes().fromTopic(options.getInputTopic()));
    }
    PCollectionTuple convertedTableRows = messages.apply("ConvertMessageToTableRow", new PubsubMessageToTableRow(options));
    /*
     * Step #3: Write the successful records out to BigQuery
     */
    WriteResult writeResult = convertedTableRows.get(TRANSFORM_OUT).apply("WriteSuccessfulRecords", BigQueryIO.writeTableRows().withoutValidation().withCreateDisposition(CreateDisposition.CREATE_NEVER).withWriteDisposition(WriteDisposition.WRITE_APPEND).withExtendedErrorInfo().withMethod(BigQueryIO.Write.Method.STREAMING_INSERTS).withFailedInsertRetryPolicy(InsertRetryPolicy.retryTransientErrors()).to(options.getOutputTableSpec()));
    /*
     * Step 3 Contd.
     * Elements that failed inserts into BigQuery are extracted and converted to FailsafeElement
     */
    PCollection<FailsafeElement<String, String>> failedInserts = writeResult.getFailedInsertsWithErr().apply("WrapInsertionErrors", MapElements.into(FAILSAFE_ELEMENT_CODER.getEncodedTypeDescriptor()).via((BigQueryInsertError e) -> wrapBigQueryInsertError(e))).setCoder(FAILSAFE_ELEMENT_CODER);
    /*
     * Step #4: Write records that failed table row transformation
     * or conversion out to BigQuery deadletter table.
     */
    PCollectionList.of(ImmutableList.of(convertedTableRows.get(UDF_DEADLETTER_OUT), convertedTableRows.get(TRANSFORM_DEADLETTER_OUT))).apply("Flatten", Flatten.pCollections()).apply("WriteFailedRecords", ErrorConverters.WritePubsubMessageErrors.newBuilder().setErrorRecordsTable(ValueProviderUtils.maybeUseDefaultDeadletterTable(options.getOutputDeadletterTable(), options.getOutputTableSpec(), DEFAULT_DEADLETTER_TABLE_SUFFIX)).setErrorRecordsTableSchema(ResourceUtils.getDeadletterTableSchemaJson()).build());
    // 5) Insert records that failed insert into deadletter table
    failedInserts.apply("WriteFailedRecords", ErrorConverters.WriteStringMessageErrors.newBuilder().setErrorRecordsTable(ValueProviderUtils.maybeUseDefaultDeadletterTable(options.getOutputDeadletterTable(), options.getOutputTableSpec(), DEFAULT_DEADLETTER_TABLE_SUFFIX)).setErrorRecordsTableSchema(ResourceUtils.getDeadletterTableSchemaJson()).build());
    return pipeline.run();
}
Also used : CoderRegistry(org.apache.beam.sdk.coders.CoderRegistry) WriteResult(org.apache.beam.sdk.io.gcp.bigquery.WriteResult) TextToBigQueryStreaming.wrapBigQueryInsertError(com.google.cloud.teleport.templates.TextToBigQueryStreaming.wrapBigQueryInsertError) BigQueryInsertError(org.apache.beam.sdk.io.gcp.bigquery.BigQueryInsertError) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) PubsubMessage(org.apache.beam.sdk.io.gcp.pubsub.PubsubMessage) Pipeline(org.apache.beam.sdk.Pipeline) FailsafeElement(com.google.cloud.teleport.values.FailsafeElement)

Example 3 with FailsafeElement

use of com.google.cloud.teleport.values.FailsafeElement in project DataflowTemplates by GoogleCloudPlatform.

the class TextToBigQueryStreaming method run.

/**
 * Runs the pipeline with the supplied options.
 *
 * @param options The execution parameters to the pipeline.
 * @return The result of the pipeline execution.
 */
public static PipelineResult run(TextToBigQueryStreamingOptions options) {
    // Create the pipeline
    Pipeline pipeline = Pipeline.create(options);
    // Register the coder for pipeline
    FailsafeElementCoder<String, String> coder = FailsafeElementCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of());
    CoderRegistry coderRegistry = pipeline.getCoderRegistry();
    coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);
    /*
     * Steps:
     *  1) Read from the text source continuously.
     *  2) Convert to FailsafeElement.
     *  3) Apply Javascript udf transformation.
     *    - Tag records that were successfully transformed and those
     *      that failed transformation.
     *  4) Convert records to TableRow.
     *    - Tag records that were successfully converted and those
     *      that failed conversion.
     *  5) Insert successfully converted records into BigQuery.
     *    - Errors encountered while streaming will be sent to deadletter table.
     *  6) Insert records that failed into deadletter table.
     */
    PCollectionTuple transformedOutput = pipeline.apply("ReadFromSource", TextIO.read().from(options.getInputFilePattern()).watchForNewFiles(DEFAULT_POLL_INTERVAL, Growth.never())).apply("ConvertToFailsafeElement", MapElements.into(FAILSAFE_ELEMENT_CODER.getEncodedTypeDescriptor()).via(input -> FailsafeElement.of(input, input))).apply("ApplyUDFTransformation", FailsafeJavascriptUdf.<String>newBuilder().setFileSystemPath(options.getJavascriptTextTransformGcsPath()).setFunctionName(options.getJavascriptTextTransformFunctionName()).setSuccessTag(UDF_OUT).setFailureTag(UDF_DEADLETTER_OUT).build());
    PCollectionTuple convertedTableRows = transformedOutput.get(UDF_OUT).apply("ConvertJSONToTableRow", FailsafeJsonToTableRow.<String>newBuilder().setSuccessTag(TRANSFORM_OUT).setFailureTag(TRANSFORM_DEADLETTER_OUT).build());
    WriteResult writeResult = convertedTableRows.get(TRANSFORM_OUT).apply("InsertIntoBigQuery", BigQueryIO.writeTableRows().withJsonSchema(getSchemaFromGCS(options.getJSONPath())).to(options.getOutputTable()).withExtendedErrorInfo().withoutValidation().withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED).withWriteDisposition(WriteDisposition.WRITE_APPEND).withMethod(Method.STREAMING_INSERTS).withFailedInsertRetryPolicy(InsertRetryPolicy.retryTransientErrors()).withCustomGcsTempLocation(options.getBigQueryLoadingTemporaryDirectory()));
    // Elements that failed inserts into BigQuery are extracted and converted to FailsafeElement
    PCollection<FailsafeElement<String, String>> failedInserts = writeResult.getFailedInsertsWithErr().apply("WrapInsertionErrors", MapElements.into(FAILSAFE_ELEMENT_CODER.getEncodedTypeDescriptor()).via(TextToBigQueryStreaming::wrapBigQueryInsertError));
    // 6) Insert records that failed transformation or conversion into deadletter table
    PCollectionList.of(ImmutableList.of(transformedOutput.get(UDF_DEADLETTER_OUT), convertedTableRows.get(TRANSFORM_DEADLETTER_OUT), failedInserts)).apply("Flatten", Flatten.pCollections()).apply("WriteFailedRecords", WriteStringMessageErrors.newBuilder().setErrorRecordsTable(ValueProviderUtils.maybeUseDefaultDeadletterTable(options.getOutputDeadletterTable(), options.getOutputTable(), DEFAULT_DEADLETTER_TABLE_SUFFIX)).setErrorRecordsTableSchema(ResourceUtils.getDeadletterTableSchemaJson()).build());
    return pipeline.run();
}
Also used : CoderRegistry(org.apache.beam.sdk.coders.CoderRegistry) WriteResult(org.apache.beam.sdk.io.gcp.bigquery.WriteResult) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) Pipeline(org.apache.beam.sdk.Pipeline) FailsafeElement(com.google.cloud.teleport.values.FailsafeElement)

Example 4 with FailsafeElement

use of com.google.cloud.teleport.values.FailsafeElement in project DataflowTemplates by GoogleCloudPlatform.

the class PubSubToSplunk method run.

/**
 * Runs the pipeline to completion with the specified options. This method does not wait until the
 * pipeline is finished before returning. Invoke {@code result.waitUntilFinish()} on the result
 * object to block until the pipeline is finished running if blocking programmatic execution is
 * required.
 *
 * @param options The execution options.
 * @return The pipeline result.
 */
public static PipelineResult run(PubSubToSplunkOptions options) {
    Pipeline pipeline = Pipeline.create(options);
    // Register coders.
    CoderRegistry registry = pipeline.getCoderRegistry();
    registry.registerCoderForClass(SplunkEvent.class, SplunkEventCoder.of());
    registry.registerCoderForType(FAILSAFE_ELEMENT_CODER.getEncodedTypeDescriptor(), FAILSAFE_ELEMENT_CODER);
    /*
     * Steps:
     *  1) Read messages in from Pub/Sub
     *  2) Convert message to FailsafeElement for processing.
     *  3) Apply user provided UDF (if any) on the input strings.
     *  4) Convert successfully transformed messages into SplunkEvent objects
     *  5) Write SplunkEvents to Splunk's HEC end point.
     *  5a) Wrap write failures into a FailsafeElement.
     *  6) Collect errors from UDF transform (#3), SplunkEvent transform (#4)
     *     and writing to Splunk HEC (#5) and stream into a Pub/Sub deadletter topic.
     */
    // 1) Read messages in from Pub/Sub
    PCollection<String> stringMessages = pipeline.apply("ReadMessages", new ReadMessages(options.getInputSubscription(), options.getIncludePubsubMessage()));
    // 2) Convert message to FailsafeElement for processing.
    PCollectionTuple transformedOutput = stringMessages.apply("ConvertToFailsafeElement", MapElements.into(FAILSAFE_ELEMENT_CODER.getEncodedTypeDescriptor()).via(input -> FailsafeElement.of(input, input))).apply("ApplyUDFTransformation", FailsafeJavascriptUdf.<String>newBuilder().setFileSystemPath(options.getJavascriptTextTransformGcsPath()).setFunctionName(options.getJavascriptTextTransformFunctionName()).setLoggingEnabled(ValueProvider.StaticValueProvider.of(true)).setSuccessTag(UDF_OUT).setFailureTag(UDF_DEADLETTER_OUT).build());
    // 4) Convert successfully transformed messages into SplunkEvent objects
    PCollectionTuple convertToEventTuple = transformedOutput.get(UDF_OUT).apply("ConvertToSplunkEvent", SplunkConverters.failsafeStringToSplunkEvent(SPLUNK_EVENT_OUT, SPLUNK_EVENT_DEADLETTER_OUT));
    // 5) Write SplunkEvents to Splunk's HEC end point.
    PCollection<SplunkWriteError> writeErrors = convertToEventTuple.get(SPLUNK_EVENT_OUT).apply("WriteToSplunk", SplunkIO.writeBuilder().withToken(new TokenNestedValueProvider(options.getTokenSecretId(), options.getTokenKMSEncryptionKey(), options.getToken(), options.getTokenSource())).withUrl(options.getUrl()).withBatchCount(options.getBatchCount()).withParallelism(options.getParallelism()).withDisableCertificateValidation(options.getDisableCertificateValidation()).withRootCaCertificatePath(options.getRootCaCertificatePath()).withEnableBatchLogs(options.getEnableBatchLogs()).build());
    // 5a) Wrap write failures into a FailsafeElement.
    PCollection<FailsafeElement<String, String>> wrappedSplunkWriteErrors = writeErrors.apply("WrapSplunkWriteErrors", ParDo.of(new DoFn<SplunkWriteError, FailsafeElement<String, String>>() {

        @ProcessElement
        public void processElement(ProcessContext context) {
            SplunkWriteError error = context.element();
            FailsafeElement<String, String> failsafeElement = FailsafeElement.of(error.payload(), error.payload());
            if (error.statusMessage() != null) {
                failsafeElement.setErrorMessage(error.statusMessage());
            }
            if (error.statusCode() != null) {
                failsafeElement.setErrorMessage(String.format("Splunk write status code: %d", error.statusCode()));
            }
            context.output(failsafeElement);
        }
    }));
    // 6) Collect errors from UDF transform (#4), SplunkEvent transform (#5)
    // and writing to Splunk HEC (#6) and stream into a Pub/Sub deadletter topic.
    PCollectionList.of(ImmutableList.of(convertToEventTuple.get(SPLUNK_EVENT_DEADLETTER_OUT), wrappedSplunkWriteErrors, transformedOutput.get(UDF_DEADLETTER_OUT))).apply("FlattenErrors", Flatten.pCollections()).apply("WriteFailedRecords", ErrorConverters.WriteStringMessageErrorsToPubSub.newBuilder().setErrorRecordsTopic(options.getOutputDeadletterTopic()).build());
    return pipeline.run();
}
Also used : SplunkWriteError(com.google.cloud.teleport.splunk.SplunkWriteError) Pipeline(org.apache.beam.sdk.Pipeline) TokenNestedValueProvider(com.google.cloud.teleport.util.TokenNestedValueProvider) FailsafeElement(com.google.cloud.teleport.values.FailsafeElement) CoderRegistry(org.apache.beam.sdk.coders.CoderRegistry) DoFn(org.apache.beam.sdk.transforms.DoFn) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple)

Aggregations

FailsafeElement (com.google.cloud.teleport.values.FailsafeElement)4 Pipeline (org.apache.beam.sdk.Pipeline)3 CoderRegistry (org.apache.beam.sdk.coders.CoderRegistry)3 PCollectionTuple (org.apache.beam.sdk.values.PCollectionTuple)3 WriteResult (org.apache.beam.sdk.io.gcp.bigquery.WriteResult)2 PubsubMessage (org.apache.beam.sdk.io.gcp.pubsub.PubsubMessage)2 SplunkWriteError (com.google.cloud.teleport.splunk.SplunkWriteError)1 TextToBigQueryStreaming.wrapBigQueryInsertError (com.google.cloud.teleport.templates.TextToBigQueryStreaming.wrapBigQueryInsertError)1 FailedStringToPubsubMessageFn (com.google.cloud.teleport.templates.common.ErrorConverters.FailedStringToPubsubMessageFn)1 TokenNestedValueProvider (com.google.cloud.teleport.util.TokenNestedValueProvider)1 BigQueryInsertError (org.apache.beam.sdk.io.gcp.bigquery.BigQueryInsertError)1 DoFn (org.apache.beam.sdk.transforms.DoFn)1 Instant (org.joda.time.Instant)1 Test (org.junit.Test)1 Category (org.junit.experimental.categories.Category)1