Search in sources :

Example 1 with FailsafeElementCoder

use of com.google.cloud.teleport.v2.coders.FailsafeElementCoder in project DataflowTemplates by GoogleCloudPlatform.

the class ErrorConvertersTest method testFailedStringMessageToTableRowFn.

/**
 * Tests that {@link ErrorConverters.FailedStringToTableRowFn} properly formats failed String
 * objects into {@link TableRow} objects to save to BigQuery.
 */
@Test
public void testFailedStringMessageToTableRowFn() {
    // Test input
    final String message = "Super secret";
    final String errorMessage = "Failed to parse input JSON";
    final String stacktrace = "Error at com.google.cloud.teleport.TextToBigQueryStreaming";
    final FailsafeElement<String, String> input = FailsafeElement.of(message, message).setErrorMessage(errorMessage).setStacktrace(stacktrace);
    final Instant timestamp = new DateTime(2022, 2, 22, 22, 22, 22, 222, DateTimeZone.UTC).toInstant();
    // Register the coder for the pipeline. This prevents having to invoke .setCoder() on
    // many transforms.
    FailsafeElementCoder<String, String> coder = FailsafeElementCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of());
    CoderRegistry coderRegistry = pipeline.getCoderRegistry();
    coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);
    // Build pipeline
    PCollection<TableRow> output = pipeline.apply("CreateInput", Create.timestamped(TimestampedValue.of(input, timestamp)).withCoder(coder)).apply("FailedRecordToTableRow", ParDo.of(new FailedStringToTableRowFn()));
    // Assert
    PAssert.that(output).satisfies(collection -> {
        final TableRow result = collection.iterator().next();
        assertThat(result.get("timestamp")).isEqualTo("2022-02-22 22:22:22.222000");
        assertThat(result.get("attributes")).isNull();
        assertThat(result.get("payloadString")).isEqualTo(message);
        assertThat(result.get("payloadBytes")).isNotNull();
        assertThat(result.get("errorMessage")).isEqualTo(errorMessage);
        assertThat(result.get("stacktrace")).isEqualTo(stacktrace);
        return null;
    });
    // Execute pipeline
    pipeline.run();
}
Also used : FailedStringToTableRowFn(com.google.cloud.teleport.v2.transforms.ErrorConverters.FailedStringToTableRowFn) CoderRegistry(org.apache.beam.sdk.coders.CoderRegistry) Instant(org.joda.time.Instant) TableRow(com.google.api.services.bigquery.model.TableRow) DateTime(org.joda.time.DateTime) Test(org.junit.Test)

Example 2 with FailsafeElementCoder

use of com.google.cloud.teleport.v2.coders.FailsafeElementCoder in project DataflowTemplates by GoogleCloudPlatform.

the class GCSToElasticsearchTest method testGCSToElasticsearchHeadersE2E.

/**
 * Tests the {@link GCSToElasticsearch} pipeline the headers of the Csv to parse it.
 */
@Test
public void testGCSToElasticsearchHeadersE2E() {
    final String record = "007,CA,26.23";
    final String stringJsonRecord = "{\"id\":\"007\",\"state\":\"CA\",\"price\":\"26.23\"}";
    final FailsafeElementCoder<String, String> coder = FailsafeElementCoder.of(NullableCoder.of(StringUtf8Coder.of()), NullableCoder.of(StringUtf8Coder.of()));
    CoderRegistry coderRegistry = pipeline.getCoderRegistry();
    coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);
    GCSToElasticsearchOptions options = PipelineOptionsFactory.create().as(GCSToElasticsearchOptions.class);
    options.setContainsHeaders(true);
    options.setInputFileSpec(HEADER_CSV_FILE_PATH);
    options.setApiKey("key");
    // Build pipeline with no headers.
    PCollectionTuple readCsvOut = pipeline.apply("ReadCsv", CsvConverters.ReadCsv.newBuilder().setCsvFormat(options.getCsvFormat()).setDelimiter(options.getDelimiter()).setHasHeaders(options.getContainsHeaders()).setInputFileSpec(options.getInputFileSpec()).setHeaderTag(GCSToElasticsearch.CSV_HEADERS).setLineTag(GCSToElasticsearch.CSV_LINES).setFileEncoding(options.getCsvFileEncoding()).build()).apply("ConvertLine", CsvConverters.LineToFailsafeJson.newBuilder().setDelimiter(options.getDelimiter()).setUdfFileSystemPath(options.getJavascriptTextTransformGcsPath()).setUdfFunctionName(options.getJavascriptTextTransformFunctionName()).setJsonSchemaPath(options.getJsonSchemaPath()).setHeaderTag(GCSToElasticsearch.CSV_HEADERS).setLineTag(GCSToElasticsearch.CSV_LINES).setUdfOutputTag(GCSToElasticsearch.PROCESSING_OUT).setUdfDeadletterTag(GCSToElasticsearch.PROCESSING_DEADLETTER_OUT).build());
    // Assert
    PAssert.that(readCsvOut.get(GCSToElasticsearch.PROCESSING_OUT)).satisfies(collection -> {
        FailsafeElement element = collection.iterator().next();
        assertThat(element.getOriginalPayload(), is(equalTo(record)));
        assertThat(element.getPayload(), is(equalTo(stringJsonRecord)));
        return null;
    });
    // Execute pipeline
    pipeline.run();
}
Also used : CoderRegistry(org.apache.beam.sdk.coders.CoderRegistry) GCSToElasticsearchOptions(com.google.cloud.teleport.v2.elasticsearch.options.GCSToElasticsearchOptions) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) FailsafeElement(com.google.cloud.teleport.v2.values.FailsafeElement) Test(org.junit.Test)

Example 3 with FailsafeElementCoder

use of com.google.cloud.teleport.v2.coders.FailsafeElementCoder in project DataflowTemplates by GoogleCloudPlatform.

the class GCSToElasticsearchTest method testGCSToElasticsearchJsonSchemaE2E.

/**
 * Tests the {@link GCSToElasticsearch} pipeline using a JSON schema to parse the Csv.
 */
@Test
public void testGCSToElasticsearchJsonSchemaE2E() {
    final String record = "007,CA,26.23";
    final String stringifiedJsonRecord = "{\"id\":\"007\",\"state\":\"CA\",\"price\":26.23}";
    final FailsafeElementCoder<String, String> coder = FailsafeElementCoder.of(NullableCoder.of(StringUtf8Coder.of()), NullableCoder.of(StringUtf8Coder.of()));
    CoderRegistry coderRegistry = pipeline.getCoderRegistry();
    coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);
    GCSToElasticsearchOptions options = PipelineOptionsFactory.create().as(GCSToElasticsearchOptions.class);
    options.setJsonSchemaPath(JSON_SCHEMA_FILE_PATH);
    options.setContainsHeaders(false);
    options.setInputFileSpec(NO_HEADER_CSV_FILE_PATH);
    options.setApiKey("key");
    // Build pipeline with no headers.
    PCollectionTuple readCsvOut = pipeline.apply("ReadCsv", CsvConverters.ReadCsv.newBuilder().setCsvFormat(options.getCsvFormat()).setDelimiter(options.getDelimiter()).setHasHeaders(options.getContainsHeaders()).setInputFileSpec(options.getInputFileSpec()).setHeaderTag(GCSToElasticsearch.CSV_HEADERS).setLineTag(GCSToElasticsearch.CSV_LINES).setFileEncoding(options.getCsvFileEncoding()).build()).apply("ConvertLine", CsvConverters.LineToFailsafeJson.newBuilder().setDelimiter(options.getDelimiter()).setUdfFileSystemPath(options.getJavascriptTextTransformGcsPath()).setUdfFunctionName(options.getJavascriptTextTransformFunctionName()).setJsonSchemaPath(options.getJsonSchemaPath()).setHeaderTag(GCSToElasticsearch.CSV_HEADERS).setLineTag(GCSToElasticsearch.CSV_LINES).setUdfOutputTag(GCSToElasticsearch.PROCESSING_OUT).setUdfDeadletterTag(GCSToElasticsearch.PROCESSING_DEADLETTER_OUT).build());
    // Assert
    PAssert.that(readCsvOut.get(GCSToElasticsearch.PROCESSING_OUT)).satisfies(collection -> {
        FailsafeElement element = collection.iterator().next();
        assertThat(element.getOriginalPayload(), is(equalTo(record)));
        assertThat(element.getPayload(), is(equalTo(stringifiedJsonRecord)));
        return null;
    });
    // Execute pipeline
    pipeline.run();
}
Also used : CoderRegistry(org.apache.beam.sdk.coders.CoderRegistry) GCSToElasticsearchOptions(com.google.cloud.teleport.v2.elasticsearch.options.GCSToElasticsearchOptions) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) FailsafeElement(com.google.cloud.teleport.v2.values.FailsafeElement) Test(org.junit.Test)

Example 4 with FailsafeElementCoder

use of com.google.cloud.teleport.v2.coders.FailsafeElementCoder in project DataflowTemplates by GoogleCloudPlatform.

the class BigQueryConvertersTest method testFailsafeJsonToTableRowValidInput.

/**
 * Tests the {@link BigQueryConverters.FailsafeJsonToTableRow} transform with good input.
 */
@Test
@Category(NeedsRunner.class)
public void testFailsafeJsonToTableRowValidInput() {
    // Test input
    final String payload = "{\"ticker\": \"GOOGL\", \"price\": 1006.94}";
    final Map<String, String> attributes = ImmutableMap.of("id", "0xDb12", "type", "stock");
    final PubsubMessage message = new PubsubMessage(payload.getBytes(), attributes);
    final FailsafeElement<PubsubMessage, String> input = FailsafeElement.of(message, payload);
    // Expected Output
    TableRow expectedRow = new TableRow().set("ticker", "GOOGL").set("price", 1006.94);
    // Register the coder for the pipeline. This prevents having to invoke .setCoder() on
    // many transforms.
    FailsafeElementCoder<PubsubMessage, String> coder = FailsafeElementCoder.of(PubsubMessageWithAttributesCoder.of(), StringUtf8Coder.of());
    CoderRegistry coderRegistry = pipeline.getCoderRegistry();
    coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);
    // Build the pipeline
    PCollectionTuple output = pipeline.apply("CreateInput", Create.of(input).withCoder(coder)).apply("JsonToTableRow", FailsafeJsonToTableRow.<PubsubMessage>newBuilder().setSuccessTag(TABLE_ROW_TAG).setFailureTag(FAILSAFE_ELM_TAG).build());
    // Assert
    PAssert.that(output.get(TABLE_ROW_TAG)).containsInAnyOrder(expectedRow);
    PAssert.that(output.get(FAILSAFE_ELM_TAG)).empty();
    // Execute the test
    pipeline.run();
}
Also used : CoderRegistry(org.apache.beam.sdk.coders.CoderRegistry) FailsafeJsonToTableRow(com.google.cloud.teleport.v2.transforms.BigQueryConverters.FailsafeJsonToTableRow) TableRow(com.google.api.services.bigquery.model.TableRow) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) PubsubMessage(org.apache.beam.sdk.io.gcp.pubsub.PubsubMessage) Category(org.junit.experimental.categories.Category) Test(org.junit.Test)

Example 5 with FailsafeElementCoder

use of com.google.cloud.teleport.v2.coders.FailsafeElementCoder in project DataflowTemplates by GoogleCloudPlatform.

the class DataStreamIO method expandDataStreamJsonStrings.

public PCollection<FailsafeElement<String, String>> expandDataStreamJsonStrings(PCollection<ReadableFile> datastreamFiles) {
    PCollection<FailsafeElement<String, String>> datastreamRecords;
    FailsafeElementCoder coder = FailsafeElementCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of());
    if (this.fileType.equals(JSON_SUFFIX)) {
        datastreamRecords = datastreamFiles.apply("FileReadConcurrency", Reshuffle.<ReadableFile>viaRandomKey().withNumBuckets(fileReadConcurrency)).apply("ReadFiles", TextIO.readFiles()).apply("ReshuffleRecords", Reshuffle.viaRandomKey()).apply("ParseJsonRecords", ParDo.of(FormatDatastreamJsonToJson.create().withStreamName(this.streamName).withHashColumnValues(this.hashedColumns).withLowercaseSourceColumns(this.lowercaseSourceColumns))).setCoder(coder);
    } else {
        SerializableFunction<GenericRecord, FailsafeElement<String, String>> parseFn = FormatDatastreamRecordToJson.create().withStreamName(this.streamName).withHashColumnValues(this.hashedColumns).withLowercaseSourceColumns(this.lowercaseSourceColumns);
        datastreamRecords = datastreamFiles.apply("ReshuffleFiles", Reshuffle.<ReadableFile>viaRandomKey()).apply("ParseAvroRows", ParDo.of(new ReadFileRangesFn<FailsafeElement<String, String>>(new CreateParseSourceFn(parseFn, coder), new ReadFileRangesFn.ReadFileRangesFnExceptionHandler()))).setCoder(coder);
    }
    return datastreamRecords.apply("Reshuffle", Reshuffle.viaRandomKey());
}
Also used : FailsafeElementCoder(com.google.cloud.teleport.v2.coders.FailsafeElementCoder) GenericRecord(org.apache.avro.generic.GenericRecord) ReadableFile(org.apache.beam.sdk.io.FileIO.ReadableFile) FailsafeElement(com.google.cloud.teleport.v2.values.FailsafeElement)

Aggregations

CoderRegistry (org.apache.beam.sdk.coders.CoderRegistry)10 PCollectionTuple (org.apache.beam.sdk.values.PCollectionTuple)9 FailsafeElement (com.google.cloud.teleport.v2.values.FailsafeElement)8 Test (org.junit.Test)8 TableRow (com.google.api.services.bigquery.model.TableRow)4 GCSToElasticsearchOptions (com.google.cloud.teleport.v2.elasticsearch.options.GCSToElasticsearchOptions)3 KV (org.apache.beam.sdk.values.KV)3 DateTime (org.joda.time.DateTime)3 Instant (org.joda.time.Instant)3 ArrayList (java.util.ArrayList)2 Pipeline (org.apache.beam.sdk.Pipeline)2 PubsubMessage (org.apache.beam.sdk.io.gcp.pubsub.PubsubMessage)2 FailsafeElementCoder (com.google.cloud.teleport.v2.coders.FailsafeElementCoder)1 MessageToTableRow (com.google.cloud.teleport.v2.templates.KafkaToBigQuery.MessageToTableRow)1 FailsafeJsonToTableRow (com.google.cloud.teleport.v2.transforms.BigQueryConverters.FailsafeJsonToTableRow)1 FailedStringToTableRowFn (com.google.cloud.teleport.v2.transforms.ErrorConverters.FailedStringToTableRowFn)1 FormatTransform (com.google.cloud.teleport.v2.transforms.FormatTransform)1 PubSubToFailSafeElement (com.google.cloud.teleport.v2.transforms.PubSubToFailSafeElement)1 InputUDFToTableRow (com.google.cloud.teleport.v2.transforms.UDFTextTransformer.InputUDFToTableRow)1 HashMap (java.util.HashMap)1