Examples with GCSToElasticsearchOptions - com.google.cloud.teleport.v2.elasticsearch.options.GCSToElasticsearchOptions

Example 1 with GCSToElasticsearchOptions

use of com.google.cloud.teleport.v2.elasticsearch.options.GCSToElasticsearchOptions in project DataflowTemplates by GoogleCloudPlatform.

the class GCSToElasticsearch method run.

/**
 * Runs the pipeline to completion with the specified options.
 *
 * @param options The execution options.
 * @return The pipeline result.
 */
private static PipelineResult run(GCSToElasticsearchOptions options) {
    // Create the pipeline
    Pipeline pipeline = Pipeline.create(options);
    // Register the coder for pipeline
    CoderRegistry coderRegistry = pipeline.getCoderRegistry();
    coderRegistry.registerCoderForType(FAILSAFE_ELEMENT_CODER.getEncodedTypeDescriptor(), FAILSAFE_ELEMENT_CODER);
    // Throw error if containsHeaders is true and a schema or Udf is also set.
    if (options.getContainsHeaders()) {
        checkArgument(options.getJavascriptTextTransformGcsPath() == null && options.getJsonSchemaPath() == null, "Cannot parse file containing headers with UDF or Json schema.");
    }
    // Throw error if only one retry configuration parameter is set.
    checkArgument((options.getMaxRetryAttempts() == null && options.getMaxRetryDuration() == null) || (options.getMaxRetryAttempts() != null && options.getMaxRetryDuration() != null), "To specify retry configuration both max attempts and max duration must be set.");
    /*
     * Steps: 1) Read records from CSV(s) via {@link CsvConverters.ReadCsv}.
     *        2) Convert lines to JSON strings via {@link CsvConverters.LineToFailsafeJson}.
     *        3a) Write JSON strings as documents to Elasticsearch via {@link ElasticsearchIO}.
     *        3b) Write elements that failed processing to {@link org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO}.
     */
    PCollectionTuple convertedCsvLines = pipeline.apply("ReadCsv", CsvConverters.ReadCsv.newBuilder().setCsvFormat(options.getCsvFormat()).setDelimiter(options.getDelimiter()).setHasHeaders(options.getContainsHeaders()).setInputFileSpec(options.getInputFileSpec()).setHeaderTag(CSV_HEADERS).setLineTag(CSV_LINES).setFileEncoding(options.getCsvFileEncoding()).build()).apply("ConvertLine", CsvConverters.LineToFailsafeJson.newBuilder().setDelimiter(options.getDelimiter()).setUdfFileSystemPath(options.getJavascriptTextTransformGcsPath()).setUdfFunctionName(options.getJavascriptTextTransformFunctionName()).setJsonSchemaPath(options.getJsonSchemaPath()).setHeaderTag(CSV_HEADERS).setLineTag(CSV_LINES).setUdfOutputTag(PROCESSING_OUT).setUdfDeadletterTag(PROCESSING_DEADLETTER_OUT).build());
    /*
     * Step 3a: Write elements that were successfully processed to Elasticsearch using {@link WriteToElasticsearch}.
     */
    convertedCsvLines.get(PROCESSING_OUT).apply("GetJsonDocuments", MapElements.into(TypeDescriptors.strings()).via(FailsafeElement::getPayload)).apply("WriteToElasticsearch", WriteToElasticsearch.newBuilder().setOptions(options.as(GCSToElasticsearchOptions.class)).build());
    /*
     * Step 3b: Write elements that failed processing to deadletter table via {@link BigQueryIO}.
     */
    convertedCsvLines.get(PROCESSING_DEADLETTER_OUT).apply("AddTimestamps", WithTimestamps.of((FailsafeElement<String, String> failures) -> new Instant())).apply("WriteFailedElementsToBigQuery", WriteStringMessageErrors.newBuilder().setErrorRecordsTable(options.getDeadletterTable()).setErrorRecordsTableSchema(SchemaUtils.DEADLETTER_SCHEMA).build());
    return pipeline.run();
}

Also used : CoderRegistry(org.apache.beam.sdk.coders.CoderRegistry) GCSToElasticsearchOptions(com.google.cloud.teleport.v2.elasticsearch.options.GCSToElasticsearchOptions) Instant(org.joda.time.Instant) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) Pipeline(org.apache.beam.sdk.Pipeline) FailsafeElement(com.google.cloud.teleport.v2.values.FailsafeElement)

Example 2 with GCSToElasticsearchOptions

use of com.google.cloud.teleport.v2.elasticsearch.options.GCSToElasticsearchOptions in project DataflowTemplates by GoogleCloudPlatform.

the class GCSToElasticsearchTest method testGCSToElasticsearchHeadersE2E.

/**
 * Tests the {@link GCSToElasticsearch} pipeline the headers of the Csv to parse it.
 */
@Test
public void testGCSToElasticsearchHeadersE2E() {
    final String record = "007,CA,26.23";
    final String stringJsonRecord = "{\"id\":\"007\",\"state\":\"CA\",\"price\":\"26.23\"}";
    final FailsafeElementCoder<String, String> coder = FailsafeElementCoder.of(NullableCoder.of(StringUtf8Coder.of()), NullableCoder.of(StringUtf8Coder.of()));
    CoderRegistry coderRegistry = pipeline.getCoderRegistry();
    coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);
    GCSToElasticsearchOptions options = PipelineOptionsFactory.create().as(GCSToElasticsearchOptions.class);
    options.setContainsHeaders(true);
    options.setInputFileSpec(HEADER_CSV_FILE_PATH);
    options.setApiKey("key");
    // Build pipeline with no headers.
    PCollectionTuple readCsvOut = pipeline.apply("ReadCsv", CsvConverters.ReadCsv.newBuilder().setCsvFormat(options.getCsvFormat()).setDelimiter(options.getDelimiter()).setHasHeaders(options.getContainsHeaders()).setInputFileSpec(options.getInputFileSpec()).setHeaderTag(GCSToElasticsearch.CSV_HEADERS).setLineTag(GCSToElasticsearch.CSV_LINES).setFileEncoding(options.getCsvFileEncoding()).build()).apply("ConvertLine", CsvConverters.LineToFailsafeJson.newBuilder().setDelimiter(options.getDelimiter()).setUdfFileSystemPath(options.getJavascriptTextTransformGcsPath()).setUdfFunctionName(options.getJavascriptTextTransformFunctionName()).setJsonSchemaPath(options.getJsonSchemaPath()).setHeaderTag(GCSToElasticsearch.CSV_HEADERS).setLineTag(GCSToElasticsearch.CSV_LINES).setUdfOutputTag(GCSToElasticsearch.PROCESSING_OUT).setUdfDeadletterTag(GCSToElasticsearch.PROCESSING_DEADLETTER_OUT).build());
    // Assert
    PAssert.that(readCsvOut.get(GCSToElasticsearch.PROCESSING_OUT)).satisfies(collection -> {
        FailsafeElement element = collection.iterator().next();
        assertThat(element.getOriginalPayload(), is(equalTo(record)));
        assertThat(element.getPayload(), is(equalTo(stringJsonRecord)));
        return null;
    });
    // Execute pipeline
    pipeline.run();
}

Also used : CoderRegistry(org.apache.beam.sdk.coders.CoderRegistry) GCSToElasticsearchOptions(com.google.cloud.teleport.v2.elasticsearch.options.GCSToElasticsearchOptions) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) FailsafeElement(com.google.cloud.teleport.v2.values.FailsafeElement) Test(org.junit.Test)

Example 3 with GCSToElasticsearchOptions

use of com.google.cloud.teleport.v2.elasticsearch.options.GCSToElasticsearchOptions in project DataflowTemplates by GoogleCloudPlatform.

the class GCSToElasticsearchTest method testGCSToElasticsearchJsonSchemaE2E.

/**
 * Tests the {@link GCSToElasticsearch} pipeline using a JSON schema to parse the Csv.
 */
@Test
public void testGCSToElasticsearchJsonSchemaE2E() {
    final String record = "007,CA,26.23";
    final String stringifiedJsonRecord = "{\"id\":\"007\",\"state\":\"CA\",\"price\":26.23}";
    final FailsafeElementCoder<String, String> coder = FailsafeElementCoder.of(NullableCoder.of(StringUtf8Coder.of()), NullableCoder.of(StringUtf8Coder.of()));
    CoderRegistry coderRegistry = pipeline.getCoderRegistry();
    coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);
    GCSToElasticsearchOptions options = PipelineOptionsFactory.create().as(GCSToElasticsearchOptions.class);
    options.setJsonSchemaPath(JSON_SCHEMA_FILE_PATH);
    options.setContainsHeaders(false);
    options.setInputFileSpec(NO_HEADER_CSV_FILE_PATH);
    options.setApiKey("key");
    // Build pipeline with no headers.
    PCollectionTuple readCsvOut = pipeline.apply("ReadCsv", CsvConverters.ReadCsv.newBuilder().setCsvFormat(options.getCsvFormat()).setDelimiter(options.getDelimiter()).setHasHeaders(options.getContainsHeaders()).setInputFileSpec(options.getInputFileSpec()).setHeaderTag(GCSToElasticsearch.CSV_HEADERS).setLineTag(GCSToElasticsearch.CSV_LINES).setFileEncoding(options.getCsvFileEncoding()).build()).apply("ConvertLine", CsvConverters.LineToFailsafeJson.newBuilder().setDelimiter(options.getDelimiter()).setUdfFileSystemPath(options.getJavascriptTextTransformGcsPath()).setUdfFunctionName(options.getJavascriptTextTransformFunctionName()).setJsonSchemaPath(options.getJsonSchemaPath()).setHeaderTag(GCSToElasticsearch.CSV_HEADERS).setLineTag(GCSToElasticsearch.CSV_LINES).setUdfOutputTag(GCSToElasticsearch.PROCESSING_OUT).setUdfDeadletterTag(GCSToElasticsearch.PROCESSING_DEADLETTER_OUT).build());
    // Assert
    PAssert.that(readCsvOut.get(GCSToElasticsearch.PROCESSING_OUT)).satisfies(collection -> {
        FailsafeElement element = collection.iterator().next();
        assertThat(element.getOriginalPayload(), is(equalTo(record)));
        assertThat(element.getPayload(), is(equalTo(stringifiedJsonRecord)));
        return null;
    });
    // Execute pipeline
    pipeline.run();
}

Also used : CoderRegistry(org.apache.beam.sdk.coders.CoderRegistry) GCSToElasticsearchOptions(com.google.cloud.teleport.v2.elasticsearch.options.GCSToElasticsearchOptions) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) FailsafeElement(com.google.cloud.teleport.v2.values.FailsafeElement) Test(org.junit.Test)

Example 4 with GCSToElasticsearchOptions

use of com.google.cloud.teleport.v2.elasticsearch.options.GCSToElasticsearchOptions in project DataflowTemplates by GoogleCloudPlatform.

the class GCSToElasticsearch method main.

/**
 * Main entry point for pipeline execution.
 *
 * @param args Command line arguments to the pipeline.
 */
public static void main(String[] args) {
    GCSToElasticsearchOptions options = PipelineOptionsFactory.fromArgs(args).withValidation().as(GCSToElasticsearchOptions.class);
    run(options);
}

Also used : GCSToElasticsearchOptions(com.google.cloud.teleport.v2.elasticsearch.options.GCSToElasticsearchOptions)

Example 5 with GCSToElasticsearchOptions

use of com.google.cloud.teleport.v2.elasticsearch.options.GCSToElasticsearchOptions in project DataflowTemplates by GoogleCloudPlatform.

the class GCSToElasticsearchTest method testGCSToElasticsearchUdfE2E.

/**
 * Tests the {@link GCSToElasticsearch} pipeline using a Udf to parse the Csv.
 */
@Test
public void testGCSToElasticsearchUdfE2E() {
    final String record = "007,CA,26.23";
    final String stringifiedJsonRecord = "{\"id\":\"007\",\"state\":\"CA\",\"price\":26.23}";
    final FailsafeElementCoder<String, String> coder = FailsafeElementCoder.of(NullableCoder.of(StringUtf8Coder.of()), NullableCoder.of(StringUtf8Coder.of()));
    CoderRegistry coderRegistry = pipeline.getCoderRegistry();
    coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);
    GCSToElasticsearchOptions options = PipelineOptionsFactory.create().as(GCSToElasticsearchOptions.class);
    options.setJavascriptTextTransformGcsPath(TRANSFORM_FILE_PATH);
    options.setJavascriptTextTransformFunctionName("transform");
    options.setContainsHeaders(false);
    options.setInputFileSpec(NO_HEADER_CSV_FILE_PATH);
    options.setApiKey("key");
    // Build pipeline with no headers.
    PCollectionTuple readCsvOut = pipeline.apply("ReadCsv", CsvConverters.ReadCsv.newBuilder().setCsvFormat(options.getCsvFormat()).setDelimiter(options.getDelimiter()).setHasHeaders(options.getContainsHeaders()).setInputFileSpec(options.getInputFileSpec()).setHeaderTag(GCSToElasticsearch.CSV_HEADERS).setLineTag(GCSToElasticsearch.CSV_LINES).setFileEncoding(options.getCsvFileEncoding()).build()).apply("ConvertLine", CsvConverters.LineToFailsafeJson.newBuilder().setDelimiter(options.getDelimiter()).setUdfFileSystemPath(options.getJavascriptTextTransformGcsPath()).setUdfFunctionName(options.getJavascriptTextTransformFunctionName()).setJsonSchemaPath(options.getJsonSchemaPath()).setHeaderTag(GCSToElasticsearch.CSV_HEADERS).setLineTag(GCSToElasticsearch.CSV_LINES).setUdfOutputTag(GCSToElasticsearch.PROCESSING_OUT).setUdfDeadletterTag(GCSToElasticsearch.PROCESSING_DEADLETTER_OUT).build());
    // Assert
    PAssert.that(readCsvOut.get(GCSToElasticsearch.PROCESSING_OUT)).satisfies(collection -> {
        FailsafeElement element = collection.iterator().next();
        assertThat(element.getOriginalPayload(), is(equalTo(record)));
        assertThat(element.getPayload(), is(equalTo(stringifiedJsonRecord)));
        return null;
    });
    // Execute pipeline
    pipeline.run();
}

Also used : CoderRegistry(org.apache.beam.sdk.coders.CoderRegistry) GCSToElasticsearchOptions(com.google.cloud.teleport.v2.elasticsearch.options.GCSToElasticsearchOptions) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) FailsafeElement(com.google.cloud.teleport.v2.values.FailsafeElement) Test(org.junit.Test)

Aggregations

GCSToElasticsearchOptions (com.google.cloud.teleport.v2.elasticsearch.options.GCSToElasticsearchOptions)6 FailsafeElement (com.google.cloud.teleport.v2.values.FailsafeElement)4 CoderRegistry (org.apache.beam.sdk.coders.CoderRegistry)4 PCollectionTuple (org.apache.beam.sdk.values.PCollectionTuple)4 Test (org.junit.Test)4 ElasticsearchWriteOptions (com.google.cloud.teleport.v2.elasticsearch.options.ElasticsearchWriteOptions)1 Pipeline (org.apache.beam.sdk.Pipeline)1 Instant (org.joda.time.Instant)1