Search in sources :

Example 11 with Options

use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.

the class FileFormatConversionTest method testCsvToParquetE2E.

/**
 * Tests if the Csv to Parquet pipeline transforms data correctly and stores it in a Parquet file.
 */
@Test
public void testCsvToParquetE2E() {
    FileFormatConversionOptions options = PipelineOptionsFactory.create().as(FileFormatConversionOptions.class);
    final String tempDir = temporaryFolder.getRoot().getAbsolutePath() + "/";
    options.setInputFileFormat(CSV);
    options.setOutputFileFormat(PARQUET);
    options.setInputFileSpec(CSV_FILE_PATH);
    options.setOutputBucket(tempDir);
    options.setContainsHeaders(true);
    options.setSchema(SCHEMA_FILE_PATH);
    options.setDelimiter("|");
    Schema schema = SchemaUtils.getAvroSchema(SCHEMA_FILE_PATH);
    GenericRecord genericRecords = new GenericData.Record(schema);
    genericRecords.put("id", "007");
    genericRecords.put("state", "CA");
    genericRecords.put("price", 26.23);
    mainPipeline.apply("TestCsvToParquet", FileFormatConversionFactory.FileFormat.newBuilder().setOptions(options).setInputFileFormat(CSV).setOutputFileFormat(PARQUET).build());
    mainPipeline.run();
    PCollection<GenericRecord> readParquetFile = readPipeline.apply("ReadParquetFile", ParquetConverters.ReadParquetFile.newBuilder().withInputFileSpec(tempDir + "*").withSchema(SCHEMA_FILE_PATH).build());
    PAssert.that(readParquetFile).containsInAnyOrder(genericRecords);
    readPipeline.run();
}
Also used : FileFormatConversionOptions(com.google.cloud.teleport.v2.templates.FileFormatConversion.FileFormatConversionOptions) Schema(org.apache.avro.Schema) GenericRecord(org.apache.avro.generic.GenericRecord) GenericRecord(org.apache.avro.generic.GenericRecord) Test(org.junit.Test)

Example 12 with Options

use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.

the class FileFormatConversionTest method testCsvToParquetWithEmptyField.

/**
 * Tests if the Csv to Parquet pipeline can handle empty fields in the Csv file.
 */
@Test
public void testCsvToParquetWithEmptyField() {
    FileFormatConversionOptions options = PipelineOptionsFactory.create().as(FileFormatConversionOptions.class);
    String tempDir = temporaryFolder.getRoot().getAbsolutePath() + "/";
    options.setInputFileFormat(CSV);
    options.setOutputFileFormat(PARQUET);
    options.setInputFileSpec(CSV_FILE_WITH_MISSING_FIELD_PATH);
    options.setOutputBucket(tempDir);
    options.setContainsHeaders(true);
    options.setSchema(SCHEMA_FILE_TWO_PATH);
    Schema schema = SchemaUtils.getAvroSchema(SCHEMA_FILE_TWO_PATH);
    GenericRecord genericRecords = new GenericData.Record(schema);
    genericRecords.put("id", "007");
    genericRecords.put("state", "CA");
    genericRecords.put("price", null);
    mainPipeline.apply("TestCsvToParquetWithEmptyField", FileFormatConversionFactory.FileFormat.newBuilder().setOptions(options).setInputFileFormat(CSV).setOutputFileFormat(PARQUET).build());
    mainPipeline.run();
    PCollection<GenericRecord> readParquetFile = readPipeline.apply("ReadParquetFile", ParquetConverters.ReadParquetFile.newBuilder().withInputFileSpec(tempDir + "*").withSchema(SCHEMA_FILE_TWO_PATH).build());
    PAssert.that(readParquetFile).containsInAnyOrder(genericRecords);
    readPipeline.run();
}
Also used : FileFormatConversionOptions(com.google.cloud.teleport.v2.templates.FileFormatConversion.FileFormatConversionOptions) Schema(org.apache.avro.Schema) GenericRecord(org.apache.avro.generic.GenericRecord) GenericRecord(org.apache.avro.generic.GenericRecord) Test(org.junit.Test)

Example 13 with Options

use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.

the class FileFormatConversionTest method testAvroToParquetE2E.

/**
 * Tests if the Avro to Parquet pipeline transforms data correctly and stores it in a Parquet
 * file.
 */
@Test
public void testAvroToParquetE2E() {
    FileFormatConversionOptions options = PipelineOptionsFactory.create().as(FileFormatConversionOptions.class);
    final String tempDir = temporaryFolder.getRoot().getAbsolutePath() + "/";
    options.setInputFileFormat(AVRO);
    options.setOutputFileFormat(PARQUET);
    options.setInputFileSpec(AVRO_FILE_PATH);
    options.setOutputBucket(tempDir);
    options.setSchema(SCHEMA_FILE_PATH);
    Schema schema = SchemaUtils.getAvroSchema(SCHEMA_FILE_PATH);
    GenericRecord genericRecords = new GenericData.Record(schema);
    genericRecords.put("id", "007");
    genericRecords.put("state", "CA");
    genericRecords.put("price", 26.23);
    mainPipeline.apply("TestAvroToParquet", FileFormatConversionFactory.FileFormat.newBuilder().setOptions(options).setInputFileFormat(AVRO).setOutputFileFormat(PARQUET).build());
    mainPipeline.run();
    PCollection<GenericRecord> readParquetFile = readPipeline.apply("ReadParquetFile", ParquetConverters.ReadParquetFile.newBuilder().withInputFileSpec(tempDir + "*").withSchema(SCHEMA_FILE_PATH).build());
    PAssert.that(readParquetFile).containsInAnyOrder(genericRecords);
    readPipeline.run();
}
Also used : FileFormatConversionOptions(com.google.cloud.teleport.v2.templates.FileFormatConversion.FileFormatConversionOptions) Schema(org.apache.avro.Schema) GenericRecord(org.apache.avro.generic.GenericRecord) GenericRecord(org.apache.avro.generic.GenericRecord) Test(org.junit.Test)

Example 14 with Options

use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.

the class GCSToElasticsearch method run.

/**
 * Runs the pipeline to completion with the specified options.
 *
 * @param options The execution options.
 * @return The pipeline result.
 */
private static PipelineResult run(GCSToElasticsearchOptions options) {
    // Create the pipeline
    Pipeline pipeline = Pipeline.create(options);
    // Register the coder for pipeline
    CoderRegistry coderRegistry = pipeline.getCoderRegistry();
    coderRegistry.registerCoderForType(FAILSAFE_ELEMENT_CODER.getEncodedTypeDescriptor(), FAILSAFE_ELEMENT_CODER);
    // Throw error if containsHeaders is true and a schema or Udf is also set.
    if (options.getContainsHeaders()) {
        checkArgument(options.getJavascriptTextTransformGcsPath() == null && options.getJsonSchemaPath() == null, "Cannot parse file containing headers with UDF or Json schema.");
    }
    // Throw error if only one retry configuration parameter is set.
    checkArgument((options.getMaxRetryAttempts() == null && options.getMaxRetryDuration() == null) || (options.getMaxRetryAttempts() != null && options.getMaxRetryDuration() != null), "To specify retry configuration both max attempts and max duration must be set.");
    /*
     * Steps: 1) Read records from CSV(s) via {@link CsvConverters.ReadCsv}.
     *        2) Convert lines to JSON strings via {@link CsvConverters.LineToFailsafeJson}.
     *        3a) Write JSON strings as documents to Elasticsearch via {@link ElasticsearchIO}.
     *        3b) Write elements that failed processing to {@link org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO}.
     */
    PCollectionTuple convertedCsvLines = pipeline.apply("ReadCsv", CsvConverters.ReadCsv.newBuilder().setCsvFormat(options.getCsvFormat()).setDelimiter(options.getDelimiter()).setHasHeaders(options.getContainsHeaders()).setInputFileSpec(options.getInputFileSpec()).setHeaderTag(CSV_HEADERS).setLineTag(CSV_LINES).setFileEncoding(options.getCsvFileEncoding()).build()).apply("ConvertLine", CsvConverters.LineToFailsafeJson.newBuilder().setDelimiter(options.getDelimiter()).setUdfFileSystemPath(options.getJavascriptTextTransformGcsPath()).setUdfFunctionName(options.getJavascriptTextTransformFunctionName()).setJsonSchemaPath(options.getJsonSchemaPath()).setHeaderTag(CSV_HEADERS).setLineTag(CSV_LINES).setUdfOutputTag(PROCESSING_OUT).setUdfDeadletterTag(PROCESSING_DEADLETTER_OUT).build());
    /*
     * Step 3a: Write elements that were successfully processed to Elasticsearch using {@link WriteToElasticsearch}.
     */
    convertedCsvLines.get(PROCESSING_OUT).apply("GetJsonDocuments", MapElements.into(TypeDescriptors.strings()).via(FailsafeElement::getPayload)).apply("WriteToElasticsearch", WriteToElasticsearch.newBuilder().setOptions(options.as(GCSToElasticsearchOptions.class)).build());
    /*
     * Step 3b: Write elements that failed processing to deadletter table via {@link BigQueryIO}.
     */
    convertedCsvLines.get(PROCESSING_DEADLETTER_OUT).apply("AddTimestamps", WithTimestamps.of((FailsafeElement<String, String> failures) -> new Instant())).apply("WriteFailedElementsToBigQuery", WriteStringMessageErrors.newBuilder().setErrorRecordsTable(options.getDeadletterTable()).setErrorRecordsTableSchema(SchemaUtils.DEADLETTER_SCHEMA).build());
    return pipeline.run();
}
Also used : CoderRegistry(org.apache.beam.sdk.coders.CoderRegistry) GCSToElasticsearchOptions(com.google.cloud.teleport.v2.elasticsearch.options.GCSToElasticsearchOptions) Instant(org.joda.time.Instant) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) Pipeline(org.apache.beam.sdk.Pipeline) FailsafeElement(com.google.cloud.teleport.v2.values.FailsafeElement)

Example 15 with Options

use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.

the class PubSubToElasticsearch method main.

/**
 * Main entry point for executing the pipeline.
 *
 * @param args The command-line arguments to the pipeline.
 */
public static void main(String[] args) {
    // Parse the user options passed from the command-line.
    PubSubToElasticsearchOptions pubSubToElasticsearchOptions = PipelineOptionsFactory.fromArgs(args).withValidation().as(PubSubToElasticsearchOptions.class);
    pubSubToElasticsearchOptions.setIndex(new ElasticsearchIndex(pubSubToElasticsearchOptions.getDataset(), pubSubToElasticsearchOptions.getNamespace()).getIndex());
    run(pubSubToElasticsearchOptions);
}
Also used : ElasticsearchIndex(com.google.cloud.teleport.v2.elasticsearch.utils.ElasticsearchIndex) PubSubToElasticsearchOptions(com.google.cloud.teleport.v2.elasticsearch.options.PubSubToElasticsearchOptions)

Aggregations

Test (org.junit.Test)63 PCollectionTuple (org.apache.beam.sdk.values.PCollectionTuple)25 FailsafeElement (com.google.cloud.teleport.v2.values.FailsafeElement)20 Pipeline (org.apache.beam.sdk.Pipeline)19 CoderRegistry (org.apache.beam.sdk.coders.CoderRegistry)19 BigQueryTable (com.google.cloud.teleport.v2.values.BigQueryTable)15 GenericRecord (org.apache.avro.generic.GenericRecord)12 Category (org.junit.experimental.categories.Category)12 Filter (com.google.cloud.teleport.v2.utils.BigQueryMetadataLoader.Filter)10 BigQueryTablePartition (com.google.cloud.teleport.v2.values.BigQueryTablePartition)10 PubSubToElasticsearchOptions (com.google.cloud.teleport.v2.elasticsearch.options.PubSubToElasticsearchOptions)9 TableRow (com.google.api.services.bigquery.model.TableRow)8 DataplexClient (com.google.cloud.teleport.v2.clients.DataplexClient)8 FileFormatConversionOptions (com.google.cloud.teleport.v2.templates.FileFormatConversion.FileFormatConversionOptions)8 KV (org.apache.beam.sdk.values.KV)8 ArrayList (java.util.ArrayList)7 ElasticsearchWriteOptions (com.google.cloud.teleport.v2.elasticsearch.options.ElasticsearchWriteOptions)6 GCSToElasticsearchOptions (com.google.cloud.teleport.v2.elasticsearch.options.GCSToElasticsearchOptions)6 FileFormatConversionOptions (com.google.cloud.teleport.v2.templates.DataplexFileFormatConversion.FileFormatConversionOptions)6 PubSubProtoToBigQueryOptions (com.google.cloud.teleport.v2.templates.PubsubProtoToBigQuery.PubSubProtoToBigQueryOptions)6