Search in sources :

Example 1 with Options

use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.

the class DataStreamToSQL method run.

/**
 * Runs the pipeline with the supplied options.
 *
 * @param options The execution parameters to the pipeline.
 * @return The result of the pipeline execution.
 */
public static PipelineResult run(Options options) {
    /*
     * Stages:
     *   1) Ingest and Normalize Data to FailsafeElement with JSON Strings
     *   2) Write JSON Strings to SQL DML Objects
     *   3) Filter stale rows using stateful PK transform
     *   4) Write DML statements to SQL Database via jdbc
     */
    Pipeline pipeline = Pipeline.create(options);
    CdcJdbcIO.DataSourceConfiguration dataSourceConfiguration = getDataSourceConfiguration(options);
    validateOptions(options, dataSourceConfiguration);
    Map<String, String> schemaMap = parseSchemaMap(options.getSchemaMap());
    /*
     * Stage 1: Ingest and Normalize Data to FailsafeElement with JSON Strings
     *   a) Read DataStream data from GCS into JSON String FailsafeElements (datastreamJsonRecords)
     */
    PCollection<FailsafeElement<String, String>> datastreamJsonRecords = pipeline.apply(new DataStreamIO(options.getStreamName(), options.getInputFilePattern(), options.getInputFileFormat(), options.getGcsPubSubSubscription(), options.getRfcStartDateTime()).withLowercaseSourceColumns().withHashColumnValue("_metadata_row_id", "rowid"));
    /*
     * Stage 2: Write JSON Strings to SQL Insert Strings
     *   a) Convert JSON String FailsafeElements to TableRow's (tableRowRecords)
     * Stage 3) Filter stale rows using stateful PK transform
     */
    PCollection<DmlInfo> dmlStatements = datastreamJsonRecords.apply("Format to DML", CreateDml.of(dataSourceConfiguration).withSchemaMap(schemaMap)).apply("DML Stateful Processing", ProcessDml.statefulOrderByPK());
    /*
     * Stage 4: Write Inserts to CloudSQL
     */
    dmlStatements.apply("Write to SQL", CdcJdbcIO.<DmlInfo>write().withDataSourceConfiguration(dataSourceConfiguration).withStatementFormatter(new CdcJdbcIO.StatementFormatter<DmlInfo>() {

        public String formatStatement(DmlInfo element) {
            return element.getDmlSql();
        }
    }));
    // Execute the pipeline and return the result.
    return pipeline.run();
}
Also used : DataStreamIO(com.google.cloud.teleport.v2.cdc.sources.DataStreamIO) DmlInfo(com.google.cloud.teleport.v2.values.DmlInfo) CdcJdbcIO(com.google.cloud.teleport.v2.io.CdcJdbcIO) Pipeline(org.apache.beam.sdk.Pipeline) FailsafeElement(com.google.cloud.teleport.v2.values.FailsafeElement)

Example 2 with Options

use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.

the class FileFormatConversionTest method testParquetToAvroE2E.

/**
 * Tests if the Parquet to Avro pipeline transforms data correctly and stores it in an Avro file.
 */
@Test
public void testParquetToAvroE2E() {
    FileFormatConversionOptions options = PipelineOptionsFactory.create().as(FileFormatConversionOptions.class);
    final String tempDir = temporaryFolder.getRoot().getAbsolutePath() + "/";
    options.setInputFileFormat(PARQUET);
    options.setOutputFileFormat(AVRO);
    options.setInputFileSpec(PARQUET_FILE_PATH);
    options.setOutputBucket(tempDir);
    options.setSchema(SCHEMA_FILE_PATH);
    Schema schema = SchemaUtils.getAvroSchema(SCHEMA_FILE_PATH);
    GenericRecord genericRecords = new GenericData.Record(schema);
    genericRecords.put("id", "007");
    genericRecords.put("state", "CA");
    genericRecords.put("price", 26.23);
    mainPipeline.apply("TestParquetToAvro", FileFormatConversionFactory.FileFormat.newBuilder().setOptions(options).setInputFileFormat(PARQUET).setOutputFileFormat(AVRO).build());
    mainPipeline.run();
    PCollection<GenericRecord> readAvroFile = readPipeline.apply("ReadAvroFile", AvroConverters.ReadAvroFile.newBuilder().withInputFileSpec(tempDir + "*").withSchema(SCHEMA_FILE_PATH).build());
    PAssert.that(readAvroFile).containsInAnyOrder(genericRecords);
    readPipeline.run();
}
Also used : FileFormatConversionOptions(com.google.cloud.teleport.v2.templates.FileFormatConversion.FileFormatConversionOptions) Schema(org.apache.avro.Schema) GenericRecord(org.apache.avro.generic.GenericRecord) GenericRecord(org.apache.avro.generic.GenericRecord) Test(org.junit.Test)

Example 3 with Options

use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.

the class FileFormatConversionTest method testCsvToParquetE2E.

/**
 * Tests if the Csv to Parquet pipeline transforms data correctly and stores it in a Parquet file.
 */
@Test
public void testCsvToParquetE2E() {
    FileFormatConversionOptions options = PipelineOptionsFactory.create().as(FileFormatConversionOptions.class);
    final String tempDir = temporaryFolder.getRoot().getAbsolutePath() + "/";
    options.setInputFileFormat(CSV);
    options.setOutputFileFormat(PARQUET);
    options.setInputFileSpec(CSV_FILE_PATH);
    options.setOutputBucket(tempDir);
    options.setContainsHeaders(true);
    options.setSchema(SCHEMA_FILE_PATH);
    options.setDelimiter("|");
    Schema schema = SchemaUtils.getAvroSchema(SCHEMA_FILE_PATH);
    GenericRecord genericRecords = new GenericData.Record(schema);
    genericRecords.put("id", "007");
    genericRecords.put("state", "CA");
    genericRecords.put("price", 26.23);
    mainPipeline.apply("TestCsvToParquet", FileFormatConversionFactory.FileFormat.newBuilder().setOptions(options).setInputFileFormat(CSV).setOutputFileFormat(PARQUET).build());
    mainPipeline.run();
    PCollection<GenericRecord> readParquetFile = readPipeline.apply("ReadParquetFile", ParquetConverters.ReadParquetFile.newBuilder().withInputFileSpec(tempDir + "*").withSchema(SCHEMA_FILE_PATH).build());
    PAssert.that(readParquetFile).containsInAnyOrder(genericRecords);
    readPipeline.run();
}
Also used : FileFormatConversionOptions(com.google.cloud.teleport.v2.templates.FileFormatConversion.FileFormatConversionOptions) Schema(org.apache.avro.Schema) GenericRecord(org.apache.avro.generic.GenericRecord) GenericRecord(org.apache.avro.generic.GenericRecord) Test(org.junit.Test)

Example 4 with Options

use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.

the class FileFormatConversionTest method testAvroToParquetE2E.

/**
 * Tests if the Avro to Parquet pipeline transforms data correctly and stores it in a Parquet
 * file.
 */
@Test
public void testAvroToParquetE2E() {
    FileFormatConversionOptions options = PipelineOptionsFactory.create().as(FileFormatConversionOptions.class);
    final String tempDir = temporaryFolder.getRoot().getAbsolutePath() + "/";
    options.setInputFileFormat(AVRO);
    options.setOutputFileFormat(PARQUET);
    options.setInputFileSpec(AVRO_FILE_PATH);
    options.setOutputBucket(tempDir);
    options.setSchema(SCHEMA_FILE_PATH);
    Schema schema = SchemaUtils.getAvroSchema(SCHEMA_FILE_PATH);
    GenericRecord genericRecords = new GenericData.Record(schema);
    genericRecords.put("id", "007");
    genericRecords.put("state", "CA");
    genericRecords.put("price", 26.23);
    mainPipeline.apply("TestAvroToParquet", FileFormatConversionFactory.FileFormat.newBuilder().setOptions(options).setInputFileFormat(AVRO).setOutputFileFormat(PARQUET).build());
    mainPipeline.run();
    PCollection<GenericRecord> readParquetFile = readPipeline.apply("ReadParquetFile", ParquetConverters.ReadParquetFile.newBuilder().withInputFileSpec(tempDir + "*").withSchema(SCHEMA_FILE_PATH).build());
    PAssert.that(readParquetFile).containsInAnyOrder(genericRecords);
    readPipeline.run();
}
Also used : FileFormatConversionOptions(com.google.cloud.teleport.v2.templates.FileFormatConversion.FileFormatConversionOptions) Schema(org.apache.avro.Schema) GenericRecord(org.apache.avro.generic.GenericRecord) GenericRecord(org.apache.avro.generic.GenericRecord) Test(org.junit.Test)

Example 5 with Options

use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.

the class FileFormatConversionTest method testCsvToParquetWithEmptyField.

/**
 * Tests if the Csv to Parquet pipeline can handle empty fields in the Csv file.
 */
@Test
public void testCsvToParquetWithEmptyField() {
    FileFormatConversionOptions options = PipelineOptionsFactory.create().as(FileFormatConversionOptions.class);
    String tempDir = temporaryFolder.getRoot().getAbsolutePath() + "/";
    options.setInputFileFormat(CSV);
    options.setOutputFileFormat(PARQUET);
    options.setInputFileSpec(CSV_FILE_WITH_MISSING_FIELD_PATH);
    options.setOutputBucket(tempDir);
    options.setContainsHeaders(true);
    options.setSchema(SCHEMA_FILE_TWO_PATH);
    Schema schema = SchemaUtils.getAvroSchema(SCHEMA_FILE_TWO_PATH);
    GenericRecord genericRecords = new GenericData.Record(schema);
    genericRecords.put("id", "007");
    genericRecords.put("state", "CA");
    genericRecords.put("price", null);
    mainPipeline.apply("TestCsvToParquetWithEmptyField", FileFormatConversionFactory.FileFormat.newBuilder().setOptions(options).setInputFileFormat(CSV).setOutputFileFormat(PARQUET).build());
    mainPipeline.run();
    PCollection<GenericRecord> readParquetFile = readPipeline.apply("ReadParquetFile", ParquetConverters.ReadParquetFile.newBuilder().withInputFileSpec(tempDir + "*").withSchema(SCHEMA_FILE_TWO_PATH).build());
    PAssert.that(readParquetFile).containsInAnyOrder(genericRecords);
    readPipeline.run();
}
Also used : FileFormatConversionOptions(com.google.cloud.teleport.v2.templates.FileFormatConversion.FileFormatConversionOptions) Schema(org.apache.avro.Schema) GenericRecord(org.apache.avro.generic.GenericRecord) GenericRecord(org.apache.avro.generic.GenericRecord) Test(org.junit.Test)

Aggregations

Test (org.junit.Test)55 PCollectionTuple (org.apache.beam.sdk.values.PCollectionTuple)19 FailsafeElement (com.google.cloud.teleport.v2.values.FailsafeElement)16 Pipeline (org.apache.beam.sdk.Pipeline)16 CoderRegistry (org.apache.beam.sdk.coders.CoderRegistry)16 BigQueryTable (com.google.cloud.teleport.v2.values.BigQueryTable)15 GenericRecord (org.apache.avro.generic.GenericRecord)12 Filter (com.google.cloud.teleport.v2.utils.BigQueryMetadataLoader.Filter)10 BigQueryTablePartition (com.google.cloud.teleport.v2.values.BigQueryTablePartition)10 Category (org.junit.experimental.categories.Category)10 PubSubToElasticsearchOptions (com.google.cloud.teleport.v2.elasticsearch.options.PubSubToElasticsearchOptions)9 DataplexClient (com.google.cloud.teleport.v2.clients.DataplexClient)8 FileFormatConversionOptions (com.google.cloud.teleport.v2.templates.FileFormatConversion.FileFormatConversionOptions)8 TableRow (com.google.api.services.bigquery.model.TableRow)7 KV (org.apache.beam.sdk.values.KV)7 ElasticsearchWriteOptions (com.google.cloud.teleport.v2.elasticsearch.options.ElasticsearchWriteOptions)6 GCSToElasticsearchOptions (com.google.cloud.teleport.v2.elasticsearch.options.GCSToElasticsearchOptions)6 FileFormatConversionOptions (com.google.cloud.teleport.v2.templates.DataplexFileFormatConversion.FileFormatConversionOptions)6 PubSubProtoToBigQueryOptions (com.google.cloud.teleport.v2.templates.PubsubProtoToBigQuery.PubSubProtoToBigQueryOptions)6 Schema (org.apache.avro.Schema)6