use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.
the class DataStreamToSQL method run.
/**
* Runs the pipeline with the supplied options.
*
* @param options The execution parameters to the pipeline.
* @return The result of the pipeline execution.
*/
public static PipelineResult run(Options options) {
/*
* Stages:
* 1) Ingest and Normalize Data to FailsafeElement with JSON Strings
* 2) Write JSON Strings to SQL DML Objects
* 3) Filter stale rows using stateful PK transform
* 4) Write DML statements to SQL Database via jdbc
*/
Pipeline pipeline = Pipeline.create(options);
CdcJdbcIO.DataSourceConfiguration dataSourceConfiguration = getDataSourceConfiguration(options);
validateOptions(options, dataSourceConfiguration);
Map<String, String> schemaMap = parseSchemaMap(options.getSchemaMap());
/*
* Stage 1: Ingest and Normalize Data to FailsafeElement with JSON Strings
* a) Read DataStream data from GCS into JSON String FailsafeElements (datastreamJsonRecords)
*/
PCollection<FailsafeElement<String, String>> datastreamJsonRecords = pipeline.apply(new DataStreamIO(options.getStreamName(), options.getInputFilePattern(), options.getInputFileFormat(), options.getGcsPubSubSubscription(), options.getRfcStartDateTime()).withLowercaseSourceColumns().withHashColumnValue("_metadata_row_id", "rowid"));
/*
* Stage 2: Write JSON Strings to SQL Insert Strings
* a) Convert JSON String FailsafeElements to TableRow's (tableRowRecords)
* Stage 3) Filter stale rows using stateful PK transform
*/
PCollection<DmlInfo> dmlStatements = datastreamJsonRecords.apply("Format to DML", CreateDml.of(dataSourceConfiguration).withSchemaMap(schemaMap)).apply("DML Stateful Processing", ProcessDml.statefulOrderByPK());
/*
* Stage 4: Write Inserts to CloudSQL
*/
dmlStatements.apply("Write to SQL", CdcJdbcIO.<DmlInfo>write().withDataSourceConfiguration(dataSourceConfiguration).withStatementFormatter(new CdcJdbcIO.StatementFormatter<DmlInfo>() {
public String formatStatement(DmlInfo element) {
return element.getDmlSql();
}
}));
// Execute the pipeline and return the result.
return pipeline.run();
}
use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.
the class FileFormatConversionTest method testParquetToAvroE2E.
/**
* Tests if the Parquet to Avro pipeline transforms data correctly and stores it in an Avro file.
*/
@Test
public void testParquetToAvroE2E() {
FileFormatConversionOptions options = PipelineOptionsFactory.create().as(FileFormatConversionOptions.class);
final String tempDir = temporaryFolder.getRoot().getAbsolutePath() + "/";
options.setInputFileFormat(PARQUET);
options.setOutputFileFormat(AVRO);
options.setInputFileSpec(PARQUET_FILE_PATH);
options.setOutputBucket(tempDir);
options.setSchema(SCHEMA_FILE_PATH);
Schema schema = SchemaUtils.getAvroSchema(SCHEMA_FILE_PATH);
GenericRecord genericRecords = new GenericData.Record(schema);
genericRecords.put("id", "007");
genericRecords.put("state", "CA");
genericRecords.put("price", 26.23);
mainPipeline.apply("TestParquetToAvro", FileFormatConversionFactory.FileFormat.newBuilder().setOptions(options).setInputFileFormat(PARQUET).setOutputFileFormat(AVRO).build());
mainPipeline.run();
PCollection<GenericRecord> readAvroFile = readPipeline.apply("ReadAvroFile", AvroConverters.ReadAvroFile.newBuilder().withInputFileSpec(tempDir + "*").withSchema(SCHEMA_FILE_PATH).build());
PAssert.that(readAvroFile).containsInAnyOrder(genericRecords);
readPipeline.run();
}
use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.
the class FileFormatConversionTest method testCsvToParquetE2E.
/**
* Tests if the Csv to Parquet pipeline transforms data correctly and stores it in a Parquet file.
*/
@Test
public void testCsvToParquetE2E() {
FileFormatConversionOptions options = PipelineOptionsFactory.create().as(FileFormatConversionOptions.class);
final String tempDir = temporaryFolder.getRoot().getAbsolutePath() + "/";
options.setInputFileFormat(CSV);
options.setOutputFileFormat(PARQUET);
options.setInputFileSpec(CSV_FILE_PATH);
options.setOutputBucket(tempDir);
options.setContainsHeaders(true);
options.setSchema(SCHEMA_FILE_PATH);
options.setDelimiter("|");
Schema schema = SchemaUtils.getAvroSchema(SCHEMA_FILE_PATH);
GenericRecord genericRecords = new GenericData.Record(schema);
genericRecords.put("id", "007");
genericRecords.put("state", "CA");
genericRecords.put("price", 26.23);
mainPipeline.apply("TestCsvToParquet", FileFormatConversionFactory.FileFormat.newBuilder().setOptions(options).setInputFileFormat(CSV).setOutputFileFormat(PARQUET).build());
mainPipeline.run();
PCollection<GenericRecord> readParquetFile = readPipeline.apply("ReadParquetFile", ParquetConverters.ReadParquetFile.newBuilder().withInputFileSpec(tempDir + "*").withSchema(SCHEMA_FILE_PATH).build());
PAssert.that(readParquetFile).containsInAnyOrder(genericRecords);
readPipeline.run();
}
use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.
the class FileFormatConversionTest method testAvroToParquetE2E.
/**
* Tests if the Avro to Parquet pipeline transforms data correctly and stores it in a Parquet
* file.
*/
@Test
public void testAvroToParquetE2E() {
FileFormatConversionOptions options = PipelineOptionsFactory.create().as(FileFormatConversionOptions.class);
final String tempDir = temporaryFolder.getRoot().getAbsolutePath() + "/";
options.setInputFileFormat(AVRO);
options.setOutputFileFormat(PARQUET);
options.setInputFileSpec(AVRO_FILE_PATH);
options.setOutputBucket(tempDir);
options.setSchema(SCHEMA_FILE_PATH);
Schema schema = SchemaUtils.getAvroSchema(SCHEMA_FILE_PATH);
GenericRecord genericRecords = new GenericData.Record(schema);
genericRecords.put("id", "007");
genericRecords.put("state", "CA");
genericRecords.put("price", 26.23);
mainPipeline.apply("TestAvroToParquet", FileFormatConversionFactory.FileFormat.newBuilder().setOptions(options).setInputFileFormat(AVRO).setOutputFileFormat(PARQUET).build());
mainPipeline.run();
PCollection<GenericRecord> readParquetFile = readPipeline.apply("ReadParquetFile", ParquetConverters.ReadParquetFile.newBuilder().withInputFileSpec(tempDir + "*").withSchema(SCHEMA_FILE_PATH).build());
PAssert.that(readParquetFile).containsInAnyOrder(genericRecords);
readPipeline.run();
}
use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.
the class FileFormatConversionTest method testCsvToParquetWithEmptyField.
/**
* Tests if the Csv to Parquet pipeline can handle empty fields in the Csv file.
*/
@Test
public void testCsvToParquetWithEmptyField() {
FileFormatConversionOptions options = PipelineOptionsFactory.create().as(FileFormatConversionOptions.class);
String tempDir = temporaryFolder.getRoot().getAbsolutePath() + "/";
options.setInputFileFormat(CSV);
options.setOutputFileFormat(PARQUET);
options.setInputFileSpec(CSV_FILE_WITH_MISSING_FIELD_PATH);
options.setOutputBucket(tempDir);
options.setContainsHeaders(true);
options.setSchema(SCHEMA_FILE_TWO_PATH);
Schema schema = SchemaUtils.getAvroSchema(SCHEMA_FILE_TWO_PATH);
GenericRecord genericRecords = new GenericData.Record(schema);
genericRecords.put("id", "007");
genericRecords.put("state", "CA");
genericRecords.put("price", null);
mainPipeline.apply("TestCsvToParquetWithEmptyField", FileFormatConversionFactory.FileFormat.newBuilder().setOptions(options).setInputFileFormat(CSV).setOutputFileFormat(PARQUET).build());
mainPipeline.run();
PCollection<GenericRecord> readParquetFile = readPipeline.apply("ReadParquetFile", ParquetConverters.ReadParquetFile.newBuilder().withInputFileSpec(tempDir + "*").withSchema(SCHEMA_FILE_TWO_PATH).build());
PAssert.that(readParquetFile).containsInAnyOrder(genericRecords);
readPipeline.run();
}
Aggregations