use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.
the class DataStreamToPostgres method run.
/**
* Runs the pipeline with the supplied options.
*
* @param options The execution parameters to the pipeline.
* @return The result of the pipeline execution.
*/
public static PipelineResult run(Options options) {
/*
* Stages:
* 1) Ingest and Normalize Data to FailsafeElement with JSON Strings
* 2) Write JSON Strings to Postgres DML Objects
* 3) Filter stale rows using stateful PK transform
* 4) Write DML statements to Postgres
*/
Pipeline pipeline = Pipeline.create(options);
String jdbcDriverConnectionString = String.format("jdbc:postgresql://%s:%s/%s", options.getDatabaseHost(), options.getDatabasePort(), options.getDatabaseName());
CdcJdbcIO.DataSourceConfiguration dataSourceConfiguration = CdcJdbcIO.DataSourceConfiguration.create("org.postgresql.Driver", jdbcDriverConnectionString).withUsername(options.getDatabaseUser()).withPassword(options.getDatabasePassword()).withMaxIdleConnections(new Integer(0));
validateOptions(options, dataSourceConfiguration);
/*
* Stage 1: Ingest and Normalize Data to FailsafeElement with JSON Strings
* a) Read DataStream data from GCS into JSON String FailsafeElements (datastreamJsonRecords)
*/
PCollection<FailsafeElement<String, String>> datastreamJsonRecords = pipeline.apply(new DataStreamIO(options.getStreamName(), options.getInputFilePattern(), options.getInputFileFormat(), options.getGcsPubSubSubscription(), options.getRfcStartDateTime()).withLowercaseSourceColumns().withRenameColumnValue("_metadata_row_id", "rowid").withHashRowId());
/*
* Stage 2: Write JSON Strings to Postgres Insert Strings
* a) Convert JSON String FailsafeElements to TableRow's (tableRowRecords)
* Stage 3) Filter stale rows using stateful PK transform
*/
PCollection<DmlInfo> dmlStatements = datastreamJsonRecords.apply("Format to Postgres DML", CreateDml.createDmlObjects(dataSourceConfiguration)).apply("DML Stateful Processing", ProcessDml.statefulOrderByPK());
/*
* Stage 4: Write Inserts to CloudSQL
*/
dmlStatements.apply("Write to Postgres", CdcJdbcIO.<DmlInfo>write().withDataSourceConfiguration(dataSourceConfiguration).withStatementFormatter(new CdcJdbcIO.StatementFormatter<DmlInfo>() {
public String formatStatement(DmlInfo element) {
return element.getDmlSql();
}
}));
// Execute the pipeline and return the result.
return pipeline.run();
}
use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.
the class DataplexBigQueryToGcsTest method setUp.
@Before
public void setUp() throws InterruptedException, IOException {
options = TestPipeline.testingPipelineOptions().as(DataplexBigQueryToGcsOptions.class);
options.setProject(PROJECT);
options.setUpdateDataplexMetadata(true);
options.setEnforceSamePartitionKey(false);
// Required when using BigQueryIO.withMethod(EXPORT).
options.setTempLocation(tmpDir.newFolder("bqTmp").getAbsolutePath());
outDir = tmpDir.newFolder("out");
bqSchema = new TableSchema().setFields(ImmutableList.of(new TableFieldSchema().setName("ts").setType("TIMESTAMP"), new TableFieldSchema().setName("s1").setType("STRING"), new TableFieldSchema().setName("d1").setType("DATE"), new TableFieldSchema().setName("t1").setType("TIME").setMode("REQUIRED"), new TableFieldSchema().setName("dt").setType("DATETIME"), new TableFieldSchema().setName("i1").setType("INTEGER")));
avroSchema = new Schema.Parser().parse("{\"type\":\"record\",\"name\":\"__root__\",\"fields\":" + "[{\"name\":\"ts\",\"type\":[\"null\",{\"type\":\"long\",\"logicalType\":\"timestamp-micros\"}]}," + "{\"name\":\"s1\",\"type\":[\"null\",\"string\"]}," + "{\"name\":\"d1\",\"type\":[\"null\",{\"type\":\"int\",\"logicalType\":\"date\"}]}," + "{\"name\":\"t1\",\"type\":{\"type\":\"long\",\"logicalType\":\"time-micros\"}}," + "{\"name\":\"dt\",\"type\":[\"null\",{\"type\":\"string\",\"logicalType\":\"datetime\"}]}," + "{\"name\":\"i1\",\"type\":[\"null\",\"long\"]}]}");
long modTime = System.currentTimeMillis() * 1000;
BigQueryTablePartition p1 = BigQueryTablePartition.builder().setPartitionName("p1").setLastModificationTime(modTime).build();
BigQueryTablePartition p2 = BigQueryTablePartition.builder().setPartitionName("p2").setLastModificationTime(modTime).build();
BigQueryTable t1 = BigQueryTable.builder().setTableName("partitioned_table").setProject(PROJECT).setDataset(DATASET).setSchema(avroSchema).setLastModificationTime(modTime).setPartitioningColumn("ts").setPartitions(Arrays.asList(p1, p2)).build();
BigQueryTable t2 = BigQueryTable.builder().setTableName("unpartitioned_table").setProject(PROJECT).setDataset(DATASET).setSchema(avroSchema).setLastModificationTime(modTime).build();
tableByName = new HashMap<>();
tableByName.put(t1.getTableName(), t1);
tableByName.put(t2.getTableName(), t2);
defaultRecords = new TableRow[] { new TableRow().set("ts", 1L).set("s1", "1001").set("d1", "1970-01-01").set("t1", "00:00:00.000001").set("dt", "2020-01-01T00:42:00.123").set("i1", 2001L), new TableRow().set("ts", 2L).set("s1", "1002").set("d1", "1970-01-02").set("t1", "00:00:00.000002").set("dt", "2020-01-02T00:42:00.123").set("i1", 2002L), new TableRow().set("ts", 3L).set("s1", "1003").set("d1", "1970-01-03").set("t1", "00:00:00.000003").set("dt", "2020-01-03T00:42:00.123").set("i1", 2003L), new TableRow().set("ts", 4L).set("s1", "1004").set("d1", "1970-01-04").set("t1", "00:00:00.000004").set("dt", "2020-01-04T00:42:00.123").set("i1", null), new TableRow().set("ts", 5L).set("s1", "1005").set("d1", "1970-01-05").set("t1", "00:00:00.000005").set("dt", "2020-01-05T00:42:00.123").set("i1", 2005L) };
defaultExpectedRecords = new String[] { "{\"ts\": 1, \"s1\": \"1001\", \"d1\": 0, \"t1\": 1, \"dt\": \"2020-01-01T00:42:00.123\"," + " \"i1\": 2001}", "{\"ts\": 2, \"s1\": \"1002\", \"d1\": 1, \"t1\": 2, \"dt\": \"2020-01-02T00:42:00.123\"," + " \"i1\": 2002}", "{\"ts\": 3, \"s1\": \"1003\", \"d1\": 2, \"t1\": 3, \"dt\": \"2020-01-03T00:42:00.123\"," + " \"i1\": 2003}", "{\"ts\": 4, \"s1\": \"1004\", \"d1\": 3, \"t1\": 4, \"dt\": \"2020-01-04T00:42:00.123\"," + " \"i1\": null}", "{\"ts\": 5, \"s1\": \"1005\", \"d1\": 4, \"t1\": 5, \"dt\": \"2020-01-05T00:42:00.123\"," + " \"i1\": 2005}" };
FakeDatasetService.setUp();
fakeDatasetService = new FakeDatasetService();
fakeDatasetService.createDataset(PROJECT, DATASET, "", "", null);
fakeDatasetService.createTable(new Table().setTableReference(t1.toTableReference()).setSchema(bqSchema).setRequirePartitionFilter(true).setTimePartitioning(new TimePartitioning().setField("ts").setType("DAY")));
fakeDatasetService.createTable(new Table().setTableReference(t2.toTableReference()).setSchema(bqSchema));
fakeJobService = new CustomFakeJobService();
bqFakeServices = new FakeBigQueryServices().withJobService(fakeJobService).withDatasetService(fakeDatasetService);
when(tableResultMock.iterateAll()).thenReturn(Collections.singleton(fields("unpartitioned_table", "0", null)));
when(bqMock.query(any())).thenReturn(tableResultMock);
when(bqMock.delete(any(TableId.class))).thenReturn(true);
when(bqsMock.createReadSession(any())).thenReturn(ReadSession.newBuilder().setAvroSchema(AvroSchema.newBuilder().setSchema(avroSchema.toString())).build());
metadataLoader = new BigQueryMetadataLoader(bqMock, bqsMock, MAX_PARALLEL_REQUESTS);
}
use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.
the class DataplexFileFormatConversionTest method testAssetWithEntityParquetToAvroE2E.
/**
* Tests Parquet to Avro conversion for an asset with entity.
*/
@Test
@Category(NeedsRunner.class)
public void testAssetWithEntityParquetToAvroE2E() throws IOException {
DataplexClient dataplex = mock(DataplexClient.class);
when(dataplex.getEntities(ImmutableList.of(entity4.getName()))).thenReturn(ImmutableList.of(entity4));
when(dataplex.getPartitions(entity4.getName())).thenReturn(ImmutableList.of());
when(dataplex.getAsset(outputAsset.getName())).thenReturn(outputAsset);
FileFormatConversionOptions options = PipelineOptionsFactory.create().as(FileFormatConversionOptions.class);
options.setInputAssetOrEntitiesList(entity4.getName());
options.setOutputFileFormat(FileFormatOptions.AVRO);
options.setOutputAsset(outputAsset.getName());
DataplexFileFormatConversion.run(mainPipeline, options, dataplex, DataplexFileFormatConversionTest::outputPathProvider);
PCollection<GenericRecord> readParquetFile = readPipeline.apply("ReadAvroFile", AvroConverters.ReadAvroFile.newBuilder().withInputFileSpec(temporaryFolder.getRoot().getAbsolutePath() + "/**/*.avro").withSerializedSchema(EXPECT_SERIALIZED_AVRO_SCHEMA).build());
PAssert.that(readParquetFile).containsInAnyOrder(EXPECTED_GENERIC_RECORDS);
readPipeline.run();
}
use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.
the class DataplexFileFormatConversionTest method testAssetWithEntityAvroToParquetE2E.
/**
* Tests Avro to Parquet conversion for an asset with entity.
*/
@Test
@Category(NeedsRunner.class)
public void testAssetWithEntityAvroToParquetE2E() throws IOException {
DataplexClient dataplex = mock(DataplexClient.class);
when(dataplex.getEntities(ImmutableList.of(entity3.getName()))).thenReturn(ImmutableList.of(entity3));
when(dataplex.getPartitions(entity3.getName())).thenReturn(ImmutableList.of());
when(dataplex.getAsset(outputAsset.getName())).thenReturn(outputAsset);
FileFormatConversionOptions options = PipelineOptionsFactory.create().as(FileFormatConversionOptions.class);
options.setInputAssetOrEntitiesList(entity3.getName());
options.setOutputFileFormat(FileFormatOptions.PARQUET);
options.setOutputAsset(outputAsset.getName());
DataplexFileFormatConversion.run(mainPipeline, options, dataplex, DataplexFileFormatConversionTest::outputPathProvider);
PCollection<GenericRecord> readParquetFile = readPipeline.apply("ReadParquetFile", ParquetConverters.ReadParquetFile.newBuilder().withInputFileSpec(temporaryFolder.getRoot().getAbsolutePath() + "/**/*.parquet").withSerializedSchema(EXPECT_SERIALIZED_AVRO_SCHEMA).build());
PAssert.that(readParquetFile).containsInAnyOrder(EXPECTED_GENERIC_RECORDS);
readPipeline.run();
}
use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.
the class SpannerChangeStreamsToGcsTest method testInvalidWindowDuration.
@Test
public void testInvalidWindowDuration() {
exception.expect(IllegalArgumentException.class);
exception.expectMessage("The window duration must be greater than 0!");
SpannerChangeStreamsToGcsOptions options = PipelineOptionsFactory.create().as(SpannerChangeStreamsToGcsOptions.class);
options.setOutputFileFormat(FileFormat.AVRO);
options.setGcsOutputDirectory(fakeDir);
options.setOutputFilenamePrefix(FILENAME_PREFIX);
options.setNumShards(NUM_SHARDS);
options.setTempLocation(fakeTempLocation);
options.setWindowDuration("invalidWindowDuration");
Pipeline p = Pipeline.create(options);
Timestamp startTimestamp = Timestamp.now();
Timestamp endTimestamp = Timestamp.now();
p.apply(SpannerIO.readChangeStream().withSpannerConfig(SpannerConfig.create().withProjectId("project").withInstanceId("instance").withDatabaseId("db")).withMetadataInstance("instance").withMetadataDatabase("db").withChangeStreamName("changestream").withInclusiveStartAt(startTimestamp).withInclusiveEndAt(endTimestamp).withRpcPriority(RpcPriority.HIGH)).apply("Creating " + options.getWindowDuration() + " Window", Window.into(FixedWindows.of(DurationUtils.parseDuration(options.getWindowDuration())))).apply("Write To GCS", FileFormatFactorySpannerChangeStreams.newBuilder().setOptions(options).build());
p.run();
}
Aggregations