Search in sources :

Example 31 with Options

use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.

the class DataStreamToPostgres method run.

/**
 * Runs the pipeline with the supplied options.
 *
 * @param options The execution parameters to the pipeline.
 * @return The result of the pipeline execution.
 */
public static PipelineResult run(Options options) {
    /*
     * Stages:
     *   1) Ingest and Normalize Data to FailsafeElement with JSON Strings
     *   2) Write JSON Strings to Postgres DML Objects
     *   3) Filter stale rows using stateful PK transform
     *   4) Write DML statements to Postgres
     */
    Pipeline pipeline = Pipeline.create(options);
    String jdbcDriverConnectionString = String.format("jdbc:postgresql://%s:%s/%s", options.getDatabaseHost(), options.getDatabasePort(), options.getDatabaseName());
    CdcJdbcIO.DataSourceConfiguration dataSourceConfiguration = CdcJdbcIO.DataSourceConfiguration.create("org.postgresql.Driver", jdbcDriverConnectionString).withUsername(options.getDatabaseUser()).withPassword(options.getDatabasePassword()).withMaxIdleConnections(new Integer(0));
    validateOptions(options, dataSourceConfiguration);
    /*
     * Stage 1: Ingest and Normalize Data to FailsafeElement with JSON Strings
     *   a) Read DataStream data from GCS into JSON String FailsafeElements (datastreamJsonRecords)
     */
    PCollection<FailsafeElement<String, String>> datastreamJsonRecords = pipeline.apply(new DataStreamIO(options.getStreamName(), options.getInputFilePattern(), options.getInputFileFormat(), options.getGcsPubSubSubscription(), options.getRfcStartDateTime()).withLowercaseSourceColumns().withRenameColumnValue("_metadata_row_id", "rowid").withHashRowId());
    /*
     * Stage 2: Write JSON Strings to Postgres Insert Strings
     *   a) Convert JSON String FailsafeElements to TableRow's (tableRowRecords)
     * Stage 3) Filter stale rows using stateful PK transform
     */
    PCollection<DmlInfo> dmlStatements = datastreamJsonRecords.apply("Format to Postgres DML", CreateDml.createDmlObjects(dataSourceConfiguration)).apply("DML Stateful Processing", ProcessDml.statefulOrderByPK());
    /*
     * Stage 4: Write Inserts to CloudSQL
     */
    dmlStatements.apply("Write to Postgres", CdcJdbcIO.<DmlInfo>write().withDataSourceConfiguration(dataSourceConfiguration).withStatementFormatter(new CdcJdbcIO.StatementFormatter<DmlInfo>() {

        public String formatStatement(DmlInfo element) {
            return element.getDmlSql();
        }
    }));
    // Execute the pipeline and return the result.
    return pipeline.run();
}
Also used : DataStreamIO(com.google.cloud.teleport.v2.cdc.sources.DataStreamIO) DmlInfo(com.google.cloud.teleport.v2.values.DmlInfo) CdcJdbcIO(com.google.cloud.teleport.v2.io.CdcJdbcIO) Pipeline(org.apache.beam.sdk.Pipeline) FailsafeElement(com.google.cloud.teleport.v2.values.FailsafeElement)

Example 32 with Options

use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.

the class DataplexBigQueryToGcsTest method setUp.

@Before
public void setUp() throws InterruptedException, IOException {
    options = TestPipeline.testingPipelineOptions().as(DataplexBigQueryToGcsOptions.class);
    options.setProject(PROJECT);
    options.setUpdateDataplexMetadata(true);
    options.setEnforceSamePartitionKey(false);
    // Required when using BigQueryIO.withMethod(EXPORT).
    options.setTempLocation(tmpDir.newFolder("bqTmp").getAbsolutePath());
    outDir = tmpDir.newFolder("out");
    bqSchema = new TableSchema().setFields(ImmutableList.of(new TableFieldSchema().setName("ts").setType("TIMESTAMP"), new TableFieldSchema().setName("s1").setType("STRING"), new TableFieldSchema().setName("d1").setType("DATE"), new TableFieldSchema().setName("t1").setType("TIME").setMode("REQUIRED"), new TableFieldSchema().setName("dt").setType("DATETIME"), new TableFieldSchema().setName("i1").setType("INTEGER")));
    avroSchema = new Schema.Parser().parse("{\"type\":\"record\",\"name\":\"__root__\",\"fields\":" + "[{\"name\":\"ts\",\"type\":[\"null\",{\"type\":\"long\",\"logicalType\":\"timestamp-micros\"}]}," + "{\"name\":\"s1\",\"type\":[\"null\",\"string\"]}," + "{\"name\":\"d1\",\"type\":[\"null\",{\"type\":\"int\",\"logicalType\":\"date\"}]}," + "{\"name\":\"t1\",\"type\":{\"type\":\"long\",\"logicalType\":\"time-micros\"}}," + "{\"name\":\"dt\",\"type\":[\"null\",{\"type\":\"string\",\"logicalType\":\"datetime\"}]}," + "{\"name\":\"i1\",\"type\":[\"null\",\"long\"]}]}");
    long modTime = System.currentTimeMillis() * 1000;
    BigQueryTablePartition p1 = BigQueryTablePartition.builder().setPartitionName("p1").setLastModificationTime(modTime).build();
    BigQueryTablePartition p2 = BigQueryTablePartition.builder().setPartitionName("p2").setLastModificationTime(modTime).build();
    BigQueryTable t1 = BigQueryTable.builder().setTableName("partitioned_table").setProject(PROJECT).setDataset(DATASET).setSchema(avroSchema).setLastModificationTime(modTime).setPartitioningColumn("ts").setPartitions(Arrays.asList(p1, p2)).build();
    BigQueryTable t2 = BigQueryTable.builder().setTableName("unpartitioned_table").setProject(PROJECT).setDataset(DATASET).setSchema(avroSchema).setLastModificationTime(modTime).build();
    tableByName = new HashMap<>();
    tableByName.put(t1.getTableName(), t1);
    tableByName.put(t2.getTableName(), t2);
    defaultRecords = new TableRow[] { new TableRow().set("ts", 1L).set("s1", "1001").set("d1", "1970-01-01").set("t1", "00:00:00.000001").set("dt", "2020-01-01T00:42:00.123").set("i1", 2001L), new TableRow().set("ts", 2L).set("s1", "1002").set("d1", "1970-01-02").set("t1", "00:00:00.000002").set("dt", "2020-01-02T00:42:00.123").set("i1", 2002L), new TableRow().set("ts", 3L).set("s1", "1003").set("d1", "1970-01-03").set("t1", "00:00:00.000003").set("dt", "2020-01-03T00:42:00.123").set("i1", 2003L), new TableRow().set("ts", 4L).set("s1", "1004").set("d1", "1970-01-04").set("t1", "00:00:00.000004").set("dt", "2020-01-04T00:42:00.123").set("i1", null), new TableRow().set("ts", 5L).set("s1", "1005").set("d1", "1970-01-05").set("t1", "00:00:00.000005").set("dt", "2020-01-05T00:42:00.123").set("i1", 2005L) };
    defaultExpectedRecords = new String[] { "{\"ts\": 1, \"s1\": \"1001\", \"d1\": 0, \"t1\": 1, \"dt\": \"2020-01-01T00:42:00.123\"," + " \"i1\": 2001}", "{\"ts\": 2, \"s1\": \"1002\", \"d1\": 1, \"t1\": 2, \"dt\": \"2020-01-02T00:42:00.123\"," + " \"i1\": 2002}", "{\"ts\": 3, \"s1\": \"1003\", \"d1\": 2, \"t1\": 3, \"dt\": \"2020-01-03T00:42:00.123\"," + " \"i1\": 2003}", "{\"ts\": 4, \"s1\": \"1004\", \"d1\": 3, \"t1\": 4, \"dt\": \"2020-01-04T00:42:00.123\"," + " \"i1\": null}", "{\"ts\": 5, \"s1\": \"1005\", \"d1\": 4, \"t1\": 5, \"dt\": \"2020-01-05T00:42:00.123\"," + " \"i1\": 2005}" };
    FakeDatasetService.setUp();
    fakeDatasetService = new FakeDatasetService();
    fakeDatasetService.createDataset(PROJECT, DATASET, "", "", null);
    fakeDatasetService.createTable(new Table().setTableReference(t1.toTableReference()).setSchema(bqSchema).setRequirePartitionFilter(true).setTimePartitioning(new TimePartitioning().setField("ts").setType("DAY")));
    fakeDatasetService.createTable(new Table().setTableReference(t2.toTableReference()).setSchema(bqSchema));
    fakeJobService = new CustomFakeJobService();
    bqFakeServices = new FakeBigQueryServices().withJobService(fakeJobService).withDatasetService(fakeDatasetService);
    when(tableResultMock.iterateAll()).thenReturn(Collections.singleton(fields("unpartitioned_table", "0", null)));
    when(bqMock.query(any())).thenReturn(tableResultMock);
    when(bqMock.delete(any(TableId.class))).thenReturn(true);
    when(bqsMock.createReadSession(any())).thenReturn(ReadSession.newBuilder().setAvroSchema(AvroSchema.newBuilder().setSchema(avroSchema.toString())).build());
    metadataLoader = new BigQueryMetadataLoader(bqMock, bqsMock, MAX_PARALLEL_REQUESTS);
}
Also used : TableId(com.google.cloud.bigquery.TableId) BigQueryTablePartition(com.google.cloud.teleport.v2.values.BigQueryTablePartition) BigQueryTable(com.google.cloud.teleport.v2.values.BigQueryTable) Table(com.google.api.services.bigquery.model.Table) TableSchema(com.google.api.services.bigquery.model.TableSchema) BigQueryMetadataLoader(com.google.cloud.teleport.v2.utils.BigQueryMetadataLoader) TableFieldSchema(com.google.api.services.bigquery.model.TableFieldSchema) TimePartitioning(com.google.api.services.bigquery.model.TimePartitioning) FakeDatasetService(org.apache.beam.sdk.io.gcp.testing.FakeDatasetService) BigQueryTable(com.google.cloud.teleport.v2.values.BigQueryTable) TableRow(com.google.api.services.bigquery.model.TableRow) FakeBigQueryServices(org.apache.beam.sdk.io.gcp.testing.FakeBigQueryServices) DataplexBigQueryToGcsOptions(com.google.cloud.teleport.v2.options.DataplexBigQueryToGcsOptions) Before(org.junit.Before)

Example 33 with Options

use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.

the class DataplexFileFormatConversionTest method testAssetWithEntityParquetToAvroE2E.

/**
 * Tests Parquet to Avro conversion for an asset with entity.
 */
@Test
@Category(NeedsRunner.class)
public void testAssetWithEntityParquetToAvroE2E() throws IOException {
    DataplexClient dataplex = mock(DataplexClient.class);
    when(dataplex.getEntities(ImmutableList.of(entity4.getName()))).thenReturn(ImmutableList.of(entity4));
    when(dataplex.getPartitions(entity4.getName())).thenReturn(ImmutableList.of());
    when(dataplex.getAsset(outputAsset.getName())).thenReturn(outputAsset);
    FileFormatConversionOptions options = PipelineOptionsFactory.create().as(FileFormatConversionOptions.class);
    options.setInputAssetOrEntitiesList(entity4.getName());
    options.setOutputFileFormat(FileFormatOptions.AVRO);
    options.setOutputAsset(outputAsset.getName());
    DataplexFileFormatConversion.run(mainPipeline, options, dataplex, DataplexFileFormatConversionTest::outputPathProvider);
    PCollection<GenericRecord> readParquetFile = readPipeline.apply("ReadAvroFile", AvroConverters.ReadAvroFile.newBuilder().withInputFileSpec(temporaryFolder.getRoot().getAbsolutePath() + "/**/*.avro").withSerializedSchema(EXPECT_SERIALIZED_AVRO_SCHEMA).build());
    PAssert.that(readParquetFile).containsInAnyOrder(EXPECTED_GENERIC_RECORDS);
    readPipeline.run();
}
Also used : DataplexClient(com.google.cloud.teleport.v2.clients.DataplexClient) FileFormatConversionOptions(com.google.cloud.teleport.v2.templates.DataplexFileFormatConversion.FileFormatConversionOptions) GenericRecord(org.apache.avro.generic.GenericRecord) Category(org.junit.experimental.categories.Category) Test(org.junit.Test)

Example 34 with Options

use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.

the class DataplexFileFormatConversionTest method testAssetWithEntityAvroToParquetE2E.

/**
 * Tests Avro to Parquet conversion for an asset with entity.
 */
@Test
@Category(NeedsRunner.class)
public void testAssetWithEntityAvroToParquetE2E() throws IOException {
    DataplexClient dataplex = mock(DataplexClient.class);
    when(dataplex.getEntities(ImmutableList.of(entity3.getName()))).thenReturn(ImmutableList.of(entity3));
    when(dataplex.getPartitions(entity3.getName())).thenReturn(ImmutableList.of());
    when(dataplex.getAsset(outputAsset.getName())).thenReturn(outputAsset);
    FileFormatConversionOptions options = PipelineOptionsFactory.create().as(FileFormatConversionOptions.class);
    options.setInputAssetOrEntitiesList(entity3.getName());
    options.setOutputFileFormat(FileFormatOptions.PARQUET);
    options.setOutputAsset(outputAsset.getName());
    DataplexFileFormatConversion.run(mainPipeline, options, dataplex, DataplexFileFormatConversionTest::outputPathProvider);
    PCollection<GenericRecord> readParquetFile = readPipeline.apply("ReadParquetFile", ParquetConverters.ReadParquetFile.newBuilder().withInputFileSpec(temporaryFolder.getRoot().getAbsolutePath() + "/**/*.parquet").withSerializedSchema(EXPECT_SERIALIZED_AVRO_SCHEMA).build());
    PAssert.that(readParquetFile).containsInAnyOrder(EXPECTED_GENERIC_RECORDS);
    readPipeline.run();
}
Also used : DataplexClient(com.google.cloud.teleport.v2.clients.DataplexClient) FileFormatConversionOptions(com.google.cloud.teleport.v2.templates.DataplexFileFormatConversion.FileFormatConversionOptions) GenericRecord(org.apache.avro.generic.GenericRecord) Category(org.junit.experimental.categories.Category) Test(org.junit.Test)

Example 35 with Options

use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.

the class SpannerChangeStreamsToGcsTest method testInvalidWindowDuration.

@Test
public void testInvalidWindowDuration() {
    exception.expect(IllegalArgumentException.class);
    exception.expectMessage("The window duration must be greater than 0!");
    SpannerChangeStreamsToGcsOptions options = PipelineOptionsFactory.create().as(SpannerChangeStreamsToGcsOptions.class);
    options.setOutputFileFormat(FileFormat.AVRO);
    options.setGcsOutputDirectory(fakeDir);
    options.setOutputFilenamePrefix(FILENAME_PREFIX);
    options.setNumShards(NUM_SHARDS);
    options.setTempLocation(fakeTempLocation);
    options.setWindowDuration("invalidWindowDuration");
    Pipeline p = Pipeline.create(options);
    Timestamp startTimestamp = Timestamp.now();
    Timestamp endTimestamp = Timestamp.now();
    p.apply(SpannerIO.readChangeStream().withSpannerConfig(SpannerConfig.create().withProjectId("project").withInstanceId("instance").withDatabaseId("db")).withMetadataInstance("instance").withMetadataDatabase("db").withChangeStreamName("changestream").withInclusiveStartAt(startTimestamp).withInclusiveEndAt(endTimestamp).withRpcPriority(RpcPriority.HIGH)).apply("Creating " + options.getWindowDuration() + " Window", Window.into(FixedWindows.of(DurationUtils.parseDuration(options.getWindowDuration())))).apply("Write To GCS", FileFormatFactorySpannerChangeStreams.newBuilder().setOptions(options).build());
    p.run();
}
Also used : SpannerChangeStreamsToGcsOptions(com.google.cloud.teleport.v2.options.SpannerChangeStreamsToGcsOptions) Timestamp(com.google.cloud.Timestamp) TestPipeline(org.apache.beam.sdk.testing.TestPipeline) Pipeline(org.apache.beam.sdk.Pipeline) Test(org.junit.Test) IntegrationTest(com.google.cloud.teleport.v2.spanner.IntegrationTest)

Aggregations

Test (org.junit.Test)63 PCollectionTuple (org.apache.beam.sdk.values.PCollectionTuple)25 FailsafeElement (com.google.cloud.teleport.v2.values.FailsafeElement)20 Pipeline (org.apache.beam.sdk.Pipeline)19 CoderRegistry (org.apache.beam.sdk.coders.CoderRegistry)19 BigQueryTable (com.google.cloud.teleport.v2.values.BigQueryTable)15 GenericRecord (org.apache.avro.generic.GenericRecord)12 Category (org.junit.experimental.categories.Category)12 Filter (com.google.cloud.teleport.v2.utils.BigQueryMetadataLoader.Filter)10 BigQueryTablePartition (com.google.cloud.teleport.v2.values.BigQueryTablePartition)10 PubSubToElasticsearchOptions (com.google.cloud.teleport.v2.elasticsearch.options.PubSubToElasticsearchOptions)9 TableRow (com.google.api.services.bigquery.model.TableRow)8 DataplexClient (com.google.cloud.teleport.v2.clients.DataplexClient)8 FileFormatConversionOptions (com.google.cloud.teleport.v2.templates.FileFormatConversion.FileFormatConversionOptions)8 KV (org.apache.beam.sdk.values.KV)8 ArrayList (java.util.ArrayList)7 ElasticsearchWriteOptions (com.google.cloud.teleport.v2.elasticsearch.options.ElasticsearchWriteOptions)6 GCSToElasticsearchOptions (com.google.cloud.teleport.v2.elasticsearch.options.GCSToElasticsearchOptions)6 FileFormatConversionOptions (com.google.cloud.teleport.v2.templates.DataplexFileFormatConversion.FileFormatConversionOptions)6 PubSubProtoToBigQueryOptions (com.google.cloud.teleport.v2.templates.PubsubProtoToBigQuery.PubSubProtoToBigQueryOptions)6