Search in sources :

Example 81 with Options

use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.

the class DataplexFileFormatConversionTest method testEntityWithPartitionsCsvToAvroE2E.

/**
 * Tests CSV to Avro conversion for an entity with partitions.
 */
@Test
@Category(NeedsRunner.class)
public void testEntityWithPartitionsCsvToAvroE2E() throws IOException {
    DataplexClient dataplex = mock(DataplexClient.class);
    when(dataplex.getEntities(ImmutableList.of(entity1.getName()))).thenReturn(ImmutableList.of(entity1));
    when(dataplex.getPartitions(entity1.getName())).thenReturn(ImmutableList.of(partition11, partition12));
    when(dataplex.getAsset(outputAsset.getName())).thenReturn(outputAsset);
    FileFormatConversionOptions options = PipelineOptionsFactory.create().as(FileFormatConversionOptions.class);
    options.setInputAssetOrEntitiesList(entity1.getName());
    options.setOutputFileFormat(FileFormatOptions.AVRO);
    options.setOutputAsset(outputAsset.getName());
    DataplexFileFormatConversion.run(mainPipeline, options, dataplex, DataplexFileFormatConversionTest::outputPathProvider);
    PCollection<GenericRecord> readAvroFile = readPipeline.apply("ReadAvroFile", AvroConverters.ReadAvroFile.newBuilder().withInputFileSpec(temporaryFolder.getRoot().getAbsolutePath() + "/**/*.avro").withSerializedSchema(EXPECT_SERIALIZED_AVRO_SCHEMA).build());
    PAssert.that(readAvroFile).containsInAnyOrder(EXPECTED_GENERIC_RECORDS);
    readPipeline.run();
}
Also used : DataplexClient(com.google.cloud.teleport.v2.clients.DataplexClient) FileFormatConversionOptions(com.google.cloud.teleport.v2.templates.DataplexFileFormatConversion.FileFormatConversionOptions) GenericRecord(org.apache.avro.generic.GenericRecord) Category(org.junit.experimental.categories.Category) Test(org.junit.Test)

Example 82 with Options

use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.

the class SpannerChangeStreamsToGcsTest method testFileFormatFactoryInvalid.

/**
 * Test whether {@link FileFormatFactory} maps the output file format to the transform to be
 * carried out. And throws illegal argument exception if invalid file format is passed.
 */
@Test
public void testFileFormatFactoryInvalid() {
    exception.expect(IllegalArgumentException.class);
    exception.expectMessage("Invalid output format:PARQUET. Supported output formats: TEXT, AVRO");
    SpannerChangeStreamsToGcsOptions options = PipelineOptionsFactory.create().as(SpannerChangeStreamsToGcsOptions.class);
    options.setOutputFileFormat(FileFormat.PARQUET);
    options.setGcsOutputDirectory(fakeDir);
    options.setOutputFilenamePrefix(FILENAME_PREFIX);
    options.setNumShards(NUM_SHARDS);
    options.setTempLocation(fakeTempLocation);
    Pipeline p = Pipeline.create(options);
    Timestamp startTimestamp = Timestamp.now();
    Timestamp endTimestamp = Timestamp.now();
    p.apply(SpannerIO.readChangeStream().withSpannerConfig(SpannerConfig.create().withProjectId("project").withInstanceId("instance").withDatabaseId("db")).withMetadataInstance("instance").withMetadataDatabase("db").withChangeStreamName("changestream").withInclusiveStartAt(startTimestamp).withInclusiveEndAt(endTimestamp).withRpcPriority(RpcPriority.HIGH)).apply("Creating " + options.getWindowDuration() + " Window", Window.into(FixedWindows.of(DurationUtils.parseDuration(options.getWindowDuration())))).apply("Write To GCS", FileFormatFactorySpannerChangeStreams.newBuilder().setOptions(options).build());
    p.run();
}
Also used : SpannerChangeStreamsToGcsOptions(com.google.cloud.teleport.v2.options.SpannerChangeStreamsToGcsOptions) Timestamp(com.google.cloud.Timestamp) TestPipeline(org.apache.beam.sdk.testing.TestPipeline) Pipeline(org.apache.beam.sdk.Pipeline) Test(org.junit.Test) IntegrationTest(com.google.cloud.teleport.v2.spanner.IntegrationTest)

Example 83 with Options

use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.

the class SpannerChangeStreamsToGcsTest method testWriteToGCSText.

@Test
@Category(IntegrationTest.class)
public // mvn -Dexcluded.spanner.tests="" -Dtest=SpannerChangeStreamsToGcsTest test
void testWriteToGCSText() throws Exception {
    // Create a test database.
    String testDatabase = generateDatabaseName();
    fakeDir = tmpDir.newFolder("output").getAbsolutePath();
    fakeTempLocation = tmpDir.newFolder("temporaryLocation").getAbsolutePath();
    spannerServer.dropDatabase(testDatabase);
    // Create a table.
    List<String> statements = new ArrayList<String>();
    final String createTable = "CREATE TABLE " + TEST_TABLE + " (" + "user_id INT64 NOT NULL," + "name STRING(MAX) " + ") PRIMARY KEY(user_id)";
    final String createChangeStream = "CREATE CHANGE STREAM " + TEST_CHANGE_STREAM + " FOR Users";
    statements.add(createTable);
    statements.add(createChangeStream);
    spannerServer.createDatabase(testDatabase, statements);
    Timestamp startTimestamp = Timestamp.now();
    // Create a mutation for the table that will generate 1 data change record.
    List<Mutation> mutations = new ArrayList<>();
    mutations.add(Mutation.newInsertBuilder(TEST_TABLE).set("user_id").to(1).set("name").to("Name1").build());
    mutations.add(Mutation.newInsertBuilder(TEST_TABLE).set("user_id").to(2).set("name").to("Name2").build());
    spannerServer.getDbClient(testDatabase).write(mutations);
    Timestamp endTimestamp = Timestamp.now();
    SpannerChangeStreamsToGcsOptions options = PipelineOptionsFactory.create().as(SpannerChangeStreamsToGcsOptions.class);
    options.setSpannerProjectId(TEST_PROJECT);
    options.setSpannerInstanceId(TEST_INSTANCE);
    options.setSpannerDatabase(testDatabase);
    options.setSpannerMetadataInstanceId(TEST_INSTANCE);
    options.setSpannerMetadataDatabase(testDatabase);
    options.setSpannerChangeStreamName(TEST_CHANGE_STREAM);
    options.setStartTimestamp(startTimestamp.toString());
    options.setEndTimestamp(endTimestamp.toString());
    List<String> experiments = new ArrayList<String>();
    options.setExperiments(experiments);
    options.setOutputFileFormat(FileFormat.TEXT);
    options.setGcsOutputDirectory(fakeDir);
    options.setOutputFilenamePrefix(TEXT_FILENAME_PREFIX);
    options.setNumShards(NUM_SHARDS);
    options.setTempLocation(fakeTempLocation);
    // Run the pipeline.
    PipelineResult result = run(options);
    result.waitUntilFinish();
    // Read from the output Avro file to assert that 1 data change record has been generated.
    PCollection<String> dataChangeRecords = pipeline.apply("readRecords", TextIO.read().from(fakeDir + "/text-output-*.txt"));
    PAssert.that(dataChangeRecords).satisfies(new VerifyDataChangeRecordText());
    pipeline.run();
    // Drop the database.
    spannerServer.dropDatabase(testDatabase);
}
Also used : SpannerChangeStreamsToGcsOptions(com.google.cloud.teleport.v2.options.SpannerChangeStreamsToGcsOptions) ArrayList(java.util.ArrayList) PipelineResult(org.apache.beam.sdk.PipelineResult) Mutation(com.google.cloud.spanner.Mutation) Timestamp(com.google.cloud.Timestamp) Category(org.junit.experimental.categories.Category) Test(org.junit.Test) IntegrationTest(com.google.cloud.teleport.v2.spanner.IntegrationTest)

Example 84 with Options

use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.

the class DataplexBigQueryToGcs method transformPipeline.

@VisibleForTesting
static void transformPipeline(Pipeline pipeline, List<BigQueryTable> tables, DataplexBigQueryToGcsOptions options, String targetRootPath, BigQueryServices testBqServices, BigQueryClientFactory testBqClientFactory) {
    List<PCollection<KV<BigQueryTable, KV<BigQueryTablePartition, String>>>> fileCollections = new ArrayList<>(tables.size());
    tables.forEach(table -> {
        fileCollections.add(pipeline.apply(String.format("ExportTable-%s", table.getTableName()), new BigQueryTableToGcsTransform(table, targetRootPath, options.getFileFormat(), options.getFileCompression(), options.getEnforceSamePartitionKey()).withTestServices(testBqServices)).apply(String.format("AttachTableKeys-%s", table.getTableName()), WithKeys.of(table)));
    });
    PCollection<KV<BigQueryTable, KV<BigQueryTablePartition, String>>> exportFileResults = PCollectionList.of(fileCollections).apply("FlattenTableResults", Flatten.pCollections());
    PCollection<Void> metadataUpdateResults = exportFileResults.apply("UpdateDataplexMetadata", new UpdateDataplexBigQueryToGcsExportMetadataTransform());
    exportFileResults.apply(MapElements.into(TypeDescriptors.kvs(TypeDescriptor.of(BigQueryTable.class), TypeDescriptor.of(BigQueryTablePartition.class))).via((SerializableFunction<KV<BigQueryTable, KV<BigQueryTablePartition, String>>, KV<BigQueryTable, BigQueryTablePartition>>) input -> KV.of(input.getKey(), input.getValue().getKey()))).apply("WaitForMetadataUpdate", Wait.on(metadataUpdateResults)).apply("TruncateBigQueryData", ParDo.of(new DeleteBigQueryDataFn().withTestBqClientFactory(testBqClientFactory)));
}
Also used : BigQueryTablePartition(com.google.cloud.teleport.v2.values.BigQueryTablePartition) SerializableFunction(org.apache.beam.sdk.transforms.SerializableFunction) ArrayList(java.util.ArrayList) DeleteBigQueryDataFn(com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn) KV(org.apache.beam.sdk.values.KV) PCollection(org.apache.beam.sdk.values.PCollection) BigQueryTable(com.google.cloud.teleport.v2.values.BigQueryTable) UpdateDataplexBigQueryToGcsExportMetadataTransform(com.google.cloud.teleport.v2.transforms.UpdateDataplexBigQueryToGcsExportMetadataTransform) BigQueryTableToGcsTransform(com.google.cloud.teleport.v2.transforms.BigQueryTableToGcsTransform) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 85 with Options

use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.

the class DataplexBigQueryToGcs method main.

/**
 * Main entry point for pipeline execution.
 *
 * @param args Command line arguments to the pipeline.
 */
public static void main(String[] args) throws IOException, InterruptedException, ExecutionException {
    DataplexBigQueryToGcsOptions options = PipelineOptionsFactory.fromArgs(args).withValidation().as(DataplexBigQueryToGcsOptions.class);
    List<String> experiments = new ArrayList<>();
    if (options.getExperiments() != null) {
        experiments.addAll(options.getExperiments());
    }
    if (!experiments.contains("upload_graph")) {
        experiments.add("upload_graph");
    }
    options.setExperiments(experiments);
    Pipeline pipeline;
    DataplexClient dataplex = DefaultDataplexClient.withDefaultClient(options.getGcpCredential());
    BigQuery bqClient = BigQueryOptions.getDefaultInstance().getService();
    try (BigQueryStorageClient bqsClient = BigQueryStorageClient.create()) {
        LOG.info("Building the pipeline...");
        pipeline = setUpPipeline(options, dataplex, bqClient, bqsClient);
    }
    LOG.info("Running the pipeline.");
    pipeline.run();
}
Also used : BigQuery(com.google.cloud.bigquery.BigQuery) DefaultDataplexClient(com.google.cloud.teleport.v2.clients.DefaultDataplexClient) DataplexClient(com.google.cloud.teleport.v2.clients.DataplexClient) BigQueryStorageClient(com.google.cloud.bigquery.storage.v1beta1.BigQueryStorageClient) ArrayList(java.util.ArrayList) DataplexBigQueryToGcsOptions(com.google.cloud.teleport.v2.options.DataplexBigQueryToGcsOptions) Pipeline(org.apache.beam.sdk.Pipeline)

Aggregations

Test (org.junit.Test)63 PCollectionTuple (org.apache.beam.sdk.values.PCollectionTuple)25 FailsafeElement (com.google.cloud.teleport.v2.values.FailsafeElement)20 Pipeline (org.apache.beam.sdk.Pipeline)19 CoderRegistry (org.apache.beam.sdk.coders.CoderRegistry)19 BigQueryTable (com.google.cloud.teleport.v2.values.BigQueryTable)15 GenericRecord (org.apache.avro.generic.GenericRecord)12 Category (org.junit.experimental.categories.Category)12 Filter (com.google.cloud.teleport.v2.utils.BigQueryMetadataLoader.Filter)10 BigQueryTablePartition (com.google.cloud.teleport.v2.values.BigQueryTablePartition)10 PubSubToElasticsearchOptions (com.google.cloud.teleport.v2.elasticsearch.options.PubSubToElasticsearchOptions)9 TableRow (com.google.api.services.bigquery.model.TableRow)8 DataplexClient (com.google.cloud.teleport.v2.clients.DataplexClient)8 FileFormatConversionOptions (com.google.cloud.teleport.v2.templates.FileFormatConversion.FileFormatConversionOptions)8 KV (org.apache.beam.sdk.values.KV)8 ArrayList (java.util.ArrayList)7 ElasticsearchWriteOptions (com.google.cloud.teleport.v2.elasticsearch.options.ElasticsearchWriteOptions)6 GCSToElasticsearchOptions (com.google.cloud.teleport.v2.elasticsearch.options.GCSToElasticsearchOptions)6 FileFormatConversionOptions (com.google.cloud.teleport.v2.templates.DataplexFileFormatConversion.FileFormatConversionOptions)6 PubSubProtoToBigQueryOptions (com.google.cloud.teleport.v2.templates.PubsubProtoToBigQuery.PubSubProtoToBigQueryOptions)6