use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.
the class DataplexFileFormatConversionTest method testEntityWithPartitionsCsvToAvroE2E.
/**
* Tests CSV to Avro conversion for an entity with partitions.
*/
@Test
@Category(NeedsRunner.class)
public void testEntityWithPartitionsCsvToAvroE2E() throws IOException {
DataplexClient dataplex = mock(DataplexClient.class);
when(dataplex.getEntities(ImmutableList.of(entity1.getName()))).thenReturn(ImmutableList.of(entity1));
when(dataplex.getPartitions(entity1.getName())).thenReturn(ImmutableList.of(partition11, partition12));
when(dataplex.getAsset(outputAsset.getName())).thenReturn(outputAsset);
FileFormatConversionOptions options = PipelineOptionsFactory.create().as(FileFormatConversionOptions.class);
options.setInputAssetOrEntitiesList(entity1.getName());
options.setOutputFileFormat(FileFormatOptions.AVRO);
options.setOutputAsset(outputAsset.getName());
DataplexFileFormatConversion.run(mainPipeline, options, dataplex, DataplexFileFormatConversionTest::outputPathProvider);
PCollection<GenericRecord> readAvroFile = readPipeline.apply("ReadAvroFile", AvroConverters.ReadAvroFile.newBuilder().withInputFileSpec(temporaryFolder.getRoot().getAbsolutePath() + "/**/*.avro").withSerializedSchema(EXPECT_SERIALIZED_AVRO_SCHEMA).build());
PAssert.that(readAvroFile).containsInAnyOrder(EXPECTED_GENERIC_RECORDS);
readPipeline.run();
}
use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.
the class SpannerChangeStreamsToGcsTest method testFileFormatFactoryInvalid.
/**
* Test whether {@link FileFormatFactory} maps the output file format to the transform to be
* carried out. And throws illegal argument exception if invalid file format is passed.
*/
@Test
public void testFileFormatFactoryInvalid() {
exception.expect(IllegalArgumentException.class);
exception.expectMessage("Invalid output format:PARQUET. Supported output formats: TEXT, AVRO");
SpannerChangeStreamsToGcsOptions options = PipelineOptionsFactory.create().as(SpannerChangeStreamsToGcsOptions.class);
options.setOutputFileFormat(FileFormat.PARQUET);
options.setGcsOutputDirectory(fakeDir);
options.setOutputFilenamePrefix(FILENAME_PREFIX);
options.setNumShards(NUM_SHARDS);
options.setTempLocation(fakeTempLocation);
Pipeline p = Pipeline.create(options);
Timestamp startTimestamp = Timestamp.now();
Timestamp endTimestamp = Timestamp.now();
p.apply(SpannerIO.readChangeStream().withSpannerConfig(SpannerConfig.create().withProjectId("project").withInstanceId("instance").withDatabaseId("db")).withMetadataInstance("instance").withMetadataDatabase("db").withChangeStreamName("changestream").withInclusiveStartAt(startTimestamp).withInclusiveEndAt(endTimestamp).withRpcPriority(RpcPriority.HIGH)).apply("Creating " + options.getWindowDuration() + " Window", Window.into(FixedWindows.of(DurationUtils.parseDuration(options.getWindowDuration())))).apply("Write To GCS", FileFormatFactorySpannerChangeStreams.newBuilder().setOptions(options).build());
p.run();
}
use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.
the class SpannerChangeStreamsToGcsTest method testWriteToGCSText.
@Test
@Category(IntegrationTest.class)
public // mvn -Dexcluded.spanner.tests="" -Dtest=SpannerChangeStreamsToGcsTest test
void testWriteToGCSText() throws Exception {
// Create a test database.
String testDatabase = generateDatabaseName();
fakeDir = tmpDir.newFolder("output").getAbsolutePath();
fakeTempLocation = tmpDir.newFolder("temporaryLocation").getAbsolutePath();
spannerServer.dropDatabase(testDatabase);
// Create a table.
List<String> statements = new ArrayList<String>();
final String createTable = "CREATE TABLE " + TEST_TABLE + " (" + "user_id INT64 NOT NULL," + "name STRING(MAX) " + ") PRIMARY KEY(user_id)";
final String createChangeStream = "CREATE CHANGE STREAM " + TEST_CHANGE_STREAM + " FOR Users";
statements.add(createTable);
statements.add(createChangeStream);
spannerServer.createDatabase(testDatabase, statements);
Timestamp startTimestamp = Timestamp.now();
// Create a mutation for the table that will generate 1 data change record.
List<Mutation> mutations = new ArrayList<>();
mutations.add(Mutation.newInsertBuilder(TEST_TABLE).set("user_id").to(1).set("name").to("Name1").build());
mutations.add(Mutation.newInsertBuilder(TEST_TABLE).set("user_id").to(2).set("name").to("Name2").build());
spannerServer.getDbClient(testDatabase).write(mutations);
Timestamp endTimestamp = Timestamp.now();
SpannerChangeStreamsToGcsOptions options = PipelineOptionsFactory.create().as(SpannerChangeStreamsToGcsOptions.class);
options.setSpannerProjectId(TEST_PROJECT);
options.setSpannerInstanceId(TEST_INSTANCE);
options.setSpannerDatabase(testDatabase);
options.setSpannerMetadataInstanceId(TEST_INSTANCE);
options.setSpannerMetadataDatabase(testDatabase);
options.setSpannerChangeStreamName(TEST_CHANGE_STREAM);
options.setStartTimestamp(startTimestamp.toString());
options.setEndTimestamp(endTimestamp.toString());
List<String> experiments = new ArrayList<String>();
options.setExperiments(experiments);
options.setOutputFileFormat(FileFormat.TEXT);
options.setGcsOutputDirectory(fakeDir);
options.setOutputFilenamePrefix(TEXT_FILENAME_PREFIX);
options.setNumShards(NUM_SHARDS);
options.setTempLocation(fakeTempLocation);
// Run the pipeline.
PipelineResult result = run(options);
result.waitUntilFinish();
// Read from the output Avro file to assert that 1 data change record has been generated.
PCollection<String> dataChangeRecords = pipeline.apply("readRecords", TextIO.read().from(fakeDir + "/text-output-*.txt"));
PAssert.that(dataChangeRecords).satisfies(new VerifyDataChangeRecordText());
pipeline.run();
// Drop the database.
spannerServer.dropDatabase(testDatabase);
}
use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.
the class DataplexBigQueryToGcs method transformPipeline.
@VisibleForTesting
static void transformPipeline(Pipeline pipeline, List<BigQueryTable> tables, DataplexBigQueryToGcsOptions options, String targetRootPath, BigQueryServices testBqServices, BigQueryClientFactory testBqClientFactory) {
List<PCollection<KV<BigQueryTable, KV<BigQueryTablePartition, String>>>> fileCollections = new ArrayList<>(tables.size());
tables.forEach(table -> {
fileCollections.add(pipeline.apply(String.format("ExportTable-%s", table.getTableName()), new BigQueryTableToGcsTransform(table, targetRootPath, options.getFileFormat(), options.getFileCompression(), options.getEnforceSamePartitionKey()).withTestServices(testBqServices)).apply(String.format("AttachTableKeys-%s", table.getTableName()), WithKeys.of(table)));
});
PCollection<KV<BigQueryTable, KV<BigQueryTablePartition, String>>> exportFileResults = PCollectionList.of(fileCollections).apply("FlattenTableResults", Flatten.pCollections());
PCollection<Void> metadataUpdateResults = exportFileResults.apply("UpdateDataplexMetadata", new UpdateDataplexBigQueryToGcsExportMetadataTransform());
exportFileResults.apply(MapElements.into(TypeDescriptors.kvs(TypeDescriptor.of(BigQueryTable.class), TypeDescriptor.of(BigQueryTablePartition.class))).via((SerializableFunction<KV<BigQueryTable, KV<BigQueryTablePartition, String>>, KV<BigQueryTable, BigQueryTablePartition>>) input -> KV.of(input.getKey(), input.getValue().getKey()))).apply("WaitForMetadataUpdate", Wait.on(metadataUpdateResults)).apply("TruncateBigQueryData", ParDo.of(new DeleteBigQueryDataFn().withTestBqClientFactory(testBqClientFactory)));
}
use of com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options in project DataflowTemplates by GoogleCloudPlatform.
the class DataplexBigQueryToGcs method main.
/**
* Main entry point for pipeline execution.
*
* @param args Command line arguments to the pipeline.
*/
public static void main(String[] args) throws IOException, InterruptedException, ExecutionException {
DataplexBigQueryToGcsOptions options = PipelineOptionsFactory.fromArgs(args).withValidation().as(DataplexBigQueryToGcsOptions.class);
List<String> experiments = new ArrayList<>();
if (options.getExperiments() != null) {
experiments.addAll(options.getExperiments());
}
if (!experiments.contains("upload_graph")) {
experiments.add("upload_graph");
}
options.setExperiments(experiments);
Pipeline pipeline;
DataplexClient dataplex = DefaultDataplexClient.withDefaultClient(options.getGcpCredential());
BigQuery bqClient = BigQueryOptions.getDefaultInstance().getService();
try (BigQueryStorageClient bqsClient = BigQueryStorageClient.create()) {
LOG.info("Building the pipeline...");
pipeline = setUpPipeline(options, dataplex, bqClient, bqsClient);
}
LOG.info("Running the pipeline.");
pipeline.run();
}
Aggregations