use of com.google.cloud.teleport.v2.clients.DataplexClient in project DataflowTemplates by GoogleCloudPlatform.
the class DataplexFileFormatConversionTest method testAssetWithEntityJsonToParquetFailOnExistingFilesE2E.
/**
* Tests JSON to Parquet conversion for an asset with entity when one of the files already exists
* and the existing file behaviour is FAIL.
*/
@Test(expected = RuntimeException.class)
@Category(NeedsRunner.class)
public void testAssetWithEntityJsonToParquetFailOnExistingFilesE2E() throws IOException {
// setup Dataplex client to return entity 2
DataplexClient dataplex = mock(DataplexClient.class);
when(dataplex.getCloudStorageEntities(asset2.getName())).thenReturn(ImmutableList.of(entity2));
when(dataplex.getPartitions(entity2.getName())).thenReturn(ImmutableList.of());
when(dataplex.getAsset(outputAsset.getName())).thenReturn(outputAsset);
// setup options to fail on existing files
FileFormatConversionOptions options = PipelineOptionsFactory.create().as(FileFormatConversionOptions.class);
options.setInputAssetOrEntitiesList(asset2.getName());
options.setOutputFileFormat(FileFormatOptions.PARQUET);
options.setOutputAsset(outputAsset.getName());
options.setWriteDisposition(WriteDispositionOptions.FAIL);
// simulate the 1.json -> 1.parquet conversion already happened
copyFileToOutputBucket("entity2.existing/1.parquet", "entity2/1.parquet");
// simulate the 2.json -> 2.parquet conversion already happened
copyFileToOutputBucket("entity2.existing/1.parquet", "entity2/2.parquet");
// run the pipeline, the job should fail because 1.parquet already exists
DataplexFileFormatConversion.run(mainPipeline, options, dataplex, DataplexFileFormatConversionTest::outputPathProvider).waitUntilFinish();
}
use of com.google.cloud.teleport.v2.clients.DataplexClient in project DataflowTemplates by GoogleCloudPlatform.
the class DataplexFileFormatConversionTest method testAssetWithEntityJsonToParquetSkipExistingFilesE2E.
/**
* Tests JSON to Parquet conversion for an asset with entity when one of the files already exists
* and the existing file behaviour is SKIP.
*/
@Test
@Category(NeedsRunner.class)
public void testAssetWithEntityJsonToParquetSkipExistingFilesE2E() throws IOException {
// setup Dataplex client to return entity 2
DataplexClient dataplex = mock(DataplexClient.class);
when(dataplex.getCloudStorageEntities(asset2.getName())).thenReturn(ImmutableList.of(entity2));
when(dataplex.getPartitions(entity2.getName())).thenReturn(ImmutableList.of());
when(dataplex.getAsset(outputAsset.getName())).thenReturn(outputAsset);
// setup options to skip existing files
FileFormatConversionOptions options = PipelineOptionsFactory.create().as(FileFormatConversionOptions.class);
options.setInputAssetOrEntitiesList(asset2.getName());
options.setOutputFileFormat(FileFormatOptions.PARQUET);
options.setOutputAsset(outputAsset.getName());
options.setWriteDisposition(WriteDispositionOptions.SKIP);
// simulate the 1.json -> 1.parquet conversion already happened
copyFileToOutputBucket("entity2.existing/1.parquet", "entity2/1.parquet");
// run the pipeline, only 2.json -> 2.parquet conversion should happen
DataplexFileFormatConversion.run(mainPipeline, options, dataplex, DataplexFileFormatConversionTest::outputPathProvider);
// read the conversion results
PCollection<GenericRecord> readParquetFile = readPipeline.apply("ReadParquetFile", ParquetConverters.ReadParquetFile.newBuilder().withInputFileSpec(temporaryFolder.getRoot().getAbsolutePath() + "/**/*.parquet").withSerializedSchema(EXPECT_SERIALIZED_AVRO_SCHEMA).build());
// expect old 1.parquet (from entity2.existing) and newly converted 2.parquet (from entity2)
ImmutableList.Builder<GenericRecord> expected = ImmutableList.builder();
Record record = new Record(EXPECTED_AVRO_SCHEMA);
record.put("Word", "abc.existing");
record.put("Number", 1);
expected.add(record);
record = new Record(EXPECTED_AVRO_SCHEMA);
record.put("Word", "def");
record.put("Number", 2);
expected.add(record);
record = new Record(EXPECTED_AVRO_SCHEMA);
record.put("Word", "ghi");
record.put("Number", 3);
expected.add(record);
PAssert.that(readParquetFile).containsInAnyOrder(expected.build());
readPipeline.run();
}
use of com.google.cloud.teleport.v2.clients.DataplexClient in project DataflowTemplates by GoogleCloudPlatform.
the class DataplexFileFormatConversionTest method testAssetWithEntityJsonToGzippedParquetE2E.
/**
* Tests JSON to Parquet conversion for an asset with entity using non-default compression.
*/
@Test
@Category(NeedsRunner.class)
public void testAssetWithEntityJsonToGzippedParquetE2E() throws IOException {
DataplexClient dataplex = mock(DataplexClient.class);
when(dataplex.getCloudStorageEntities(asset2.getName())).thenReturn(ImmutableList.of(entity2));
when(dataplex.getPartitions(entity2.getName())).thenReturn(ImmutableList.of());
when(dataplex.getAsset(outputAsset.getName())).thenReturn(outputAsset);
FileFormatConversionOptions options = PipelineOptionsFactory.create().as(FileFormatConversionOptions.class);
options.setInputAssetOrEntitiesList(asset2.getName());
options.setOutputFileFormat(FileFormatOptions.PARQUET);
options.setOutputAsset(outputAsset.getName());
options.setOutputFileCompression(DataplexCompression.GZIP);
DataplexFileFormatConversion.run(mainPipeline, options, dataplex, DataplexFileFormatConversionTest::outputPathProvider);
PCollection<GenericRecord> readParquetFile = readPipeline.apply("ReadParquetFile", ParquetConverters.ReadParquetFile.newBuilder().withInputFileSpec(temporaryFolder.getRoot().getAbsolutePath() + "/**/*.parquet").withSerializedSchema(EXPECT_SERIALIZED_AVRO_SCHEMA).build());
PAssert.that(readParquetFile).containsInAnyOrder(EXPECTED_GENERIC_RECORDS);
readPipeline.run();
}
use of com.google.cloud.teleport.v2.clients.DataplexClient in project DataflowTemplates by GoogleCloudPlatform.
the class DataplexFileFormatConversionTest method testEntityWithPartitionsCsvToAvroE2E.
/**
* Tests CSV to Avro conversion for an entity with partitions.
*/
@Test
@Category(NeedsRunner.class)
public void testEntityWithPartitionsCsvToAvroE2E() throws IOException {
DataplexClient dataplex = mock(DataplexClient.class);
when(dataplex.getEntities(ImmutableList.of(entity1.getName()))).thenReturn(ImmutableList.of(entity1));
when(dataplex.getPartitions(entity1.getName())).thenReturn(ImmutableList.of(partition11, partition12));
when(dataplex.getAsset(outputAsset.getName())).thenReturn(outputAsset);
FileFormatConversionOptions options = PipelineOptionsFactory.create().as(FileFormatConversionOptions.class);
options.setInputAssetOrEntitiesList(entity1.getName());
options.setOutputFileFormat(FileFormatOptions.AVRO);
options.setOutputAsset(outputAsset.getName());
DataplexFileFormatConversion.run(mainPipeline, options, dataplex, DataplexFileFormatConversionTest::outputPathProvider);
PCollection<GenericRecord> readAvroFile = readPipeline.apply("ReadAvroFile", AvroConverters.ReadAvroFile.newBuilder().withInputFileSpec(temporaryFolder.getRoot().getAbsolutePath() + "/**/*.avro").withSerializedSchema(EXPECT_SERIALIZED_AVRO_SCHEMA).build());
PAssert.that(readAvroFile).containsInAnyOrder(EXPECTED_GENERIC_RECORDS);
readPipeline.run();
}
use of com.google.cloud.teleport.v2.clients.DataplexClient in project DataflowTemplates by GoogleCloudPlatform.
the class DataplexBigQueryToGcs method main.
/**
* Main entry point for pipeline execution.
*
* @param args Command line arguments to the pipeline.
*/
public static void main(String[] args) throws IOException, InterruptedException, ExecutionException {
DataplexBigQueryToGcsOptions options = PipelineOptionsFactory.fromArgs(args).withValidation().as(DataplexBigQueryToGcsOptions.class);
List<String> experiments = new ArrayList<>();
if (options.getExperiments() != null) {
experiments.addAll(options.getExperiments());
}
if (!experiments.contains("upload_graph")) {
experiments.add("upload_graph");
}
options.setExperiments(experiments);
Pipeline pipeline;
DataplexClient dataplex = DefaultDataplexClient.withDefaultClient(options.getGcpCredential());
BigQuery bqClient = BigQueryOptions.getDefaultInstance().getService();
try (BigQueryStorageClient bqsClient = BigQueryStorageClient.create()) {
LOG.info("Building the pipeline...");
pipeline = setUpPipeline(options, dataplex, bqClient, bqsClient);
}
LOG.info("Running the pipeline.");
pipeline.run();
}
Aggregations