use of com.google.cloud.teleport.v2.templates.FileFormatConversion.FileFormatConversionOptions in project DataflowTemplates by GoogleCloudPlatform.
the class DataplexFileFormatConversion method run.
/**
* Runs the pipeline to completion with the specified options.
*
* @return The pipeline result.
*/
public static PipelineResult run(Pipeline pipeline, FileFormatConversionOptions options, DataplexClient dataplex, OutputPathProvider outputPathProvider) throws IOException {
boolean isInputAsset = ASSET_PATTERN.matcher(options.getInputAssetOrEntitiesList()).matches();
if (!isInputAsset && !ENTITIES_PATTERN.matcher(options.getInputAssetOrEntitiesList()).matches()) {
throw new IllegalArgumentException("Either input asset or input entities list must be provided");
}
GoogleCloudDataplexV1Asset outputAsset = dataplex.getAsset(options.getOutputAsset());
if (outputAsset == null || outputAsset.getResourceSpec() == null || !DataplexAssetResourceSpec.STORAGE_BUCKET.name().equals(outputAsset.getResourceSpec().getType()) || outputAsset.getResourceSpec().getName() == null) {
throw new IllegalArgumentException("Output asset must be an existing asset with resource spec name being a GCS bucket and" + " resource spec type of " + DataplexAssetResourceSpec.STORAGE_BUCKET.name());
}
String outputBucket = outputAsset.getResourceSpec().getName();
Predicate<String> inputFilesFilter;
switch(options.getWriteDisposition()) {
case OVERWRITE:
inputFilesFilter = inputFilePath -> true;
break;
case FAIL:
Set<String> outputFilePaths = getAllOutputFilePaths(outputBucket);
inputFilesFilter = inputFilePath -> {
if (outputFilePaths.contains(inputFilePathToOutputFilePath(outputPathProvider, inputFilePath, outputBucket, options.getOutputFileFormat()))) {
throw new WriteDispositionException(String.format("The file %s already exists in the output asset bucket: %s", inputFilePath, outputBucket));
}
return true;
};
break;
case SKIP:
outputFilePaths = getAllOutputFilePaths(outputBucket);
inputFilesFilter = inputFilePath -> !outputFilePaths.contains(inputFilePathToOutputFilePath(outputPathProvider, inputFilePath, outputBucket, options.getOutputFileFormat()));
break;
default:
throw new UnsupportedOperationException("Unsupported existing file behaviour: " + options.getWriteDisposition());
}
ImmutableList<GoogleCloudDataplexV1Entity> entities = isInputAsset ? dataplex.getCloudStorageEntities(options.getInputAssetOrEntitiesList()) : dataplex.getEntities(Splitter.on(',').trimResults().splitToList(options.getInputAssetOrEntitiesList()));
boolean convertingFiles = false;
for (GoogleCloudDataplexV1Entity entity : entities) {
ImmutableList<GoogleCloudDataplexV1Partition> partitions = dataplex.getPartitions(entity.getName());
if (partitions.isEmpty()) {
String outputPath = outputPathProvider.outputPathFrom(entity.getDataPath(), outputBucket);
Iterator<String> inputFilePaths = getFilesFromFilePattern(entityToFileSpec(entity)).filter(inputFilesFilter).iterator();
convertingFiles = inputFilePaths.hasNext();
inputFilePaths.forEachRemaining(inputFilePath -> pipeline.apply("Convert " + shortenDataplexName(entity.getName()), new ConvertFiles(entity, inputFilePath, options, outputPath)));
} else {
for (GoogleCloudDataplexV1Partition partition : partitions) {
String outputPath = outputPathProvider.outputPathFrom(partition.getLocation(), outputBucket);
Iterator<String> inputFilePaths = getFilesFromFilePattern(partitionToFileSpec(partition)).filter(inputFilesFilter).iterator();
convertingFiles = inputFilePaths.hasNext();
inputFilePaths.forEachRemaining(inputFilePath -> pipeline.apply("Convert " + shortenDataplexName(partition.getName()), new ConvertFiles(entity, inputFilePath, options, outputPath)));
}
}
}
if (!convertingFiles) {
pipeline.apply("Nothing to convert", new NoopTransform());
}
return pipeline.run();
}
use of com.google.cloud.teleport.v2.templates.FileFormatConversion.FileFormatConversionOptions in project DataflowTemplates by GoogleCloudPlatform.
the class FileFormatConversionTest method testParquetToAvroE2E.
/**
* Tests if the Parquet to Avro pipeline transforms data correctly and stores it in an Avro file.
*/
@Test
public void testParquetToAvroE2E() {
FileFormatConversionOptions options = PipelineOptionsFactory.create().as(FileFormatConversionOptions.class);
final String tempDir = temporaryFolder.getRoot().getAbsolutePath() + "/";
options.setInputFileFormat(PARQUET);
options.setOutputFileFormat(AVRO);
options.setInputFileSpec(PARQUET_FILE_PATH);
options.setOutputBucket(tempDir);
options.setSchema(SCHEMA_FILE_PATH);
Schema schema = SchemaUtils.getAvroSchema(SCHEMA_FILE_PATH);
GenericRecord genericRecords = new GenericData.Record(schema);
genericRecords.put("id", "007");
genericRecords.put("state", "CA");
genericRecords.put("price", 26.23);
mainPipeline.apply("TestParquetToAvro", FileFormatConversionFactory.FileFormat.newBuilder().setOptions(options).setInputFileFormat(PARQUET).setOutputFileFormat(AVRO).build());
mainPipeline.run();
PCollection<GenericRecord> readAvroFile = readPipeline.apply("ReadAvroFile", AvroConverters.ReadAvroFile.newBuilder().withInputFileSpec(tempDir + "*").withSchema(SCHEMA_FILE_PATH).build());
PAssert.that(readAvroFile).containsInAnyOrder(genericRecords);
readPipeline.run();
}
use of com.google.cloud.teleport.v2.templates.FileFormatConversion.FileFormatConversionOptions in project DataflowTemplates by GoogleCloudPlatform.
the class FileFormatConversionTest method testCsvToParquetE2E.
/**
* Tests if the Csv to Parquet pipeline transforms data correctly and stores it in a Parquet file.
*/
@Test
public void testCsvToParquetE2E() {
FileFormatConversionOptions options = PipelineOptionsFactory.create().as(FileFormatConversionOptions.class);
final String tempDir = temporaryFolder.getRoot().getAbsolutePath() + "/";
options.setInputFileFormat(CSV);
options.setOutputFileFormat(PARQUET);
options.setInputFileSpec(CSV_FILE_PATH);
options.setOutputBucket(tempDir);
options.setContainsHeaders(true);
options.setSchema(SCHEMA_FILE_PATH);
options.setDelimiter("|");
Schema schema = SchemaUtils.getAvroSchema(SCHEMA_FILE_PATH);
GenericRecord genericRecords = new GenericData.Record(schema);
genericRecords.put("id", "007");
genericRecords.put("state", "CA");
genericRecords.put("price", 26.23);
mainPipeline.apply("TestCsvToParquet", FileFormatConversionFactory.FileFormat.newBuilder().setOptions(options).setInputFileFormat(CSV).setOutputFileFormat(PARQUET).build());
mainPipeline.run();
PCollection<GenericRecord> readParquetFile = readPipeline.apply("ReadParquetFile", ParquetConverters.ReadParquetFile.newBuilder().withInputFileSpec(tempDir + "*").withSchema(SCHEMA_FILE_PATH).build());
PAssert.that(readParquetFile).containsInAnyOrder(genericRecords);
readPipeline.run();
}
use of com.google.cloud.teleport.v2.templates.FileFormatConversion.FileFormatConversionOptions in project DataflowTemplates by GoogleCloudPlatform.
the class FileFormatConversionTest method testCsvToParquetWithEmptyField.
/**
* Tests if the Csv to Parquet pipeline can handle empty fields in the Csv file.
*/
@Test
public void testCsvToParquetWithEmptyField() {
FileFormatConversionOptions options = PipelineOptionsFactory.create().as(FileFormatConversionOptions.class);
String tempDir = temporaryFolder.getRoot().getAbsolutePath() + "/";
options.setInputFileFormat(CSV);
options.setOutputFileFormat(PARQUET);
options.setInputFileSpec(CSV_FILE_WITH_MISSING_FIELD_PATH);
options.setOutputBucket(tempDir);
options.setContainsHeaders(true);
options.setSchema(SCHEMA_FILE_TWO_PATH);
Schema schema = SchemaUtils.getAvroSchema(SCHEMA_FILE_TWO_PATH);
GenericRecord genericRecords = new GenericData.Record(schema);
genericRecords.put("id", "007");
genericRecords.put("state", "CA");
genericRecords.put("price", null);
mainPipeline.apply("TestCsvToParquetWithEmptyField", FileFormatConversionFactory.FileFormat.newBuilder().setOptions(options).setInputFileFormat(CSV).setOutputFileFormat(PARQUET).build());
mainPipeline.run();
PCollection<GenericRecord> readParquetFile = readPipeline.apply("ReadParquetFile", ParquetConverters.ReadParquetFile.newBuilder().withInputFileSpec(tempDir + "*").withSchema(SCHEMA_FILE_TWO_PATH).build());
PAssert.that(readParquetFile).containsInAnyOrder(genericRecords);
readPipeline.run();
}
use of com.google.cloud.teleport.v2.templates.FileFormatConversion.FileFormatConversionOptions in project DataflowTemplates by GoogleCloudPlatform.
the class FileFormatConversionTest method testAvroToParquetE2E.
/**
* Tests if the Avro to Parquet pipeline transforms data correctly and stores it in a Parquet
* file.
*/
@Test
public void testAvroToParquetE2E() {
FileFormatConversionOptions options = PipelineOptionsFactory.create().as(FileFormatConversionOptions.class);
final String tempDir = temporaryFolder.getRoot().getAbsolutePath() + "/";
options.setInputFileFormat(AVRO);
options.setOutputFileFormat(PARQUET);
options.setInputFileSpec(AVRO_FILE_PATH);
options.setOutputBucket(tempDir);
options.setSchema(SCHEMA_FILE_PATH);
Schema schema = SchemaUtils.getAvroSchema(SCHEMA_FILE_PATH);
GenericRecord genericRecords = new GenericData.Record(schema);
genericRecords.put("id", "007");
genericRecords.put("state", "CA");
genericRecords.put("price", 26.23);
mainPipeline.apply("TestAvroToParquet", FileFormatConversionFactory.FileFormat.newBuilder().setOptions(options).setInputFileFormat(AVRO).setOutputFileFormat(PARQUET).build());
mainPipeline.run();
PCollection<GenericRecord> readParquetFile = readPipeline.apply("ReadParquetFile", ParquetConverters.ReadParquetFile.newBuilder().withInputFileSpec(tempDir + "*").withSchema(SCHEMA_FILE_PATH).build());
PAssert.that(readParquetFile).containsInAnyOrder(genericRecords);
readPipeline.run();
}
Aggregations