use of com.google.cloud.teleport.v2.utils.WriteDisposition.WriteDispositionException in project DataflowTemplates by GoogleCloudPlatform.
the class DataplexFileFormatConversion method run.
/**
* Runs the pipeline to completion with the specified options.
*
* @return The pipeline result.
*/
public static PipelineResult run(Pipeline pipeline, FileFormatConversionOptions options, DataplexClient dataplex, OutputPathProvider outputPathProvider) throws IOException {
boolean isInputAsset = ASSET_PATTERN.matcher(options.getInputAssetOrEntitiesList()).matches();
if (!isInputAsset && !ENTITIES_PATTERN.matcher(options.getInputAssetOrEntitiesList()).matches()) {
throw new IllegalArgumentException("Either input asset or input entities list must be provided");
}
GoogleCloudDataplexV1Asset outputAsset = dataplex.getAsset(options.getOutputAsset());
if (outputAsset == null || outputAsset.getResourceSpec() == null || !DataplexAssetResourceSpec.STORAGE_BUCKET.name().equals(outputAsset.getResourceSpec().getType()) || outputAsset.getResourceSpec().getName() == null) {
throw new IllegalArgumentException("Output asset must be an existing asset with resource spec name being a GCS bucket and" + " resource spec type of " + DataplexAssetResourceSpec.STORAGE_BUCKET.name());
}
String outputBucket = outputAsset.getResourceSpec().getName();
Predicate<String> inputFilesFilter;
switch(options.getWriteDisposition()) {
case OVERWRITE:
inputFilesFilter = inputFilePath -> true;
break;
case FAIL:
Set<String> outputFilePaths = getAllOutputFilePaths(outputBucket);
inputFilesFilter = inputFilePath -> {
if (outputFilePaths.contains(inputFilePathToOutputFilePath(outputPathProvider, inputFilePath, outputBucket, options.getOutputFileFormat()))) {
throw new WriteDispositionException(String.format("The file %s already exists in the output asset bucket: %s", inputFilePath, outputBucket));
}
return true;
};
break;
case SKIP:
outputFilePaths = getAllOutputFilePaths(outputBucket);
inputFilesFilter = inputFilePath -> !outputFilePaths.contains(inputFilePathToOutputFilePath(outputPathProvider, inputFilePath, outputBucket, options.getOutputFileFormat()));
break;
default:
throw new UnsupportedOperationException("Unsupported existing file behaviour: " + options.getWriteDisposition());
}
ImmutableList<GoogleCloudDataplexV1Entity> entities = isInputAsset ? dataplex.getCloudStorageEntities(options.getInputAssetOrEntitiesList()) : dataplex.getEntities(Splitter.on(',').trimResults().splitToList(options.getInputAssetOrEntitiesList()));
boolean convertingFiles = false;
for (GoogleCloudDataplexV1Entity entity : entities) {
ImmutableList<GoogleCloudDataplexV1Partition> partitions = dataplex.getPartitions(entity.getName());
if (partitions.isEmpty()) {
String outputPath = outputPathProvider.outputPathFrom(entity.getDataPath(), outputBucket);
Iterator<String> inputFilePaths = getFilesFromFilePattern(entityToFileSpec(entity)).filter(inputFilesFilter).iterator();
convertingFiles = inputFilePaths.hasNext();
inputFilePaths.forEachRemaining(inputFilePath -> pipeline.apply("Convert " + shortenDataplexName(entity.getName()), new ConvertFiles(entity, inputFilePath, options, outputPath)));
} else {
for (GoogleCloudDataplexV1Partition partition : partitions) {
String outputPath = outputPathProvider.outputPathFrom(partition.getLocation(), outputBucket);
Iterator<String> inputFilePaths = getFilesFromFilePattern(partitionToFileSpec(partition)).filter(inputFilesFilter).iterator();
convertingFiles = inputFilePaths.hasNext();
inputFilePaths.forEachRemaining(inputFilePath -> pipeline.apply("Convert " + shortenDataplexName(partition.getName()), new ConvertFiles(entity, inputFilePath, options, outputPath)));
}
}
}
if (!convertingFiles) {
pipeline.apply("Nothing to convert", new NoopTransform());
}
return pipeline.run();
}
use of com.google.cloud.teleport.v2.utils.WriteDisposition.WriteDispositionException in project DataflowTemplates by GoogleCloudPlatform.
the class DataplexBigQueryToGcsTest method testE2E_withTargetStrategyFail_andEnforceSamePartitionKeyEnabled_throwsException.
/**
* Tests that the pipeline throws an exception if {@code writeDisposition = FAIL}, {@code
* enforceSamePartitionKey = true}, and one of the target files exist, when processing a
* partitioned table.
*
* <p>This is a special case because depending on the {@code enforceSamePartitionKey} param the
* generated file path can be different (for partitioned tables only!), so this verifies that
* {@link com.google.cloud.teleport.v2.utils.DataplexBigQueryToGcsFilter
* DataplexBigQueryToGcsFilter} can find such files correctly.
*/
@Test
public void testE2E_withTargetStrategyFail_andEnforceSamePartitionKeyEnabled_throwsException() throws Exception {
options.setFileFormat(FileFormatOptions.PARQUET);
options.setWriteDisposition(WriteDispositionOptions.FAIL);
options.setEnforceSamePartitionKey(true);
writeOutputFile("partitioned_table/ts=p2", "output-partitioned_table-p2.parquet", "Test data");
when(bqMock.query(any())).then(invocation -> {
Iterable<FieldValueList> result = null;
QueryJobConfiguration q = (QueryJobConfiguration) invocation.getArguments()[0];
if (TABLE_QUERY_PATTERN.matcher(q.getQuery()).find()) {
result = Collections.singletonList(fields("partitioned_table", "0", "ts"));
} else if (PARTITION_QUERY_PATTERN.matcher(q.getQuery()).find()) {
result = Arrays.asList(fields("p1", "0"), fields("p2", "0"));
}
when(tableResultMock.iterateAll()).thenReturn(result);
return tableResultMock;
});
try {
DataplexBigQueryToGcs.buildPipeline(options, metadataLoader, outDir.getAbsolutePath(), DatasetId.of(PROJECT, DATASET));
fail("Expected a WriteDispositionException");
} catch (Exception e) {
assertThat(e).hasCauseThat().hasCauseThat().isInstanceOf(WriteDispositionException.class);
assertThat(e).hasCauseThat().hasCauseThat().hasMessageThat().contains("Target File partitioned_table/ts=p2/output-partitioned_table-p2.parquet exists for" + " partitioned_table$p2.");
}
}
Aggregations