Search in sources :

Example 1 with NoopTransform

use of com.google.cloud.teleport.v2.transforms.NoopTransform in project DataflowTemplates by GoogleCloudPlatform.

the class DataplexBigQueryToGcs method buildPipeline.

/**
 * Builds the pipeline with the supplied options.
 *
 * @param options The execution parameters to the pipeline.
 * @return The resulting pipeline.
 */
@VisibleForTesting
static Pipeline buildPipeline(DataplexBigQueryToGcsOptions options, BigQueryMetadataLoader metadataLoader, String targetRootPath, DatasetId datasetId) throws ExecutionException, InterruptedException {
    Pipeline pipeline = Pipeline.create(options);
    List<String> existingTargetFiles = StorageUtils.getFilesInDirectory(targetRootPath);
    LOG.info("Loading BigQuery metadata...");
    List<BigQueryTable> tables = metadataLoader.loadDatasetMetadata(datasetId, new DataplexBigQueryToGcsFilter(options, existingTargetFiles));
    LOG.info("Loaded {} table(s).", tables.size());
    if (!tables.isEmpty()) {
        transformPipeline(pipeline, tables, options, targetRootPath, null, null);
    } else {
        pipeline.apply("Nothing to export", new NoopTransform());
    }
    return pipeline;
}
Also used : NoopTransform(com.google.cloud.teleport.v2.transforms.NoopTransform) BigQueryTable(com.google.cloud.teleport.v2.values.BigQueryTable) DataplexBigQueryToGcsFilter(com.google.cloud.teleport.v2.utils.DataplexBigQueryToGcsFilter) Pipeline(org.apache.beam.sdk.Pipeline) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 2 with NoopTransform

use of com.google.cloud.teleport.v2.transforms.NoopTransform in project DataflowTemplates by GoogleCloudPlatform.

the class DataplexFileFormatConversion method run.

/**
 * Runs the pipeline to completion with the specified options.
 *
 * @return The pipeline result.
 */
public static PipelineResult run(Pipeline pipeline, FileFormatConversionOptions options, DataplexClient dataplex, OutputPathProvider outputPathProvider) throws IOException {
    boolean isInputAsset = ASSET_PATTERN.matcher(options.getInputAssetOrEntitiesList()).matches();
    if (!isInputAsset && !ENTITIES_PATTERN.matcher(options.getInputAssetOrEntitiesList()).matches()) {
        throw new IllegalArgumentException("Either input asset or input entities list must be provided");
    }
    GoogleCloudDataplexV1Asset outputAsset = dataplex.getAsset(options.getOutputAsset());
    if (outputAsset == null || outputAsset.getResourceSpec() == null || !DataplexAssetResourceSpec.STORAGE_BUCKET.name().equals(outputAsset.getResourceSpec().getType()) || outputAsset.getResourceSpec().getName() == null) {
        throw new IllegalArgumentException("Output asset must be an existing asset with resource spec name being a GCS bucket and" + " resource spec type of " + DataplexAssetResourceSpec.STORAGE_BUCKET.name());
    }
    String outputBucket = outputAsset.getResourceSpec().getName();
    Predicate<String> inputFilesFilter;
    switch(options.getWriteDisposition()) {
        case OVERWRITE:
            inputFilesFilter = inputFilePath -> true;
            break;
        case FAIL:
            Set<String> outputFilePaths = getAllOutputFilePaths(outputBucket);
            inputFilesFilter = inputFilePath -> {
                if (outputFilePaths.contains(inputFilePathToOutputFilePath(outputPathProvider, inputFilePath, outputBucket, options.getOutputFileFormat()))) {
                    throw new WriteDispositionException(String.format("The file %s already exists in the output asset bucket: %s", inputFilePath, outputBucket));
                }
                return true;
            };
            break;
        case SKIP:
            outputFilePaths = getAllOutputFilePaths(outputBucket);
            inputFilesFilter = inputFilePath -> !outputFilePaths.contains(inputFilePathToOutputFilePath(outputPathProvider, inputFilePath, outputBucket, options.getOutputFileFormat()));
            break;
        default:
            throw new UnsupportedOperationException("Unsupported existing file behaviour: " + options.getWriteDisposition());
    }
    ImmutableList<GoogleCloudDataplexV1Entity> entities = isInputAsset ? dataplex.getCloudStorageEntities(options.getInputAssetOrEntitiesList()) : dataplex.getEntities(Splitter.on(',').trimResults().splitToList(options.getInputAssetOrEntitiesList()));
    boolean convertingFiles = false;
    for (GoogleCloudDataplexV1Entity entity : entities) {
        ImmutableList<GoogleCloudDataplexV1Partition> partitions = dataplex.getPartitions(entity.getName());
        if (partitions.isEmpty()) {
            String outputPath = outputPathProvider.outputPathFrom(entity.getDataPath(), outputBucket);
            Iterator<String> inputFilePaths = getFilesFromFilePattern(entityToFileSpec(entity)).filter(inputFilesFilter).iterator();
            convertingFiles = inputFilePaths.hasNext();
            inputFilePaths.forEachRemaining(inputFilePath -> pipeline.apply("Convert " + shortenDataplexName(entity.getName()), new ConvertFiles(entity, inputFilePath, options, outputPath)));
        } else {
            for (GoogleCloudDataplexV1Partition partition : partitions) {
                String outputPath = outputPathProvider.outputPathFrom(partition.getLocation(), outputBucket);
                Iterator<String> inputFilePaths = getFilesFromFilePattern(partitionToFileSpec(partition)).filter(inputFilesFilter).iterator();
                convertingFiles = inputFilePaths.hasNext();
                inputFilePaths.forEachRemaining(inputFilePath -> pipeline.apply("Convert " + shortenDataplexName(partition.getName()), new ConvertFiles(entity, inputFilePath, options, outputPath)));
            }
        }
    }
    if (!convertingFiles) {
        pipeline.apply("Nothing to convert", new NoopTransform());
    }
    return pipeline.run();
}
Also used : NoopTransform(com.google.cloud.teleport.v2.transforms.NoopTransform) GoogleCloudDataplexV1Partition(com.google.api.services.dataplex.v1.model.GoogleCloudDataplexV1Partition) WriteDispositionException(com.google.cloud.teleport.v2.utils.WriteDisposition.WriteDispositionException) GoogleCloudDataplexV1Asset(com.google.api.services.dataplex.v1.model.GoogleCloudDataplexV1Asset) GoogleCloudDataplexV1Entity(com.google.api.services.dataplex.v1.model.GoogleCloudDataplexV1Entity)

Aggregations

NoopTransform (com.google.cloud.teleport.v2.transforms.NoopTransform)2 GoogleCloudDataplexV1Asset (com.google.api.services.dataplex.v1.model.GoogleCloudDataplexV1Asset)1 GoogleCloudDataplexV1Entity (com.google.api.services.dataplex.v1.model.GoogleCloudDataplexV1Entity)1 GoogleCloudDataplexV1Partition (com.google.api.services.dataplex.v1.model.GoogleCloudDataplexV1Partition)1 DataplexBigQueryToGcsFilter (com.google.cloud.teleport.v2.utils.DataplexBigQueryToGcsFilter)1 WriteDispositionException (com.google.cloud.teleport.v2.utils.WriteDisposition.WriteDispositionException)1 BigQueryTable (com.google.cloud.teleport.v2.values.BigQueryTable)1 VisibleForTesting (com.google.common.annotations.VisibleForTesting)1 Pipeline (org.apache.beam.sdk.Pipeline)1