Search in sources :

Example 1 with DataplexClient

use of com.google.cloud.teleport.v2.clients.DataplexClient in project DataflowTemplates by GoogleCloudPlatform.

the class DataplexBigQueryToGcs method setUpPipeline.

private static Pipeline setUpPipeline(DataplexBigQueryToGcsOptions options, DataplexClient dataplex, BigQuery bqClient, BigQueryStorageClient bqsClient) throws IOException, ExecutionException, InterruptedException {
    int maxParallelBigQueryRequests = options.getMaxParallelBigQueryMetadataRequests();
    checkArgument(maxParallelBigQueryRequests >= 1, "maxParallelBigQueryMetadataRequests must be >= 1, but was: %s", maxParallelBigQueryRequests);
    String gcsResource = resolveAsset(dataplex, options.getDestinationStorageBucketAssetName(), DataplexAssetResourceSpec.STORAGE_BUCKET);
    String targetRootPath = "gs://" + gcsResource;
    String bqResource = options.getSourceBigQueryDataset();
    // If param contains "/lakes/", assume it's a Dataplex resource and resolve it into BQ ID first:
    if (bqResource.toLowerCase().contains("/lakes/")) {
        bqResource = resolveAsset(dataplex, bqResource, DataplexAssetResourceSpec.BIGQUERY_DATASET);
    }
    DatasetId datasetId = BigQueryUtils.parseDatasetUrn(bqResource);
    BigQueryMetadataLoader metadataLoader = new BigQueryMetadataLoader(bqClient, bqsClient, maxParallelBigQueryRequests);
    return buildPipeline(options, metadataLoader, targetRootPath, datasetId);
}
Also used : BigQueryMetadataLoader(com.google.cloud.teleport.v2.utils.BigQueryMetadataLoader) DatasetId(com.google.cloud.bigquery.DatasetId)

Example 2 with DataplexClient

use of com.google.cloud.teleport.v2.clients.DataplexClient in project DataflowTemplates by GoogleCloudPlatform.

the class DataplexFileFormatConversion method run.

/**
 * Runs the pipeline to completion with the specified options.
 *
 * @return The pipeline result.
 */
public static PipelineResult run(Pipeline pipeline, FileFormatConversionOptions options, DataplexClient dataplex, OutputPathProvider outputPathProvider) throws IOException {
    boolean isInputAsset = ASSET_PATTERN.matcher(options.getInputAssetOrEntitiesList()).matches();
    if (!isInputAsset && !ENTITIES_PATTERN.matcher(options.getInputAssetOrEntitiesList()).matches()) {
        throw new IllegalArgumentException("Either input asset or input entities list must be provided");
    }
    GoogleCloudDataplexV1Asset outputAsset = dataplex.getAsset(options.getOutputAsset());
    if (outputAsset == null || outputAsset.getResourceSpec() == null || !DataplexAssetResourceSpec.STORAGE_BUCKET.name().equals(outputAsset.getResourceSpec().getType()) || outputAsset.getResourceSpec().getName() == null) {
        throw new IllegalArgumentException("Output asset must be an existing asset with resource spec name being a GCS bucket and" + " resource spec type of " + DataplexAssetResourceSpec.STORAGE_BUCKET.name());
    }
    String outputBucket = outputAsset.getResourceSpec().getName();
    Predicate<String> inputFilesFilter;
    switch(options.getWriteDisposition()) {
        case OVERWRITE:
            inputFilesFilter = inputFilePath -> true;
            break;
        case FAIL:
            Set<String> outputFilePaths = getAllOutputFilePaths(outputBucket);
            inputFilesFilter = inputFilePath -> {
                if (outputFilePaths.contains(inputFilePathToOutputFilePath(outputPathProvider, inputFilePath, outputBucket, options.getOutputFileFormat()))) {
                    throw new WriteDispositionException(String.format("The file %s already exists in the output asset bucket: %s", inputFilePath, outputBucket));
                }
                return true;
            };
            break;
        case SKIP:
            outputFilePaths = getAllOutputFilePaths(outputBucket);
            inputFilesFilter = inputFilePath -> !outputFilePaths.contains(inputFilePathToOutputFilePath(outputPathProvider, inputFilePath, outputBucket, options.getOutputFileFormat()));
            break;
        default:
            throw new UnsupportedOperationException("Unsupported existing file behaviour: " + options.getWriteDisposition());
    }
    ImmutableList<GoogleCloudDataplexV1Entity> entities = isInputAsset ? dataplex.getCloudStorageEntities(options.getInputAssetOrEntitiesList()) : dataplex.getEntities(Splitter.on(',').trimResults().splitToList(options.getInputAssetOrEntitiesList()));
    boolean convertingFiles = false;
    for (GoogleCloudDataplexV1Entity entity : entities) {
        ImmutableList<GoogleCloudDataplexV1Partition> partitions = dataplex.getPartitions(entity.getName());
        if (partitions.isEmpty()) {
            String outputPath = outputPathProvider.outputPathFrom(entity.getDataPath(), outputBucket);
            Iterator<String> inputFilePaths = getFilesFromFilePattern(entityToFileSpec(entity)).filter(inputFilesFilter).iterator();
            convertingFiles = inputFilePaths.hasNext();
            inputFilePaths.forEachRemaining(inputFilePath -> pipeline.apply("Convert " + shortenDataplexName(entity.getName()), new ConvertFiles(entity, inputFilePath, options, outputPath)));
        } else {
            for (GoogleCloudDataplexV1Partition partition : partitions) {
                String outputPath = outputPathProvider.outputPathFrom(partition.getLocation(), outputBucket);
                Iterator<String> inputFilePaths = getFilesFromFilePattern(partitionToFileSpec(partition)).filter(inputFilesFilter).iterator();
                convertingFiles = inputFilePaths.hasNext();
                inputFilePaths.forEachRemaining(inputFilePath -> pipeline.apply("Convert " + shortenDataplexName(partition.getName()), new ConvertFiles(entity, inputFilePath, options, outputPath)));
            }
        }
    }
    if (!convertingFiles) {
        pipeline.apply("Nothing to convert", new NoopTransform());
    }
    return pipeline.run();
}
Also used : NoopTransform(com.google.cloud.teleport.v2.transforms.NoopTransform) GoogleCloudDataplexV1Partition(com.google.api.services.dataplex.v1.model.GoogleCloudDataplexV1Partition) WriteDispositionException(com.google.cloud.teleport.v2.utils.WriteDisposition.WriteDispositionException) GoogleCloudDataplexV1Asset(com.google.api.services.dataplex.v1.model.GoogleCloudDataplexV1Asset) GoogleCloudDataplexV1Entity(com.google.api.services.dataplex.v1.model.GoogleCloudDataplexV1Entity)

Example 3 with DataplexClient

use of com.google.cloud.teleport.v2.clients.DataplexClient in project DataflowTemplates by GoogleCloudPlatform.

the class DataplexJdbcIngestion method main.

/**
 * Main entry point for pipeline execution.
 *
 * @param args Command line arguments to the pipeline.
 */
public static void main(String[] args) throws IOException {
    DataplexJdbcIngestionOptions options = PipelineOptionsFactory.fromArgs(args).withValidation().as(DataplexJdbcIngestionOptions.class);
    Pipeline pipeline = Pipeline.create(options);
    DataplexClient dataplexClient = DefaultDataplexClient.withDefaultClient(options.getGcpCredential());
    String assetName = options.getOutputAsset();
    GoogleCloudDataplexV1Asset asset = resolveAsset(assetName, dataplexClient);
    DynamicDataSourceConfiguration dataSourceConfig = configDataSource(options);
    String assetType = asset.getResourceSpec().getType();
    if (DataplexAssetResourceSpec.BIGQUERY_DATASET.name().equals(assetType)) {
        buildBigQueryPipeline(pipeline, options, dataSourceConfig);
    } else if (DataplexAssetResourceSpec.STORAGE_BUCKET.name().equals(assetType)) {
        String targetRootPath = "gs://" + asset.getResourceSpec().getName() + "/" + options.getOutputTable();
        buildGcsPipeline(pipeline, options, dataSourceConfig, targetRootPath);
    } else {
        throw new IllegalArgumentException(String.format("Asset " + assetName + " is of type " + assetType + ". Only " + DataplexAssetResourceSpec.BIGQUERY_DATASET.name() + "and " + DataplexAssetResourceSpec.STORAGE_BUCKET.name() + " supported."));
    }
    pipeline.run();
}
Also used : DataplexJdbcIngestionOptions(com.google.cloud.teleport.v2.options.DataplexJdbcIngestionOptions) DefaultDataplexClient(com.google.cloud.teleport.v2.clients.DefaultDataplexClient) DataplexClient(com.google.cloud.teleport.v2.clients.DataplexClient) DynamicDataSourceConfiguration(com.google.cloud.teleport.v2.io.DynamicJdbcIO.DynamicDataSourceConfiguration) GoogleCloudDataplexV1Asset(com.google.api.services.dataplex.v1.model.GoogleCloudDataplexV1Asset) Pipeline(org.apache.beam.sdk.Pipeline)

Example 4 with DataplexClient

use of com.google.cloud.teleport.v2.clients.DataplexClient in project DataflowTemplates by GoogleCloudPlatform.

the class DataplexFileFormatConversionTest method testAssetWithEntityParquetToAvroE2E.

/**
 * Tests Parquet to Avro conversion for an asset with entity.
 */
@Test
@Category(NeedsRunner.class)
public void testAssetWithEntityParquetToAvroE2E() throws IOException {
    DataplexClient dataplex = mock(DataplexClient.class);
    when(dataplex.getEntities(ImmutableList.of(entity4.getName()))).thenReturn(ImmutableList.of(entity4));
    when(dataplex.getPartitions(entity4.getName())).thenReturn(ImmutableList.of());
    when(dataplex.getAsset(outputAsset.getName())).thenReturn(outputAsset);
    FileFormatConversionOptions options = PipelineOptionsFactory.create().as(FileFormatConversionOptions.class);
    options.setInputAssetOrEntitiesList(entity4.getName());
    options.setOutputFileFormat(FileFormatOptions.AVRO);
    options.setOutputAsset(outputAsset.getName());
    DataplexFileFormatConversion.run(mainPipeline, options, dataplex, DataplexFileFormatConversionTest::outputPathProvider);
    PCollection<GenericRecord> readParquetFile = readPipeline.apply("ReadAvroFile", AvroConverters.ReadAvroFile.newBuilder().withInputFileSpec(temporaryFolder.getRoot().getAbsolutePath() + "/**/*.avro").withSerializedSchema(EXPECT_SERIALIZED_AVRO_SCHEMA).build());
    PAssert.that(readParquetFile).containsInAnyOrder(EXPECTED_GENERIC_RECORDS);
    readPipeline.run();
}
Also used : DataplexClient(com.google.cloud.teleport.v2.clients.DataplexClient) FileFormatConversionOptions(com.google.cloud.teleport.v2.templates.DataplexFileFormatConversion.FileFormatConversionOptions) GenericRecord(org.apache.avro.generic.GenericRecord) Category(org.junit.experimental.categories.Category) Test(org.junit.Test)

Example 5 with DataplexClient

use of com.google.cloud.teleport.v2.clients.DataplexClient in project DataflowTemplates by GoogleCloudPlatform.

the class DataplexFileFormatConversionTest method testAssetWithEntityAvroToParquetE2E.

/**
 * Tests Avro to Parquet conversion for an asset with entity.
 */
@Test
@Category(NeedsRunner.class)
public void testAssetWithEntityAvroToParquetE2E() throws IOException {
    DataplexClient dataplex = mock(DataplexClient.class);
    when(dataplex.getEntities(ImmutableList.of(entity3.getName()))).thenReturn(ImmutableList.of(entity3));
    when(dataplex.getPartitions(entity3.getName())).thenReturn(ImmutableList.of());
    when(dataplex.getAsset(outputAsset.getName())).thenReturn(outputAsset);
    FileFormatConversionOptions options = PipelineOptionsFactory.create().as(FileFormatConversionOptions.class);
    options.setInputAssetOrEntitiesList(entity3.getName());
    options.setOutputFileFormat(FileFormatOptions.PARQUET);
    options.setOutputAsset(outputAsset.getName());
    DataplexFileFormatConversion.run(mainPipeline, options, dataplex, DataplexFileFormatConversionTest::outputPathProvider);
    PCollection<GenericRecord> readParquetFile = readPipeline.apply("ReadParquetFile", ParquetConverters.ReadParquetFile.newBuilder().withInputFileSpec(temporaryFolder.getRoot().getAbsolutePath() + "/**/*.parquet").withSerializedSchema(EXPECT_SERIALIZED_AVRO_SCHEMA).build());
    PAssert.that(readParquetFile).containsInAnyOrder(EXPECTED_GENERIC_RECORDS);
    readPipeline.run();
}
Also used : DataplexClient(com.google.cloud.teleport.v2.clients.DataplexClient) FileFormatConversionOptions(com.google.cloud.teleport.v2.templates.DataplexFileFormatConversion.FileFormatConversionOptions) GenericRecord(org.apache.avro.generic.GenericRecord) Category(org.junit.experimental.categories.Category) Test(org.junit.Test)

Aggregations

DataplexClient (com.google.cloud.teleport.v2.clients.DataplexClient)8 FileFormatConversionOptions (com.google.cloud.teleport.v2.templates.DataplexFileFormatConversion.FileFormatConversionOptions)6 Test (org.junit.Test)6 Category (org.junit.experimental.categories.Category)6 GenericRecord (org.apache.avro.generic.GenericRecord)5 GoogleCloudDataplexV1Asset (com.google.api.services.dataplex.v1.model.GoogleCloudDataplexV1Asset)2 DefaultDataplexClient (com.google.cloud.teleport.v2.clients.DefaultDataplexClient)2 Pipeline (org.apache.beam.sdk.Pipeline)2 GoogleCloudDataplexV1Entity (com.google.api.services.dataplex.v1.model.GoogleCloudDataplexV1Entity)1 GoogleCloudDataplexV1Partition (com.google.api.services.dataplex.v1.model.GoogleCloudDataplexV1Partition)1 BigQuery (com.google.cloud.bigquery.BigQuery)1 DatasetId (com.google.cloud.bigquery.DatasetId)1 BigQueryStorageClient (com.google.cloud.bigquery.storage.v1beta1.BigQueryStorageClient)1 DynamicDataSourceConfiguration (com.google.cloud.teleport.v2.io.DynamicJdbcIO.DynamicDataSourceConfiguration)1 DataplexBigQueryToGcsOptions (com.google.cloud.teleport.v2.options.DataplexBigQueryToGcsOptions)1 DataplexJdbcIngestionOptions (com.google.cloud.teleport.v2.options.DataplexJdbcIngestionOptions)1 NoopTransform (com.google.cloud.teleport.v2.transforms.NoopTransform)1 BigQueryMetadataLoader (com.google.cloud.teleport.v2.utils.BigQueryMetadataLoader)1 WriteDispositionException (com.google.cloud.teleport.v2.utils.WriteDisposition.WriteDispositionException)1 ImmutableList (com.google.common.collect.ImmutableList)1