use of com.google.cloud.teleport.v2.options.DataplexBigQueryToGcsOptions in project DataflowTemplates by GoogleCloudPlatform.
the class DataplexBigQueryToGcs method setUpPipeline.
private static Pipeline setUpPipeline(DataplexBigQueryToGcsOptions options, DataplexClient dataplex, BigQuery bqClient, BigQueryStorageClient bqsClient) throws IOException, ExecutionException, InterruptedException {
int maxParallelBigQueryRequests = options.getMaxParallelBigQueryMetadataRequests();
checkArgument(maxParallelBigQueryRequests >= 1, "maxParallelBigQueryMetadataRequests must be >= 1, but was: %s", maxParallelBigQueryRequests);
String gcsResource = resolveAsset(dataplex, options.getDestinationStorageBucketAssetName(), DataplexAssetResourceSpec.STORAGE_BUCKET);
String targetRootPath = "gs://" + gcsResource;
String bqResource = options.getSourceBigQueryDataset();
// If param contains "/lakes/", assume it's a Dataplex resource and resolve it into BQ ID first:
if (bqResource.toLowerCase().contains("/lakes/")) {
bqResource = resolveAsset(dataplex, bqResource, DataplexAssetResourceSpec.BIGQUERY_DATASET);
}
DatasetId datasetId = BigQueryUtils.parseDatasetUrn(bqResource);
BigQueryMetadataLoader metadataLoader = new BigQueryMetadataLoader(bqClient, bqsClient, maxParallelBigQueryRequests);
return buildPipeline(options, metadataLoader, targetRootPath, datasetId);
}
use of com.google.cloud.teleport.v2.options.DataplexBigQueryToGcsOptions in project DataflowTemplates by GoogleCloudPlatform.
the class DataplexBigQueryToGcs method buildPipeline.
/**
* Builds the pipeline with the supplied options.
*
* @param options The execution parameters to the pipeline.
* @return The resulting pipeline.
*/
@VisibleForTesting
static Pipeline buildPipeline(DataplexBigQueryToGcsOptions options, BigQueryMetadataLoader metadataLoader, String targetRootPath, DatasetId datasetId) throws ExecutionException, InterruptedException {
Pipeline pipeline = Pipeline.create(options);
List<String> existingTargetFiles = StorageUtils.getFilesInDirectory(targetRootPath);
LOG.info("Loading BigQuery metadata...");
List<BigQueryTable> tables = metadataLoader.loadDatasetMetadata(datasetId, new DataplexBigQueryToGcsFilter(options, existingTargetFiles));
LOG.info("Loaded {} table(s).", tables.size());
if (!tables.isEmpty()) {
transformPipeline(pipeline, tables, options, targetRootPath, null, null);
} else {
pipeline.apply("Nothing to export", new NoopTransform());
}
return pipeline;
}
use of com.google.cloud.teleport.v2.options.DataplexBigQueryToGcsOptions in project DataflowTemplates by GoogleCloudPlatform.
the class DataplexBigQueryToGcs method transformPipeline.
@VisibleForTesting
static void transformPipeline(Pipeline pipeline, List<BigQueryTable> tables, DataplexBigQueryToGcsOptions options, String targetRootPath, BigQueryServices testBqServices, BigQueryClientFactory testBqClientFactory) {
List<PCollection<KV<BigQueryTable, KV<BigQueryTablePartition, String>>>> fileCollections = new ArrayList<>(tables.size());
tables.forEach(table -> {
fileCollections.add(pipeline.apply(String.format("ExportTable-%s", table.getTableName()), new BigQueryTableToGcsTransform(table, targetRootPath, options.getFileFormat(), options.getFileCompression(), options.getEnforceSamePartitionKey()).withTestServices(testBqServices)).apply(String.format("AttachTableKeys-%s", table.getTableName()), WithKeys.of(table)));
});
PCollection<KV<BigQueryTable, KV<BigQueryTablePartition, String>>> exportFileResults = PCollectionList.of(fileCollections).apply("FlattenTableResults", Flatten.pCollections());
PCollection<Void> metadataUpdateResults = exportFileResults.apply("UpdateDataplexMetadata", new UpdateDataplexBigQueryToGcsExportMetadataTransform());
exportFileResults.apply(MapElements.into(TypeDescriptors.kvs(TypeDescriptor.of(BigQueryTable.class), TypeDescriptor.of(BigQueryTablePartition.class))).via((SerializableFunction<KV<BigQueryTable, KV<BigQueryTablePartition, String>>, KV<BigQueryTable, BigQueryTablePartition>>) input -> KV.of(input.getKey(), input.getValue().getKey()))).apply("WaitForMetadataUpdate", Wait.on(metadataUpdateResults)).apply("TruncateBigQueryData", ParDo.of(new DeleteBigQueryDataFn().withTestBqClientFactory(testBqClientFactory)));
}
use of com.google.cloud.teleport.v2.options.DataplexBigQueryToGcsOptions in project DataflowTemplates by GoogleCloudPlatform.
the class DataplexBigQueryToGcs method main.
/**
* Main entry point for pipeline execution.
*
* @param args Command line arguments to the pipeline.
*/
public static void main(String[] args) throws IOException, InterruptedException, ExecutionException {
DataplexBigQueryToGcsOptions options = PipelineOptionsFactory.fromArgs(args).withValidation().as(DataplexBigQueryToGcsOptions.class);
List<String> experiments = new ArrayList<>();
if (options.getExperiments() != null) {
experiments.addAll(options.getExperiments());
}
if (!experiments.contains("upload_graph")) {
experiments.add("upload_graph");
}
options.setExperiments(experiments);
Pipeline pipeline;
DataplexClient dataplex = DefaultDataplexClient.withDefaultClient(options.getGcpCredential());
BigQuery bqClient = BigQueryOptions.getDefaultInstance().getService();
try (BigQueryStorageClient bqsClient = BigQueryStorageClient.create()) {
LOG.info("Building the pipeline...");
pipeline = setUpPipeline(options, dataplex, bqClient, bqsClient);
}
LOG.info("Running the pipeline.");
pipeline.run();
}
Aggregations