Search in sources :

Example 1 with BigQueryMetadataLoader

use of com.google.cloud.teleport.v2.utils.BigQueryMetadataLoader in project DataflowTemplates by GoogleCloudPlatform.

the class DataplexBigQueryToGcs method setUpPipeline.

private static Pipeline setUpPipeline(DataplexBigQueryToGcsOptions options, DataplexClient dataplex, BigQuery bqClient, BigQueryStorageClient bqsClient) throws IOException, ExecutionException, InterruptedException {
    int maxParallelBigQueryRequests = options.getMaxParallelBigQueryMetadataRequests();
    checkArgument(maxParallelBigQueryRequests >= 1, "maxParallelBigQueryMetadataRequests must be >= 1, but was: %s", maxParallelBigQueryRequests);
    String gcsResource = resolveAsset(dataplex, options.getDestinationStorageBucketAssetName(), DataplexAssetResourceSpec.STORAGE_BUCKET);
    String targetRootPath = "gs://" + gcsResource;
    String bqResource = options.getSourceBigQueryDataset();
    // If param contains "/lakes/", assume it's a Dataplex resource and resolve it into BQ ID first:
    if (bqResource.toLowerCase().contains("/lakes/")) {
        bqResource = resolveAsset(dataplex, bqResource, DataplexAssetResourceSpec.BIGQUERY_DATASET);
    }
    DatasetId datasetId = BigQueryUtils.parseDatasetUrn(bqResource);
    BigQueryMetadataLoader metadataLoader = new BigQueryMetadataLoader(bqClient, bqsClient, maxParallelBigQueryRequests);
    return buildPipeline(options, metadataLoader, targetRootPath, datasetId);
}
Also used : BigQueryMetadataLoader(com.google.cloud.teleport.v2.utils.BigQueryMetadataLoader) DatasetId(com.google.cloud.bigquery.DatasetId)

Example 2 with BigQueryMetadataLoader

use of com.google.cloud.teleport.v2.utils.BigQueryMetadataLoader in project DataflowTemplates by GoogleCloudPlatform.

the class DataplexBigQueryToGcs method buildPipeline.

/**
 * Builds the pipeline with the supplied options.
 *
 * @param options The execution parameters to the pipeline.
 * @return The resulting pipeline.
 */
@VisibleForTesting
static Pipeline buildPipeline(DataplexBigQueryToGcsOptions options, BigQueryMetadataLoader metadataLoader, String targetRootPath, DatasetId datasetId) throws ExecutionException, InterruptedException {
    Pipeline pipeline = Pipeline.create(options);
    List<String> existingTargetFiles = StorageUtils.getFilesInDirectory(targetRootPath);
    LOG.info("Loading BigQuery metadata...");
    List<BigQueryTable> tables = metadataLoader.loadDatasetMetadata(datasetId, new DataplexBigQueryToGcsFilter(options, existingTargetFiles));
    LOG.info("Loaded {} table(s).", tables.size());
    if (!tables.isEmpty()) {
        transformPipeline(pipeline, tables, options, targetRootPath, null, null);
    } else {
        pipeline.apply("Nothing to export", new NoopTransform());
    }
    return pipeline;
}
Also used : NoopTransform(com.google.cloud.teleport.v2.transforms.NoopTransform) BigQueryTable(com.google.cloud.teleport.v2.values.BigQueryTable) DataplexBigQueryToGcsFilter(com.google.cloud.teleport.v2.utils.DataplexBigQueryToGcsFilter) Pipeline(org.apache.beam.sdk.Pipeline) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 3 with BigQueryMetadataLoader

use of com.google.cloud.teleport.v2.utils.BigQueryMetadataLoader in project DataflowTemplates by GoogleCloudPlatform.

the class DataplexBigQueryToGcsTest method setUp.

@Before
public void setUp() throws InterruptedException, IOException {
    options = TestPipeline.testingPipelineOptions().as(DataplexBigQueryToGcsOptions.class);
    options.setProject(PROJECT);
    options.setUpdateDataplexMetadata(true);
    options.setEnforceSamePartitionKey(false);
    // Required when using BigQueryIO.withMethod(EXPORT).
    options.setTempLocation(tmpDir.newFolder("bqTmp").getAbsolutePath());
    outDir = tmpDir.newFolder("out");
    bqSchema = new TableSchema().setFields(ImmutableList.of(new TableFieldSchema().setName("ts").setType("TIMESTAMP"), new TableFieldSchema().setName("s1").setType("STRING"), new TableFieldSchema().setName("d1").setType("DATE"), new TableFieldSchema().setName("i1").setType("INTEGER")));
    avroSchema = new Schema.Parser().parse("{\"type\":\"record\",\"name\":\"__root__\",\"fields\":" + "[{\"name\":\"ts\",\"type\":[\"null\",{\"type\":\"long\",\"logicalType\":\"timestamp-micros\"}]}," + "{\"name\":\"s1\",\"type\":[\"null\",\"string\"]}," + "{\"name\":\"d1\",\"type\":[\"null\",{\"type\":\"int\",\"logicalType\":\"date\"}]}," + "{\"name\":\"i1\",\"type\":[\"null\",\"long\"]}]}");
    long modTime = System.currentTimeMillis() * 1000;
    BigQueryTablePartition p1 = BigQueryTablePartition.builder().setPartitionName("p1").setLastModificationTime(modTime).build();
    BigQueryTablePartition p2 = BigQueryTablePartition.builder().setPartitionName("p2").setLastModificationTime(modTime).build();
    BigQueryTable t1 = BigQueryTable.builder().setTableName("partitioned_table").setProject(PROJECT).setDataset(DATASET).setSchema(avroSchema).setLastModificationTime(modTime).setPartitioningColumn("ts").setPartitions(Arrays.asList(p1, p2)).build();
    BigQueryTable t2 = BigQueryTable.builder().setTableName("unpartitioned_table").setProject(PROJECT).setDataset(DATASET).setSchema(avroSchema).setLastModificationTime(modTime).build();
    tableByName = new HashMap<>();
    tableByName.put(t1.getTableName(), t1);
    tableByName.put(t2.getTableName(), t2);
    defaultRecords = new TableRow[] { new TableRow().set("ts", 1L).set("s1", "1001").set("d1", "1970-01-01").set("i1", 2001L), new TableRow().set("ts", 2L).set("s1", "1002").set("d1", "1970-01-02").set("i1", 2002L), new TableRow().set("ts", 3L).set("s1", "1003").set("d1", "1970-01-03").set("i1", 2003L), new TableRow().set("ts", 4L).set("s1", "1004").set("d1", "1970-01-04").set("i1", null), new TableRow().set("ts", 5L).set("s1", "1005").set("d1", "1970-01-05").set("i1", 2005L) };
    defaultExpectedRecords = new String[] { "{\"ts\": 1, \"s1\": \"1001\", \"d1\": 0, \"i1\": 2001}", "{\"ts\": 2, \"s1\": \"1002\", \"d1\": 1, \"i1\": 2002}", "{\"ts\": 3, \"s1\": \"1003\", \"d1\": 2, \"i1\": 2003}", "{\"ts\": 4, \"s1\": \"1004\", \"d1\": 3, \"i1\": null}", "{\"ts\": 5, \"s1\": \"1005\", \"d1\": 4, \"i1\": 2005}" };
    FakeDatasetService.setUp();
    fakeDatasetService = new FakeDatasetService();
    fakeDatasetService.createDataset(PROJECT, DATASET, "", "", null);
    fakeDatasetService.createTable(new Table().setTableReference(t1.toTableReference()).setSchema(bqSchema).setRequirePartitionFilter(true).setTimePartitioning(new TimePartitioning().setField("ts").setType("DAY")));
    fakeDatasetService.createTable(new Table().setTableReference(t2.toTableReference()).setSchema(bqSchema));
    fakeJobService = new CustomFakeJobService();
    bqFakeServices = new FakeBigQueryServices().withJobService(fakeJobService).withDatasetService(fakeDatasetService);
    when(tableResultMock.iterateAll()).thenReturn(Collections.singleton(fields("unpartitioned_table", "0", null)));
    when(bqMock.query(any())).thenReturn(tableResultMock);
    when(bqMock.delete(any(TableId.class))).thenReturn(true);
    when(bqsMock.createReadSession(any())).thenReturn(ReadSession.newBuilder().setAvroSchema(AvroSchema.newBuilder().setSchema(avroSchema.toString())).build());
    metadataLoader = new BigQueryMetadataLoader(bqMock, bqsMock, MAX_PARALLEL_REQUESTS);
}
Also used : TableId(com.google.cloud.bigquery.TableId) BigQueryTablePartition(com.google.cloud.teleport.v2.values.BigQueryTablePartition) BigQueryTable(com.google.cloud.teleport.v2.values.BigQueryTable) Table(com.google.api.services.bigquery.model.Table) TableSchema(com.google.api.services.bigquery.model.TableSchema) BigQueryMetadataLoader(com.google.cloud.teleport.v2.utils.BigQueryMetadataLoader) TableFieldSchema(com.google.api.services.bigquery.model.TableFieldSchema) TimePartitioning(com.google.api.services.bigquery.model.TimePartitioning) FakeDatasetService(org.apache.beam.sdk.io.gcp.testing.FakeDatasetService) BigQueryTable(com.google.cloud.teleport.v2.values.BigQueryTable) TableRow(com.google.api.services.bigquery.model.TableRow) FakeBigQueryServices(org.apache.beam.sdk.io.gcp.testing.FakeBigQueryServices) DataplexBigQueryToGcsOptions(com.google.cloud.teleport.v2.options.DataplexBigQueryToGcsOptions) Before(org.junit.Before)

Aggregations

BigQueryMetadataLoader (com.google.cloud.teleport.v2.utils.BigQueryMetadataLoader)2 BigQueryTable (com.google.cloud.teleport.v2.values.BigQueryTable)2 Table (com.google.api.services.bigquery.model.Table)1 TableFieldSchema (com.google.api.services.bigquery.model.TableFieldSchema)1 TableRow (com.google.api.services.bigquery.model.TableRow)1 TableSchema (com.google.api.services.bigquery.model.TableSchema)1 TimePartitioning (com.google.api.services.bigquery.model.TimePartitioning)1 DatasetId (com.google.cloud.bigquery.DatasetId)1 TableId (com.google.cloud.bigquery.TableId)1 DataplexBigQueryToGcsOptions (com.google.cloud.teleport.v2.options.DataplexBigQueryToGcsOptions)1 NoopTransform (com.google.cloud.teleport.v2.transforms.NoopTransform)1 DataplexBigQueryToGcsFilter (com.google.cloud.teleport.v2.utils.DataplexBigQueryToGcsFilter)1 BigQueryTablePartition (com.google.cloud.teleport.v2.values.BigQueryTablePartition)1 VisibleForTesting (com.google.common.annotations.VisibleForTesting)1 Pipeline (org.apache.beam.sdk.Pipeline)1 FakeBigQueryServices (org.apache.beam.sdk.io.gcp.testing.FakeBigQueryServices)1 FakeDatasetService (org.apache.beam.sdk.io.gcp.testing.FakeDatasetService)1 Before (org.junit.Before)1