Search in sources :

Example 1 with BigQueryTablePartition

use of com.google.cloud.teleport.v2.values.BigQueryTablePartition in project DataflowTemplates by GoogleCloudPlatform.

the class BigQueryTableToGcsTransform method expand.

@Override
public PCollection<KV<BigQueryTablePartition, String>> expand(PBegin begin) {
    Schema targetFileSchema = table.getSchema();
    if (table.isPartitioned() && enforceSamePartitionKey) {
        // Apart from renaming the field in the schema we don't need to anything else (e.g. replace
        // the field in the actual GenericRecord being processed) because writers write fields
        // to the file based on their numeric position, not their name.
        targetFileSchema = Schemas.renameAvroField(targetFileSchema, table.getPartitioningColumn(), table.getPartitioningColumn() + PARTITION_COLUMN_RENAME_SUFFIX);
    }
    Sink<GenericRecord> sink;
    switch(outputFileFormat) {
        case PARQUET:
            sink = ParquetIO.sink(targetFileSchema).withCompressionCodec(outputFileCompression.getParquetCodec());
            break;
        case AVRO:
            sink = AvroIO.<GenericRecord>sink(targetFileSchema).withCodec(outputFileCompression.getAvroCodec());
            break;
        default:
            throw new UnsupportedOperationException("Output format is not implemented: " + outputFileFormat);
    }
    BigQueryToGcsDirectoryNaming dn = new BigQueryToGcsDirectoryNaming(enforceSamePartitionKey);
    if (!table.isPartitioned()) {
        return transformTable(begin, sink, dn);
    }
    if (table.getPartitions() == null || table.getPartitions().isEmpty()) {
        throw new IllegalStateException(String.format("Expected at least 1 partition for a partitioned table %s, but got none.", table.getTableName()));
    }
    List<PCollection<KV<BigQueryTablePartition, String>>> collections = new ArrayList<>();
    table.getPartitions().forEach(p -> collections.add(transformPartition(begin, sink, p, dn)));
    return PCollectionList.of(collections).apply(tableNodeName("FlattenPartitionResults"), Flatten.pCollections());
}
Also used : PCollection(org.apache.beam.sdk.values.PCollection) BigQueryTablePartition(com.google.cloud.teleport.v2.values.BigQueryTablePartition) BigQueryToGcsDirectoryNaming(com.google.cloud.teleport.v2.utils.BigQueryToGcsDirectoryNaming) Schema(org.apache.avro.Schema) ArrayList(java.util.ArrayList) GenericRecord(org.apache.avro.generic.GenericRecord)

Example 2 with BigQueryTablePartition

use of com.google.cloud.teleport.v2.values.BigQueryTablePartition in project DataflowTemplates by GoogleCloudPlatform.

the class DeleteBigQueryDataFn method processElement.

@ProcessElement
public void processElement(@Element KV<BigQueryTable, BigQueryTablePartition> input, PipelineOptions options) {
    BigQueryTable t = input.getKey();
    BigQueryTablePartition p = input.getValue();
    if (t.isPartitioned() && p == null) {
        throw new IllegalStateException(String.format("No partition to delete provided for a partitioned table %s.", t.getTableName()));
    }
    if (!t.isPartitioned() && p != null) {
        throw new IllegalStateException(String.format("Got unexpected partition %s to delete for a non-partitioned table %s.", p.getPartitionName(), t.getTableName()));
    }
    if (!options.as(Options.class).getDeleteSourceData()) {
        if (t.isPartitioned()) {
            LOG.info("Skipping source BigQuery data deletion for partition {}${}.", t.getTableName(), p.getPartitionName());
        } else {
            LOG.info("Skipping source BigQuery data deletion for table {}.", t.getTableName());
        }
        return;
    }
    if (t.isPartitioned()) {
        deletePartition(t, p);
    } else {
        deleteTable(t);
    }
}
Also used : BigQueryTablePartition(com.google.cloud.teleport.v2.values.BigQueryTablePartition) BigQueryTable(com.google.cloud.teleport.v2.values.BigQueryTable)

Example 3 with BigQueryTablePartition

use of com.google.cloud.teleport.v2.values.BigQueryTablePartition in project DataflowTemplates by GoogleCloudPlatform.

the class BigQueryMetadataLoader method loadTableMetadata.

/**
 * Populates {@code table} builder with additional metadata like partition names and schema.
 *
 * @param filter optional filter to skip a subset of tables
 * @return {@code true} if the table matches all filters and should be included in the results,
 *     {@code false} if it should be skipped
 */
private boolean loadTableMetadata(BigQueryTable.Builder table, Filter filter) throws InterruptedException {
    TableReadOptions.Builder readOptions = TableReadOptions.newBuilder();
    if (table.getPartitioningColumn() == null) {
        if (filter != null && filter.shouldSkipUnpartitionedTable(table)) {
            return false;
        }
    } else {
        List<BigQueryTablePartition> partitions = loadTablePartitions(table, filter);
        if (filter != null && filter.shouldSkipPartitionedTable(table, partitions)) {
            return false;
        }
        table.setPartitions(partitions);
        LOG.info("Loaded {} partitions for table {}: {}", partitions.size(), table.getTableName(), partitions);
        // Creating a ReadSession without a WHERE clause for a partitioned table that has
        // "require partition filter" param set to true would fail with the error:
        // "Cannot query over table ... without a filter over column(s) ...
        // that can be used for partition elimination".
        // The following is a hack that adds an "is null and is not null" filter over the
        // partitioning column, which shouldn't select any data but should make the query
        // analyzer happy and should be enough to extract the table schema.
        // TODO(an2x): do this only when "require partition filter" = true
        // or load schema differently?
        readOptions.setRowRestriction(String.format("%s is null and %s is not null", table.getPartitioningColumn(), table.getPartitioningColumn()));
    }
    ReadSession session = BigQueryUtils.createReadSession(bqsClient, DatasetId.of(table.getProject(), table.getDataset()), table.getTableName(), readOptions.build());
    table.setSchema(new Schema.Parser().parse(session.getAvroSchema().getSchema()));
    LOG.info("Loaded schema for table {}: {}", table.getTableName(), table.getSchema());
    return true;
}
Also used : BigQueryTablePartition(com.google.cloud.teleport.v2.values.BigQueryTablePartition) ReadSession(com.google.cloud.bigquery.storage.v1beta1.Storage.ReadSession) TableReadOptions(com.google.cloud.bigquery.storage.v1beta1.ReadOptions.TableReadOptions)

Example 4 with BigQueryTablePartition

use of com.google.cloud.teleport.v2.values.BigQueryTablePartition in project DataflowTemplates by GoogleCloudPlatform.

the class DataplexBigQueryToGcsTest method setUp.

@Before
public void setUp() throws InterruptedException, IOException {
    options = TestPipeline.testingPipelineOptions().as(DataplexBigQueryToGcsOptions.class);
    options.setProject(PROJECT);
    options.setUpdateDataplexMetadata(true);
    options.setEnforceSamePartitionKey(false);
    // Required when using BigQueryIO.withMethod(EXPORT).
    options.setTempLocation(tmpDir.newFolder("bqTmp").getAbsolutePath());
    outDir = tmpDir.newFolder("out");
    bqSchema = new TableSchema().setFields(ImmutableList.of(new TableFieldSchema().setName("ts").setType("TIMESTAMP"), new TableFieldSchema().setName("s1").setType("STRING"), new TableFieldSchema().setName("d1").setType("DATE"), new TableFieldSchema().setName("i1").setType("INTEGER")));
    avroSchema = new Schema.Parser().parse("{\"type\":\"record\",\"name\":\"__root__\",\"fields\":" + "[{\"name\":\"ts\",\"type\":[\"null\",{\"type\":\"long\",\"logicalType\":\"timestamp-micros\"}]}," + "{\"name\":\"s1\",\"type\":[\"null\",\"string\"]}," + "{\"name\":\"d1\",\"type\":[\"null\",{\"type\":\"int\",\"logicalType\":\"date\"}]}," + "{\"name\":\"i1\",\"type\":[\"null\",\"long\"]}]}");
    long modTime = System.currentTimeMillis() * 1000;
    BigQueryTablePartition p1 = BigQueryTablePartition.builder().setPartitionName("p1").setLastModificationTime(modTime).build();
    BigQueryTablePartition p2 = BigQueryTablePartition.builder().setPartitionName("p2").setLastModificationTime(modTime).build();
    BigQueryTable t1 = BigQueryTable.builder().setTableName("partitioned_table").setProject(PROJECT).setDataset(DATASET).setSchema(avroSchema).setLastModificationTime(modTime).setPartitioningColumn("ts").setPartitions(Arrays.asList(p1, p2)).build();
    BigQueryTable t2 = BigQueryTable.builder().setTableName("unpartitioned_table").setProject(PROJECT).setDataset(DATASET).setSchema(avroSchema).setLastModificationTime(modTime).build();
    tableByName = new HashMap<>();
    tableByName.put(t1.getTableName(), t1);
    tableByName.put(t2.getTableName(), t2);
    defaultRecords = new TableRow[] { new TableRow().set("ts", 1L).set("s1", "1001").set("d1", "1970-01-01").set("i1", 2001L), new TableRow().set("ts", 2L).set("s1", "1002").set("d1", "1970-01-02").set("i1", 2002L), new TableRow().set("ts", 3L).set("s1", "1003").set("d1", "1970-01-03").set("i1", 2003L), new TableRow().set("ts", 4L).set("s1", "1004").set("d1", "1970-01-04").set("i1", null), new TableRow().set("ts", 5L).set("s1", "1005").set("d1", "1970-01-05").set("i1", 2005L) };
    defaultExpectedRecords = new String[] { "{\"ts\": 1, \"s1\": \"1001\", \"d1\": 0, \"i1\": 2001}", "{\"ts\": 2, \"s1\": \"1002\", \"d1\": 1, \"i1\": 2002}", "{\"ts\": 3, \"s1\": \"1003\", \"d1\": 2, \"i1\": 2003}", "{\"ts\": 4, \"s1\": \"1004\", \"d1\": 3, \"i1\": null}", "{\"ts\": 5, \"s1\": \"1005\", \"d1\": 4, \"i1\": 2005}" };
    FakeDatasetService.setUp();
    fakeDatasetService = new FakeDatasetService();
    fakeDatasetService.createDataset(PROJECT, DATASET, "", "", null);
    fakeDatasetService.createTable(new Table().setTableReference(t1.toTableReference()).setSchema(bqSchema).setRequirePartitionFilter(true).setTimePartitioning(new TimePartitioning().setField("ts").setType("DAY")));
    fakeDatasetService.createTable(new Table().setTableReference(t2.toTableReference()).setSchema(bqSchema));
    fakeJobService = new CustomFakeJobService();
    bqFakeServices = new FakeBigQueryServices().withJobService(fakeJobService).withDatasetService(fakeDatasetService);
    when(tableResultMock.iterateAll()).thenReturn(Collections.singleton(fields("unpartitioned_table", "0", null)));
    when(bqMock.query(any())).thenReturn(tableResultMock);
    when(bqMock.delete(any(TableId.class))).thenReturn(true);
    when(bqsMock.createReadSession(any())).thenReturn(ReadSession.newBuilder().setAvroSchema(AvroSchema.newBuilder().setSchema(avroSchema.toString())).build());
    metadataLoader = new BigQueryMetadataLoader(bqMock, bqsMock, MAX_PARALLEL_REQUESTS);
}
Also used : TableId(com.google.cloud.bigquery.TableId) BigQueryTablePartition(com.google.cloud.teleport.v2.values.BigQueryTablePartition) BigQueryTable(com.google.cloud.teleport.v2.values.BigQueryTable) Table(com.google.api.services.bigquery.model.Table) TableSchema(com.google.api.services.bigquery.model.TableSchema) BigQueryMetadataLoader(com.google.cloud.teleport.v2.utils.BigQueryMetadataLoader) TableFieldSchema(com.google.api.services.bigquery.model.TableFieldSchema) TimePartitioning(com.google.api.services.bigquery.model.TimePartitioning) FakeDatasetService(org.apache.beam.sdk.io.gcp.testing.FakeDatasetService) BigQueryTable(com.google.cloud.teleport.v2.values.BigQueryTable) TableRow(com.google.api.services.bigquery.model.TableRow) FakeBigQueryServices(org.apache.beam.sdk.io.gcp.testing.FakeBigQueryServices) DataplexBigQueryToGcsOptions(com.google.cloud.teleport.v2.options.DataplexBigQueryToGcsOptions) Before(org.junit.Before)

Example 5 with BigQueryTablePartition

use of com.google.cloud.teleport.v2.values.BigQueryTablePartition in project DataflowTemplates by GoogleCloudPlatform.

the class DeleteBigQueryDataFnTest method testTransform_withDeleteSourceDataEnabled_truncatesData.

@Test
@Category(NeedsRunner.class)
public void testTransform_withDeleteSourceDataEnabled_truncatesData() throws InterruptedException {
    Options options = TestPipeline.testingPipelineOptions().as(Options.class);
    options.setDeleteSourceData(true);
    PCollection<Void> actual = testPipeline.apply("CreateInput", Create.of(KV.of(partitionedTable, partition), KV.of(table, (BigQueryTablePartition) null)).withCoder(fnCoder)).apply("TestDeleteBigQueryDataFn", ParDo.of(fnUnderTest));
    PAssert.that(actual).empty();
    testPipeline.run(options);
    verify(bqMock, times(1)).query(QueryJobConfiguration.newBuilder("truncate table `pr1.d1.t1`").build());
    verify(bqMock, times(1)).delete(TableId.of("pr1", "d1", "t1p$p1"));
    verifyNoMoreInteractions(bqMock);
}
Also used : Options(com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options) Category(org.junit.experimental.categories.Category) Test(org.junit.Test)

Aggregations

BigQueryTablePartition (com.google.cloud.teleport.v2.values.BigQueryTablePartition)13 BigQueryTable (com.google.cloud.teleport.v2.values.BigQueryTable)11 Test (org.junit.Test)9 Filter (com.google.cloud.teleport.v2.utils.BigQueryMetadataLoader.Filter)6 Options (com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn.Options)3 ArrayList (java.util.ArrayList)3 Category (org.junit.experimental.categories.Category)3 PCollection (org.apache.beam.sdk.values.PCollection)2 Table (com.google.api.services.bigquery.model.Table)1 TableFieldSchema (com.google.api.services.bigquery.model.TableFieldSchema)1 TableRow (com.google.api.services.bigquery.model.TableRow)1 TableSchema (com.google.api.services.bigquery.model.TableSchema)1 TimePartitioning (com.google.api.services.bigquery.model.TimePartitioning)1 TableId (com.google.cloud.bigquery.TableId)1 TableResult (com.google.cloud.bigquery.TableResult)1 TableReadOptions (com.google.cloud.bigquery.storage.v1beta1.ReadOptions.TableReadOptions)1 ReadSession (com.google.cloud.bigquery.storage.v1beta1.Storage.ReadSession)1 DataplexBigQueryToGcsOptions (com.google.cloud.teleport.v2.options.DataplexBigQueryToGcsOptions)1 BigQueryTableToGcsTransform (com.google.cloud.teleport.v2.transforms.BigQueryTableToGcsTransform)1 DeleteBigQueryDataFn (com.google.cloud.teleport.v2.transforms.DeleteBigQueryDataFn)1