Search in sources :

Example 16 with Filter

use of com.google.cloud.teleport.v2.utils.BigQueryMetadataLoader.Filter in project DataflowTemplates by GoogleCloudPlatform.

the class DataplexBigQueryToGcsFilterTest method test_whenBeforeDateIs1DayDuration_dateParsedCorrectly.

@Test
public void test_whenBeforeDateIs1DayDuration_dateParsedCorrectly() {
    // current time in the DEFAULT time zone minus one day:
    long micros = Instant.now().minus(Duration.standardDays(1)).getMillis() * 1000L;
    BigQueryTable.Builder olderTable = table().setLastModificationTime(micros - 100000L);
    BigQueryTable.Builder newerTable = table().setLastModificationTime(micros + 100000L);
    options.setTables(null);
    options.setExportDataModifiedBeforeDateTime("-P1D");
    Filter f = new DataplexBigQueryToGcsFilter(options, new ArrayList<String>());
    assertThat(f.shouldSkipUnpartitionedTable(newerTable)).isTrue();
    assertThat(f.shouldSkipUnpartitionedTable(olderTable)).isFalse();
}
Also used : Filter(com.google.cloud.teleport.v2.utils.BigQueryMetadataLoader.Filter) BigQueryTable(com.google.cloud.teleport.v2.values.BigQueryTable) Test(org.junit.Test)

Example 17 with Filter

use of com.google.cloud.teleport.v2.utils.BigQueryMetadataLoader.Filter in project DataflowTemplates by GoogleCloudPlatform.

the class BigQueryMetadataLoader method loadTablePartitions.

private List<BigQueryTablePartition> loadTablePartitions(BigQueryTable.Builder table, Filter filter) throws InterruptedException {
    String partitionSql = String.format("select partition_id, last_modified_time\n" + "from `%s.%s.INFORMATION_SCHEMA.PARTITIONS`\n" + "where table_name = @table_name", table.getProject(), table.getDataset());
    TableResult partitionRows = bqClient.query(QueryJobConfiguration.newBuilder(partitionSql).addNamedParameter("table_name", QueryParameterValue.string(table.getTableName())).build());
    List<BigQueryTablePartition> partitions = new ArrayList<>();
    partitionRows.iterateAll().forEach(// TODO(an2x): Check we didn't get duplicate partition names.
    r -> {
        BigQueryTablePartition p = BigQueryTablePartition.builder().setPartitionName(r.get(0).getStringValue()).setLastModificationTime(r.get(1).getTimestampValue()).build();
        if (filter == null || !filter.shouldSkipPartition(table, p)) {
            partitions.add(p);
        }
    });
    return partitions;
}
Also used : BigQueryTablePartition(com.google.cloud.teleport.v2.values.BigQueryTablePartition) TableResult(com.google.cloud.bigquery.TableResult) ArrayList(java.util.ArrayList)

Example 18 with Filter

use of com.google.cloud.teleport.v2.utils.BigQueryMetadataLoader.Filter in project DataflowTemplates by GoogleCloudPlatform.

the class BigQueryMetadataLoader method loadDatasetMetadata.

/**
 * Loads metadata for all tables in the dataset {@code datasetId} returning only those accepted by
 * the {@code filter}.
 *
 * @param filter if {@code null}, will include all tables and partitions
 */
public List<BigQueryTable> loadDatasetMetadata(DatasetId datasetId, @Nullable Filter filter) throws InterruptedException, ExecutionException {
    String tableSql = String.format("select\n" + "    table_id,\n" + "    timestamp_millis(last_modified_time) as last_modified_time,\n" + "    (select column_name from `%s.%s.INFORMATION_SCHEMA.COLUMNS` c\n" + "      where c.table_catalog = t.project_id\n" + "        and c.table_schema = t.dataset_id\n" + "        and c.table_name = t.table_id\n" + "        and c.is_partitioning_column = 'YES') as partitioning_column,\n" + "  from `%s.%s.__TABLES__` t\n" + // Tables only (1), not views (2), or external tables (3).
    " where type = 1", datasetId.getProject(), datasetId.getDataset(), datasetId.getProject(), datasetId.getDataset());
    TableResult tableRows = bqClient.query(QueryJobConfiguration.newBuilder(tableSql).build());
    List<Callable<BigQueryTable>> tableQueries = new ArrayList<>();
    tableRows.iterateAll().forEach(row -> tableQueries.add(() -> {
        BigQueryTable.Builder table = BigQueryTable.builder().setProject(datasetId.getProject()).setDataset(datasetId.getDataset()).setTableName(row.get(0).getStringValue()).setLastModificationTime(row.get(1).getTimestampValue()).setPartitioningColumn(!row.get(2).isNull() ? row.get(2).getStringValue() : null);
        try {
            if (!loadTableMetadata(table, filter)) {
                return null;
            }
        } catch (RuntimeException e) {
            throw new RuntimeException("Error loading table " + table.getTableName() + " metadata.", e);
        }
        return table.build();
    }));
    ExecutorService executor = Executors.newFixedThreadPool(maxParallelRequests);
    List<Future<BigQueryTable>> tableFutures = executor.invokeAll(tableQueries);
    executor.shutdown();
    List<BigQueryTable> tables = new ArrayList<>(tableFutures.size());
    for (Future<BigQueryTable> ft : tableFutures) {
        BigQueryTable t = ft.get();
        if (t != null) {
            tables.add(t);
        }
    }
    return tables;
}
Also used : ArrayList(java.util.ArrayList) Callable(java.util.concurrent.Callable) TableResult(com.google.cloud.bigquery.TableResult) BigQueryTable(com.google.cloud.teleport.v2.values.BigQueryTable) ExecutorService(java.util.concurrent.ExecutorService) Future(java.util.concurrent.Future)

Example 19 with Filter

use of com.google.cloud.teleport.v2.utils.BigQueryMetadataLoader.Filter in project DataflowTemplates by GoogleCloudPlatform.

the class GenericRecordsToGcsPartitioned method getZoneId.

private ZoneId getZoneId(Schema schema) {
    Schema partitionFieldType = schema.getField(partitionColumnName).schema();
    // check if the partition field is nullable, inspired by {@code Schema.isNullable()} of Avro 1.9
    if (schema.getType() == Schema.Type.UNION) {
        partitionFieldType = partitionFieldType.getTypes().stream().filter(t -> t.getType() != Schema.Type.NULL).findFirst().orElseThrow(() -> new IllegalArgumentException(String.format("Partition field %s is of unsupported type: %s", partitionColumnName, schema.getField(partitionColumnName).schema())));
    }
    // get zone according to the logical-type if there is no logical-type assume UTC time-zone
    ZoneId zoneId = AVRO_DATE_TIME_LOGICAL_TYPES.getOrDefault(partitionFieldType.getLogicalType(), ZoneOffset.UTC);
    if (zoneId == null) {
        throw new IllegalArgumentException(String.format("Partition field `%s` is of an unsupported type: %s, supported types are `long` types" + " with logical types: %s", partitionColumnName, partitionFieldType, AVRO_DATE_TIME_LOGICAL_TYPES.keySet().stream().map(LogicalType::getName).collect(Collectors.joining(", "))));
    }
    return zoneId;
}
Also used : FileIO(org.apache.beam.sdk.io.FileIO) KV(org.apache.beam.sdk.values.KV) ZonedDateTime(java.time.ZonedDateTime) PartitionMetadata(com.google.cloud.teleport.v2.values.PartitionMetadata) Sink(org.apache.beam.sdk.io.FileIO.Sink) LoggerFactory(org.slf4j.LoggerFactory) ListCoder(org.apache.beam.sdk.coders.ListCoder) SerializableFunction(org.apache.beam.sdk.transforms.SerializableFunction) SimpleFunction(org.apache.beam.sdk.transforms.SimpleFunction) Function(java.util.function.Function) StringUtf8Coder(org.apache.beam.sdk.coders.StringUtf8Coder) PTransform(org.apache.beam.sdk.transforms.PTransform) ImmutableList(com.google.common.collect.ImmutableList) LogicalTypes(org.apache.avro.LogicalTypes) Write(org.apache.beam.sdk.io.FileIO.Write) FileFormatOptions(com.google.cloud.teleport.v2.utils.FileFormat.FileFormatOptions) ZoneOffset(java.time.ZoneOffset) Nullable(javax.annotation.Nullable) MapElements(org.apache.beam.sdk.transforms.MapElements) GenericRecord(org.apache.avro.generic.GenericRecord) KvCoder(org.apache.beam.sdk.coders.KvCoder) Schema(org.apache.avro.Schema) Logger(org.slf4j.Logger) ImmutableMap(com.google.common.collect.ImmutableMap) LogicalType(org.apache.avro.LogicalType) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) ReadableInstant(org.joda.time.ReadableInstant) Instant(java.time.Instant) PCollection(org.apache.beam.sdk.values.PCollection) Collectors(java.util.stream.Collectors) ParquetIO(org.apache.beam.sdk.io.parquet.ParquetIO) ZoneId(java.time.ZoneId) List(java.util.List) SchemaUtils(com.google.cloud.teleport.v2.utils.SchemaUtils) TypeDescriptors(org.apache.beam.sdk.values.TypeDescriptors) VarIntCoder(org.apache.beam.sdk.coders.VarIntCoder) AvroSinkWithJodaDatesConversion(com.google.cloud.teleport.v2.io.AvroSinkWithJodaDatesConversion) ZoneId(java.time.ZoneId) Schema(org.apache.avro.Schema)

Aggregations

Filter (com.google.cloud.teleport.v2.utils.BigQueryMetadataLoader.Filter)10 BigQueryTable (com.google.cloud.teleport.v2.values.BigQueryTable)10 Test (org.junit.Test)10 BigQueryTablePartition (com.google.cloud.teleport.v2.values.BigQueryTablePartition)8 TableResult (com.google.cloud.bigquery.TableResult)2 DataStreamIO (com.google.cloud.teleport.v2.cdc.sources.DataStreamIO)2 CdcJdbcIO (com.google.cloud.teleport.v2.io.CdcJdbcIO)2 DmlInfo (com.google.cloud.teleport.v2.values.DmlInfo)2 FailsafeElement (com.google.cloud.teleport.v2.values.FailsafeElement)2 ArrayList (java.util.ArrayList)2 List (java.util.List)2 Collectors (java.util.stream.Collectors)2 Pipeline (org.apache.beam.sdk.Pipeline)2 GoogleCloudDataplexV1Asset (com.google.api.services.dataplex.v1.model.GoogleCloudDataplexV1Asset)1 GoogleCloudDataplexV1Entity (com.google.api.services.dataplex.v1.model.GoogleCloudDataplexV1Entity)1 GoogleCloudDataplexV1Partition (com.google.api.services.dataplex.v1.model.GoogleCloudDataplexV1Partition)1 TableReadOptions (com.google.cloud.bigquery.storage.v1beta1.ReadOptions.TableReadOptions)1 ReadSession (com.google.cloud.bigquery.storage.v1beta1.Storage.ReadSession)1 AvroSinkWithJodaDatesConversion (com.google.cloud.teleport.v2.io.AvroSinkWithJodaDatesConversion)1 DatastreamConstants (com.google.cloud.teleport.v2.templates.datastream.DatastreamConstants)1