use of com.google.cloud.teleport.v2.utils.BigQueryMetadataLoader.Filter in project DataflowTemplates by GoogleCloudPlatform.
the class DataplexBigQueryToGcsFilterTest method test_whenBeforeDateIs1DayDuration_dateParsedCorrectly.
@Test
public void test_whenBeforeDateIs1DayDuration_dateParsedCorrectly() {
// current time in the DEFAULT time zone minus one day:
long micros = Instant.now().minus(Duration.standardDays(1)).getMillis() * 1000L;
BigQueryTable.Builder olderTable = table().setLastModificationTime(micros - 100000L);
BigQueryTable.Builder newerTable = table().setLastModificationTime(micros + 100000L);
options.setTables(null);
options.setExportDataModifiedBeforeDateTime("-P1D");
Filter f = new DataplexBigQueryToGcsFilter(options, new ArrayList<String>());
assertThat(f.shouldSkipUnpartitionedTable(newerTable)).isTrue();
assertThat(f.shouldSkipUnpartitionedTable(olderTable)).isFalse();
}
use of com.google.cloud.teleport.v2.utils.BigQueryMetadataLoader.Filter in project DataflowTemplates by GoogleCloudPlatform.
the class BigQueryMetadataLoader method loadTablePartitions.
private List<BigQueryTablePartition> loadTablePartitions(BigQueryTable.Builder table, Filter filter) throws InterruptedException {
String partitionSql = String.format("select partition_id, last_modified_time\n" + "from `%s.%s.INFORMATION_SCHEMA.PARTITIONS`\n" + "where table_name = @table_name", table.getProject(), table.getDataset());
TableResult partitionRows = bqClient.query(QueryJobConfiguration.newBuilder(partitionSql).addNamedParameter("table_name", QueryParameterValue.string(table.getTableName())).build());
List<BigQueryTablePartition> partitions = new ArrayList<>();
partitionRows.iterateAll().forEach(// TODO(an2x): Check we didn't get duplicate partition names.
r -> {
BigQueryTablePartition p = BigQueryTablePartition.builder().setPartitionName(r.get(0).getStringValue()).setLastModificationTime(r.get(1).getTimestampValue()).build();
if (filter == null || !filter.shouldSkipPartition(table, p)) {
partitions.add(p);
}
});
return partitions;
}
use of com.google.cloud.teleport.v2.utils.BigQueryMetadataLoader.Filter in project DataflowTemplates by GoogleCloudPlatform.
the class BigQueryMetadataLoader method loadDatasetMetadata.
/**
* Loads metadata for all tables in the dataset {@code datasetId} returning only those accepted by
* the {@code filter}.
*
* @param filter if {@code null}, will include all tables and partitions
*/
public List<BigQueryTable> loadDatasetMetadata(DatasetId datasetId, @Nullable Filter filter) throws InterruptedException, ExecutionException {
String tableSql = String.format("select\n" + " table_id,\n" + " timestamp_millis(last_modified_time) as last_modified_time,\n" + " (select column_name from `%s.%s.INFORMATION_SCHEMA.COLUMNS` c\n" + " where c.table_catalog = t.project_id\n" + " and c.table_schema = t.dataset_id\n" + " and c.table_name = t.table_id\n" + " and c.is_partitioning_column = 'YES') as partitioning_column,\n" + " from `%s.%s.__TABLES__` t\n" + // Tables only (1), not views (2), or external tables (3).
" where type = 1", datasetId.getProject(), datasetId.getDataset(), datasetId.getProject(), datasetId.getDataset());
TableResult tableRows = bqClient.query(QueryJobConfiguration.newBuilder(tableSql).build());
List<Callable<BigQueryTable>> tableQueries = new ArrayList<>();
tableRows.iterateAll().forEach(row -> tableQueries.add(() -> {
BigQueryTable.Builder table = BigQueryTable.builder().setProject(datasetId.getProject()).setDataset(datasetId.getDataset()).setTableName(row.get(0).getStringValue()).setLastModificationTime(row.get(1).getTimestampValue()).setPartitioningColumn(!row.get(2).isNull() ? row.get(2).getStringValue() : null);
try {
if (!loadTableMetadata(table, filter)) {
return null;
}
} catch (RuntimeException e) {
throw new RuntimeException("Error loading table " + table.getTableName() + " metadata.", e);
}
return table.build();
}));
ExecutorService executor = Executors.newFixedThreadPool(maxParallelRequests);
List<Future<BigQueryTable>> tableFutures = executor.invokeAll(tableQueries);
executor.shutdown();
List<BigQueryTable> tables = new ArrayList<>(tableFutures.size());
for (Future<BigQueryTable> ft : tableFutures) {
BigQueryTable t = ft.get();
if (t != null) {
tables.add(t);
}
}
return tables;
}
use of com.google.cloud.teleport.v2.utils.BigQueryMetadataLoader.Filter in project DataflowTemplates by GoogleCloudPlatform.
the class GenericRecordsToGcsPartitioned method getZoneId.
private ZoneId getZoneId(Schema schema) {
Schema partitionFieldType = schema.getField(partitionColumnName).schema();
// check if the partition field is nullable, inspired by {@code Schema.isNullable()} of Avro 1.9
if (schema.getType() == Schema.Type.UNION) {
partitionFieldType = partitionFieldType.getTypes().stream().filter(t -> t.getType() != Schema.Type.NULL).findFirst().orElseThrow(() -> new IllegalArgumentException(String.format("Partition field %s is of unsupported type: %s", partitionColumnName, schema.getField(partitionColumnName).schema())));
}
// get zone according to the logical-type if there is no logical-type assume UTC time-zone
ZoneId zoneId = AVRO_DATE_TIME_LOGICAL_TYPES.getOrDefault(partitionFieldType.getLogicalType(), ZoneOffset.UTC);
if (zoneId == null) {
throw new IllegalArgumentException(String.format("Partition field `%s` is of an unsupported type: %s, supported types are `long` types" + " with logical types: %s", partitionColumnName, partitionFieldType, AVRO_DATE_TIME_LOGICAL_TYPES.keySet().stream().map(LogicalType::getName).collect(Collectors.joining(", "))));
}
return zoneId;
}
Aggregations