use of com.google.cloud.teleport.v2.utils.DataplexBigQueryToGcsFilter in project DataflowTemplates by GoogleCloudPlatform.
the class DataplexBigQueryToGcs method buildPipeline.
/**
* Builds the pipeline with the supplied options.
*
* @param options The execution parameters to the pipeline.
* @return The resulting pipeline.
*/
@VisibleForTesting
static Pipeline buildPipeline(DataplexBigQueryToGcsOptions options, BigQueryMetadataLoader metadataLoader, String targetRootPath, DatasetId datasetId) throws ExecutionException, InterruptedException {
Pipeline pipeline = Pipeline.create(options);
List<String> existingTargetFiles = StorageUtils.getFilesInDirectory(targetRootPath);
LOG.info("Loading BigQuery metadata...");
List<BigQueryTable> tables = metadataLoader.loadDatasetMetadata(datasetId, new DataplexBigQueryToGcsFilter(options, existingTargetFiles));
LOG.info("Loaded {} table(s).", tables.size());
if (!tables.isEmpty()) {
transformPipeline(pipeline, tables, options, targetRootPath, null, null);
} else {
pipeline.apply("Nothing to export", new NoopTransform());
}
return pipeline;
}
use of com.google.cloud.teleport.v2.utils.DataplexBigQueryToGcsFilter in project DataflowTemplates by GoogleCloudPlatform.
the class DataplexBigQueryToGcsFilterTest method test_whenPartitionedTableHasNoPartitions_filterExcludesTable.
@Test
public void test_whenPartitionedTableHasNoPartitions_filterExcludesTable() {
options.setTables(null);
options.setExportDataModifiedBeforeDateTime(null);
Filter f = new DataplexBigQueryToGcsFilter(options, new ArrayList<String>());
assertThat(f.shouldSkipPartitionedTable(table(), Collections.emptyList())).isTrue();
}
use of com.google.cloud.teleport.v2.utils.DataplexBigQueryToGcsFilter in project DataflowTemplates by GoogleCloudPlatform.
the class DataplexBigQueryToGcsFilterTest method test_whenNoFilterOptions_filterAcceptsAllTablesAndPartitions.
@Test
public void test_whenNoFilterOptions_filterAcceptsAllTablesAndPartitions() {
BigQueryTable.Builder t = table();
BigQueryTablePartition p = partition().build();
options.setTables(null);
options.setExportDataModifiedBeforeDateTime(null);
Filter f = new DataplexBigQueryToGcsFilter(options, new ArrayList<String>());
assertThat(f.shouldSkipUnpartitionedTable(t)).isFalse();
assertThat(f.shouldSkipPartitionedTable(t, Collections.singletonList(p))).isFalse();
assertThat(f.shouldSkipPartition(t, p)).isFalse();
}
use of com.google.cloud.teleport.v2.utils.DataplexBigQueryToGcsFilter in project DataflowTemplates by GoogleCloudPlatform.
the class DataplexBigQueryToGcsTest method testE2E_withTargetStrategyFail_andEnforceSamePartitionKeyEnabled_throwsException.
/**
* Tests that the pipeline throws an exception if {@code writeDisposition = FAIL}, {@code
* enforceSamePartitionKey = true}, and one of the target files exist, when processing a
* partitioned table.
*
* <p>This is a special case because depending on the {@code enforceSamePartitionKey} param the
* generated file path can be different (for partitioned tables only!), so this verifies that
* {@link com.google.cloud.teleport.v2.utils.DataplexBigQueryToGcsFilter
* DataplexBigQueryToGcsFilter} can find such files correctly.
*/
@Test
public void testE2E_withTargetStrategyFail_andEnforceSamePartitionKeyEnabled_throwsException() throws Exception {
options.setFileFormat(FileFormatOptions.PARQUET);
options.setWriteDisposition(WriteDispositionOptions.FAIL);
options.setEnforceSamePartitionKey(true);
writeOutputFile("partitioned_table/ts=p2", "output-partitioned_table-p2.parquet", "Test data");
when(bqMock.query(any())).then(invocation -> {
Iterable<FieldValueList> result = null;
QueryJobConfiguration q = (QueryJobConfiguration) invocation.getArguments()[0];
if (TABLE_QUERY_PATTERN.matcher(q.getQuery()).find()) {
result = Collections.singletonList(fields("partitioned_table", "0", "ts"));
} else if (PARTITION_QUERY_PATTERN.matcher(q.getQuery()).find()) {
result = Arrays.asList(fields("p1", "0"), fields("p2", "0"));
}
when(tableResultMock.iterateAll()).thenReturn(result);
return tableResultMock;
});
try {
DataplexBigQueryToGcs.buildPipeline(options, metadataLoader, outDir.getAbsolutePath(), DatasetId.of(PROJECT, DATASET));
fail("Expected a WriteDispositionException");
} catch (Exception e) {
assertThat(e).hasCauseThat().hasCauseThat().isInstanceOf(WriteDispositionException.class);
assertThat(e).hasCauseThat().hasCauseThat().hasMessageThat().contains("Target File partitioned_table/ts=p2/output-partitioned_table-p2.parquet exists for" + " partitioned_table$p2.");
}
}
use of com.google.cloud.teleport.v2.utils.DataplexBigQueryToGcsFilter in project DataflowTemplates by GoogleCloudPlatform.
the class DataplexBigQueryToGcsFilterTest method test_whenTablesSet_filterExcludesTablesByName.
@Test
public void test_whenTablesSet_filterExcludesTablesByName() {
BigQueryTable.Builder includedTable1 = table().setTableName("includedTable1");
BigQueryTable.Builder includedTable2 = table().setTableName("includedTable2");
BigQueryTable.Builder excludedTable = table().setTableName("excludedTable");
BigQueryTablePartition p = partition().build();
options.setTables("includedTable1,includedTable2");
options.setExportDataModifiedBeforeDateTime(null);
Filter f = new DataplexBigQueryToGcsFilter(options, new ArrayList<String>());
assertThat(f.shouldSkipUnpartitionedTable(includedTable1)).isFalse();
assertThat(f.shouldSkipUnpartitionedTable(includedTable2)).isFalse();
assertThat(f.shouldSkipUnpartitionedTable(excludedTable)).isTrue();
assertThat(f.shouldSkipPartitionedTable(includedTable1, Collections.singletonList(p))).isFalse();
assertThat(f.shouldSkipPartitionedTable(includedTable2, Collections.singletonList(p))).isFalse();
assertThat(f.shouldSkipPartitionedTable(excludedTable, Collections.singletonList(p))).isTrue();
assertThat(f.shouldSkipPartition(includedTable1, p)).isFalse();
assertThat(f.shouldSkipPartition(includedTable2, p)).isFalse();
// Should NOT skip PARTITIONS, only tables as a whole because of their name:
assertThat(f.shouldSkipPartition(excludedTable, p)).isFalse();
}
Aggregations