use of com.google.cloud.teleport.v2.utils.BigQueryToGcsDirectoryNaming in project DataflowTemplates by GoogleCloudPlatform.
the class BigQueryTableToGcsTransform method expand.
@Override
public PCollection<KV<BigQueryTablePartition, String>> expand(PBegin begin) {
Schema targetFileSchema = table.getSchema();
if (table.isPartitioned() && enforceSamePartitionKey) {
// Apart from renaming the field in the schema we don't need to anything else (e.g. replace
// the field in the actual GenericRecord being processed) because writers write fields
// to the file based on their numeric position, not their name.
targetFileSchema = Schemas.renameAvroField(targetFileSchema, table.getPartitioningColumn(), table.getPartitioningColumn() + PARTITION_COLUMN_RENAME_SUFFIX);
}
Sink<GenericRecord> sink;
switch(outputFileFormat) {
case PARQUET:
sink = ParquetIO.sink(targetFileSchema).withCompressionCodec(outputFileCompression.getParquetCodec());
break;
case AVRO:
sink = AvroIO.<GenericRecord>sink(targetFileSchema).withCodec(outputFileCompression.getAvroCodec());
break;
default:
throw new UnsupportedOperationException("Output format is not implemented: " + outputFileFormat);
}
BigQueryToGcsDirectoryNaming dn = new BigQueryToGcsDirectoryNaming(enforceSamePartitionKey);
if (!table.isPartitioned()) {
return transformTable(begin, sink, dn);
}
if (table.getPartitions() == null || table.getPartitions().isEmpty()) {
throw new IllegalStateException(String.format("Expected at least 1 partition for a partitioned table %s, but got none.", table.getTableName()));
}
List<PCollection<KV<BigQueryTablePartition, String>>> collections = new ArrayList<>();
table.getPartitions().forEach(p -> collections.add(transformPartition(begin, sink, p, dn)));
return PCollectionList.of(collections).apply(tableNodeName("FlattenPartitionResults"), Flatten.pCollections());
}
Aggregations