use of com.google.cloud.teleport.v2.values.PartitionMetadata in project DataflowTemplates by GoogleCloudPlatform.
the class GenericRecordsToGcsPartitionedTest method testHourlyPartitioning.
@Test
public void testHourlyPartitioning() {
Record record11 = new Record(SCHEMA);
record11.put("x", true);
record11.put("date", dateToEpochMillis(2010, 1, 1, 1));
Record record12 = new Record(SCHEMA);
record12.put("x", false);
record12.put("date", dateToEpochMillis(2010, 1, 1, 1));
Record record21 = new Record(SCHEMA);
record21.put("x", true);
record21.put("date", dateToEpochMillis(2010, 1, 1, 2));
Record record31 = new Record(SCHEMA);
record31.put("x", true);
record31.put("date", dateToEpochMillis(2010, 1, 2, 1));
String tmpRootPath = temporaryFolder.getRoot().getAbsolutePath();
PCollection<PartitionMetadata> result = mainPipeline.apply(Create.<GenericRecord>of(record11, record12, record21, record31).withCoder(AvroCoder.of(SCHEMA))).apply("GenericRecordsToGCS", new GenericRecordsToGcsPartitioned(tmpRootPath, SERIALIZED_SCHEMA, "date", PartitioningSchema.HOURLY, FileFormatOptions.AVRO));
PAssert.that(result).containsInAnyOrder(PartitionMetadata.builder().setLocation(tmpRootPath + "/year=2010/month=1/day=1/hour=1").setValues(ImmutableList.of("2010", "1", "1", "1")).build(), PartitionMetadata.builder().setLocation(tmpRootPath + "/year=2010/month=1/day=1/hour=2").setValues(ImmutableList.of("2010", "1", "1", "2")).build(), PartitionMetadata.builder().setLocation(tmpRootPath + "/year=2010/month=1/day=2/hour=1").setValues(ImmutableList.of("2010", "1", "2", "1")).build());
mainPipeline.run();
verifyRecordsExists(tmpRootPath + "/year=2010/month=1/day=1/hour=1/*", record11, record12);
verifyRecordsExists(tmpRootPath + "/year=2010/month=1/day=1/hour=2/*", record21);
verifyRecordsExists(tmpRootPath + "/year=2010/month=1/day=2/hour=1/*", record31);
}
use of com.google.cloud.teleport.v2.values.PartitionMetadata in project DataflowTemplates by GoogleCloudPlatform.
the class DataplexJdbcIngestion method buildGcsPipeline.
@VisibleForTesting
static void buildGcsPipeline(Pipeline pipeline, DataplexJdbcIngestionOptions options, DynamicDataSourceConfiguration dataSourceConfig, String targetRootPath) {
// Auto inferring beam schema
Schema beamSchema = Schemas.jdbcSchemaToBeamSchema(dataSourceConfig.buildDatasource(), options.getQuery());
// Convert to Avro Schema
org.apache.avro.Schema avroSchema = AvroUtils.toAvroSchema(beamSchema);
// Read from JdbcIO and convert ResultSet to Beam Row
PCollection<Row> resultRows = pipeline.apply("Read from JdbcIO", DynamicJdbcIO.<Row>read().withDataSourceConfiguration(dataSourceConfig).withQuery(options.getQuery()).withCoder(RowCoder.of(beamSchema)).withRowMapper(BeamSchemaUtil.of(beamSchema)));
// Convert Beam Row to GenericRecord
PCollection<GenericRecord> genericRecords = resultRows.apply("convert to GenericRecord", ParDo.of(new BeamRowToGenericRecordFn(avroSchema))).setCoder(AvroCoder.of(avroSchema));
// Write to GCS bucket
PCollection<PartitionMetadata> metadata = genericRecords.apply("Write to GCS", new GenericRecordsToGcsPartitioned(targetRootPath, Schemas.serialize(avroSchema), options.getParitionColumn(), options.getPartitioningScheme(), options.getFileFormat()));
}
use of com.google.cloud.teleport.v2.values.PartitionMetadata in project DataflowTemplates by GoogleCloudPlatform.
the class GenericRecordsToGcsPartitioned method expand.
@Override
public PCollection<PartitionMetadata> expand(PCollection<GenericRecord> input) {
Schema schema = SchemaUtils.parseAvroSchema(serializedAvroSchema);
Sink<GenericRecord> sink;
switch(outputFileFormat) {
case PARQUET:
sink = ParquetIO.sink(schema);
break;
case AVRO:
sink = new AvroSinkWithJodaDatesConversion<>(schema);
break;
default:
throw new UnsupportedOperationException("Output format is not implemented: " + outputFileFormat);
}
if (partitionColumnName == null || partitioningSchema == null) {
LOG.info("PartitionColumnName or/and PartitioningSchema not provided. " + "Writing to GCS without partition");
return input.apply("Write to Storage with No Partition", FileIO.<GenericRecord>write().withSuffix(outputFileFormat.getFileSuffix()).via(sink).to(gcsPath)).getPerDestinationOutputFilenames().apply("MapFileNames", MapElements.into(TypeDescriptors.strings()).via((SerializableFunction<KV<Void, String>, String>) KV::getValue)).apply(MapElements.via(new SimpleFunction<String, PartitionMetadata>() {
@Override
public PartitionMetadata apply(String path) {
return PartitionMetadata.builder().setValues(ImmutableList.of("1")).setLocation(withoutFileName(path)).build();
}
}));
}
ZoneId zoneId = getZoneId(schema);
return input.apply(FileIO.<List<KV<String, Integer>>, GenericRecord>writeDynamic().by((GenericRecord r) -> partitioningSchema.toPartition(Instant.ofEpochMilli(partitionColumnValueToMillis(r.get(partitionColumnName))).atZone(zoneId))).withDestinationCoder(ListCoder.of(KvCoder.of(StringUtf8Coder.of(), VarIntCoder.of()))).via(sink).to(gcsPath).withNumShards(// must be 1 as we can only have 1 file per Dataplex partition
1).withNaming(p -> Write.defaultNaming(partitionToPath(p), outputFileFormat.getFileSuffix()))).getPerDestinationOutputFilenames().apply(MapElements.via(new SimpleFunction<KV<List<KV<String, Integer>>, String>, PartitionMetadata>() {
@Override
public PartitionMetadata apply(KV<List<KV<String, Integer>>, String> partitionAndPath) {
if (partitionAndPath.getKey() == null) {
throw new IllegalStateException("Partition is null for path " + partitionAndPath.getValue());
}
if (partitionAndPath.getValue() == null) {
throw new IllegalStateException("Path is null for partition " + partitionAndPath.getKey());
}
return PartitionMetadata.builder().setValues(partitionAndPath.getKey().stream().map(e -> String.valueOf(e.getValue())).collect(toImmutableList())).setLocation(withoutFileName(partitionAndPath.getValue())).build();
}
}));
}
use of com.google.cloud.teleport.v2.values.PartitionMetadata in project DataflowTemplates by GoogleCloudPlatform.
the class GenericRecordsToGcsPartitionedTest method testDailyPartitioning.
@Test
public void testDailyPartitioning() {
Record record11 = new Record(SCHEMA);
record11.put("x", true);
record11.put("date", dateToEpochMillis(2010, 1, 1));
Record record12 = new Record(SCHEMA);
record12.put("x", false);
record12.put("date", dateToEpochMillis(2010, 1, 1));
Record record21 = new Record(SCHEMA);
record21.put("x", true);
record21.put("date", dateToEpochMillis(2010, 1, 2));
Record record31 = new Record(SCHEMA);
record31.put("x", true);
record31.put("date", dateToEpochMillis(2010, 2, 1));
String tmpRootPath = temporaryFolder.getRoot().getAbsolutePath();
PCollection<PartitionMetadata> result = mainPipeline.apply(Create.<GenericRecord>of(record11, record12, record21, record31).withCoder(AvroCoder.of(SCHEMA))).apply("GenericRecordsToGCS", new GenericRecordsToGcsPartitioned(tmpRootPath, SERIALIZED_SCHEMA, "date", PartitioningSchema.DAILY, FileFormatOptions.AVRO));
PAssert.that(result).containsInAnyOrder(PartitionMetadata.builder().setLocation(tmpRootPath + "/year=2010/month=1/day=1").setValues(ImmutableList.of("2010", "1", "1")).build(), PartitionMetadata.builder().setLocation(tmpRootPath + "/year=2010/month=1/day=2").setValues(ImmutableList.of("2010", "1", "2")).build(), PartitionMetadata.builder().setLocation(tmpRootPath + "/year=2010/month=2/day=1").setValues(ImmutableList.of("2010", "2", "1")).build());
mainPipeline.run();
verifyRecordsExists(tmpRootPath + "/year=2010/month=1/day=1/*", record11, record12);
verifyRecordsExists(tmpRootPath + "/year=2010/month=1/day=2/*", record21);
verifyRecordsExists(tmpRootPath + "/year=2010/month=2/day=1/*", record31);
}
use of com.google.cloud.teleport.v2.values.PartitionMetadata in project DataflowTemplates by GoogleCloudPlatform.
the class GenericRecordsToGcsPartitionedTest method testMonthlyPartitioning.
@Test
public void testMonthlyPartitioning() {
Record record11 = new Record(SCHEMA);
record11.put("x", true);
record11.put("date", dateToEpochMillis(2010, 1));
Record record12 = new Record(SCHEMA);
record12.put("x", false);
record12.put("date", dateToEpochMillis(2010, 1));
Record record21 = new Record(SCHEMA);
record21.put("x", true);
record21.put("date", dateToEpochMillis(2010, 2));
String tmpRootPath = temporaryFolder.getRoot().getAbsolutePath();
PCollection<PartitionMetadata> result = mainPipeline.apply(Create.<GenericRecord>of(record11, record12, record21).withCoder(AvroCoder.of(SCHEMA))).apply("GenericRecordsToGCS", new GenericRecordsToGcsPartitioned(tmpRootPath, SERIALIZED_SCHEMA, "date", PartitioningSchema.MONTHLY, FileFormatOptions.AVRO));
PAssert.that(result).containsInAnyOrder(PartitionMetadata.builder().setLocation(tmpRootPath + "/year=2010/month=1").setValues(ImmutableList.of("2010", "1")).build(), PartitionMetadata.builder().setLocation(tmpRootPath + "/year=2010/month=2").setValues(ImmutableList.of("2010", "2")).build());
mainPipeline.run();
verifyRecordsExists(tmpRootPath + "/year=2010/month=1/*", record11, record12);
verifyRecordsExists(tmpRootPath + "/year=2010/month=2/*", record21);
}
Aggregations