Search in sources :

Example 1 with AvroSinkWithJodaDatesConversion

use of com.google.cloud.teleport.v2.io.AvroSinkWithJodaDatesConversion in project DataflowTemplates by GoogleCloudPlatform.

the class GenericRecordsToGcsPartitioned method expand.

@Override
public PCollection<PartitionMetadata> expand(PCollection<GenericRecord> input) {
    Schema schema = SchemaUtils.parseAvroSchema(serializedAvroSchema);
    Sink<GenericRecord> sink;
    switch(outputFileFormat) {
        case PARQUET:
            sink = ParquetIO.sink(schema);
            break;
        case AVRO:
            sink = new AvroSinkWithJodaDatesConversion<>(schema);
            break;
        default:
            throw new UnsupportedOperationException("Output format is not implemented: " + outputFileFormat);
    }
    if (partitionColumnName == null || partitioningSchema == null) {
        LOG.info("PartitionColumnName or/and PartitioningSchema not provided. " + "Writing to GCS without partition");
        return input.apply("Write to Storage with No Partition", FileIO.<GenericRecord>write().withSuffix(outputFileFormat.getFileSuffix()).via(sink).to(gcsPath)).getPerDestinationOutputFilenames().apply("MapFileNames", MapElements.into(TypeDescriptors.strings()).via((SerializableFunction<KV<Void, String>, String>) KV::getValue)).apply(MapElements.via(new SimpleFunction<String, PartitionMetadata>() {

            @Override
            public PartitionMetadata apply(String path) {
                return PartitionMetadata.builder().setValues(ImmutableList.of("1")).setLocation(withoutFileName(path)).build();
            }
        }));
    }
    ZoneId zoneId = getZoneId(schema);
    return input.apply(FileIO.<List<KV<String, Integer>>, GenericRecord>writeDynamic().by((GenericRecord r) -> partitioningSchema.toPartition(Instant.ofEpochMilli(partitionColumnValueToMillis(r.get(partitionColumnName))).atZone(zoneId))).withDestinationCoder(ListCoder.of(KvCoder.of(StringUtf8Coder.of(), VarIntCoder.of()))).via(sink).to(gcsPath).withNumShards(// must be 1 as we can only have 1 file per Dataplex partition
    1).withNaming(p -> Write.defaultNaming(partitionToPath(p), outputFileFormat.getFileSuffix()))).getPerDestinationOutputFilenames().apply(MapElements.via(new SimpleFunction<KV<List<KV<String, Integer>>, String>, PartitionMetadata>() {

        @Override
        public PartitionMetadata apply(KV<List<KV<String, Integer>>, String> partitionAndPath) {
            if (partitionAndPath.getKey() == null) {
                throw new IllegalStateException("Partition is null for path " + partitionAndPath.getValue());
            }
            if (partitionAndPath.getValue() == null) {
                throw new IllegalStateException("Path is null for partition " + partitionAndPath.getKey());
            }
            return PartitionMetadata.builder().setValues(partitionAndPath.getKey().stream().map(e -> String.valueOf(e.getValue())).collect(toImmutableList())).setLocation(withoutFileName(partitionAndPath.getValue())).build();
        }
    }));
}
Also used : FileIO(org.apache.beam.sdk.io.FileIO) KV(org.apache.beam.sdk.values.KV) ZonedDateTime(java.time.ZonedDateTime) PartitionMetadata(com.google.cloud.teleport.v2.values.PartitionMetadata) Sink(org.apache.beam.sdk.io.FileIO.Sink) LoggerFactory(org.slf4j.LoggerFactory) ListCoder(org.apache.beam.sdk.coders.ListCoder) SerializableFunction(org.apache.beam.sdk.transforms.SerializableFunction) SimpleFunction(org.apache.beam.sdk.transforms.SimpleFunction) Function(java.util.function.Function) StringUtf8Coder(org.apache.beam.sdk.coders.StringUtf8Coder) PTransform(org.apache.beam.sdk.transforms.PTransform) ImmutableList(com.google.common.collect.ImmutableList) LogicalTypes(org.apache.avro.LogicalTypes) Write(org.apache.beam.sdk.io.FileIO.Write) FileFormatOptions(com.google.cloud.teleport.v2.utils.FileFormat.FileFormatOptions) ZoneOffset(java.time.ZoneOffset) Nullable(javax.annotation.Nullable) MapElements(org.apache.beam.sdk.transforms.MapElements) GenericRecord(org.apache.avro.generic.GenericRecord) KvCoder(org.apache.beam.sdk.coders.KvCoder) Schema(org.apache.avro.Schema) Logger(org.slf4j.Logger) ImmutableMap(com.google.common.collect.ImmutableMap) LogicalType(org.apache.avro.LogicalType) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) ReadableInstant(org.joda.time.ReadableInstant) Instant(java.time.Instant) PCollection(org.apache.beam.sdk.values.PCollection) Collectors(java.util.stream.Collectors) ParquetIO(org.apache.beam.sdk.io.parquet.ParquetIO) ZoneId(java.time.ZoneId) List(java.util.List) SchemaUtils(com.google.cloud.teleport.v2.utils.SchemaUtils) TypeDescriptors(org.apache.beam.sdk.values.TypeDescriptors) VarIntCoder(org.apache.beam.sdk.coders.VarIntCoder) AvroSinkWithJodaDatesConversion(com.google.cloud.teleport.v2.io.AvroSinkWithJodaDatesConversion) ZoneId(java.time.ZoneId) Schema(org.apache.avro.Schema) KV(org.apache.beam.sdk.values.KV) SimpleFunction(org.apache.beam.sdk.transforms.SimpleFunction) ImmutableList(com.google.common.collect.ImmutableList) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) List(java.util.List) GenericRecord(org.apache.avro.generic.GenericRecord)

Aggregations

AvroSinkWithJodaDatesConversion (com.google.cloud.teleport.v2.io.AvroSinkWithJodaDatesConversion)1 FileFormatOptions (com.google.cloud.teleport.v2.utils.FileFormat.FileFormatOptions)1 SchemaUtils (com.google.cloud.teleport.v2.utils.SchemaUtils)1 PartitionMetadata (com.google.cloud.teleport.v2.values.PartitionMetadata)1 ImmutableList (com.google.common.collect.ImmutableList)1 ImmutableList.toImmutableList (com.google.common.collect.ImmutableList.toImmutableList)1 ImmutableMap (com.google.common.collect.ImmutableMap)1 Instant (java.time.Instant)1 ZoneId (java.time.ZoneId)1 ZoneOffset (java.time.ZoneOffset)1 ZonedDateTime (java.time.ZonedDateTime)1 List (java.util.List)1 Function (java.util.function.Function)1 Collectors (java.util.stream.Collectors)1 Nullable (javax.annotation.Nullable)1 LogicalType (org.apache.avro.LogicalType)1 LogicalTypes (org.apache.avro.LogicalTypes)1 Schema (org.apache.avro.Schema)1 GenericRecord (org.apache.avro.generic.GenericRecord)1 KvCoder (org.apache.beam.sdk.coders.KvCoder)1