Search in sources :

Example 1 with Write

use of org.apache.beam.sdk.io.FileIO.Write in project dataflow-pipelines by baeminbo.

the class DynamicGcsWritePipeline method main.

public static void main(String[] args) {
    PipelineOptionsFactory.register(Options.class);
    Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
    Pipeline pipeline = Pipeline.create(options);
    PCollectionView<Integer> destinationCountView = pipeline.apply("DestinationCount", Create.ofProvider(options.getDestinationCount(), VarIntCoder.of())).apply("DestinationCountView", View.asSingleton());
    pipeline.apply("Read", Read.from(new Input(options.getInputElementCount()))).apply("Write", FileIO.<Long, Long>writeDynamic().by(Contextful.fn(new Fn<Long, Long>() {

        private MessageDigest hasher;

        private long hash(long element) throws NoSuchAlgorithmException {
            if (hasher == null) {
                hasher = MessageDigest.getInstance("SHA-256");
            }
            byte[] inputBytes = ByteBuffer.allocate(Long.BYTES).putLong(element).array();
            byte[] hashBytes = hasher.digest(inputBytes);
            // may take first 8 bytes as output
            return ByteBuffer.wrap(hashBytes).getLong();
        }

        @Override
        public Long apply(Long element, Context c) throws Exception {
            Integer destinationCount = c.sideInput(destinationCountView);
            long hash = hash(element);
            return Math.floorMod(hash, (long) destinationCount);
        }
    }, Requirements.requiresSideInputs(destinationCountView))).withDestinationCoder(VarLongCoder.of()).withNumShards(options.getShardCount()).withNaming(Contextful.fn((Fn<Long, FileNaming>) (destination, context) -> {
        Integer destinationCount = context.sideInput(destinationCountView);
        return Write.defaultNaming(String.format("%08d-of-%08d", destination, destinationCount), "");
    }, Requirements.requiresSideInputs(destinationCountView))).via(Contextful.fn((SerializableFunction<Long, String>) Object::toString), TextIO.sink()).to(options.getOutputLocation()));
    pipeline.run();
}
Also used : FileIO(org.apache.beam.sdk.io.FileIO) MessageDigest(java.security.MessageDigest) Default(org.apache.beam.sdk.options.Default) Coder(org.apache.beam.sdk.coders.Coder) SerializableFunction(org.apache.beam.sdk.transforms.SerializableFunction) View(org.apache.beam.sdk.transforms.View) PipelineOptionsFactory(org.apache.beam.sdk.options.PipelineOptionsFactory) ByteBuffer(java.nio.ByteBuffer) Description(org.apache.beam.sdk.options.Description) Contextful(org.apache.beam.sdk.transforms.Contextful) Read(org.apache.beam.sdk.io.Read) Create(org.apache.beam.sdk.transforms.Create) Write(org.apache.beam.sdk.io.FileIO.Write) Pipeline(org.apache.beam.sdk.Pipeline) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) ValueProvider(org.apache.beam.sdk.options.ValueProvider) CountingSource(org.apache.beam.sdk.io.CountingSource) IOException(java.io.IOException) VarLongCoder(org.apache.beam.sdk.coders.VarLongCoder) FileNaming(org.apache.beam.sdk.io.FileIO.Write.FileNaming) List(java.util.List) BoundedSource(org.apache.beam.sdk.io.BoundedSource) Fn(org.apache.beam.sdk.transforms.Contextful.Fn) PCollectionView(org.apache.beam.sdk.values.PCollectionView) NoSuchAlgorithmException(java.security.NoSuchAlgorithmException) VarIntCoder(org.apache.beam.sdk.coders.VarIntCoder) Requirements(org.apache.beam.sdk.transforms.Requirements) Collections(java.util.Collections) TextIO(org.apache.beam.sdk.io.TextIO) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) SerializableFunction(org.apache.beam.sdk.transforms.SerializableFunction) Fn(org.apache.beam.sdk.transforms.Contextful.Fn) Pipeline(org.apache.beam.sdk.Pipeline) FileNaming(org.apache.beam.sdk.io.FileIO.Write.FileNaming) MessageDigest(java.security.MessageDigest)

Example 2 with Write

use of org.apache.beam.sdk.io.FileIO.Write in project DataflowTemplates by GoogleCloudPlatform.

the class GenericRecordsToGcsPartitioned method expand.

@Override
public PCollection<PartitionMetadata> expand(PCollection<GenericRecord> input) {
    Schema schema = SchemaUtils.parseAvroSchema(serializedAvroSchema);
    Sink<GenericRecord> sink;
    switch(outputFileFormat) {
        case PARQUET:
            sink = ParquetIO.sink(schema);
            break;
        case AVRO:
            sink = new AvroSinkWithJodaDatesConversion<>(schema);
            break;
        default:
            throw new UnsupportedOperationException("Output format is not implemented: " + outputFileFormat);
    }
    if (partitionColumnName == null || partitioningSchema == null) {
        LOG.info("PartitionColumnName or/and PartitioningSchema not provided. " + "Writing to GCS without partition");
        return input.apply("Write to Storage with No Partition", FileIO.<GenericRecord>write().withSuffix(outputFileFormat.getFileSuffix()).via(sink).to(gcsPath)).getPerDestinationOutputFilenames().apply("MapFileNames", MapElements.into(TypeDescriptors.strings()).via((SerializableFunction<KV<Void, String>, String>) KV::getValue)).apply(MapElements.via(new SimpleFunction<String, PartitionMetadata>() {

            @Override
            public PartitionMetadata apply(String path) {
                return PartitionMetadata.builder().setValues(ImmutableList.of("1")).setLocation(withoutFileName(path)).build();
            }
        }));
    }
    ZoneId zoneId = getZoneId(schema);
    return input.apply(FileIO.<List<KV<String, Integer>>, GenericRecord>writeDynamic().by((GenericRecord r) -> partitioningSchema.toPartition(Instant.ofEpochMilli(partitionColumnValueToMillis(r.get(partitionColumnName))).atZone(zoneId))).withDestinationCoder(ListCoder.of(KvCoder.of(StringUtf8Coder.of(), VarIntCoder.of()))).via(sink).to(gcsPath).withNumShards(// must be 1 as we can only have 1 file per Dataplex partition
    1).withNaming(p -> Write.defaultNaming(partitionToPath(p), outputFileFormat.getFileSuffix()))).getPerDestinationOutputFilenames().apply(MapElements.via(new SimpleFunction<KV<List<KV<String, Integer>>, String>, PartitionMetadata>() {

        @Override
        public PartitionMetadata apply(KV<List<KV<String, Integer>>, String> partitionAndPath) {
            if (partitionAndPath.getKey() == null) {
                throw new IllegalStateException("Partition is null for path " + partitionAndPath.getValue());
            }
            if (partitionAndPath.getValue() == null) {
                throw new IllegalStateException("Path is null for partition " + partitionAndPath.getKey());
            }
            return PartitionMetadata.builder().setValues(partitionAndPath.getKey().stream().map(e -> String.valueOf(e.getValue())).collect(toImmutableList())).setLocation(withoutFileName(partitionAndPath.getValue())).build();
        }
    }));
}
Also used : FileIO(org.apache.beam.sdk.io.FileIO) KV(org.apache.beam.sdk.values.KV) ZonedDateTime(java.time.ZonedDateTime) PartitionMetadata(com.google.cloud.teleport.v2.values.PartitionMetadata) Sink(org.apache.beam.sdk.io.FileIO.Sink) LoggerFactory(org.slf4j.LoggerFactory) ListCoder(org.apache.beam.sdk.coders.ListCoder) SerializableFunction(org.apache.beam.sdk.transforms.SerializableFunction) SimpleFunction(org.apache.beam.sdk.transforms.SimpleFunction) Function(java.util.function.Function) StringUtf8Coder(org.apache.beam.sdk.coders.StringUtf8Coder) PTransform(org.apache.beam.sdk.transforms.PTransform) ImmutableList(com.google.common.collect.ImmutableList) LogicalTypes(org.apache.avro.LogicalTypes) Write(org.apache.beam.sdk.io.FileIO.Write) FileFormatOptions(com.google.cloud.teleport.v2.utils.FileFormat.FileFormatOptions) ZoneOffset(java.time.ZoneOffset) Nullable(javax.annotation.Nullable) MapElements(org.apache.beam.sdk.transforms.MapElements) GenericRecord(org.apache.avro.generic.GenericRecord) KvCoder(org.apache.beam.sdk.coders.KvCoder) Schema(org.apache.avro.Schema) Logger(org.slf4j.Logger) ImmutableMap(com.google.common.collect.ImmutableMap) LogicalType(org.apache.avro.LogicalType) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) ReadableInstant(org.joda.time.ReadableInstant) Instant(java.time.Instant) PCollection(org.apache.beam.sdk.values.PCollection) Collectors(java.util.stream.Collectors) ParquetIO(org.apache.beam.sdk.io.parquet.ParquetIO) ZoneId(java.time.ZoneId) List(java.util.List) SchemaUtils(com.google.cloud.teleport.v2.utils.SchemaUtils) TypeDescriptors(org.apache.beam.sdk.values.TypeDescriptors) VarIntCoder(org.apache.beam.sdk.coders.VarIntCoder) AvroSinkWithJodaDatesConversion(com.google.cloud.teleport.v2.io.AvroSinkWithJodaDatesConversion) ZoneId(java.time.ZoneId) Schema(org.apache.avro.Schema) KV(org.apache.beam.sdk.values.KV) SimpleFunction(org.apache.beam.sdk.transforms.SimpleFunction) ImmutableList(com.google.common.collect.ImmutableList) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) List(java.util.List) GenericRecord(org.apache.avro.generic.GenericRecord)

Aggregations

List (java.util.List)2 VarIntCoder (org.apache.beam.sdk.coders.VarIntCoder)2 FileIO (org.apache.beam.sdk.io.FileIO)2 Write (org.apache.beam.sdk.io.FileIO.Write)2 SerializableFunction (org.apache.beam.sdk.transforms.SerializableFunction)2 AvroSinkWithJodaDatesConversion (com.google.cloud.teleport.v2.io.AvroSinkWithJodaDatesConversion)1 FileFormatOptions (com.google.cloud.teleport.v2.utils.FileFormat.FileFormatOptions)1 SchemaUtils (com.google.cloud.teleport.v2.utils.SchemaUtils)1 PartitionMetadata (com.google.cloud.teleport.v2.values.PartitionMetadata)1 ImmutableList (com.google.common.collect.ImmutableList)1 ImmutableList.toImmutableList (com.google.common.collect.ImmutableList.toImmutableList)1 ImmutableMap (com.google.common.collect.ImmutableMap)1 IOException (java.io.IOException)1 ByteBuffer (java.nio.ByteBuffer)1 MessageDigest (java.security.MessageDigest)1 NoSuchAlgorithmException (java.security.NoSuchAlgorithmException)1 Instant (java.time.Instant)1 ZoneId (java.time.ZoneId)1 ZoneOffset (java.time.ZoneOffset)1 ZonedDateTime (java.time.ZonedDateTime)1