use of org.apache.beam.sdk.io.FileIO.Write in project dataflow-pipelines by baeminbo.
the class DynamicGcsWritePipeline method main.
public static void main(String[] args) {
PipelineOptionsFactory.register(Options.class);
Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
Pipeline pipeline = Pipeline.create(options);
PCollectionView<Integer> destinationCountView = pipeline.apply("DestinationCount", Create.ofProvider(options.getDestinationCount(), VarIntCoder.of())).apply("DestinationCountView", View.asSingleton());
pipeline.apply("Read", Read.from(new Input(options.getInputElementCount()))).apply("Write", FileIO.<Long, Long>writeDynamic().by(Contextful.fn(new Fn<Long, Long>() {
private MessageDigest hasher;
private long hash(long element) throws NoSuchAlgorithmException {
if (hasher == null) {
hasher = MessageDigest.getInstance("SHA-256");
}
byte[] inputBytes = ByteBuffer.allocate(Long.BYTES).putLong(element).array();
byte[] hashBytes = hasher.digest(inputBytes);
// may take first 8 bytes as output
return ByteBuffer.wrap(hashBytes).getLong();
}
@Override
public Long apply(Long element, Context c) throws Exception {
Integer destinationCount = c.sideInput(destinationCountView);
long hash = hash(element);
return Math.floorMod(hash, (long) destinationCount);
}
}, Requirements.requiresSideInputs(destinationCountView))).withDestinationCoder(VarLongCoder.of()).withNumShards(options.getShardCount()).withNaming(Contextful.fn((Fn<Long, FileNaming>) (destination, context) -> {
Integer destinationCount = context.sideInput(destinationCountView);
return Write.defaultNaming(String.format("%08d-of-%08d", destination, destinationCount), "");
}, Requirements.requiresSideInputs(destinationCountView))).via(Contextful.fn((SerializableFunction<Long, String>) Object::toString), TextIO.sink()).to(options.getOutputLocation()));
pipeline.run();
}
use of org.apache.beam.sdk.io.FileIO.Write in project DataflowTemplates by GoogleCloudPlatform.
the class GenericRecordsToGcsPartitioned method expand.
@Override
public PCollection<PartitionMetadata> expand(PCollection<GenericRecord> input) {
Schema schema = SchemaUtils.parseAvroSchema(serializedAvroSchema);
Sink<GenericRecord> sink;
switch(outputFileFormat) {
case PARQUET:
sink = ParquetIO.sink(schema);
break;
case AVRO:
sink = new AvroSinkWithJodaDatesConversion<>(schema);
break;
default:
throw new UnsupportedOperationException("Output format is not implemented: " + outputFileFormat);
}
if (partitionColumnName == null || partitioningSchema == null) {
LOG.info("PartitionColumnName or/and PartitioningSchema not provided. " + "Writing to GCS without partition");
return input.apply("Write to Storage with No Partition", FileIO.<GenericRecord>write().withSuffix(outputFileFormat.getFileSuffix()).via(sink).to(gcsPath)).getPerDestinationOutputFilenames().apply("MapFileNames", MapElements.into(TypeDescriptors.strings()).via((SerializableFunction<KV<Void, String>, String>) KV::getValue)).apply(MapElements.via(new SimpleFunction<String, PartitionMetadata>() {
@Override
public PartitionMetadata apply(String path) {
return PartitionMetadata.builder().setValues(ImmutableList.of("1")).setLocation(withoutFileName(path)).build();
}
}));
}
ZoneId zoneId = getZoneId(schema);
return input.apply(FileIO.<List<KV<String, Integer>>, GenericRecord>writeDynamic().by((GenericRecord r) -> partitioningSchema.toPartition(Instant.ofEpochMilli(partitionColumnValueToMillis(r.get(partitionColumnName))).atZone(zoneId))).withDestinationCoder(ListCoder.of(KvCoder.of(StringUtf8Coder.of(), VarIntCoder.of()))).via(sink).to(gcsPath).withNumShards(// must be 1 as we can only have 1 file per Dataplex partition
1).withNaming(p -> Write.defaultNaming(partitionToPath(p), outputFileFormat.getFileSuffix()))).getPerDestinationOutputFilenames().apply(MapElements.via(new SimpleFunction<KV<List<KV<String, Integer>>, String>, PartitionMetadata>() {
@Override
public PartitionMetadata apply(KV<List<KV<String, Integer>>, String> partitionAndPath) {
if (partitionAndPath.getKey() == null) {
throw new IllegalStateException("Partition is null for path " + partitionAndPath.getValue());
}
if (partitionAndPath.getValue() == null) {
throw new IllegalStateException("Path is null for partition " + partitionAndPath.getKey());
}
return PartitionMetadata.builder().setValues(partitionAndPath.getKey().stream().map(e -> String.valueOf(e.getValue())).collect(toImmutableList())).setLocation(withoutFileName(partitionAndPath.getValue())).build();
}
}));
}
Aggregations