use of org.apache.beam.sdk.io.common.FileBasedIOITHelper.DeleteFileFn in project beam by apache.
the class AvroIOIT method writeThenReadAll.
@Test
public void writeThenReadAll() {
PCollection<String> testFilenames = pipeline.apply("Generate sequence", GenerateSequence.from(0).to(numberOfTextLines)).apply("Produce text lines", ParDo.of(new FileBasedIOITHelper.DeterministicallyConstructTestTextLineFn())).apply("Produce Avro records", ParDo.of(new DeterministicallyConstructAvroRecordsFn())).setCoder(AvroCoder.of(AVRO_SCHEMA)).apply("Collect start time", ParDo.of(new TimeMonitor<>(AVRO_NAMESPACE, "writeStart"))).apply("Write Avro records to files", AvroIO.writeGenericRecords(AVRO_SCHEMA).to(filenamePrefix).withOutputFilenames().withSuffix(".avro")).getPerDestinationOutputFilenames().apply("Collect middle time", ParDo.of(new TimeMonitor<>(AVRO_NAMESPACE, "middlePoint"))).apply(Values.create());
PCollection<String> consolidatedHashcode = testFilenames.apply("Match all files", FileIO.matchAll()).apply("Read matches", FileIO.readMatches().withDirectoryTreatment(DirectoryTreatment.PROHIBIT)).apply("Read files", AvroIO.readFilesGenericRecords(AVRO_SCHEMA)).apply("Collect end time", ParDo.of(new TimeMonitor<>(AVRO_NAMESPACE, "endPoint"))).apply("Parse Avro records to Strings", ParDo.of(new ParseAvroRecordsFn())).apply("Calculate hashcode", Combine.globally(new HashingFn()));
PAssert.thatSingleton(consolidatedHashcode).isEqualTo(expectedHash);
testFilenames.apply("Delete test files", ParDo.of(new DeleteFileFn()).withSideInputs(consolidatedHashcode.apply(View.asSingleton())));
PipelineResult result = pipeline.run();
result.waitUntilFinish();
collectAndPublishMetrics(result);
}
use of org.apache.beam.sdk.io.common.FileBasedIOITHelper.DeleteFileFn in project beam by apache.
the class TextIOIT method writeThenReadAll.
@Test
public void writeThenReadAll() {
TextIO.TypedWrite<String, Object> write = TextIO.write().to(filenamePrefix).withOutputFilenames().withCompression(compressionType);
if (numShards != null) {
write = write.withNumShards(numShards);
}
PCollection<String> testFilenames = pipeline.apply("Generate sequence", GenerateSequence.from(0).to(numberOfTextLines)).apply("Produce text lines", ParDo.of(new FileBasedIOITHelper.DeterministicallyConstructTestTextLineFn())).apply("Collect write start time", ParDo.of(new TimeMonitor<>(FILEIOIT_NAMESPACE, "startTime"))).apply("Write content to files", write).getPerDestinationOutputFilenames().apply(Values.create()).apply("Collect write end time", ParDo.of(new TimeMonitor<>(FILEIOIT_NAMESPACE, "middleTime")));
PCollection<String> consolidatedHashcode = testFilenames.apply("Match all files", FileIO.matchAll()).apply("Read matches", FileIO.readMatches().withDirectoryTreatment(DirectoryTreatment.PROHIBIT)).apply("Read files", TextIO.readFiles()).apply("Collect read end time", ParDo.of(new TimeMonitor<>(FILEIOIT_NAMESPACE, "endTime"))).apply("Calculate hashcode", Combine.globally(new HashingFn()));
PAssert.thatSingleton(consolidatedHashcode).isEqualTo(expectedHash);
testFilenames.apply("Delete test files", ParDo.of(new DeleteFileFn()).withSideInputs(consolidatedHashcode.apply(View.asSingleton())));
PipelineResult result = pipeline.run();
result.waitUntilFinish();
collectAndPublishMetrics(result);
}
use of org.apache.beam.sdk.io.common.FileBasedIOITHelper.DeleteFileFn in project beam by apache.
the class TFRecordIOIT method writeThenReadAll.
// TODO: There are two pipelines due to: https://issues.apache.org/jira/browse/BEAM-3267
@Test
public void writeThenReadAll() {
final TFRecordIO.Write writeTransform = TFRecordIO.write().to(filenamePrefix).withCompression(compressionType).withSuffix(".tfrecord");
writePipeline.apply("Generate sequence", GenerateSequence.from(0).to(numberOfTextLines)).apply("Produce text lines", ParDo.of(new FileBasedIOITHelper.DeterministicallyConstructTestTextLineFn())).apply("Transform strings to bytes", MapElements.via(new StringToByteArray())).apply("Record time before writing", ParDo.of(new TimeMonitor<>(TFRECORD_NAMESPACE, WRITE_TIME))).apply("Write content to files", writeTransform);
final PipelineResult writeResult = writePipeline.run();
writeResult.waitUntilFinish();
String filenamePattern = createFilenamePattern();
PCollection<String> consolidatedHashcode = readPipeline.apply(TFRecordIO.read().from(filenamePattern).withCompression(AUTO)).apply("Record time after reading", ParDo.of(new TimeMonitor<>(TFRECORD_NAMESPACE, READ_TIME))).apply("Transform bytes to strings", MapElements.via(new ByteArrayToString())).apply("Calculate hashcode", Combine.globally(new HashingFn())).apply(Reshuffle.viaRandomKey());
PAssert.thatSingleton(consolidatedHashcode).isEqualTo(expectedHash);
readPipeline.apply(Create.of(filenamePattern)).apply("Delete test files", ParDo.of(new DeleteFileFn()).withSideInputs(consolidatedHashcode.apply(View.asSingleton())));
final PipelineResult readResult = readPipeline.run();
readResult.waitUntilFinish();
collectAndPublishMetrics(writeResult, readResult);
}
Aggregations