use of org.apache.beam.sdk.io.FileIO.ReadableFile in project beam by apache.
the class TFRecordIOTest method testReadFilesNamed.
@Test
public void testReadFilesNamed() {
readPipeline.enableAbandonedNodeEnforcement(false);
Metadata metadata = Metadata.builder().setResourceId(FileSystems.matchNewResource("file", false)).setIsReadSeekEfficient(true).setSizeBytes(1024).build();
Create.Values<ReadableFile> create = Create.of(new ReadableFile(metadata, Compression.AUTO));
assertEquals("TFRecordIO.ReadFiles/Read all via FileBasedSource/Read ranges/ParMultiDo(ReadFileRanges).output", readPipeline.apply(create).apply(TFRecordIO.readFiles()).getName());
assertEquals("MyRead/Read all via FileBasedSource/Read ranges/ParMultiDo(ReadFileRanges).output", readPipeline.apply(create).apply("MyRead", TFRecordIO.readFiles()).getName());
}
use of org.apache.beam.sdk.io.FileIO.ReadableFile in project DataflowTemplates by GoogleCloudPlatform.
the class DLPTextToBigQueryStreaming method run.
/**
* Runs the pipeline with the supplied options.
*
* @param options The execution parameters to the pipeline.
* @return The result of the pipeline execution.
*/
public static PipelineResult run(TokenizePipelineOptions options) {
// Create the pipeline
Pipeline p = Pipeline.create(options);
/*
* Steps:
* 1) Read from the text source continuously based on default interval e.g. 30 seconds
* - Setup a window for 30 secs to capture the list of files emited.
* - Group by file name as key and ReadableFile as a value.
* 2) Output each readable file for content processing.
* 3) Split file contents based on batch size for parallel processing.
* 4) Process each split as a DLP table content request to invoke API.
* 5) Convert DLP Table Rows to BQ Table Row.
* 6) Create dynamic table and insert successfully converted records into BQ.
*/
PCollection<KV<String, Iterable<ReadableFile>>> csvFiles = p.apply("Poll Input Files", FileIO.match().filepattern(options.getInputFilePattern()).continuously(DEFAULT_POLL_INTERVAL, Watch.Growth.never())).apply("Find Pattern Match", FileIO.readMatches().withCompression(Compression.AUTO)).apply("Add File Name as Key", WithKeys.of(file -> getFileName(file))).setCoder(KvCoder.of(StringUtf8Coder.of(), ReadableFileCoder.of())).apply("Fixed Window(30 Sec)", Window.<KV<String, ReadableFile>>into(FixedWindows.of(WINDOW_INTERVAL)).triggering(Repeatedly.forever(AfterProcessingTime.pastFirstElementInPane().plusDelayOf(Duration.ZERO))).discardingFiredPanes().withAllowedLateness(Duration.ZERO)).apply(GroupByKey.create());
PCollection<KV<String, TableRow>> bqDataMap = csvFiles.apply("File Handler", ParDo.of(new DoFn<KV<String, Iterable<ReadableFile>>, KV<String, ReadableFile>>() {
@ProcessElement
public void processElement(ProcessContext c) {
String fileKey = c.element().getKey();
c.element().getValue().forEach(file -> {
c.output(KV.of(fileKey, file));
});
}
})).apply("Process File Contents", ParDo.of(new CSVReader(NestedValueProvider.of(options.getBatchSize(), batchSize -> {
if (batchSize != null) {
return batchSize;
} else {
return DEFAULT_BATCH_SIZE;
}
})))).apply("DLP-Tokenization", ParDo.of(new DLPTokenizationDoFn(options.getDlpProjectId(), options.getDeidentifyTemplateName(), options.getInspectTemplateName()))).apply("Process Tokenized Data", ParDo.of(new TableRowProcessorDoFn()));
// 6) Create dynamic table and insert successfully converted records into BQ.
bqDataMap.apply("Write To BQ", BigQueryIO.<KV<String, TableRow>>write().to(new BQDestination(options.getDatasetName(), options.getDlpProjectId())).withFormatFunction(element -> {
return element.getValue();
}).withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_APPEND).withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED).withoutValidation().withFailedInsertRetryPolicy(InsertRetryPolicy.retryTransientErrors()));
return p.run();
}
use of org.apache.beam.sdk.io.FileIO.ReadableFile in project DataflowTemplates by GoogleCloudPlatform.
the class DataStreamIO method expandDataStreamJsonStrings.
public PCollection<FailsafeElement<String, String>> expandDataStreamJsonStrings(PCollection<ReadableFile> datastreamFiles) {
PCollection<FailsafeElement<String, String>> datastreamRecords;
FailsafeElementCoder coder = FailsafeElementCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of());
if (this.fileType.equals(JSON_SUFFIX)) {
datastreamRecords = datastreamFiles.apply("FileReadConcurrency", Reshuffle.<ReadableFile>viaRandomKey().withNumBuckets(fileReadConcurrency)).apply("ReadFiles", TextIO.readFiles()).apply("ReshuffleRecords", Reshuffle.viaRandomKey()).apply("ParseJsonRecords", ParDo.of(FormatDatastreamJsonToJson.create().withStreamName(this.streamName).withHashColumnValues(this.hashedColumns).withLowercaseSourceColumns(this.lowercaseSourceColumns))).setCoder(coder);
} else {
SerializableFunction<GenericRecord, FailsafeElement<String, String>> parseFn = FormatDatastreamRecordToJson.create().withStreamName(this.streamName).withHashColumnValues(this.hashedColumns).withLowercaseSourceColumns(this.lowercaseSourceColumns);
datastreamRecords = datastreamFiles.apply("ReshuffleFiles", Reshuffle.<ReadableFile>viaRandomKey()).apply("ParseAvroRows", ParDo.of(new ReadFileRangesFn<FailsafeElement<String, String>>(new CreateParseSourceFn(parseFn, coder), new ReadFileRangesFn.ReadFileRangesFnExceptionHandler()))).setCoder(coder);
}
return datastreamRecords.apply("Reshuffle", Reshuffle.viaRandomKey());
}
use of org.apache.beam.sdk.io.FileIO.ReadableFile in project DataflowTemplates by GoogleCloudPlatform.
the class ReadFileRangesFn method process.
@ProcessElement
public void process(ProcessContext c) throws IOException {
ReadableFile file = c.element();
FileBasedSource<T> source = CompressedSource.from(createSource.apply(file.getMetadata().resourceId().toString())).withCompression(file.getCompression());
try (BoundedSource.BoundedReader<T> reader = source.createReader(c.getPipelineOptions())) {
for (boolean more = reader.start(); more; more = reader.advance()) {
c.output(reader.getCurrent());
}
} catch (RuntimeException e) {
if (exceptionHandler.apply(file, null, e)) {
throw e;
}
}
}
use of org.apache.beam.sdk.io.FileIO.ReadableFile in project dataflow-pipelines by baeminbo.
the class UrnNotFoundPipeline method main.
public static void main(String[] args) {
PipelineOptions options = PipelineOptionsFactory.fromArgs(args).withValidation().create();
Pipeline pipeline = Pipeline.create(options);
pipeline.apply(FileIO.match().filepattern("gs://apache-beam-samples/shakespeare/*").continuously(Duration.standardMinutes(1), never())).apply(FileIO.readMatches()).apply(ParDo.of(new DoFn<ReadableFile, String>() {
@ProcessElement
public void processElement(ProcessContext context) throws IOException {
ReadableFile file = context.element();
try (BufferedReader reader = new BufferedReader(new InputStreamReader(Channels.newInputStream(file.open())))) {
reader.lines().flatMap(s -> Arrays.stream(s.split("[^\\p{L}]+"))).forEach(context::output);
}
}
})).apply(Window.into(FixedWindows.of(Duration.standardSeconds(10)))).apply(Count.perElement()).apply(ParDo.of(new DoFn<KV<String, Long>, Void>() {
@ProcessElement
public void processElement(ProcessContext context, BoundedWindow window) {
LOG.info("[{}] {}: {}", window, context.element().getKey(), context.element().getValue());
}
}));
pipeline.run();
}
Aggregations