Examples with ReadableFile - org.apache.beam.sdk.io.FileIO.ReadableFile

Example 1 with ReadableFile

use of org.apache.beam.sdk.io.FileIO.ReadableFile in project beam by apache.

the class TFRecordIOTest method testReadFilesNamed.

@Test
public void testReadFilesNamed() {
    readPipeline.enableAbandonedNodeEnforcement(false);
    Metadata metadata = Metadata.builder().setResourceId(FileSystems.matchNewResource("file", false)).setIsReadSeekEfficient(true).setSizeBytes(1024).build();
    Create.Values<ReadableFile> create = Create.of(new ReadableFile(metadata, Compression.AUTO));
    assertEquals("TFRecordIO.ReadFiles/Read all via FileBasedSource/Read ranges/ParMultiDo(ReadFileRanges).output", readPipeline.apply(create).apply(TFRecordIO.readFiles()).getName());
    assertEquals("MyRead/Read all via FileBasedSource/Read ranges/ParMultiDo(ReadFileRanges).output", readPipeline.apply(create).apply("MyRead", TFRecordIO.readFiles()).getName());
}

Also used : Create(org.apache.beam.sdk.transforms.Create) Metadata(org.apache.beam.sdk.io.fs.MatchResult.Metadata) ReadableFile(org.apache.beam.sdk.io.FileIO.ReadableFile) Test(org.junit.Test)

Example 2 with ReadableFile

use of org.apache.beam.sdk.io.FileIO.ReadableFile in project DataflowTemplates by GoogleCloudPlatform.

the class DLPTextToBigQueryStreaming method run.

/**
 * Runs the pipeline with the supplied options.
 *
 * @param options The execution parameters to the pipeline.
 * @return The result of the pipeline execution.
 */
public static PipelineResult run(TokenizePipelineOptions options) {
    // Create the pipeline
    Pipeline p = Pipeline.create(options);
    /*
     * Steps:
     *   1) Read from the text source continuously based on default interval e.g. 30 seconds
     *       - Setup a window for 30 secs to capture the list of files emited.
     *       - Group by file name as key and ReadableFile as a value.
     *   2) Output each readable file for content processing.
     *   3) Split file contents based on batch size for parallel processing.
     *   4) Process each split as a DLP table content request to invoke API.
     *   5) Convert DLP Table Rows to BQ Table Row.
     *   6) Create dynamic table and insert successfully converted records into BQ.
     */
    PCollection<KV<String, Iterable<ReadableFile>>> csvFiles = p.apply("Poll Input Files", FileIO.match().filepattern(options.getInputFilePattern()).continuously(DEFAULT_POLL_INTERVAL, Watch.Growth.never())).apply("Find Pattern Match", FileIO.readMatches().withCompression(Compression.AUTO)).apply("Add File Name as Key", WithKeys.of(file -> getFileName(file))).setCoder(KvCoder.of(StringUtf8Coder.of(), ReadableFileCoder.of())).apply("Fixed Window(30 Sec)", Window.<KV<String, ReadableFile>>into(FixedWindows.of(WINDOW_INTERVAL)).triggering(Repeatedly.forever(AfterProcessingTime.pastFirstElementInPane().plusDelayOf(Duration.ZERO))).discardingFiredPanes().withAllowedLateness(Duration.ZERO)).apply(GroupByKey.create());
    PCollection<KV<String, TableRow>> bqDataMap = csvFiles.apply("File Handler", ParDo.of(new DoFn<KV<String, Iterable<ReadableFile>>, KV<String, ReadableFile>>() {

        @ProcessElement
        public void processElement(ProcessContext c) {
            String fileKey = c.element().getKey();
            c.element().getValue().forEach(file -> {
                c.output(KV.of(fileKey, file));
            });
        }
    })).apply("Process File Contents", ParDo.of(new CSVReader(NestedValueProvider.of(options.getBatchSize(), batchSize -> {
        if (batchSize != null) {
            return batchSize;
        } else {
            return DEFAULT_BATCH_SIZE;
        }
    })))).apply("DLP-Tokenization", ParDo.of(new DLPTokenizationDoFn(options.getDlpProjectId(), options.getDeidentifyTemplateName(), options.getInspectTemplateName()))).apply("Process Tokenized Data", ParDo.of(new TableRowProcessorDoFn()));
    // 6) Create dynamic table and insert successfully converted records into BQ.
    bqDataMap.apply("Write To BQ", BigQueryIO.<KV<String, TableRow>>write().to(new BQDestination(options.getDatasetName(), options.getDlpProjectId())).withFormatFunction(element -> {
        return element.getValue();
    }).withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_APPEND).withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED).withoutValidation().withFailedInsertRetryPolicy(InsertRetryPolicy.retryTransientErrors()));
    return p.run();
}

Also used : FileIO(org.apache.beam.sdk.io.FileIO) PipelineResult(org.apache.beam.sdk.PipelineResult) ValueInSingleWindow(org.apache.beam.sdk.values.ValueInSingleWindow) CSVRecord(org.apache.commons.csv.CSVRecord) InsertRetryPolicy(org.apache.beam.sdk.io.gcp.bigquery.InsertRetryPolicy) LoggerFactory(org.slf4j.LoggerFactory) TableCell(com.google.api.services.bigquery.model.TableCell) Metrics(org.apache.beam.sdk.metrics.Metrics) Description(org.apache.beam.sdk.options.Description) CSVFormat(org.apache.commons.csv.CSVFormat) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) Map(java.util.Map) TableRow(com.google.api.services.bigquery.model.TableRow) Window(org.apache.beam.sdk.transforms.windowing.Window) TableSchema(com.google.api.services.bigquery.model.TableSchema) ValueProvider(org.apache.beam.sdk.options.ValueProvider) Value(com.google.privacy.dlp.v2.Value) KvCoder(org.apache.beam.sdk.coders.KvCoder) Repeatedly(org.apache.beam.sdk.transforms.windowing.Repeatedly) ContentItem(com.google.privacy.dlp.v2.ContentItem) Collectors(java.util.stream.Collectors) List(java.util.List) ParDo(org.apache.beam.sdk.transforms.ParDo) Pattern(java.util.regex.Pattern) OffsetRange(org.apache.beam.sdk.io.range.OffsetRange) Required(org.apache.beam.sdk.options.Validation.Required) KV(org.apache.beam.sdk.values.KV) Duration(org.joda.time.Duration) PipelineOptionsFactory(org.apache.beam.sdk.options.PipelineOptionsFactory) ArrayList(java.util.ArrayList) StringUtf8Coder(org.apache.beam.sdk.coders.StringUtf8Coder) NestedValueProvider(org.apache.beam.sdk.options.ValueProvider.NestedValueProvider) SQLException(java.sql.SQLException) OffsetRangeTracker(org.apache.beam.sdk.transforms.splittabledofn.OffsetRangeTracker) Builder(com.google.privacy.dlp.v2.DeidentifyContentRequest.Builder) FieldId(com.google.privacy.dlp.v2.FieldId) Pipeline(org.apache.beam.sdk.Pipeline) Watch(org.apache.beam.sdk.transforms.Watch) RestrictionTracker(org.apache.beam.sdk.transforms.splittabledofn.RestrictionTracker) DeidentifyContentResponse(com.google.privacy.dlp.v2.DeidentifyContentResponse) ReadableFile(org.apache.beam.sdk.io.FileIO.ReadableFile) DlpServiceClient(com.google.cloud.dlp.v2.DlpServiceClient) DoFn(org.apache.beam.sdk.transforms.DoFn) Charsets(com.google.common.base.Charsets) ReadableByteChannel(java.nio.channels.ReadableByteChannel) GroupByKey(org.apache.beam.sdk.transforms.GroupByKey) TableFieldSchema(com.google.api.services.bigquery.model.TableFieldSchema) WithKeys(org.apache.beam.sdk.transforms.WithKeys) Logger(org.slf4j.Logger) Iterator(java.util.Iterator) Compression(org.apache.beam.sdk.io.Compression) BigQueryIO(org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO) Table(com.google.privacy.dlp.v2.Table) Channels(java.nio.channels.Channels) FixedWindows(org.apache.beam.sdk.transforms.windowing.FixedWindows) IOException(java.io.IOException) DataflowPipelineOptions(org.apache.beam.runners.dataflow.options.DataflowPipelineOptions) PCollection(org.apache.beam.sdk.values.PCollection) DeidentifyContentRequest(com.google.privacy.dlp.v2.DeidentifyContentRequest) TableDestination(org.apache.beam.sdk.io.gcp.bigquery.TableDestination) Distribution(org.apache.beam.sdk.metrics.Distribution) ProjectName(com.google.privacy.dlp.v2.ProjectName) AfterProcessingTime(org.apache.beam.sdk.transforms.windowing.AfterProcessingTime) PCollectionView(org.apache.beam.sdk.values.PCollectionView) ReadableFileCoder(org.apache.beam.sdk.io.ReadableFileCoder) BufferedReader(java.io.BufferedReader) DynamicDestinations(org.apache.beam.sdk.io.gcp.bigquery.DynamicDestinations) KV(org.apache.beam.sdk.values.KV) ReadableFile(org.apache.beam.sdk.io.FileIO.ReadableFile) Pipeline(org.apache.beam.sdk.Pipeline) TableRow(com.google.api.services.bigquery.model.TableRow)

Example 3 with ReadableFile

use of org.apache.beam.sdk.io.FileIO.ReadableFile in project DataflowTemplates by GoogleCloudPlatform.

the class DataStreamIO method expandDataStreamJsonStrings.

public PCollection<FailsafeElement<String, String>> expandDataStreamJsonStrings(PCollection<ReadableFile> datastreamFiles) {
    PCollection<FailsafeElement<String, String>> datastreamRecords;
    FailsafeElementCoder coder = FailsafeElementCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of());
    if (this.fileType.equals(JSON_SUFFIX)) {
        datastreamRecords = datastreamFiles.apply("FileReadConcurrency", Reshuffle.<ReadableFile>viaRandomKey().withNumBuckets(fileReadConcurrency)).apply("ReadFiles", TextIO.readFiles()).apply("ReshuffleRecords", Reshuffle.viaRandomKey()).apply("ParseJsonRecords", ParDo.of(FormatDatastreamJsonToJson.create().withStreamName(this.streamName).withHashColumnValues(this.hashedColumns).withLowercaseSourceColumns(this.lowercaseSourceColumns))).setCoder(coder);
    } else {
        SerializableFunction<GenericRecord, FailsafeElement<String, String>> parseFn = FormatDatastreamRecordToJson.create().withStreamName(this.streamName).withHashColumnValues(this.hashedColumns).withLowercaseSourceColumns(this.lowercaseSourceColumns);
        datastreamRecords = datastreamFiles.apply("ReshuffleFiles", Reshuffle.<ReadableFile>viaRandomKey()).apply("ParseAvroRows", ParDo.of(new ReadFileRangesFn<FailsafeElement<String, String>>(new CreateParseSourceFn(parseFn, coder), new ReadFileRangesFn.ReadFileRangesFnExceptionHandler()))).setCoder(coder);
    }
    return datastreamRecords.apply("Reshuffle", Reshuffle.viaRandomKey());
}

Also used : FailsafeElementCoder(com.google.cloud.teleport.v2.coders.FailsafeElementCoder) GenericRecord(org.apache.avro.generic.GenericRecord) ReadableFile(org.apache.beam.sdk.io.FileIO.ReadableFile) FailsafeElement(com.google.cloud.teleport.v2.values.FailsafeElement)

Example 4 with ReadableFile

use of org.apache.beam.sdk.io.FileIO.ReadableFile in project DataflowTemplates by GoogleCloudPlatform.

the class ReadFileRangesFn method process.

@ProcessElement
public void process(ProcessContext c) throws IOException {
    ReadableFile file = c.element();
    FileBasedSource<T> source = CompressedSource.from(createSource.apply(file.getMetadata().resourceId().toString())).withCompression(file.getCompression());
    try (BoundedSource.BoundedReader<T> reader = source.createReader(c.getPipelineOptions())) {
        for (boolean more = reader.start(); more; more = reader.advance()) {
            c.output(reader.getCurrent());
        }
    } catch (RuntimeException e) {
        if (exceptionHandler.apply(file, null, e)) {
            throw e;
        }
    }
}

Also used : BoundedSource(org.apache.beam.sdk.io.BoundedSource) ReadableFile(org.apache.beam.sdk.io.FileIO.ReadableFile)

Example 5 with ReadableFile

use of org.apache.beam.sdk.io.FileIO.ReadableFile in project dataflow-pipelines by baeminbo.

the class UrnNotFoundPipeline method main.

public static void main(String[] args) {
    PipelineOptions options = PipelineOptionsFactory.fromArgs(args).withValidation().create();
    Pipeline pipeline = Pipeline.create(options);
    pipeline.apply(FileIO.match().filepattern("gs://apache-beam-samples/shakespeare/*").continuously(Duration.standardMinutes(1), never())).apply(FileIO.readMatches()).apply(ParDo.of(new DoFn<ReadableFile, String>() {

        @ProcessElement
        public void processElement(ProcessContext context) throws IOException {
            ReadableFile file = context.element();
            try (BufferedReader reader = new BufferedReader(new InputStreamReader(Channels.newInputStream(file.open())))) {
                reader.lines().flatMap(s -> Arrays.stream(s.split("[^\\p{L}]+"))).forEach(context::output);
            }
        }
    })).apply(Window.into(FixedWindows.of(Duration.standardSeconds(10)))).apply(Count.perElement()).apply(ParDo.of(new DoFn<KV<String, Long>, Void>() {

        @ProcessElement
        public void processElement(ProcessContext context, BoundedWindow window) {
            LOG.info("[{}] {}: {}", window, context.element().getKey(), context.element().getValue());
        }
    }));
    pipeline.run();
}

Also used : InputStreamReader(java.io.InputStreamReader) IOException(java.io.IOException) ReadableFile(org.apache.beam.sdk.io.FileIO.ReadableFile) Pipeline(org.apache.beam.sdk.Pipeline) DoFn(org.apache.beam.sdk.transforms.DoFn) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) BufferedReader(java.io.BufferedReader) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow)

Aggregations

ReadableFile (org.apache.beam.sdk.io.FileIO.ReadableFile)5 BufferedReader (java.io.BufferedReader)2 IOException (java.io.IOException)2 Pipeline (org.apache.beam.sdk.Pipeline)2 TableCell (com.google.api.services.bigquery.model.TableCell)1 TableFieldSchema (com.google.api.services.bigquery.model.TableFieldSchema)1 TableRow (com.google.api.services.bigquery.model.TableRow)1 TableSchema (com.google.api.services.bigquery.model.TableSchema)1 DlpServiceClient (com.google.cloud.dlp.v2.DlpServiceClient)1 FailsafeElementCoder (com.google.cloud.teleport.v2.coders.FailsafeElementCoder)1 FailsafeElement (com.google.cloud.teleport.v2.values.FailsafeElement)1 Charsets (com.google.common.base.Charsets)1 ContentItem (com.google.privacy.dlp.v2.ContentItem)1 DeidentifyContentRequest (com.google.privacy.dlp.v2.DeidentifyContentRequest)1 Builder (com.google.privacy.dlp.v2.DeidentifyContentRequest.Builder)1 DeidentifyContentResponse (com.google.privacy.dlp.v2.DeidentifyContentResponse)1 FieldId (com.google.privacy.dlp.v2.FieldId)1 ProjectName (com.google.privacy.dlp.v2.ProjectName)1 Table (com.google.privacy.dlp.v2.Table)1 Value (com.google.privacy.dlp.v2.Value)1