Search in sources :

Example 76 with PipelineOptions

use of org.apache.beam.sdk.options.PipelineOptions in project beam by apache.

the class FileBasedSourceTest method testFullyReadSingleFile.

@Test
public void testFullyReadSingleFile() throws IOException {
    PipelineOptions options = PipelineOptionsFactory.create();
    List<String> data = createStringDataset(3, 50);
    String fileName = "file";
    File file = createFileWithData(fileName, data);
    TestFileBasedSource source = new TestFileBasedSource(file.getPath(), 64, null);
    assertEquals(data, readFromSource(source, options));
}
Also used : PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) File(java.io.File) Test(org.junit.Test)

Example 77 with PipelineOptions

use of org.apache.beam.sdk.options.PipelineOptions in project beam by apache.

the class FileBasedSourceTest method testSplitAtFraction.

@Test
public void testSplitAtFraction() throws Exception {
    PipelineOptions options = PipelineOptionsFactory.create();
    File file = createFileWithData("file", createStringDataset(3, 100));
    Metadata metadata = FileSystems.matchSingleFileSpec(file.getPath());
    TestFileBasedSource source = new TestFileBasedSource(metadata, 1, 0, file.length(), null);
    // Shouldn't be able to split while unstarted.
    assertSplitAtFractionFails(source, 0, 0.7, options);
    assertSplitAtFractionSucceedsAndConsistent(source, 1, 0.7, options);
    assertSplitAtFractionSucceedsAndConsistent(source, 30, 0.7, options);
    assertSplitAtFractionFails(source, 0, 0.0, options);
    assertSplitAtFractionFails(source, 70, 0.3, options);
    assertSplitAtFractionFails(source, 100, 1.0, options);
    assertSplitAtFractionFails(source, 100, 0.99, options);
    assertSplitAtFractionSucceedsAndConsistent(source, 100, 0.995, options);
}
Also used : PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) Metadata(org.apache.beam.sdk.io.fs.MatchResult.Metadata) File(java.io.File) Test(org.junit.Test)

Example 78 with PipelineOptions

use of org.apache.beam.sdk.options.PipelineOptions in project beam by apache.

the class FileBasedSourceTest method testSplitAtFractionExhaustive.

@Test
public void testSplitAtFractionExhaustive() throws Exception {
    PipelineOptions options = PipelineOptionsFactory.create();
    // Smaller file for exhaustive testing.
    File file = createFileWithData("file", createStringDataset(3, 20));
    Metadata metadata = FileSystems.matchSingleFileSpec(file.getPath());
    TestFileBasedSource source = new TestFileBasedSource(metadata, 1, 0, file.length(), null);
    assertSplitAtFractionExhaustive(source, options);
}
Also used : PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) Metadata(org.apache.beam.sdk.io.fs.MatchResult.Metadata) File(java.io.File) Test(org.junit.Test)

Example 79 with PipelineOptions

use of org.apache.beam.sdk.options.PipelineOptions in project beam by apache.

the class FileBasedSourceTest method testFullyReadFilePattern.

@Test
public void testFullyReadFilePattern() throws IOException {
    PipelineOptions options = PipelineOptionsFactory.create();
    List<String> data1 = createStringDataset(3, 50);
    File file1 = createFileWithData("file1", data1);
    List<String> data2 = createStringDataset(3, 50);
    createFileWithData("file2", data2);
    List<String> data3 = createStringDataset(3, 50);
    createFileWithData("file3", data3);
    List<String> data4 = createStringDataset(3, 50);
    createFileWithData("otherfile", data4);
    TestFileBasedSource source = new TestFileBasedSource(new File(file1.getParent(), "file*").getPath(), 64, null);
    List<String> expectedResults = new ArrayList<String>();
    expectedResults.addAll(data1);
    expectedResults.addAll(data2);
    expectedResults.addAll(data3);
    assertThat(expectedResults, containsInAnyOrder(readFromSource(source, options).toArray()));
}
Also used : PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) ArrayList(java.util.ArrayList) File(java.io.File) Test(org.junit.Test)

Example 80 with PipelineOptions

use of org.apache.beam.sdk.options.PipelineOptions in project beam by apache.

the class MinimalWordCount method main.

public static void main(String[] args) {
    // Create a PipelineOptions object. This object lets us set various execution
    // options for our pipeline, such as the runner you wish to use. This example
    // will run with the DirectRunner by default, based on the class path configured
    // in its dependencies.
    PipelineOptions options = PipelineOptionsFactory.create();
    // Create the Pipeline object with the options we defined above.
    Pipeline p = Pipeline.create(options);
    // Apply the pipeline's transforms.
    // Concept #1: Apply a root transform to the pipeline; in this case, TextIO.Read to read a set
    // of input text files. TextIO.Read returns a PCollection where each element is one line from
    // the input text (a set of Shakespeare's texts).
    // This example reads a public data set consisting of the complete works of Shakespeare.
    p.apply(TextIO.read().from("gs://apache-beam-samples/shakespeare/*")).apply("ExtractWords", ParDo.of(new DoFn<String, String>() {

        @ProcessElement
        public void processElement(ProcessContext c) {
            for (String word : c.element().split(ExampleUtils.TOKENIZER_PATTERN)) {
                if (!word.isEmpty()) {
                    c.output(word);
                }
            }
        }
    })).apply(Count.<String>perElement()).apply("FormatResults", MapElements.via(new SimpleFunction<KV<String, Long>, String>() {

        @Override
        public String apply(KV<String, Long> input) {
            return input.getKey() + ": " + input.getValue();
        }
    })).apply(TextIO.write().to("wordcounts"));
    // Run the pipeline.
    p.run().waitUntilFinish();
}
Also used : PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) KV(org.apache.beam.sdk.values.KV) Pipeline(org.apache.beam.sdk.Pipeline)

Aggregations

PipelineOptions (org.apache.beam.sdk.options.PipelineOptions)92 Test (org.junit.Test)79 File (java.io.File)26 ArrayList (java.util.ArrayList)16 Pipeline (org.apache.beam.sdk.Pipeline)10 Metadata (org.apache.beam.sdk.io.fs.MatchResult.Metadata)9 Path (java.nio.file.Path)6 BigQueryHelpers.toJsonString (org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.toJsonString)6 SerializedPipelineOptions (org.apache.beam.runners.flink.translation.utils.SerializedPipelineOptions)5 KV (org.apache.beam.sdk.values.KV)5 Matchers.containsString (org.hamcrest.Matchers.containsString)5 Table (com.google.api.services.bigquery.model.Table)4 TableReference (com.google.api.services.bigquery.model.TableReference)4 TableRow (com.google.api.services.bigquery.model.TableRow)4 HashBasedTable (com.google.common.collect.HashBasedTable)4 BoundedToUnboundedSourceAdapter (org.apache.beam.runners.core.construction.UnboundedReadFromBoundedSource.BoundedToUnboundedSourceAdapter)4 BigQueryHelpers.createTempTableReference (org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.createTempTableReference)4 TestPipeline (org.apache.beam.sdk.testing.TestPipeline)4 TableFieldSchema (com.google.api.services.bigquery.model.TableFieldSchema)3 TableSchema (com.google.api.services.bigquery.model.TableSchema)3