Search in sources :

Example 46 with PipelineOptions

use of org.apache.beam.sdk.options.PipelineOptions in project DataflowJavaSDK-examples by GoogleCloudPlatform.

the class MinimalWordCountJava8 method main.

public static void main(String[] args) {
    PipelineOptions options = PipelineOptionsFactory.create();
    // In order to run your pipeline, you need to make following runner specific changes:
    //
    // CHANGE 1/3: Select a Beam runner, such as BlockingDataflowRunner
    // or FlinkRunner.
    // CHANGE 2/3: Specify runner-required options.
    // For BlockingDataflowRunner, set project and temp location as follows:
    //   DataflowPipelineOptions dataflowOptions = options.as(DataflowPipelineOptions.class);
    //   dataflowOptions.setRunner(BlockingDataflowRunner.class);
    //   dataflowOptions.setProject("SET_YOUR_PROJECT_ID_HERE");
    //   dataflowOptions.setTempLocation("gs://SET_YOUR_BUCKET_NAME_HERE/AND_TEMP_DIRECTORY");
    // For FlinkRunner, set the runner as follows. See {@code FlinkPipelineOptions}
    // for more details.
    //   options.as(FlinkPipelineOptions.class)
    //      .setRunner(FlinkRunner.class);
    Pipeline p = Pipeline.create(options);
    p.apply(TextIO.read().from("gs://apache-beam-samples/shakespeare/*")).apply(FlatMapElements.into(TypeDescriptors.strings()).via((String word) -> Arrays.asList(word.split("[^\\p{L}]+")))).apply(Filter.by((String word) -> !word.isEmpty())).apply(Count.<String>perElement()).apply(MapElements.into(TypeDescriptors.strings()).via((KV<String, Long> wordCount) -> wordCount.getKey() + ": " + wordCount.getValue())).apply(TextIO.write().to("gs://YOUR_OUTPUT_BUCKET/AND_OUTPUT_PREFIX"));
    p.run().waitUntilFinish();
}
Also used : PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) KV(org.apache.beam.sdk.values.KV) Pipeline(org.apache.beam.sdk.Pipeline)

Example 47 with PipelineOptions

use of org.apache.beam.sdk.options.PipelineOptions in project beam by apache.

the class UnboundedReadFromBoundedSourceTest method testBoundedToUnboundedSourceAdapterCheckpoint.

private <T> void testBoundedToUnboundedSourceAdapterCheckpoint(BoundedSource<T> boundedSource, List<T> expectedElements) throws Exception {
    BoundedToUnboundedSourceAdapter<T> unboundedSource = new BoundedToUnboundedSourceAdapter<>(boundedSource);
    PipelineOptions options = PipelineOptionsFactory.create();
    BoundedToUnboundedSourceAdapter<T>.Reader<T> reader = unboundedSource.createReader(options, null);
    List<T> actual = Lists.newArrayList();
    for (boolean hasNext = reader.start(); hasNext; hasNext = reader.advance()) {
        actual.add(reader.getCurrent());
        // checkpoint every 9 elements
        if (actual.size() % 9 == 0) {
            Checkpoint<T> checkpoint = reader.getCheckpointMark();
            checkpoint.finalizeCheckpoint();
        }
    }
    Checkpoint<T> checkpointDone = reader.getCheckpointMark();
    assertTrue(checkpointDone.getResidualElements() == null || checkpointDone.getResidualElements().isEmpty());
    assertEquals(expectedElements.size(), actual.size());
    assertEquals(Sets.newHashSet(expectedElements), Sets.newHashSet(actual));
}
Also used : BoundedToUnboundedSourceAdapter(org.apache.beam.runners.core.construction.UnboundedReadFromBoundedSource.BoundedToUnboundedSourceAdapter) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions)

Example 48 with PipelineOptions

use of org.apache.beam.sdk.options.PipelineOptions in project beam by apache.

the class TestPipeline method testingPipelineOptions.

/** Creates {@link PipelineOptions} for testing. */
public static PipelineOptions testingPipelineOptions() {
    try {
        @Nullable String beamTestPipelineOptions = System.getProperty(PROPERTY_BEAM_TEST_PIPELINE_OPTIONS);
        PipelineOptions options = Strings.isNullOrEmpty(beamTestPipelineOptions) ? PipelineOptionsFactory.create() : PipelineOptionsFactory.fromArgs(MAPPER.readValue(beamTestPipelineOptions, String[].class)).as(TestPipelineOptions.class);
        options.as(ApplicationNameOptions.class).setAppName(getAppName());
        // If no options were specified, set some reasonable defaults
        if (Strings.isNullOrEmpty(beamTestPipelineOptions)) {
            // If there are no provided options, check to see if a dummy runner should be used.
            String useDefaultDummy = System.getProperty(PROPERTY_USE_DEFAULT_DUMMY_RUNNER);
            if (!Strings.isNullOrEmpty(useDefaultDummy) && Boolean.valueOf(useDefaultDummy)) {
                options.setRunner(CrashingRunner.class);
            }
        }
        options.setStableUniqueNames(CheckEnabled.ERROR);
        FileSystems.setDefaultPipelineOptions(options);
        return options;
    } catch (IOException e) {
        throw new RuntimeException("Unable to instantiate test options from system property " + PROPERTY_BEAM_TEST_PIPELINE_OPTIONS + ":" + System.getProperty(PROPERTY_BEAM_TEST_PIPELINE_OPTIONS), e);
    }
}
Also used : PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) IOException(java.io.IOException) ApplicationNameOptions(org.apache.beam.sdk.options.ApplicationNameOptions) Nullable(javax.annotation.Nullable)

Example 49 with PipelineOptions

use of org.apache.beam.sdk.options.PipelineOptions in project beam by apache.

the class SparkRunnerDebuggerTest method debugBatchPipeline.

@Test
public void debugBatchPipeline() {
    PipelineOptions options = PipelineOptionsFactory.create().as(TestSparkPipelineOptions.class);
    options.setRunner(SparkRunnerDebugger.class);
    Pipeline pipeline = Pipeline.create(options);
    PCollection<String> lines = pipeline.apply(Create.of(Collections.<String>emptyList()).withCoder(StringUtf8Coder.of()));
    PCollection<KV<String, Long>> wordCounts = lines.apply(new WordCount.CountWords());
    wordCounts.apply(GroupByKey.<String, Long>create()).apply(Combine.<String, Long, Long>groupedValues(Sum.ofLongs()));
    PCollection<KV<String, Long>> wordCountsPlusOne = wordCounts.apply(MapElements.via(new PlusOne()));
    PCollectionList.of(wordCounts).and(wordCountsPlusOne).apply(Flatten.<KV<String, Long>>pCollections());
    wordCounts.apply(MapElements.via(new WordCount.FormatAsTextFn())).apply(TextIO.write().to("!!PLACEHOLDER-OUTPUT-DIR!!").withNumShards(3).withSuffix(".txt"));
    final String expectedPipeline = "sparkContext.parallelize(Arrays.asList(...))\n" + "_.mapPartitions(new org.apache.beam.runners.spark.examples.WordCount$ExtractWordsFn())\n" + "_.mapPartitions(new org.apache.beam.sdk.transforms.Count$PerElement$1())\n" + "_.combineByKey(..., new org.apache.beam.sdk.transforms.Count$CountFn(), ...)\n" + "_.groupByKey()\n" + "_.map(new org.apache.beam.sdk.transforms.Sum$SumLongFn())\n" + "_.mapPartitions(new org.apache.beam.runners.spark" + ".SparkRunnerDebuggerTest$PlusOne())\n" + "sparkContext.union(...)\n" + "_.mapPartitions(new org.apache.beam.runners.spark.examples.WordCount$FormatAsTextFn())\n" + "_.<org.apache.beam.sdk.io.AutoValue_TextIO_Write>";
    SparkRunnerDebugger.DebugSparkPipelineResult result = (SparkRunnerDebugger.DebugSparkPipelineResult) pipeline.run();
    assertThat("Debug pipeline did not equal expected", result.getDebugString(), Matchers.equalTo(expectedPipeline));
}
Also used : KV(org.apache.beam.sdk.values.KV) Pipeline(org.apache.beam.sdk.Pipeline) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) WordCount(org.apache.beam.runners.spark.examples.WordCount) Test(org.junit.Test)

Example 50 with PipelineOptions

use of org.apache.beam.sdk.options.PipelineOptions in project beam by apache.

the class XmlSourceTest method testSplitAtFraction.

@Test
public void testSplitAtFraction() throws Exception {
    PipelineOptions options = PipelineOptionsFactory.create();
    String fileName = "temp.xml";
    List<Train> trains = generateRandomTrainList(100);
    File file = createRandomTrainXML(fileName, trains);
    BoundedSource<Train> fileSource = XmlIO.<Train>read().from(file.toPath().toString()).withRootElement("trains").withRecordElement("train").withRecordClass(Train.class).withMinBundleSize(10).createSource();
    List<? extends BoundedSource<Train>> splits = fileSource.split(file.length() / 3, null);
    for (BoundedSource<Train> splitSource : splits) {
        int numItems = readEverythingFromReader(splitSource.createReader(null)).size();
        // Should not split while unstarted.
        assertSplitAtFractionFails(splitSource, 0, 0.7, options);
        assertSplitAtFractionSucceedsAndConsistent(splitSource, 1, 0.7, options);
        assertSplitAtFractionSucceedsAndConsistent(splitSource, 15, 0.7, options);
        assertSplitAtFractionFails(splitSource, 0, 0.0, options);
        assertSplitAtFractionFails(splitSource, 20, 0.3, options);
        assertSplitAtFractionFails(splitSource, numItems, 1.0, options);
        // After reading 100 elements we will be approximately at position
        // 0.99 * (endOffset - startOffset) hence trying to split at fraction 0.9 will be
        // unsuccessful.
        assertSplitAtFractionFails(splitSource, numItems, 0.9, options);
        // Following passes since we can always find a fraction that is extremely close to 1 such that
        // the position suggested by the fraction will be larger than the position the reader is at
        // after reading "items - 1" elements.
        // This also passes for "numItemsToReadBeforeSplit = items" if the position at suggested
        // fraction is larger than the position the reader is at after reading all "items" elements
        // (i.e., the start position of the last element). This is true for most cases but will not
        // be true if reader position is only one less than the end position. (i.e., the last element
        // of the bundle start at the last byte that belongs to the bundle).
        assertSplitAtFractionSucceedsAndConsistent(splitSource, numItems - 1, 0.999, options);
    }
}
Also used : PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) File(java.io.File) Test(org.junit.Test)

Aggregations

PipelineOptions (org.apache.beam.sdk.options.PipelineOptions)92 Test (org.junit.Test)79 File (java.io.File)26 ArrayList (java.util.ArrayList)16 Pipeline (org.apache.beam.sdk.Pipeline)10 Metadata (org.apache.beam.sdk.io.fs.MatchResult.Metadata)9 Path (java.nio.file.Path)6 BigQueryHelpers.toJsonString (org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.toJsonString)6 SerializedPipelineOptions (org.apache.beam.runners.flink.translation.utils.SerializedPipelineOptions)5 KV (org.apache.beam.sdk.values.KV)5 Matchers.containsString (org.hamcrest.Matchers.containsString)5 Table (com.google.api.services.bigquery.model.Table)4 TableReference (com.google.api.services.bigquery.model.TableReference)4 TableRow (com.google.api.services.bigquery.model.TableRow)4 HashBasedTable (com.google.common.collect.HashBasedTable)4 BoundedToUnboundedSourceAdapter (org.apache.beam.runners.core.construction.UnboundedReadFromBoundedSource.BoundedToUnboundedSourceAdapter)4 BigQueryHelpers.createTempTableReference (org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.createTempTableReference)4 TestPipeline (org.apache.beam.sdk.testing.TestPipeline)4 TableFieldSchema (com.google.api.services.bigquery.model.TableFieldSchema)3 TableSchema (com.google.api.services.bigquery.model.TableSchema)3