use of org.apache.beam.sdk.options.PipelineOptions in project DataflowJavaSDK-examples by GoogleCloudPlatform.
the class MinimalWordCountJava8 method main.
public static void main(String[] args) {
PipelineOptions options = PipelineOptionsFactory.create();
// In order to run your pipeline, you need to make following runner specific changes:
//
// CHANGE 1/3: Select a Beam runner, such as BlockingDataflowRunner
// or FlinkRunner.
// CHANGE 2/3: Specify runner-required options.
// For BlockingDataflowRunner, set project and temp location as follows:
// DataflowPipelineOptions dataflowOptions = options.as(DataflowPipelineOptions.class);
// dataflowOptions.setRunner(BlockingDataflowRunner.class);
// dataflowOptions.setProject("SET_YOUR_PROJECT_ID_HERE");
// dataflowOptions.setTempLocation("gs://SET_YOUR_BUCKET_NAME_HERE/AND_TEMP_DIRECTORY");
// For FlinkRunner, set the runner as follows. See {@code FlinkPipelineOptions}
// for more details.
// options.as(FlinkPipelineOptions.class)
// .setRunner(FlinkRunner.class);
Pipeline p = Pipeline.create(options);
p.apply(TextIO.read().from("gs://apache-beam-samples/shakespeare/*")).apply(FlatMapElements.into(TypeDescriptors.strings()).via((String word) -> Arrays.asList(word.split("[^\\p{L}]+")))).apply(Filter.by((String word) -> !word.isEmpty())).apply(Count.<String>perElement()).apply(MapElements.into(TypeDescriptors.strings()).via((KV<String, Long> wordCount) -> wordCount.getKey() + ": " + wordCount.getValue())).apply(TextIO.write().to("gs://YOUR_OUTPUT_BUCKET/AND_OUTPUT_PREFIX"));
p.run().waitUntilFinish();
}
use of org.apache.beam.sdk.options.PipelineOptions in project beam by apache.
the class UnboundedReadFromBoundedSourceTest method testBoundedToUnboundedSourceAdapterCheckpoint.
private <T> void testBoundedToUnboundedSourceAdapterCheckpoint(BoundedSource<T> boundedSource, List<T> expectedElements) throws Exception {
BoundedToUnboundedSourceAdapter<T> unboundedSource = new BoundedToUnboundedSourceAdapter<>(boundedSource);
PipelineOptions options = PipelineOptionsFactory.create();
BoundedToUnboundedSourceAdapter<T>.Reader<T> reader = unboundedSource.createReader(options, null);
List<T> actual = Lists.newArrayList();
for (boolean hasNext = reader.start(); hasNext; hasNext = reader.advance()) {
actual.add(reader.getCurrent());
// checkpoint every 9 elements
if (actual.size() % 9 == 0) {
Checkpoint<T> checkpoint = reader.getCheckpointMark();
checkpoint.finalizeCheckpoint();
}
}
Checkpoint<T> checkpointDone = reader.getCheckpointMark();
assertTrue(checkpointDone.getResidualElements() == null || checkpointDone.getResidualElements().isEmpty());
assertEquals(expectedElements.size(), actual.size());
assertEquals(Sets.newHashSet(expectedElements), Sets.newHashSet(actual));
}
use of org.apache.beam.sdk.options.PipelineOptions in project beam by apache.
the class TestPipeline method testingPipelineOptions.
/** Creates {@link PipelineOptions} for testing. */
public static PipelineOptions testingPipelineOptions() {
try {
@Nullable String beamTestPipelineOptions = System.getProperty(PROPERTY_BEAM_TEST_PIPELINE_OPTIONS);
PipelineOptions options = Strings.isNullOrEmpty(beamTestPipelineOptions) ? PipelineOptionsFactory.create() : PipelineOptionsFactory.fromArgs(MAPPER.readValue(beamTestPipelineOptions, String[].class)).as(TestPipelineOptions.class);
options.as(ApplicationNameOptions.class).setAppName(getAppName());
// If no options were specified, set some reasonable defaults
if (Strings.isNullOrEmpty(beamTestPipelineOptions)) {
// If there are no provided options, check to see if a dummy runner should be used.
String useDefaultDummy = System.getProperty(PROPERTY_USE_DEFAULT_DUMMY_RUNNER);
if (!Strings.isNullOrEmpty(useDefaultDummy) && Boolean.valueOf(useDefaultDummy)) {
options.setRunner(CrashingRunner.class);
}
}
options.setStableUniqueNames(CheckEnabled.ERROR);
FileSystems.setDefaultPipelineOptions(options);
return options;
} catch (IOException e) {
throw new RuntimeException("Unable to instantiate test options from system property " + PROPERTY_BEAM_TEST_PIPELINE_OPTIONS + ":" + System.getProperty(PROPERTY_BEAM_TEST_PIPELINE_OPTIONS), e);
}
}
use of org.apache.beam.sdk.options.PipelineOptions in project beam by apache.
the class SparkRunnerDebuggerTest method debugBatchPipeline.
@Test
public void debugBatchPipeline() {
PipelineOptions options = PipelineOptionsFactory.create().as(TestSparkPipelineOptions.class);
options.setRunner(SparkRunnerDebugger.class);
Pipeline pipeline = Pipeline.create(options);
PCollection<String> lines = pipeline.apply(Create.of(Collections.<String>emptyList()).withCoder(StringUtf8Coder.of()));
PCollection<KV<String, Long>> wordCounts = lines.apply(new WordCount.CountWords());
wordCounts.apply(GroupByKey.<String, Long>create()).apply(Combine.<String, Long, Long>groupedValues(Sum.ofLongs()));
PCollection<KV<String, Long>> wordCountsPlusOne = wordCounts.apply(MapElements.via(new PlusOne()));
PCollectionList.of(wordCounts).and(wordCountsPlusOne).apply(Flatten.<KV<String, Long>>pCollections());
wordCounts.apply(MapElements.via(new WordCount.FormatAsTextFn())).apply(TextIO.write().to("!!PLACEHOLDER-OUTPUT-DIR!!").withNumShards(3).withSuffix(".txt"));
final String expectedPipeline = "sparkContext.parallelize(Arrays.asList(...))\n" + "_.mapPartitions(new org.apache.beam.runners.spark.examples.WordCount$ExtractWordsFn())\n" + "_.mapPartitions(new org.apache.beam.sdk.transforms.Count$PerElement$1())\n" + "_.combineByKey(..., new org.apache.beam.sdk.transforms.Count$CountFn(), ...)\n" + "_.groupByKey()\n" + "_.map(new org.apache.beam.sdk.transforms.Sum$SumLongFn())\n" + "_.mapPartitions(new org.apache.beam.runners.spark" + ".SparkRunnerDebuggerTest$PlusOne())\n" + "sparkContext.union(...)\n" + "_.mapPartitions(new org.apache.beam.runners.spark.examples.WordCount$FormatAsTextFn())\n" + "_.<org.apache.beam.sdk.io.AutoValue_TextIO_Write>";
SparkRunnerDebugger.DebugSparkPipelineResult result = (SparkRunnerDebugger.DebugSparkPipelineResult) pipeline.run();
assertThat("Debug pipeline did not equal expected", result.getDebugString(), Matchers.equalTo(expectedPipeline));
}
use of org.apache.beam.sdk.options.PipelineOptions in project beam by apache.
the class XmlSourceTest method testSplitAtFraction.
@Test
public void testSplitAtFraction() throws Exception {
PipelineOptions options = PipelineOptionsFactory.create();
String fileName = "temp.xml";
List<Train> trains = generateRandomTrainList(100);
File file = createRandomTrainXML(fileName, trains);
BoundedSource<Train> fileSource = XmlIO.<Train>read().from(file.toPath().toString()).withRootElement("trains").withRecordElement("train").withRecordClass(Train.class).withMinBundleSize(10).createSource();
List<? extends BoundedSource<Train>> splits = fileSource.split(file.length() / 3, null);
for (BoundedSource<Train> splitSource : splits) {
int numItems = readEverythingFromReader(splitSource.createReader(null)).size();
// Should not split while unstarted.
assertSplitAtFractionFails(splitSource, 0, 0.7, options);
assertSplitAtFractionSucceedsAndConsistent(splitSource, 1, 0.7, options);
assertSplitAtFractionSucceedsAndConsistent(splitSource, 15, 0.7, options);
assertSplitAtFractionFails(splitSource, 0, 0.0, options);
assertSplitAtFractionFails(splitSource, 20, 0.3, options);
assertSplitAtFractionFails(splitSource, numItems, 1.0, options);
// After reading 100 elements we will be approximately at position
// 0.99 * (endOffset - startOffset) hence trying to split at fraction 0.9 will be
// unsuccessful.
assertSplitAtFractionFails(splitSource, numItems, 0.9, options);
// Following passes since we can always find a fraction that is extremely close to 1 such that
// the position suggested by the fraction will be larger than the position the reader is at
// after reading "items - 1" elements.
// This also passes for "numItemsToReadBeforeSplit = items" if the position at suggested
// fraction is larger than the position the reader is at after reading all "items" elements
// (i.e., the start position of the last element). This is true for most cases but will not
// be true if reader position is only one less than the end position. (i.e., the last element
// of the bundle start at the last byte that belongs to the bundle).
assertSplitAtFractionSucceedsAndConsistent(splitSource, numItems - 1, 0.999, options);
}
}
Aggregations