use of org.apache.beam.sdk.Pipeline in project DataflowJavaSDK-examples by GoogleCloudPlatform.
the class UserScore method main.
/**
* Run a batch pipeline.
*/
// [START DocInclude_USMain]
public static void main(String[] args) throws Exception {
// Begin constructing a pipeline configured by commandline flags.
Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
Pipeline pipeline = Pipeline.create(options);
// Read events from a text file and parse them.
pipeline.apply(TextIO.read().from(options.getInput())).apply("ParseGameEvent", ParDo.of(new ParseEventFn())).apply("ExtractUserScore", new ExtractAndSumScore("user")).apply("WriteUserScoreSums", new WriteToText<KV<String, Integer>>(options.getOutput(), configureOutput(), false));
// Run the batch pipeline.
pipeline.run().waitUntilFinish();
}
use of org.apache.beam.sdk.Pipeline in project DataflowJavaSDK by GoogleCloudPlatform.
the class StarterPipeline method main.
public static void main(String[] args) {
Pipeline p = Pipeline.create(PipelineOptionsFactory.fromArgs(args).withValidation().create());
p.apply(Create.of("Hello", "World")).apply(MapElements.via(new SimpleFunction<String, String>() {
@Override
public String apply(String input) {
return input.toUpperCase();
}
})).apply(ParDo.of(new DoFn<String, Void>() {
@ProcessElement
public void processElement(ProcessContext c) {
LOG.info(c.element());
}
}));
p.run();
}
use of org.apache.beam.sdk.Pipeline in project DataflowJavaSDK-examples by GoogleCloudPlatform.
the class WindowedWordCount method main.
public static void main(String[] args) throws IOException {
Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
final String output = options.getOutput();
final Instant minTimestamp = new Instant(options.getMinTimestampMillis());
final Instant maxTimestamp = new Instant(options.getMaxTimestampMillis());
Pipeline pipeline = Pipeline.create(options);
/**
* Concept #1: the Beam SDK lets us run the same pipeline with either a bounded or
* unbounded input source.
*/
PCollection<String> input = pipeline.apply(TextIO.read().from(options.getInputFile())).apply(ParDo.of(new AddTimestampFn(minTimestamp, maxTimestamp)));
/**
* Concept #3: Window into fixed windows. The fixed window size for this example defaults to 1
* minute (you can change this with a command-line option). See the documentation for more
* information on how fixed windows work, and for information on the other types of windowing
* available (e.g., sliding windows).
*/
PCollection<String> windowedWords = input.apply(Window.<String>into(FixedWindows.of(Duration.standardMinutes(options.getWindowSize()))));
/**
* Concept #4: Re-use our existing CountWords transform that does not have knowledge of
* windows over a PCollection containing windowed values.
*/
PCollection<KV<String, Long>> wordCounts = windowedWords.apply(new WordCount.CountWords());
/**
* Concept #5: Format the results and write to a sharded file partitioned by window, using a
* simple ParDo operation. Because there may be failures followed by retries, the
* writes must be idempotent, but the details of writing to files is elided here.
*/
wordCounts.apply(MapElements.via(new WordCount.FormatAsTextFn())).apply(new WriteOneFilePerWindow(output, options.getNumShards()));
PipelineResult result = pipeline.run();
try {
result.waitUntilFinish();
} catch (Exception exc) {
result.cancel();
}
}
use of org.apache.beam.sdk.Pipeline in project DataflowJavaSDK-examples by GoogleCloudPlatform.
the class DebuggingWordCount method main.
public static void main(String[] args) {
WordCountOptions options = PipelineOptionsFactory.fromArgs(args).withValidation().as(WordCountOptions.class);
Pipeline p = Pipeline.create(options);
PCollection<KV<String, Long>> filteredWords = p.apply("ReadLines", TextIO.read().from(options.getInputFile())).apply(new WordCount.CountWords()).apply(ParDo.of(new FilterTextFn(options.getFilterPattern())));
/**
* Concept #3: PAssert is a set of convenient PTransforms in the style of
* Hamcrest's collection matchers that can be used when writing Pipeline level tests
* to validate the contents of PCollections. PAssert is best used in unit tests
* with small data sets but is demonstrated here as a teaching tool.
*
* <p>Below we verify that the set of filtered words matches our expected counts. Note
* that PAssert does not provide any output and that successful completion of the
* Pipeline implies that the expectations were met. Learn more at
* https://beam.apache.org/documentation/pipelines/test-your-pipeline/ on how to test
* your Pipeline and see {@link DebuggingWordCountTest} for an example unit test.
*/
List<KV<String, Long>> expectedResults = Arrays.asList(KV.of("Flourish", 3L), KV.of("stomach", 1L));
PAssert.that(filteredWords).containsInAnyOrder(expectedResults);
p.run().waitUntilFinish();
}
use of org.apache.beam.sdk.Pipeline in project DataflowJavaSDK-examples by GoogleCloudPlatform.
the class StarterPipeline method main.
public static void main(String[] args) {
Pipeline p = Pipeline.create(PipelineOptionsFactory.fromArgs(args).withValidation().create());
p.apply(Create.of("Hello", "World")).apply(MapElements.via(new SimpleFunction<String, String>() {
@Override
public String apply(String input) {
return input.toUpperCase();
}
})).apply(ParDo.of(new DoFn<String, Void>() {
@ProcessElement
public void processElement(ProcessContext c) {
LOG.info(c.element());
}
}));
p.run();
}
Aggregations