use of org.apache.beam.sdk.values.KV in project beam by apache.
the class WindowedWordCount method main.
public static void main(String[] args) throws IOException {
Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
final String output = options.getOutput();
final Instant minTimestamp = new Instant(options.getMinTimestampMillis());
final Instant maxTimestamp = new Instant(options.getMaxTimestampMillis());
Pipeline pipeline = Pipeline.create(options);
/**
* Concept #1: the Beam SDK lets us run the same pipeline with either a bounded or
* unbounded input source.
*/
PCollection<String> input = pipeline.apply(TextIO.read().from(options.getInputFile())).apply(ParDo.of(new AddTimestampFn(minTimestamp, maxTimestamp)));
/**
* Concept #3: Window into fixed windows. The fixed window size for this example defaults to 1
* minute (you can change this with a command-line option). See the documentation for more
* information on how fixed windows work, and for information on the other types of windowing
* available (e.g., sliding windows).
*/
PCollection<String> windowedWords = input.apply(Window.<String>into(FixedWindows.of(Duration.standardMinutes(options.getWindowSize()))));
/**
* Concept #4: Re-use our existing CountWords transform that does not have knowledge of
* windows over a PCollection containing windowed values.
*/
PCollection<KV<String, Long>> wordCounts = windowedWords.apply(new WordCount.CountWords());
/**
* Concept #5: Format the results and write to a sharded file partitioned by window, using a
* simple ParDo operation. Because there may be failures followed by retries, the
* writes must be idempotent, but the details of writing to files is elided here.
*/
wordCounts.apply(MapElements.via(new WordCount.FormatAsTextFn())).apply(new WriteOneFilePerWindow(output, options.getNumShards()));
PipelineResult result = pipeline.run();
try {
result.waitUntilFinish();
} catch (Exception exc) {
result.cancel();
}
}
use of org.apache.beam.sdk.values.KV in project DataflowJavaSDK-examples by GoogleCloudPlatform.
the class HourlyTeamScore method configureOutput.
/**
* Create a map of information that describes how to write pipeline output to text. This map
* is passed to the {@link WriteToText} constructor to write team score sums and
* includes information about window start time.
*/
protected static Map<String, WriteToText.FieldFn<KV<String, Integer>>> configureOutput() {
Map<String, WriteToText.FieldFn<KV<String, Integer>>> config = new HashMap<String, WriteToText.FieldFn<KV<String, Integer>>>();
config.put("team", (c, w) -> c.element().getKey());
config.put("total_score", (c, w) -> c.element().getValue());
config.put("window_start", (c, w) -> {
IntervalWindow window = (IntervalWindow) w;
return fmt.print(window.start());
});
return config;
}
use of org.apache.beam.sdk.values.KV in project DataflowJavaSDK-examples by GoogleCloudPlatform.
the class UserScore method main.
/**
* Run a batch pipeline.
*/
// [START DocInclude_USMain]
public static void main(String[] args) throws Exception {
// Begin constructing a pipeline configured by commandline flags.
Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
Pipeline pipeline = Pipeline.create(options);
// Read events from a text file and parse them.
pipeline.apply(TextIO.read().from(options.getInput())).apply("ParseGameEvent", ParDo.of(new ParseEventFn())).apply("ExtractUserScore", new ExtractAndSumScore("user")).apply("WriteUserScoreSums", new WriteToText<KV<String, Integer>>(options.getOutput(), configureOutput(), false));
// Run the batch pipeline.
pipeline.run().waitUntilFinish();
}
use of org.apache.beam.sdk.values.KV in project DataflowJavaSDK-examples by GoogleCloudPlatform.
the class WindowedWordCount method main.
public static void main(String[] args) throws IOException {
Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
final String output = options.getOutput();
final Instant minTimestamp = new Instant(options.getMinTimestampMillis());
final Instant maxTimestamp = new Instant(options.getMaxTimestampMillis());
Pipeline pipeline = Pipeline.create(options);
/**
* Concept #1: the Beam SDK lets us run the same pipeline with either a bounded or
* unbounded input source.
*/
PCollection<String> input = pipeline.apply(TextIO.read().from(options.getInputFile())).apply(ParDo.of(new AddTimestampFn(minTimestamp, maxTimestamp)));
/**
* Concept #3: Window into fixed windows. The fixed window size for this example defaults to 1
* minute (you can change this with a command-line option). See the documentation for more
* information on how fixed windows work, and for information on the other types of windowing
* available (e.g., sliding windows).
*/
PCollection<String> windowedWords = input.apply(Window.<String>into(FixedWindows.of(Duration.standardMinutes(options.getWindowSize()))));
/**
* Concept #4: Re-use our existing CountWords transform that does not have knowledge of
* windows over a PCollection containing windowed values.
*/
PCollection<KV<String, Long>> wordCounts = windowedWords.apply(new WordCount.CountWords());
/**
* Concept #5: Format the results and write to a sharded file partitioned by window, using a
* simple ParDo operation. Because there may be failures followed by retries, the
* writes must be idempotent, but the details of writing to files is elided here.
*/
wordCounts.apply(MapElements.via(new WordCount.FormatAsTextFn())).apply(new WriteOneFilePerWindow(output, options.getNumShards()));
PipelineResult result = pipeline.run();
try {
result.waitUntilFinish();
} catch (Exception exc) {
result.cancel();
}
}
use of org.apache.beam.sdk.values.KV in project DataflowJavaSDK-examples by GoogleCloudPlatform.
the class LeaderBoardTest method testUserScore.
/**
* A test where elements arrive both on-time and late in {@link CalculateUserScores}, which emits
* output into the {@link GlobalWindow}. All elements that arrive should be taken into account,
* even if they arrive later than the maximum allowed lateness.
*/
@Test
public void testUserScore() {
TestStream<GameActionInfo> infos = TestStream.create(AvroCoder.of(GameActionInfo.class)).addElements(event(TestUser.BLUE_ONE, 12, Duration.ZERO), event(TestUser.RED_ONE, 3, Duration.ZERO)).advanceProcessingTime(Duration.standardMinutes(7)).addElements(event(TestUser.RED_ONE, 4, Duration.standardMinutes(2)), event(TestUser.BLUE_TWO, 3, Duration.ZERO), event(TestUser.BLUE_ONE, 3, Duration.standardMinutes(3))).advanceProcessingTime(Duration.standardMinutes(5)).advanceWatermarkTo(baseTime.plus(ALLOWED_LATENESS).plus(Duration.standardHours(12))).addElements(event(TestUser.RED_ONE, 3, Duration.standardMinutes(7)), event(TestUser.RED_ONE, 2, (ALLOWED_LATENESS).plus(Duration.standardHours(13)))).advanceProcessingTime(Duration.standardMinutes(6)).addElements(event(TestUser.BLUE_TWO, 5, Duration.standardMinutes(12))).advanceProcessingTime(Duration.standardMinutes(20)).advanceWatermarkToInfinity();
PCollection<KV<String, Integer>> userScores = p.apply(infos).apply(new CalculateUserScores(ALLOWED_LATENESS));
// User scores are emitted in speculative panes in the Global Window - this matcher choice
// ensures that panes emitted by the watermark advancing to positive infinity are not included,
// as that will not occur outside of tests
PAssert.that(userScores).inEarlyGlobalWindowPanes().containsInAnyOrder(KV.of(TestUser.BLUE_ONE.getUser(), 15), KV.of(TestUser.RED_ONE.getUser(), 7), KV.of(TestUser.RED_ONE.getUser(), 12), KV.of(TestUser.BLUE_TWO.getUser(), 3), KV.of(TestUser.BLUE_TWO.getUser(), 8));
p.run().waitUntilFinish();
}
Aggregations