Search in sources :

Example 91 with KV

use of org.apache.beam.sdk.values.KV in project beam by apache.

the class WindowedWordCount method main.

public static void main(String[] args) throws IOException {
    Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
    final String output = options.getOutput();
    final Instant minTimestamp = new Instant(options.getMinTimestampMillis());
    final Instant maxTimestamp = new Instant(options.getMaxTimestampMillis());
    Pipeline pipeline = Pipeline.create(options);
    /**
     * Concept #1: the Beam SDK lets us run the same pipeline with either a bounded or
     * unbounded input source.
     */
    PCollection<String> input = pipeline.apply(TextIO.read().from(options.getInputFile())).apply(ParDo.of(new AddTimestampFn(minTimestamp, maxTimestamp)));
    /**
     * Concept #3: Window into fixed windows. The fixed window size for this example defaults to 1
     * minute (you can change this with a command-line option). See the documentation for more
     * information on how fixed windows work, and for information on the other types of windowing
     * available (e.g., sliding windows).
     */
    PCollection<String> windowedWords = input.apply(Window.<String>into(FixedWindows.of(Duration.standardMinutes(options.getWindowSize()))));
    /**
     * Concept #4: Re-use our existing CountWords transform that does not have knowledge of
     * windows over a PCollection containing windowed values.
     */
    PCollection<KV<String, Long>> wordCounts = windowedWords.apply(new WordCount.CountWords());
    /**
     * Concept #5: Format the results and write to a sharded file partitioned by window, using a
     * simple ParDo operation. Because there may be failures followed by retries, the
     * writes must be idempotent, but the details of writing to files is elided here.
     */
    wordCounts.apply(MapElements.via(new WordCount.FormatAsTextFn())).apply(new WriteOneFilePerWindow(output, options.getNumShards()));
    PipelineResult result = pipeline.run();
    try {
        result.waitUntilFinish();
    } catch (Exception exc) {
        result.cancel();
    }
}
Also used : ExampleOptions(org.apache.beam.examples.common.ExampleOptions) ExampleBigQueryTableOptions(org.apache.beam.examples.common.ExampleBigQueryTableOptions) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) WriteOneFilePerWindow(org.apache.beam.examples.common.WriteOneFilePerWindow) Instant(org.joda.time.Instant) PipelineResult(org.apache.beam.sdk.PipelineResult) KV(org.apache.beam.sdk.values.KV) IOException(java.io.IOException) Pipeline(org.apache.beam.sdk.Pipeline)

Example 92 with KV

use of org.apache.beam.sdk.values.KV in project DataflowJavaSDK-examples by GoogleCloudPlatform.

the class HourlyTeamScore method configureOutput.

/**
   * Create a map of information that describes how to write pipeline output to text. This map
   * is passed to the {@link WriteToText} constructor to write team score sums and
   * includes information about window start time.
   */
protected static Map<String, WriteToText.FieldFn<KV<String, Integer>>> configureOutput() {
    Map<String, WriteToText.FieldFn<KV<String, Integer>>> config = new HashMap<String, WriteToText.FieldFn<KV<String, Integer>>>();
    config.put("team", (c, w) -> c.element().getKey());
    config.put("total_score", (c, w) -> c.element().getValue());
    config.put("window_start", (c, w) -> {
        IntervalWindow window = (IntervalWindow) w;
        return fmt.print(window.start());
    });
    return config;
}
Also used : WriteToText(com.google.cloud.dataflow.examples.complete.game.utils.WriteToText) HashMap(java.util.HashMap) KV(org.apache.beam.sdk.values.KV) IntervalWindow(org.apache.beam.sdk.transforms.windowing.IntervalWindow)

Example 93 with KV

use of org.apache.beam.sdk.values.KV in project DataflowJavaSDK-examples by GoogleCloudPlatform.

the class UserScore method main.

/**
   * Run a batch pipeline.
   */
// [START DocInclude_USMain]
public static void main(String[] args) throws Exception {
    // Begin constructing a pipeline configured by commandline flags.
    Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
    Pipeline pipeline = Pipeline.create(options);
    // Read events from a text file and parse them.
    pipeline.apply(TextIO.read().from(options.getInput())).apply("ParseGameEvent", ParDo.of(new ParseEventFn())).apply("ExtractUserScore", new ExtractAndSumScore("user")).apply("WriteUserScoreSums", new WriteToText<KV<String, Integer>>(options.getOutput(), configureOutput(), false));
    // Run the batch pipeline.
    pipeline.run().waitUntilFinish();
}
Also used : PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) KV(org.apache.beam.sdk.values.KV) Pipeline(org.apache.beam.sdk.Pipeline)

Example 94 with KV

use of org.apache.beam.sdk.values.KV in project DataflowJavaSDK-examples by GoogleCloudPlatform.

the class WindowedWordCount method main.

public static void main(String[] args) throws IOException {
    Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
    final String output = options.getOutput();
    final Instant minTimestamp = new Instant(options.getMinTimestampMillis());
    final Instant maxTimestamp = new Instant(options.getMaxTimestampMillis());
    Pipeline pipeline = Pipeline.create(options);
    /**
     * Concept #1: the Beam SDK lets us run the same pipeline with either a bounded or
     * unbounded input source.
     */
    PCollection<String> input = pipeline.apply(TextIO.read().from(options.getInputFile())).apply(ParDo.of(new AddTimestampFn(minTimestamp, maxTimestamp)));
    /**
     * Concept #3: Window into fixed windows. The fixed window size for this example defaults to 1
     * minute (you can change this with a command-line option). See the documentation for more
     * information on how fixed windows work, and for information on the other types of windowing
     * available (e.g., sliding windows).
     */
    PCollection<String> windowedWords = input.apply(Window.<String>into(FixedWindows.of(Duration.standardMinutes(options.getWindowSize()))));
    /**
     * Concept #4: Re-use our existing CountWords transform that does not have knowledge of
     * windows over a PCollection containing windowed values.
     */
    PCollection<KV<String, Long>> wordCounts = windowedWords.apply(new WordCount.CountWords());
    /**
     * Concept #5: Format the results and write to a sharded file partitioned by window, using a
     * simple ParDo operation. Because there may be failures followed by retries, the
     * writes must be idempotent, but the details of writing to files is elided here.
     */
    wordCounts.apply(MapElements.via(new WordCount.FormatAsTextFn())).apply(new WriteOneFilePerWindow(output, options.getNumShards()));
    PipelineResult result = pipeline.run();
    try {
        result.waitUntilFinish();
    } catch (Exception exc) {
        result.cancel();
    }
}
Also used : ExampleBigQueryTableOptions(com.google.cloud.dataflow.examples.common.ExampleBigQueryTableOptions) ExampleOptions(com.google.cloud.dataflow.examples.common.ExampleOptions) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) WriteOneFilePerWindow(com.google.cloud.dataflow.examples.common.WriteOneFilePerWindow) Instant(org.joda.time.Instant) PipelineResult(org.apache.beam.sdk.PipelineResult) KV(org.apache.beam.sdk.values.KV) IOException(java.io.IOException) Pipeline(org.apache.beam.sdk.Pipeline)

Example 95 with KV

use of org.apache.beam.sdk.values.KV in project DataflowJavaSDK-examples by GoogleCloudPlatform.

the class LeaderBoardTest method testUserScore.

/**
   * A test where elements arrive both on-time and late in {@link CalculateUserScores}, which emits
   * output into the {@link GlobalWindow}. All elements that arrive should be taken into account,
   * even if they arrive later than the maximum allowed lateness.
   */
@Test
public void testUserScore() {
    TestStream<GameActionInfo> infos = TestStream.create(AvroCoder.of(GameActionInfo.class)).addElements(event(TestUser.BLUE_ONE, 12, Duration.ZERO), event(TestUser.RED_ONE, 3, Duration.ZERO)).advanceProcessingTime(Duration.standardMinutes(7)).addElements(event(TestUser.RED_ONE, 4, Duration.standardMinutes(2)), event(TestUser.BLUE_TWO, 3, Duration.ZERO), event(TestUser.BLUE_ONE, 3, Duration.standardMinutes(3))).advanceProcessingTime(Duration.standardMinutes(5)).advanceWatermarkTo(baseTime.plus(ALLOWED_LATENESS).plus(Duration.standardHours(12))).addElements(event(TestUser.RED_ONE, 3, Duration.standardMinutes(7)), event(TestUser.RED_ONE, 2, (ALLOWED_LATENESS).plus(Duration.standardHours(13)))).advanceProcessingTime(Duration.standardMinutes(6)).addElements(event(TestUser.BLUE_TWO, 5, Duration.standardMinutes(12))).advanceProcessingTime(Duration.standardMinutes(20)).advanceWatermarkToInfinity();
    PCollection<KV<String, Integer>> userScores = p.apply(infos).apply(new CalculateUserScores(ALLOWED_LATENESS));
    // User scores are emitted in speculative panes in the Global Window - this matcher choice
    // ensures that panes emitted by the watermark advancing to positive infinity are not included,
    // as that will not occur outside of tests
    PAssert.that(userScores).inEarlyGlobalWindowPanes().containsInAnyOrder(KV.of(TestUser.BLUE_ONE.getUser(), 15), KV.of(TestUser.RED_ONE.getUser(), 7), KV.of(TestUser.RED_ONE.getUser(), 12), KV.of(TestUser.BLUE_TWO.getUser(), 3), KV.of(TestUser.BLUE_TWO.getUser(), 8));
    p.run().waitUntilFinish();
}
Also used : GameActionInfo(com.google.cloud.dataflow.examples.complete.game.UserScore.GameActionInfo) KV(org.apache.beam.sdk.values.KV) CalculateUserScores(com.google.cloud.dataflow.examples.complete.game.LeaderBoard.CalculateUserScores) Test(org.junit.Test)

Aggregations

KV (org.apache.beam.sdk.values.KV)192 Test (org.junit.Test)143 Instant (org.joda.time.Instant)66 Category (org.junit.experimental.categories.Category)62 Pipeline (org.apache.beam.sdk.Pipeline)35 IntervalWindow (org.apache.beam.sdk.transforms.windowing.IntervalWindow)34 StringUtils.byteArrayToJsonString (org.apache.beam.sdk.util.StringUtils.byteArrayToJsonString)33 Matchers.containsString (org.hamcrest.Matchers.containsString)33 StateSpec (org.apache.beam.sdk.state.StateSpec)25 BoundedWindow (org.apache.beam.sdk.transforms.windowing.BoundedWindow)22 ArrayList (java.util.ArrayList)19 WindowedValue (org.apache.beam.sdk.util.WindowedValue)19 TupleTag (org.apache.beam.sdk.values.TupleTag)16 TableRow (com.google.api.services.bigquery.model.TableRow)15 Map (java.util.Map)15 ValueState (org.apache.beam.sdk.state.ValueState)15 List (java.util.List)14 ImmutableList (com.google.common.collect.ImmutableList)12 HashMap (java.util.HashMap)12 Timer (org.apache.beam.sdk.state.Timer)12