Search in sources :

Example 66 with KV

use of org.apache.beam.sdk.values.KV in project beam by apache.

the class SparkKeyedCombineFn method mergeCombiners.

/**
   * Implements Spark's mergeCombiners function in:
   * <p>
   * {@link org.apache.spark.rdd.PairRDDFunctions#combineByKey}.
   * </p>
   */
Iterable<WindowedValue<KV<K, AccumT>>> mergeCombiners(Iterable<WindowedValue<KV<K, AccumT>>> a1, Iterable<WindowedValue<KV<K, AccumT>>> a2) {
    // concatenate accumulators.
    Iterable<WindowedValue<KV<K, AccumT>>> accumulators = Iterables.concat(a1, a2);
    // sort accumulators, no need to explode since inputs were exploded.
    Iterable<WindowedValue<KV<K, AccumT>>> sortedAccumulators = sortByWindows(accumulators);
    @SuppressWarnings("unchecked") TimestampCombiner timestampCombiner = windowingStrategy.getTimestampCombiner();
    //--- accumulators iterator, by window order.
    final Iterator<WindowedValue<KV<K, AccumT>>> iterator = sortedAccumulators.iterator();
    // get the first accumulator and assign it to the current window's accumulators.
    WindowedValue<KV<K, AccumT>> currentValue = iterator.next();
    K key = currentValue.getValue().getKey();
    BoundedWindow currentWindow = Iterables.getFirst(currentValue.getWindows(), null);
    List<AccumT> currentWindowAccumulators = Lists.newArrayList();
    currentWindowAccumulators.add(currentValue.getValue().getValue());
    // keep track of the timestamps assigned by the TimestampCombiner,
    // in createCombiner we already merge the timestamps assigned
    // to individual elements, here we will just merge them.
    List<Instant> windowTimestamps = Lists.newArrayList();
    windowTimestamps.add(currentValue.getTimestamp());
    // accumulate the next windows, or output.
    List<WindowedValue<KV<K, AccumT>>> output = Lists.newArrayList();
    // if merging, merge overlapping windows, e.g. Sessions.
    final boolean merging = !windowingStrategy.getWindowFn().isNonMerging();
    while (iterator.hasNext()) {
        WindowedValue<KV<K, AccumT>> nextValue = iterator.next();
        BoundedWindow nextWindow = Iterables.getOnlyElement(nextValue.getWindows());
        boolean mergingAndIntersecting = merging && isIntersecting((IntervalWindow) currentWindow, (IntervalWindow) nextWindow);
        if (mergingAndIntersecting || nextWindow.equals(currentWindow)) {
            if (mergingAndIntersecting) {
                // merge intersecting windows.
                currentWindow = merge((IntervalWindow) currentWindow, (IntervalWindow) nextWindow);
            }
            // add to window accumulators.
            currentWindowAccumulators.add(nextValue.getValue().getValue());
            windowTimestamps.add(nextValue.getTimestamp());
        } else {
            // before moving to the next window,
            // add the current accumulation to the output and initialize the accumulation.
            // merge the timestamps of all accumulators to merge.
            Instant mergedTimestamp = timestampCombiner.merge(currentWindow, windowTimestamps);
            // merge accumulators.
            // transforming a KV<K, Iterable<AccumT>> into a KV<K, Iterable<AccumT>>.
            // for the (possibly merged) window.
            Iterable<AccumT> accumsToMerge = Iterables.unmodifiableIterable(currentWindowAccumulators);
            WindowedValue<KV<K, Iterable<AccumT>>> preMergeWindowedValue = WindowedValue.of(KV.of(key, accumsToMerge), mergedTimestamp, currentWindow, PaneInfo.NO_FIRING);
            // applying the actual combiner onto the accumulators.
            AccumT accumulated = combineFn.mergeAccumulators(accumsToMerge, ctxtForInput(preMergeWindowedValue));
            WindowedValue<KV<K, AccumT>> postMergeWindowedValue = preMergeWindowedValue.withValue(KV.of(key, accumulated));
            // emit the accumulated output.
            output.add(postMergeWindowedValue);
            // re-init accumulator, window and timestamps.
            currentWindowAccumulators.clear();
            currentWindowAccumulators.add(nextValue.getValue().getValue());
            currentWindow = nextWindow;
            windowTimestamps.clear();
            windowTimestamps.add(nextValue.getTimestamp());
        }
    }
    // merge the last chunk of accumulators.
    Instant mergedTimestamp = timestampCombiner.merge(currentWindow, windowTimestamps);
    Iterable<AccumT> accumsToMerge = Iterables.unmodifiableIterable(currentWindowAccumulators);
    WindowedValue<KV<K, Iterable<AccumT>>> preMergeWindowedValue = WindowedValue.of(KV.of(key, accumsToMerge), mergedTimestamp, currentWindow, PaneInfo.NO_FIRING);
    AccumT accumulated = combineFn.mergeAccumulators(accumsToMerge, ctxtForInput(preMergeWindowedValue));
    WindowedValue<KV<K, AccumT>> postMergeWindowedValue = preMergeWindowedValue.withValue(KV.of(key, accumulated));
    output.add(postMergeWindowedValue);
    return output;
}
Also used : TimestampCombiner(org.apache.beam.sdk.transforms.windowing.TimestampCombiner) Instant(org.joda.time.Instant) KV(org.apache.beam.sdk.values.KV) WindowedValue(org.apache.beam.sdk.util.WindowedValue) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) IntervalWindow(org.apache.beam.sdk.transforms.windowing.IntervalWindow)

Example 67 with KV

use of org.apache.beam.sdk.values.KV in project beam by apache.

the class SparkKeyedCombineFn method createCombiner.

/**
   * Implements Spark's createCombiner function in:
   * <p>
   * {@link org.apache.spark.rdd.PairRDDFunctions#combineByKey}.
   * </p>
   */
Iterable<WindowedValue<KV<K, AccumT>>> createCombiner(WindowedValue<KV<K, InputT>> wkvi) {
    // sort exploded inputs.
    Iterable<WindowedValue<KV<K, InputT>>> sortedInputs = sortByWindows(wkvi.explodeWindows());
    TimestampCombiner timestampCombiner = windowingStrategy.getTimestampCombiner();
    WindowFn<?, BoundedWindow> windowFn = windowingStrategy.getWindowFn();
    //--- inputs iterator, by window order.
    final Iterator<WindowedValue<KV<K, InputT>>> iterator = sortedInputs.iterator();
    WindowedValue<KV<K, InputT>> currentInput = iterator.next();
    BoundedWindow currentWindow = Iterables.getFirst(currentInput.getWindows(), null);
    // first create the accumulator and accumulate first input.
    K key = currentInput.getValue().getKey();
    AccumT accumulator = combineFn.createAccumulator(ctxtForInput(currentInput));
    accumulator = combineFn.addInput(accumulator, currentInput.getValue().getValue(), ctxtForInput(currentInput));
    // keep track of the timestamps assigned by the TimestampCombiner.
    Instant windowTimestamp = timestampCombiner.assign(currentWindow, windowingStrategy.getWindowFn().getOutputTime(currentInput.getTimestamp(), currentWindow));
    // accumulate the next windows, or output.
    List<WindowedValue<KV<K, AccumT>>> output = Lists.newArrayList();
    // if merging, merge overlapping windows, e.g. Sessions.
    final boolean merging = !windowingStrategy.getWindowFn().isNonMerging();
    while (iterator.hasNext()) {
        WindowedValue<KV<K, InputT>> nextValue = iterator.next();
        BoundedWindow nextWindow = Iterables.getOnlyElement(nextValue.getWindows());
        boolean mergingAndIntersecting = merging && isIntersecting((IntervalWindow) currentWindow, (IntervalWindow) nextWindow);
        if (mergingAndIntersecting || nextWindow.equals(currentWindow)) {
            if (mergingAndIntersecting) {
                // merge intersecting windows.
                currentWindow = merge((IntervalWindow) currentWindow, (IntervalWindow) nextWindow);
            }
            // keep accumulating and carry on ;-)
            accumulator = combineFn.addInput(accumulator, nextValue.getValue().getValue(), ctxtForInput(nextValue));
            windowTimestamp = timestampCombiner.combine(windowTimestamp, timestampCombiner.assign(currentWindow, windowFn.getOutputTime(nextValue.getTimestamp(), currentWindow)));
        } else {
            // moving to the next window, first add the current accumulation to output
            // and initialize the accumulator.
            output.add(WindowedValue.of(KV.of(key, accumulator), windowTimestamp, currentWindow, PaneInfo.NO_FIRING));
            // re-init accumulator, window and timestamp.
            accumulator = combineFn.createAccumulator(ctxtForInput(nextValue));
            accumulator = combineFn.addInput(accumulator, nextValue.getValue().getValue(), ctxtForInput(nextValue));
            currentWindow = nextWindow;
            windowTimestamp = timestampCombiner.assign(currentWindow, windowFn.getOutputTime(nextValue.getTimestamp(), currentWindow));
        }
    }
    // add last accumulator to the output.
    output.add(WindowedValue.of(KV.of(key, accumulator), windowTimestamp, currentWindow, PaneInfo.NO_FIRING));
    return output;
}
Also used : TimestampCombiner(org.apache.beam.sdk.transforms.windowing.TimestampCombiner) Instant(org.joda.time.Instant) KV(org.apache.beam.sdk.values.KV) WindowedValue(org.apache.beam.sdk.util.WindowedValue) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) IntervalWindow(org.apache.beam.sdk.transforms.windowing.IntervalWindow)

Example 68 with KV

use of org.apache.beam.sdk.values.KV in project beam by apache.

the class StreamingTransformTranslator method groupByKey.

private static <K, V, W extends BoundedWindow> TransformEvaluator<GroupByKey<K, V>> groupByKey() {
    return new TransformEvaluator<GroupByKey<K, V>>() {

        @Override
        public void evaluate(GroupByKey<K, V> transform, EvaluationContext context) {
            @SuppressWarnings("unchecked") UnboundedDataset<KV<K, V>> inputDataset = (UnboundedDataset<KV<K, V>>) context.borrowDataset(transform);
            List<Integer> streamSources = inputDataset.getStreamSources();
            JavaDStream<WindowedValue<KV<K, V>>> dStream = inputDataset.getDStream();
            @SuppressWarnings("unchecked") final KvCoder<K, V> coder = (KvCoder<K, V>) context.getInput(transform).getCoder();
            final SparkRuntimeContext runtimeContext = context.getRuntimeContext();
            @SuppressWarnings("unchecked") final WindowingStrategy<?, W> windowingStrategy = (WindowingStrategy<?, W>) context.getInput(transform).getWindowingStrategy();
            @SuppressWarnings("unchecked") final WindowFn<Object, W> windowFn = (WindowFn<Object, W>) windowingStrategy.getWindowFn();
            //--- coders.
            final WindowedValue.WindowedValueCoder<V> wvCoder = WindowedValue.FullWindowedValueCoder.of(coder.getValueCoder(), windowFn.windowCoder());
            //--- group by key only.
            JavaDStream<WindowedValue<KV<K, Iterable<WindowedValue<V>>>>> groupedByKeyStream = dStream.transform(new Function<JavaRDD<WindowedValue<KV<K, V>>>, JavaRDD<WindowedValue<KV<K, Iterable<WindowedValue<V>>>>>>() {

                @Override
                public JavaRDD<WindowedValue<KV<K, Iterable<WindowedValue<V>>>>> call(JavaRDD<WindowedValue<KV<K, V>>> rdd) throws Exception {
                    return GroupCombineFunctions.groupByKeyOnly(rdd, coder.getKeyCoder(), wvCoder);
                }
            });
            //--- now group also by window.
            JavaDStream<WindowedValue<KV<K, Iterable<V>>>> outStream = SparkGroupAlsoByWindowViaWindowSet.groupAlsoByWindow(groupedByKeyStream, coder.getKeyCoder(), wvCoder, windowingStrategy, runtimeContext, streamSources);
            context.putDataset(transform, new UnboundedDataset<>(outStream, streamSources));
        }

        @Override
        public String toNativeString() {
            return "groupByKey()";
        }
    };
}
Also used : GroupByKey(org.apache.beam.sdk.transforms.GroupByKey) WindowingStrategy(org.apache.beam.sdk.values.WindowingStrategy) KV(org.apache.beam.sdk.values.KV) WindowedValue(org.apache.beam.sdk.util.WindowedValue) SparkRuntimeContext(org.apache.beam.runners.spark.translation.SparkRuntimeContext) WindowFn(org.apache.beam.sdk.transforms.windowing.WindowFn) SparkAssignWindowFn(org.apache.beam.runners.spark.translation.SparkAssignWindowFn) KvCoder(org.apache.beam.sdk.coders.KvCoder) KV(org.apache.beam.sdk.values.KV) TransformEvaluator(org.apache.beam.runners.spark.translation.TransformEvaluator) JavaRDD(org.apache.spark.api.java.JavaRDD) EvaluationContext(org.apache.beam.runners.spark.translation.EvaluationContext)

Example 69 with KV

use of org.apache.beam.sdk.values.KV in project beam by apache.

the class LeaderBoardTest method testTeamScoresDroppablyLate.

/**
   * A test where elements arrive beyond the maximum allowed lateness. These elements are dropped
   * within {@link CalculateTeamScores} and do not impact the final result.
   */
@Test
public void testTeamScoresDroppablyLate() {
    BoundedWindow window = new IntervalWindow(baseTime, TEAM_WINDOW_DURATION);
    TestStream<GameActionInfo> infos = TestStream.create(AvroCoder.of(GameActionInfo.class)).addElements(event(TestUser.BLUE_ONE, 12, Duration.ZERO), event(TestUser.RED_ONE, 3, Duration.ZERO)).advanceWatermarkTo(window.maxTimestamp()).addElements(event(TestUser.RED_ONE, 4, Duration.standardMinutes(2)), event(TestUser.BLUE_TWO, 3, Duration.ZERO), event(TestUser.BLUE_ONE, 3, Duration.standardMinutes(3))).advanceWatermarkTo(baseTime.plus(ALLOWED_LATENESS).plus(TEAM_WINDOW_DURATION).plus(Duration.standardMinutes(1))).addElements(event(TestUser.BLUE_TWO, 3, TEAM_WINDOW_DURATION.minus(Duration.standardSeconds(5))), event(TestUser.RED_ONE, 7, Duration.standardMinutes(4))).advanceWatermarkToInfinity();
    PCollection<KV<String, Integer>> teamScores = p.apply(infos).apply(new CalculateTeamScores(TEAM_WINDOW_DURATION, ALLOWED_LATENESS));
    String blueTeam = TestUser.BLUE_ONE.getTeam();
    String redTeam = TestUser.RED_ONE.getTeam();
    // Only one on-time pane and no late panes should be emitted
    PAssert.that(teamScores).inWindow(window).containsInAnyOrder(KV.of(redTeam, 7), KV.of(blueTeam, 18));
    // No elements are added before the watermark passes the end of the window plus the allowed
    // lateness, so no refinement should be emitted
    PAssert.that(teamScores).inFinalPane(window).empty();
    p.run().waitUntilFinish();
}
Also used : GameActionInfo(org.apache.beam.examples.complete.game.UserScore.GameActionInfo) CalculateTeamScores(org.apache.beam.examples.complete.game.LeaderBoard.CalculateTeamScores) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) KV(org.apache.beam.sdk.values.KV) IntervalWindow(org.apache.beam.sdk.transforms.windowing.IntervalWindow) Test(org.junit.Test)

Example 70 with KV

use of org.apache.beam.sdk.values.KV in project beam by apache.

the class LeaderBoardTest method testTeamScoresObservablyLate.

/**
   * A test where elements arrive behind the watermark (late data) after the watermark passes the
   * end of the window, but before the maximum allowed lateness. These elements are emitted in a
   * late pane.
   */
@Test
public void testTeamScoresObservablyLate() {
    Instant firstWindowCloses = baseTime.plus(ALLOWED_LATENESS).plus(TEAM_WINDOW_DURATION);
    TestStream<GameActionInfo> createEvents = TestStream.create(AvroCoder.of(GameActionInfo.class)).advanceWatermarkTo(baseTime).addElements(event(TestUser.BLUE_ONE, 3, Duration.standardSeconds(3)), event(TestUser.BLUE_TWO, 5, Duration.standardMinutes(8))).advanceProcessingTime(Duration.standardMinutes(10)).advanceWatermarkTo(baseTime.plus(Duration.standardMinutes(3))).addElements(event(TestUser.RED_ONE, 3, Duration.standardMinutes(1)), event(TestUser.RED_ONE, 4, Duration.standardMinutes(2)), event(TestUser.BLUE_ONE, 3, Duration.standardMinutes(5))).advanceWatermarkTo(firstWindowCloses.minus(Duration.standardMinutes(1))).addElements(event(TestUser.RED_TWO, 2, Duration.ZERO), event(TestUser.RED_TWO, 5, Duration.standardMinutes(1)), event(TestUser.RED_TWO, 3, Duration.standardMinutes(3))).advanceProcessingTime(Duration.standardMinutes(12)).addElements(event(TestUser.RED_TWO, 9, Duration.standardMinutes(1)), event(TestUser.RED_TWO, 1, Duration.standardMinutes(3))).advanceWatermarkToInfinity();
    PCollection<KV<String, Integer>> teamScores = p.apply(createEvents).apply(new CalculateTeamScores(TEAM_WINDOW_DURATION, ALLOWED_LATENESS));
    BoundedWindow window = new IntervalWindow(baseTime, TEAM_WINDOW_DURATION);
    String blueTeam = TestUser.BLUE_ONE.getTeam();
    String redTeam = TestUser.RED_ONE.getTeam();
    PAssert.that(teamScores).inWindow(window).satisfies((SerializableFunction<Iterable<KV<String, Integer>>, Void>) input -> {
        assertThat(input, hasItem(KV.of(blueTeam, 11)));
        assertThat(input, hasItem(KV.of(redTeam, 27)));
        return null;
    });
    PAssert.thatMap(teamScores).inOnTimePane(window).isEqualTo(ImmutableMap.<String, Integer>builder().put(redTeam, 7).put(blueTeam, 11).build());
    // No final pane is emitted for the blue team, as all of their updates have been taken into
    // account in earlier panes
    PAssert.that(teamScores).inFinalPane(window).containsInAnyOrder(KV.of(redTeam, 27));
    p.run().waitUntilFinish();
}
Also used : KV(org.apache.beam.sdk.values.KV) GameActionInfo(org.apache.beam.examples.complete.game.UserScore.GameActionInfo) Duration(org.joda.time.Duration) RunWith(org.junit.runner.RunWith) SerializableFunction(org.apache.beam.sdk.transforms.SerializableFunction) PipelineOptionsFactory(org.apache.beam.sdk.options.PipelineOptionsFactory) PTransform(org.apache.beam.sdk.transforms.PTransform) Assert.assertThat(org.junit.Assert.assertThat) TestPipeline(org.apache.beam.sdk.testing.TestPipeline) GlobalWindow(org.apache.beam.sdk.transforms.windowing.GlobalWindow) TimestampedValue(org.apache.beam.sdk.values.TimestampedValue) ImmutableMap(com.google.common.collect.ImmutableMap) PAssert(org.apache.beam.sdk.testing.PAssert) CalculateTeamScores(org.apache.beam.examples.complete.game.LeaderBoard.CalculateTeamScores) Test(org.junit.Test) JUnit4(org.junit.runners.JUnit4) PCollection(org.apache.beam.sdk.values.PCollection) AvroCoder(org.apache.beam.sdk.coders.AvroCoder) Serializable(java.io.Serializable) Matchers.hasItem(org.hamcrest.Matchers.hasItem) Rule(org.junit.Rule) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) Instant(org.joda.time.Instant) CalculateUserScores(org.apache.beam.examples.complete.game.LeaderBoard.CalculateUserScores) TestStream(org.apache.beam.sdk.testing.TestStream) IntervalWindow(org.apache.beam.sdk.transforms.windowing.IntervalWindow) GameActionInfo(org.apache.beam.examples.complete.game.UserScore.GameActionInfo) Instant(org.joda.time.Instant) CalculateTeamScores(org.apache.beam.examples.complete.game.LeaderBoard.CalculateTeamScores) KV(org.apache.beam.sdk.values.KV) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) IntervalWindow(org.apache.beam.sdk.transforms.windowing.IntervalWindow) Test(org.junit.Test)

Aggregations

KV (org.apache.beam.sdk.values.KV)192 Test (org.junit.Test)143 Instant (org.joda.time.Instant)66 Category (org.junit.experimental.categories.Category)62 Pipeline (org.apache.beam.sdk.Pipeline)35 IntervalWindow (org.apache.beam.sdk.transforms.windowing.IntervalWindow)34 StringUtils.byteArrayToJsonString (org.apache.beam.sdk.util.StringUtils.byteArrayToJsonString)33 Matchers.containsString (org.hamcrest.Matchers.containsString)33 StateSpec (org.apache.beam.sdk.state.StateSpec)25 BoundedWindow (org.apache.beam.sdk.transforms.windowing.BoundedWindow)22 ArrayList (java.util.ArrayList)19 WindowedValue (org.apache.beam.sdk.util.WindowedValue)19 TupleTag (org.apache.beam.sdk.values.TupleTag)16 TableRow (com.google.api.services.bigquery.model.TableRow)15 Map (java.util.Map)15 ValueState (org.apache.beam.sdk.state.ValueState)15 List (java.util.List)14 ImmutableList (com.google.common.collect.ImmutableList)12 HashMap (java.util.HashMap)12 Timer (org.apache.beam.sdk.state.Timer)12