Search in sources :

Example 21 with BoundedWindow

use of org.apache.beam.sdk.transforms.windowing.BoundedWindow in project beam by apache.

the class SparkGlobalCombineFn method combOp.

/**
   * Implement Spark's combOp function in:
   * <p>
   * {@link org.apache.spark.api.java.JavaRDD#aggregate}.
   * </p>
   */
Iterable<WindowedValue<AccumT>> combOp(Iterable<WindowedValue<AccumT>> a1, Iterable<WindowedValue<AccumT>> a2) {
    // concatenate accumulators.
    Iterable<WindowedValue<AccumT>> accumulators = Iterables.concat(a1, a2);
    // if empty, return an empty accumulators iterable.
    if (!accumulators.iterator().hasNext()) {
        return Lists.newArrayList();
    }
    // sort accumulators, no need to explode since inputs were exploded.
    Iterable<WindowedValue<AccumT>> sortedAccumulators = sortByWindows(accumulators);
    @SuppressWarnings("unchecked") TimestampCombiner timestampCombiner = windowingStrategy.getTimestampCombiner();
    //--- accumulators iterator, by window order.
    final Iterator<WindowedValue<AccumT>> iterator = sortedAccumulators.iterator();
    // get the first accumulator and assign it to the current window's accumulators.
    WindowedValue<AccumT> currentValue = iterator.next();
    BoundedWindow currentWindow = Iterables.getFirst(currentValue.getWindows(), null);
    List<AccumT> currentWindowAccumulators = Lists.newArrayList();
    currentWindowAccumulators.add(currentValue.getValue());
    // keep track of the timestamps assigned by the TimestampCombiner,
    // in createCombiner we already merge the timestamps assigned
    // to individual elements, here we will just merge them.
    List<Instant> windowTimestamps = Lists.newArrayList();
    windowTimestamps.add(currentValue.getTimestamp());
    // accumulate the next windows, or output.
    List<WindowedValue<AccumT>> output = Lists.newArrayList();
    // if merging, merge overlapping windows, e.g. Sessions.
    final boolean merging = !windowingStrategy.getWindowFn().isNonMerging();
    while (iterator.hasNext()) {
        WindowedValue<AccumT> nextValue = iterator.next();
        BoundedWindow nextWindow = Iterables.getOnlyElement(nextValue.getWindows());
        boolean mergingAndIntersecting = merging && isIntersecting((IntervalWindow) currentWindow, (IntervalWindow) nextWindow);
        if (mergingAndIntersecting || nextWindow.equals(currentWindow)) {
            if (mergingAndIntersecting) {
                // merge intersecting windows.
                currentWindow = merge((IntervalWindow) currentWindow, (IntervalWindow) nextWindow);
            }
            // add to window accumulators.
            currentWindowAccumulators.add(nextValue.getValue());
            windowTimestamps.add(nextValue.getTimestamp());
        } else {
            // before moving to the next window,
            // add the current accumulation to the output and initialize the accumulation.
            // merge the timestamps of all accumulators to merge.
            Instant mergedTimestamp = timestampCombiner.merge(currentWindow, windowTimestamps);
            // merge accumulators.
            // transforming a KV<K, Iterable<AccumT>> into a KV<K, Iterable<AccumT>>.
            // for the (possibly merged) window.
            Iterable<AccumT> accumsToMerge = Iterables.unmodifiableIterable(currentWindowAccumulators);
            WindowedValue<Iterable<AccumT>> preMergeWindowedValue = WindowedValue.of(accumsToMerge, mergedTimestamp, currentWindow, PaneInfo.NO_FIRING);
            // applying the actual combiner onto the accumulators.
            AccumT accumulated = combineFn.mergeAccumulators(accumsToMerge, ctxtForInput(preMergeWindowedValue));
            WindowedValue<AccumT> postMergeWindowedValue = preMergeWindowedValue.withValue(accumulated);
            // emit the accumulated output.
            output.add(postMergeWindowedValue);
            // re-init accumulator, window and timestamps.
            currentWindowAccumulators.clear();
            currentWindowAccumulators.add(nextValue.getValue());
            currentWindow = nextWindow;
            windowTimestamps.clear();
            windowTimestamps.add(nextValue.getTimestamp());
        }
    }
    // merge the last chunk of accumulators.
    Instant mergedTimestamp = timestampCombiner.merge(currentWindow, windowTimestamps);
    Iterable<AccumT> accumsToMerge = Iterables.unmodifiableIterable(currentWindowAccumulators);
    WindowedValue<Iterable<AccumT>> preMergeWindowedValue = WindowedValue.of(accumsToMerge, mergedTimestamp, currentWindow, PaneInfo.NO_FIRING);
    AccumT accumulated = combineFn.mergeAccumulators(accumsToMerge, ctxtForInput(preMergeWindowedValue));
    WindowedValue<AccumT> postMergeWindowedValue = preMergeWindowedValue.withValue(accumulated);
    output.add(postMergeWindowedValue);
    return output;
}
Also used : TimestampCombiner(org.apache.beam.sdk.transforms.windowing.TimestampCombiner) Instant(org.joda.time.Instant) WindowedValue(org.apache.beam.sdk.util.WindowedValue) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) IntervalWindow(org.apache.beam.sdk.transforms.windowing.IntervalWindow)

Example 22 with BoundedWindow

use of org.apache.beam.sdk.transforms.windowing.BoundedWindow in project beam by apache.

the class SparkKeyedCombineFn method mergeCombiners.

/**
   * Implements Spark's mergeCombiners function in:
   * <p>
   * {@link org.apache.spark.rdd.PairRDDFunctions#combineByKey}.
   * </p>
   */
Iterable<WindowedValue<KV<K, AccumT>>> mergeCombiners(Iterable<WindowedValue<KV<K, AccumT>>> a1, Iterable<WindowedValue<KV<K, AccumT>>> a2) {
    // concatenate accumulators.
    Iterable<WindowedValue<KV<K, AccumT>>> accumulators = Iterables.concat(a1, a2);
    // sort accumulators, no need to explode since inputs were exploded.
    Iterable<WindowedValue<KV<K, AccumT>>> sortedAccumulators = sortByWindows(accumulators);
    @SuppressWarnings("unchecked") TimestampCombiner timestampCombiner = windowingStrategy.getTimestampCombiner();
    //--- accumulators iterator, by window order.
    final Iterator<WindowedValue<KV<K, AccumT>>> iterator = sortedAccumulators.iterator();
    // get the first accumulator and assign it to the current window's accumulators.
    WindowedValue<KV<K, AccumT>> currentValue = iterator.next();
    K key = currentValue.getValue().getKey();
    BoundedWindow currentWindow = Iterables.getFirst(currentValue.getWindows(), null);
    List<AccumT> currentWindowAccumulators = Lists.newArrayList();
    currentWindowAccumulators.add(currentValue.getValue().getValue());
    // keep track of the timestamps assigned by the TimestampCombiner,
    // in createCombiner we already merge the timestamps assigned
    // to individual elements, here we will just merge them.
    List<Instant> windowTimestamps = Lists.newArrayList();
    windowTimestamps.add(currentValue.getTimestamp());
    // accumulate the next windows, or output.
    List<WindowedValue<KV<K, AccumT>>> output = Lists.newArrayList();
    // if merging, merge overlapping windows, e.g. Sessions.
    final boolean merging = !windowingStrategy.getWindowFn().isNonMerging();
    while (iterator.hasNext()) {
        WindowedValue<KV<K, AccumT>> nextValue = iterator.next();
        BoundedWindow nextWindow = Iterables.getOnlyElement(nextValue.getWindows());
        boolean mergingAndIntersecting = merging && isIntersecting((IntervalWindow) currentWindow, (IntervalWindow) nextWindow);
        if (mergingAndIntersecting || nextWindow.equals(currentWindow)) {
            if (mergingAndIntersecting) {
                // merge intersecting windows.
                currentWindow = merge((IntervalWindow) currentWindow, (IntervalWindow) nextWindow);
            }
            // add to window accumulators.
            currentWindowAccumulators.add(nextValue.getValue().getValue());
            windowTimestamps.add(nextValue.getTimestamp());
        } else {
            // before moving to the next window,
            // add the current accumulation to the output and initialize the accumulation.
            // merge the timestamps of all accumulators to merge.
            Instant mergedTimestamp = timestampCombiner.merge(currentWindow, windowTimestamps);
            // merge accumulators.
            // transforming a KV<K, Iterable<AccumT>> into a KV<K, Iterable<AccumT>>.
            // for the (possibly merged) window.
            Iterable<AccumT> accumsToMerge = Iterables.unmodifiableIterable(currentWindowAccumulators);
            WindowedValue<KV<K, Iterable<AccumT>>> preMergeWindowedValue = WindowedValue.of(KV.of(key, accumsToMerge), mergedTimestamp, currentWindow, PaneInfo.NO_FIRING);
            // applying the actual combiner onto the accumulators.
            AccumT accumulated = combineFn.mergeAccumulators(accumsToMerge, ctxtForInput(preMergeWindowedValue));
            WindowedValue<KV<K, AccumT>> postMergeWindowedValue = preMergeWindowedValue.withValue(KV.of(key, accumulated));
            // emit the accumulated output.
            output.add(postMergeWindowedValue);
            // re-init accumulator, window and timestamps.
            currentWindowAccumulators.clear();
            currentWindowAccumulators.add(nextValue.getValue().getValue());
            currentWindow = nextWindow;
            windowTimestamps.clear();
            windowTimestamps.add(nextValue.getTimestamp());
        }
    }
    // merge the last chunk of accumulators.
    Instant mergedTimestamp = timestampCombiner.merge(currentWindow, windowTimestamps);
    Iterable<AccumT> accumsToMerge = Iterables.unmodifiableIterable(currentWindowAccumulators);
    WindowedValue<KV<K, Iterable<AccumT>>> preMergeWindowedValue = WindowedValue.of(KV.of(key, accumsToMerge), mergedTimestamp, currentWindow, PaneInfo.NO_FIRING);
    AccumT accumulated = combineFn.mergeAccumulators(accumsToMerge, ctxtForInput(preMergeWindowedValue));
    WindowedValue<KV<K, AccumT>> postMergeWindowedValue = preMergeWindowedValue.withValue(KV.of(key, accumulated));
    output.add(postMergeWindowedValue);
    return output;
}
Also used : TimestampCombiner(org.apache.beam.sdk.transforms.windowing.TimestampCombiner) Instant(org.joda.time.Instant) KV(org.apache.beam.sdk.values.KV) WindowedValue(org.apache.beam.sdk.util.WindowedValue) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) IntervalWindow(org.apache.beam.sdk.transforms.windowing.IntervalWindow)

Example 23 with BoundedWindow

use of org.apache.beam.sdk.transforms.windowing.BoundedWindow in project beam by apache.

the class SparkKeyedCombineFn method createCombiner.

/**
   * Implements Spark's createCombiner function in:
   * <p>
   * {@link org.apache.spark.rdd.PairRDDFunctions#combineByKey}.
   * </p>
   */
Iterable<WindowedValue<KV<K, AccumT>>> createCombiner(WindowedValue<KV<K, InputT>> wkvi) {
    // sort exploded inputs.
    Iterable<WindowedValue<KV<K, InputT>>> sortedInputs = sortByWindows(wkvi.explodeWindows());
    TimestampCombiner timestampCombiner = windowingStrategy.getTimestampCombiner();
    WindowFn<?, BoundedWindow> windowFn = windowingStrategy.getWindowFn();
    //--- inputs iterator, by window order.
    final Iterator<WindowedValue<KV<K, InputT>>> iterator = sortedInputs.iterator();
    WindowedValue<KV<K, InputT>> currentInput = iterator.next();
    BoundedWindow currentWindow = Iterables.getFirst(currentInput.getWindows(), null);
    // first create the accumulator and accumulate first input.
    K key = currentInput.getValue().getKey();
    AccumT accumulator = combineFn.createAccumulator(ctxtForInput(currentInput));
    accumulator = combineFn.addInput(accumulator, currentInput.getValue().getValue(), ctxtForInput(currentInput));
    // keep track of the timestamps assigned by the TimestampCombiner.
    Instant windowTimestamp = timestampCombiner.assign(currentWindow, windowingStrategy.getWindowFn().getOutputTime(currentInput.getTimestamp(), currentWindow));
    // accumulate the next windows, or output.
    List<WindowedValue<KV<K, AccumT>>> output = Lists.newArrayList();
    // if merging, merge overlapping windows, e.g. Sessions.
    final boolean merging = !windowingStrategy.getWindowFn().isNonMerging();
    while (iterator.hasNext()) {
        WindowedValue<KV<K, InputT>> nextValue = iterator.next();
        BoundedWindow nextWindow = Iterables.getOnlyElement(nextValue.getWindows());
        boolean mergingAndIntersecting = merging && isIntersecting((IntervalWindow) currentWindow, (IntervalWindow) nextWindow);
        if (mergingAndIntersecting || nextWindow.equals(currentWindow)) {
            if (mergingAndIntersecting) {
                // merge intersecting windows.
                currentWindow = merge((IntervalWindow) currentWindow, (IntervalWindow) nextWindow);
            }
            // keep accumulating and carry on ;-)
            accumulator = combineFn.addInput(accumulator, nextValue.getValue().getValue(), ctxtForInput(nextValue));
            windowTimestamp = timestampCombiner.combine(windowTimestamp, timestampCombiner.assign(currentWindow, windowFn.getOutputTime(nextValue.getTimestamp(), currentWindow)));
        } else {
            // moving to the next window, first add the current accumulation to output
            // and initialize the accumulator.
            output.add(WindowedValue.of(KV.of(key, accumulator), windowTimestamp, currentWindow, PaneInfo.NO_FIRING));
            // re-init accumulator, window and timestamp.
            accumulator = combineFn.createAccumulator(ctxtForInput(nextValue));
            accumulator = combineFn.addInput(accumulator, nextValue.getValue().getValue(), ctxtForInput(nextValue));
            currentWindow = nextWindow;
            windowTimestamp = timestampCombiner.assign(currentWindow, windowFn.getOutputTime(nextValue.getTimestamp(), currentWindow));
        }
    }
    // add last accumulator to the output.
    output.add(WindowedValue.of(KV.of(key, accumulator), windowTimestamp, currentWindow, PaneInfo.NO_FIRING));
    return output;
}
Also used : TimestampCombiner(org.apache.beam.sdk.transforms.windowing.TimestampCombiner) Instant(org.joda.time.Instant) KV(org.apache.beam.sdk.values.KV) WindowedValue(org.apache.beam.sdk.util.WindowedValue) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) IntervalWindow(org.apache.beam.sdk.transforms.windowing.IntervalWindow)

Example 24 with BoundedWindow

use of org.apache.beam.sdk.transforms.windowing.BoundedWindow in project beam by apache.

the class SparkSideInputReader method get.

@Nullable
@Override
public <T> T get(PCollectionView<T> view, BoundedWindow window) {
    //--- validate sideInput.
    checkNotNull(view, "The PCollectionView passed to sideInput cannot be null ");
    KV<WindowingStrategy<?, ?>, SideInputBroadcast<?>> windowedBroadcastHelper = sideInputs.get(view.getTagInternal());
    checkNotNull(windowedBroadcastHelper, "SideInput for view " + view + " is not available.");
    //--- sideInput window
    final BoundedWindow sideInputWindow = view.getWindowMappingFn().getSideInputWindow(window);
    //--- match the appropriate sideInput window.
    // a tag will point to all matching sideInputs, that is all windows.
    // now that we've obtained the appropriate sideInputWindow, all that's left is to filter by it.
    Iterable<WindowedValue<?>> availableSideInputs = (Iterable<WindowedValue<?>>) windowedBroadcastHelper.getValue().getValue();
    Iterable<WindowedValue<?>> sideInputForWindow = Iterables.filter(availableSideInputs, new Predicate<WindowedValue<?>>() {

        @Override
        public boolean apply(@Nullable WindowedValue<?> sideInputCandidate) {
            if (sideInputCandidate == null) {
                return false;
            }
            // first match of a sideInputWindow to the elementWindow is good enough.
            for (BoundedWindow sideInputCandidateWindow : sideInputCandidate.getWindows()) {
                if (sideInputCandidateWindow.equals(sideInputWindow)) {
                    return true;
                }
            }
            // no match found.
            return false;
        }
    });
    return view.getViewFn().apply(sideInputForWindow);
}
Also used : WindowedValue(org.apache.beam.sdk.util.WindowedValue) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) WindowingStrategy(org.apache.beam.sdk.values.WindowingStrategy) Nullable(javax.annotation.Nullable)

Example 25 with BoundedWindow

use of org.apache.beam.sdk.transforms.windowing.BoundedWindow in project beam by apache.

the class LeaderBoardTest method testTeamScoresDroppablyLate.

/**
   * A test where elements arrive beyond the maximum allowed lateness. These elements are dropped
   * within {@link CalculateTeamScores} and do not impact the final result.
   */
@Test
public void testTeamScoresDroppablyLate() {
    BoundedWindow window = new IntervalWindow(baseTime, TEAM_WINDOW_DURATION);
    TestStream<GameActionInfo> infos = TestStream.create(AvroCoder.of(GameActionInfo.class)).addElements(event(TestUser.BLUE_ONE, 12, Duration.ZERO), event(TestUser.RED_ONE, 3, Duration.ZERO)).advanceWatermarkTo(window.maxTimestamp()).addElements(event(TestUser.RED_ONE, 4, Duration.standardMinutes(2)), event(TestUser.BLUE_TWO, 3, Duration.ZERO), event(TestUser.BLUE_ONE, 3, Duration.standardMinutes(3))).advanceWatermarkTo(baseTime.plus(ALLOWED_LATENESS).plus(TEAM_WINDOW_DURATION).plus(Duration.standardMinutes(1))).addElements(event(TestUser.BLUE_TWO, 3, TEAM_WINDOW_DURATION.minus(Duration.standardSeconds(5))), event(TestUser.RED_ONE, 7, Duration.standardMinutes(4))).advanceWatermarkToInfinity();
    PCollection<KV<String, Integer>> teamScores = p.apply(infos).apply(new CalculateTeamScores(TEAM_WINDOW_DURATION, ALLOWED_LATENESS));
    String blueTeam = TestUser.BLUE_ONE.getTeam();
    String redTeam = TestUser.RED_ONE.getTeam();
    // Only one on-time pane and no late panes should be emitted
    PAssert.that(teamScores).inWindow(window).containsInAnyOrder(KV.of(redTeam, 7), KV.of(blueTeam, 18));
    // No elements are added before the watermark passes the end of the window plus the allowed
    // lateness, so no refinement should be emitted
    PAssert.that(teamScores).inFinalPane(window).empty();
    p.run().waitUntilFinish();
}
Also used : GameActionInfo(org.apache.beam.examples.complete.game.UserScore.GameActionInfo) CalculateTeamScores(org.apache.beam.examples.complete.game.LeaderBoard.CalculateTeamScores) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) KV(org.apache.beam.sdk.values.KV) IntervalWindow(org.apache.beam.sdk.transforms.windowing.IntervalWindow) Test(org.junit.Test)

Aggregations

BoundedWindow (org.apache.beam.sdk.transforms.windowing.BoundedWindow)54 Instant (org.joda.time.Instant)27 Test (org.junit.Test)26 IntervalWindow (org.apache.beam.sdk.transforms.windowing.IntervalWindow)21 KV (org.apache.beam.sdk.values.KV)20 WindowedValue (org.apache.beam.sdk.util.WindowedValue)14 ArrayList (java.util.ArrayList)7 TimerSpec (org.apache.beam.sdk.state.TimerSpec)7 Timer (org.apache.beam.sdk.state.Timer)6 Matchers.containsString (org.hamcrest.Matchers.containsString)6 DoFn (org.apache.beam.sdk.transforms.DoFn)5 StringUtils.byteArrayToJsonString (org.apache.beam.sdk.util.StringUtils.byteArrayToJsonString)5 ImmutableList (com.google.common.collect.ImmutableList)4 List (java.util.List)4 ValueState (org.apache.beam.sdk.state.ValueState)4 OnTimer (org.apache.beam.sdk.transforms.DoFn.OnTimer)4 TimestampCombiner (org.apache.beam.sdk.transforms.windowing.TimestampCombiner)4 PCollection (org.apache.beam.sdk.values.PCollection)4 TupleTag (org.apache.beam.sdk.values.TupleTag)4 Duration (org.joda.time.Duration)4