Search in sources :

Example 6 with TimestampCombiner

use of org.apache.beam.sdk.transforms.windowing.TimestampCombiner in project beam by apache.

the class SparkKeyedCombineFn method createCombiner.

/**
   * Implements Spark's createCombiner function in:
   * <p>
   * {@link org.apache.spark.rdd.PairRDDFunctions#combineByKey}.
   * </p>
   */
Iterable<WindowedValue<KV<K, AccumT>>> createCombiner(WindowedValue<KV<K, InputT>> wkvi) {
    // sort exploded inputs.
    Iterable<WindowedValue<KV<K, InputT>>> sortedInputs = sortByWindows(wkvi.explodeWindows());
    TimestampCombiner timestampCombiner = windowingStrategy.getTimestampCombiner();
    WindowFn<?, BoundedWindow> windowFn = windowingStrategy.getWindowFn();
    //--- inputs iterator, by window order.
    final Iterator<WindowedValue<KV<K, InputT>>> iterator = sortedInputs.iterator();
    WindowedValue<KV<K, InputT>> currentInput = iterator.next();
    BoundedWindow currentWindow = Iterables.getFirst(currentInput.getWindows(), null);
    // first create the accumulator and accumulate first input.
    K key = currentInput.getValue().getKey();
    AccumT accumulator = combineFn.createAccumulator(ctxtForInput(currentInput));
    accumulator = combineFn.addInput(accumulator, currentInput.getValue().getValue(), ctxtForInput(currentInput));
    // keep track of the timestamps assigned by the TimestampCombiner.
    Instant windowTimestamp = timestampCombiner.assign(currentWindow, windowingStrategy.getWindowFn().getOutputTime(currentInput.getTimestamp(), currentWindow));
    // accumulate the next windows, or output.
    List<WindowedValue<KV<K, AccumT>>> output = Lists.newArrayList();
    // if merging, merge overlapping windows, e.g. Sessions.
    final boolean merging = !windowingStrategy.getWindowFn().isNonMerging();
    while (iterator.hasNext()) {
        WindowedValue<KV<K, InputT>> nextValue = iterator.next();
        BoundedWindow nextWindow = Iterables.getOnlyElement(nextValue.getWindows());
        boolean mergingAndIntersecting = merging && isIntersecting((IntervalWindow) currentWindow, (IntervalWindow) nextWindow);
        if (mergingAndIntersecting || nextWindow.equals(currentWindow)) {
            if (mergingAndIntersecting) {
                // merge intersecting windows.
                currentWindow = merge((IntervalWindow) currentWindow, (IntervalWindow) nextWindow);
            }
            // keep accumulating and carry on ;-)
            accumulator = combineFn.addInput(accumulator, nextValue.getValue().getValue(), ctxtForInput(nextValue));
            windowTimestamp = timestampCombiner.combine(windowTimestamp, timestampCombiner.assign(currentWindow, windowFn.getOutputTime(nextValue.getTimestamp(), currentWindow)));
        } else {
            // moving to the next window, first add the current accumulation to output
            // and initialize the accumulator.
            output.add(WindowedValue.of(KV.of(key, accumulator), windowTimestamp, currentWindow, PaneInfo.NO_FIRING));
            // re-init accumulator, window and timestamp.
            accumulator = combineFn.createAccumulator(ctxtForInput(nextValue));
            accumulator = combineFn.addInput(accumulator, nextValue.getValue().getValue(), ctxtForInput(nextValue));
            currentWindow = nextWindow;
            windowTimestamp = timestampCombiner.assign(currentWindow, windowFn.getOutputTime(nextValue.getTimestamp(), currentWindow));
        }
    }
    // add last accumulator to the output.
    output.add(WindowedValue.of(KV.of(key, accumulator), windowTimestamp, currentWindow, PaneInfo.NO_FIRING));
    return output;
}
Also used : TimestampCombiner(org.apache.beam.sdk.transforms.windowing.TimestampCombiner) Instant(org.joda.time.Instant) KV(org.apache.beam.sdk.values.KV) WindowedValue(org.apache.beam.sdk.util.WindowedValue) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) IntervalWindow(org.apache.beam.sdk.transforms.windowing.IntervalWindow)

Example 7 with TimestampCombiner

use of org.apache.beam.sdk.transforms.windowing.TimestampCombiner in project beam by apache.

the class HashingFlinkCombineRunner method combine.

@Override
public void combine(FlinkCombiner<K, InputT, AccumT, OutputT> flinkCombiner, WindowingStrategy<Object, W> windowingStrategy, SideInputReader sideInputReader, PipelineOptions options, Iterable<WindowedValue<KV<K, InputT>>> elements, Collector<WindowedValue<KV<K, OutputT>>> out) throws Exception {
    @SuppressWarnings("unchecked") TimestampCombiner timestampCombiner = windowingStrategy.getTimestampCombiner();
    WindowFn<Object, W> windowFn = windowingStrategy.getWindowFn();
    // Flink Iterable can be iterated over only once.
    List<WindowedValue<KV<K, InputT>>> inputs = new ArrayList<>();
    Iterables.addAll(inputs, elements);
    Set<W> windows = collectWindows(inputs);
    Map<W, W> windowToMergeResult = mergeWindows(windowingStrategy, windows);
    // Combine all windowedValues into map
    Map<W, Tuple2<AccumT, Instant>> mapState = new HashMap<>();
    Iterator<WindowedValue<KV<K, InputT>>> iterator = inputs.iterator();
    WindowedValue<KV<K, InputT>> currentValue = iterator.next();
    K key = currentValue.getValue().getKey();
    do {
        for (BoundedWindow w : currentValue.getWindows()) {
            @SuppressWarnings("unchecked") W currentWindow = (W) w;
            W mergedWindow = windowToMergeResult.get(currentWindow);
            mergedWindow = mergedWindow == null ? currentWindow : mergedWindow;
            Set<W> singletonW = Collections.singleton(mergedWindow);
            Tuple2<AccumT, Instant> accumAndInstant = mapState.get(mergedWindow);
            if (accumAndInstant == null) {
                AccumT accumT = flinkCombiner.firstInput(key, currentValue.getValue().getValue(), options, sideInputReader, singletonW);
                Instant windowTimestamp = timestampCombiner.assign(mergedWindow, windowFn.getOutputTime(currentValue.getTimestamp(), mergedWindow));
                accumAndInstant = new Tuple2<>(accumT, windowTimestamp);
                mapState.put(mergedWindow, accumAndInstant);
            } else {
                accumAndInstant.f0 = flinkCombiner.addInput(key, accumAndInstant.f0, currentValue.getValue().getValue(), options, sideInputReader, singletonW);
                accumAndInstant.f1 = timestampCombiner.combine(accumAndInstant.f1, timestampCombiner.assign(mergedWindow, windowingStrategy.getWindowFn().getOutputTime(currentValue.getTimestamp(), mergedWindow)));
            }
        }
        if (iterator.hasNext()) {
            currentValue = iterator.next();
        } else {
            break;
        }
    } while (true);
    // Output the final value of combiners
    for (Map.Entry<W, Tuple2<AccumT, Instant>> entry : mapState.entrySet()) {
        AccumT accumulator = entry.getValue().f0;
        Instant windowTimestamp = entry.getValue().f1;
        out.collect(WindowedValue.of(KV.of(key, flinkCombiner.extractOutput(key, accumulator, options, sideInputReader, Collections.singleton(entry.getKey()))), windowTimestamp, entry.getKey(), PaneInfo.NO_FIRING));
    }
}
Also used : HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) WindowedValue(org.apache.beam.sdk.util.WindowedValue) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) TimestampCombiner(org.apache.beam.sdk.transforms.windowing.TimestampCombiner) Instant(org.joda.time.Instant) KV(org.apache.beam.sdk.values.KV) Tuple2(org.apache.flink.api.java.tuple.Tuple2) HashMap(java.util.HashMap) Map(java.util.Map)

Example 8 with TimestampCombiner

use of org.apache.beam.sdk.transforms.windowing.TimestampCombiner in project beam by apache.

the class SortingFlinkCombineRunner method combine.

@Override
public void combine(FlinkCombiner<K, InputT, AccumT, OutputT> flinkCombiner, WindowingStrategy<Object, W> windowingStrategy, SideInputReader sideInputReader, PipelineOptions options, Iterable<WindowedValue<KV<K, InputT>>> elements, Collector<WindowedValue<KV<K, OutputT>>> out) throws Exception {
    @SuppressWarnings("unchecked") TimestampCombiner timestampCombiner = (TimestampCombiner) windowingStrategy.getTimestampCombiner();
    WindowFn<Object, W> windowFn = windowingStrategy.getWindowFn();
    // get all elements so that we can sort them, has to fit into
    // memory
    // this seems very unprudent, but correct, for now
    List<WindowedValue<KV<K, InputT>>> sortedInput = Lists.newArrayList();
    for (WindowedValue<KV<K, InputT>> inputValue : elements) {
        for (WindowedValue<KV<K, InputT>> exploded : inputValue.explodeWindows()) {
            sortedInput.add(exploded);
        }
    }
    Collections.sort(sortedInput, new Comparator<WindowedValue<KV<K, InputT>>>() {

        @Override
        public int compare(WindowedValue<KV<K, InputT>> o1, WindowedValue<KV<K, InputT>> o2) {
            return Iterables.getOnlyElement(o1.getWindows()).maxTimestamp().compareTo(Iterables.getOnlyElement(o2.getWindows()).maxTimestamp());
        }
    });
    if (!windowingStrategy.getWindowFn().isNonMerging()) {
        // merge windows, we have to do it in an extra pre-processing step and
        // can't do it as we go since the window of early elements would not
        // be correct when calling the CombineFn
        mergeWindow(sortedInput);
    }
    // iterate over the elements that are sorted by window timestamp
    final Iterator<WindowedValue<KV<K, InputT>>> iterator = sortedInput.iterator();
    // create accumulator using the first elements key
    WindowedValue<KV<K, InputT>> currentValue = iterator.next();
    K key = currentValue.getValue().getKey();
    W currentWindow = (W) Iterables.getOnlyElement(currentValue.getWindows());
    InputT firstValue = currentValue.getValue().getValue();
    AccumT accumulator = flinkCombiner.firstInput(key, firstValue, options, sideInputReader, currentValue.getWindows());
    // we use this to keep track of the timestamps assigned by the TimestampCombiner
    Instant windowTimestamp = timestampCombiner.assign(currentWindow, windowFn.getOutputTime(currentValue.getTimestamp(), currentWindow));
    while (iterator.hasNext()) {
        WindowedValue<KV<K, InputT>> nextValue = iterator.next();
        W nextWindow = (W) Iterables.getOnlyElement(nextValue.getWindows());
        if (currentWindow.equals(nextWindow)) {
            // continue accumulating and merge windows
            InputT value = nextValue.getValue().getValue();
            accumulator = flinkCombiner.addInput(key, accumulator, value, options, sideInputReader, currentValue.getWindows());
            windowTimestamp = timestampCombiner.combine(windowTimestamp, timestampCombiner.assign(currentWindow, windowFn.getOutputTime(nextValue.getTimestamp(), currentWindow)));
        } else {
            // emit the value that we currently have
            out.collect(WindowedValue.of(KV.of(key, flinkCombiner.extractOutput(key, accumulator, options, sideInputReader, currentValue.getWindows())), windowTimestamp, currentWindow, PaneInfo.NO_FIRING));
            currentWindow = nextWindow;
            currentValue = nextValue;
            InputT value = nextValue.getValue().getValue();
            accumulator = flinkCombiner.firstInput(key, value, options, sideInputReader, currentValue.getWindows());
            windowTimestamp = timestampCombiner.assign(currentWindow, windowFn.getOutputTime(nextValue.getTimestamp(), currentWindow));
        }
    }
    // emit the final accumulator
    out.collect(WindowedValue.of(KV.of(key, flinkCombiner.extractOutput(key, accumulator, options, sideInputReader, currentValue.getWindows())), windowTimestamp, currentWindow, PaneInfo.NO_FIRING));
}
Also used : TimestampCombiner(org.apache.beam.sdk.transforms.windowing.TimestampCombiner) Instant(org.joda.time.Instant) KV(org.apache.beam.sdk.values.KV) WindowedValue(org.apache.beam.sdk.util.WindowedValue)

Aggregations

TimestampCombiner (org.apache.beam.sdk.transforms.windowing.TimestampCombiner)8 Instant (org.joda.time.Instant)7 WindowedValue (org.apache.beam.sdk.util.WindowedValue)6 BoundedWindow (org.apache.beam.sdk.transforms.windowing.BoundedWindow)5 IntervalWindow (org.apache.beam.sdk.transforms.windowing.IntervalWindow)4 KV (org.apache.beam.sdk.values.KV)4 ArrayList (java.util.ArrayList)1 HashMap (java.util.HashMap)1 Map (java.util.Map)1 StateNamespace (org.apache.beam.runners.core.StateNamespace)1 StateNamespaceForTest (org.apache.beam.runners.core.StateNamespaceForTest)1 SdkFunctionSpec (org.apache.beam.sdk.common.runner.v1.RunnerApi.SdkFunctionSpec)1 WatermarkHoldState (org.apache.beam.sdk.state.WatermarkHoldState)1 Trigger (org.apache.beam.sdk.transforms.windowing.Trigger)1 ClosingBehavior (org.apache.beam.sdk.transforms.windowing.Window.ClosingBehavior)1 AccumulationMode (org.apache.beam.sdk.values.WindowingStrategy.AccumulationMode)1 Tuple2 (org.apache.flink.api.java.tuple.Tuple2)1 Duration (org.joda.time.Duration)1 Test (org.junit.Test)1