use of org.apache.beam.sdk.transforms.windowing.TimestampCombiner in project beam by apache.
the class SparkKeyedCombineFn method createCombiner.
/**
* Implements Spark's createCombiner function in:
* <p>
* {@link org.apache.spark.rdd.PairRDDFunctions#combineByKey}.
* </p>
*/
Iterable<WindowedValue<KV<K, AccumT>>> createCombiner(WindowedValue<KV<K, InputT>> wkvi) {
// sort exploded inputs.
Iterable<WindowedValue<KV<K, InputT>>> sortedInputs = sortByWindows(wkvi.explodeWindows());
TimestampCombiner timestampCombiner = windowingStrategy.getTimestampCombiner();
WindowFn<?, BoundedWindow> windowFn = windowingStrategy.getWindowFn();
//--- inputs iterator, by window order.
final Iterator<WindowedValue<KV<K, InputT>>> iterator = sortedInputs.iterator();
WindowedValue<KV<K, InputT>> currentInput = iterator.next();
BoundedWindow currentWindow = Iterables.getFirst(currentInput.getWindows(), null);
// first create the accumulator and accumulate first input.
K key = currentInput.getValue().getKey();
AccumT accumulator = combineFn.createAccumulator(ctxtForInput(currentInput));
accumulator = combineFn.addInput(accumulator, currentInput.getValue().getValue(), ctxtForInput(currentInput));
// keep track of the timestamps assigned by the TimestampCombiner.
Instant windowTimestamp = timestampCombiner.assign(currentWindow, windowingStrategy.getWindowFn().getOutputTime(currentInput.getTimestamp(), currentWindow));
// accumulate the next windows, or output.
List<WindowedValue<KV<K, AccumT>>> output = Lists.newArrayList();
// if merging, merge overlapping windows, e.g. Sessions.
final boolean merging = !windowingStrategy.getWindowFn().isNonMerging();
while (iterator.hasNext()) {
WindowedValue<KV<K, InputT>> nextValue = iterator.next();
BoundedWindow nextWindow = Iterables.getOnlyElement(nextValue.getWindows());
boolean mergingAndIntersecting = merging && isIntersecting((IntervalWindow) currentWindow, (IntervalWindow) nextWindow);
if (mergingAndIntersecting || nextWindow.equals(currentWindow)) {
if (mergingAndIntersecting) {
// merge intersecting windows.
currentWindow = merge((IntervalWindow) currentWindow, (IntervalWindow) nextWindow);
}
// keep accumulating and carry on ;-)
accumulator = combineFn.addInput(accumulator, nextValue.getValue().getValue(), ctxtForInput(nextValue));
windowTimestamp = timestampCombiner.combine(windowTimestamp, timestampCombiner.assign(currentWindow, windowFn.getOutputTime(nextValue.getTimestamp(), currentWindow)));
} else {
// moving to the next window, first add the current accumulation to output
// and initialize the accumulator.
output.add(WindowedValue.of(KV.of(key, accumulator), windowTimestamp, currentWindow, PaneInfo.NO_FIRING));
// re-init accumulator, window and timestamp.
accumulator = combineFn.createAccumulator(ctxtForInput(nextValue));
accumulator = combineFn.addInput(accumulator, nextValue.getValue().getValue(), ctxtForInput(nextValue));
currentWindow = nextWindow;
windowTimestamp = timestampCombiner.assign(currentWindow, windowFn.getOutputTime(nextValue.getTimestamp(), currentWindow));
}
}
// add last accumulator to the output.
output.add(WindowedValue.of(KV.of(key, accumulator), windowTimestamp, currentWindow, PaneInfo.NO_FIRING));
return output;
}
use of org.apache.beam.sdk.transforms.windowing.TimestampCombiner in project beam by apache.
the class HashingFlinkCombineRunner method combine.
@Override
public void combine(FlinkCombiner<K, InputT, AccumT, OutputT> flinkCombiner, WindowingStrategy<Object, W> windowingStrategy, SideInputReader sideInputReader, PipelineOptions options, Iterable<WindowedValue<KV<K, InputT>>> elements, Collector<WindowedValue<KV<K, OutputT>>> out) throws Exception {
@SuppressWarnings("unchecked") TimestampCombiner timestampCombiner = windowingStrategy.getTimestampCombiner();
WindowFn<Object, W> windowFn = windowingStrategy.getWindowFn();
// Flink Iterable can be iterated over only once.
List<WindowedValue<KV<K, InputT>>> inputs = new ArrayList<>();
Iterables.addAll(inputs, elements);
Set<W> windows = collectWindows(inputs);
Map<W, W> windowToMergeResult = mergeWindows(windowingStrategy, windows);
// Combine all windowedValues into map
Map<W, Tuple2<AccumT, Instant>> mapState = new HashMap<>();
Iterator<WindowedValue<KV<K, InputT>>> iterator = inputs.iterator();
WindowedValue<KV<K, InputT>> currentValue = iterator.next();
K key = currentValue.getValue().getKey();
do {
for (BoundedWindow w : currentValue.getWindows()) {
@SuppressWarnings("unchecked") W currentWindow = (W) w;
W mergedWindow = windowToMergeResult.get(currentWindow);
mergedWindow = mergedWindow == null ? currentWindow : mergedWindow;
Set<W> singletonW = Collections.singleton(mergedWindow);
Tuple2<AccumT, Instant> accumAndInstant = mapState.get(mergedWindow);
if (accumAndInstant == null) {
AccumT accumT = flinkCombiner.firstInput(key, currentValue.getValue().getValue(), options, sideInputReader, singletonW);
Instant windowTimestamp = timestampCombiner.assign(mergedWindow, windowFn.getOutputTime(currentValue.getTimestamp(), mergedWindow));
accumAndInstant = new Tuple2<>(accumT, windowTimestamp);
mapState.put(mergedWindow, accumAndInstant);
} else {
accumAndInstant.f0 = flinkCombiner.addInput(key, accumAndInstant.f0, currentValue.getValue().getValue(), options, sideInputReader, singletonW);
accumAndInstant.f1 = timestampCombiner.combine(accumAndInstant.f1, timestampCombiner.assign(mergedWindow, windowingStrategy.getWindowFn().getOutputTime(currentValue.getTimestamp(), mergedWindow)));
}
}
if (iterator.hasNext()) {
currentValue = iterator.next();
} else {
break;
}
} while (true);
// Output the final value of combiners
for (Map.Entry<W, Tuple2<AccumT, Instant>> entry : mapState.entrySet()) {
AccumT accumulator = entry.getValue().f0;
Instant windowTimestamp = entry.getValue().f1;
out.collect(WindowedValue.of(KV.of(key, flinkCombiner.extractOutput(key, accumulator, options, sideInputReader, Collections.singleton(entry.getKey()))), windowTimestamp, entry.getKey(), PaneInfo.NO_FIRING));
}
}
use of org.apache.beam.sdk.transforms.windowing.TimestampCombiner in project beam by apache.
the class SortingFlinkCombineRunner method combine.
@Override
public void combine(FlinkCombiner<K, InputT, AccumT, OutputT> flinkCombiner, WindowingStrategy<Object, W> windowingStrategy, SideInputReader sideInputReader, PipelineOptions options, Iterable<WindowedValue<KV<K, InputT>>> elements, Collector<WindowedValue<KV<K, OutputT>>> out) throws Exception {
@SuppressWarnings("unchecked") TimestampCombiner timestampCombiner = (TimestampCombiner) windowingStrategy.getTimestampCombiner();
WindowFn<Object, W> windowFn = windowingStrategy.getWindowFn();
// get all elements so that we can sort them, has to fit into
// memory
// this seems very unprudent, but correct, for now
List<WindowedValue<KV<K, InputT>>> sortedInput = Lists.newArrayList();
for (WindowedValue<KV<K, InputT>> inputValue : elements) {
for (WindowedValue<KV<K, InputT>> exploded : inputValue.explodeWindows()) {
sortedInput.add(exploded);
}
}
Collections.sort(sortedInput, new Comparator<WindowedValue<KV<K, InputT>>>() {
@Override
public int compare(WindowedValue<KV<K, InputT>> o1, WindowedValue<KV<K, InputT>> o2) {
return Iterables.getOnlyElement(o1.getWindows()).maxTimestamp().compareTo(Iterables.getOnlyElement(o2.getWindows()).maxTimestamp());
}
});
if (!windowingStrategy.getWindowFn().isNonMerging()) {
// merge windows, we have to do it in an extra pre-processing step and
// can't do it as we go since the window of early elements would not
// be correct when calling the CombineFn
mergeWindow(sortedInput);
}
// iterate over the elements that are sorted by window timestamp
final Iterator<WindowedValue<KV<K, InputT>>> iterator = sortedInput.iterator();
// create accumulator using the first elements key
WindowedValue<KV<K, InputT>> currentValue = iterator.next();
K key = currentValue.getValue().getKey();
W currentWindow = (W) Iterables.getOnlyElement(currentValue.getWindows());
InputT firstValue = currentValue.getValue().getValue();
AccumT accumulator = flinkCombiner.firstInput(key, firstValue, options, sideInputReader, currentValue.getWindows());
// we use this to keep track of the timestamps assigned by the TimestampCombiner
Instant windowTimestamp = timestampCombiner.assign(currentWindow, windowFn.getOutputTime(currentValue.getTimestamp(), currentWindow));
while (iterator.hasNext()) {
WindowedValue<KV<K, InputT>> nextValue = iterator.next();
W nextWindow = (W) Iterables.getOnlyElement(nextValue.getWindows());
if (currentWindow.equals(nextWindow)) {
// continue accumulating and merge windows
InputT value = nextValue.getValue().getValue();
accumulator = flinkCombiner.addInput(key, accumulator, value, options, sideInputReader, currentValue.getWindows());
windowTimestamp = timestampCombiner.combine(windowTimestamp, timestampCombiner.assign(currentWindow, windowFn.getOutputTime(nextValue.getTimestamp(), currentWindow)));
} else {
// emit the value that we currently have
out.collect(WindowedValue.of(KV.of(key, flinkCombiner.extractOutput(key, accumulator, options, sideInputReader, currentValue.getWindows())), windowTimestamp, currentWindow, PaneInfo.NO_FIRING));
currentWindow = nextWindow;
currentValue = nextValue;
InputT value = nextValue.getValue().getValue();
accumulator = flinkCombiner.firstInput(key, value, options, sideInputReader, currentValue.getWindows());
windowTimestamp = timestampCombiner.assign(currentWindow, windowFn.getOutputTime(nextValue.getTimestamp(), currentWindow));
}
}
// emit the final accumulator
out.collect(WindowedValue.of(KV.of(key, flinkCombiner.extractOutput(key, accumulator, options, sideInputReader, currentValue.getWindows())), windowTimestamp, currentWindow, PaneInfo.NO_FIRING));
}
Aggregations