use of org.apache.beam.sdk.transforms.windowing.IntervalWindow in project beam by apache.
the class DoFnOperatorTest method testStateGCForStatefulFn.
@Test
public void testStateGCForStatefulFn() throws Exception {
WindowingStrategy<Object, IntervalWindow> windowingStrategy = WindowingStrategy.of(FixedWindows.of(new Duration(10))).withAllowedLateness(Duration.ZERO);
final String timerId = "boo";
final String stateId = "dazzle";
final int offset = 5000;
final int timerOutput = 4093;
DoFn<KV<String, Integer>, KV<String, Integer>> fn = new DoFn<KV<String, Integer>, KV<String, Integer>>() {
@TimerId(timerId)
private final TimerSpec spec = TimerSpecs.timer(TimeDomain.EVENT_TIME);
@StateId(stateId)
private final StateSpec<ValueState<String>> stateSpec = StateSpecs.value(StringUtf8Coder.of());
@ProcessElement
public void processElement(ProcessContext context, @TimerId(timerId) Timer timer, @StateId(stateId) ValueState<String> state, BoundedWindow window) {
timer.set(window.maxTimestamp());
state.write(context.element().getKey());
context.output(KV.of(context.element().getKey(), context.element().getValue() + offset));
}
@OnTimer(timerId)
public void onTimer(OnTimerContext context, @StateId(stateId) ValueState<String> state) {
context.output(KV.of(state.read(), timerOutput));
}
};
WindowedValue.FullWindowedValueCoder<KV<String, Integer>> windowedValueCoder = WindowedValue.getFullCoder(KvCoder.of(StringUtf8Coder.of(), VarIntCoder.of()), windowingStrategy.getWindowFn().windowCoder());
TupleTag<KV<String, Integer>> outputTag = new TupleTag<>("main-output");
DoFnOperator<KV<String, Integer>, KV<String, Integer>, WindowedValue<KV<String, Integer>>> doFnOperator = new DoFnOperator<>(fn, "stepName", windowedValueCoder, outputTag, Collections.<TupleTag<?>>emptyList(), new DoFnOperator.DefaultOutputManagerFactory<WindowedValue<KV<String, Integer>>>(), windowingStrategy, new HashMap<Integer, PCollectionView<?>>(), /* side-input mapping */
Collections.<PCollectionView<?>>emptyList(), /* side inputs */
PipelineOptionsFactory.as(FlinkPipelineOptions.class), StringUtf8Coder.of());
KeyedOneInputStreamOperatorTestHarness<String, WindowedValue<KV<String, Integer>>, WindowedValue<KV<String, Integer>>> testHarness = new KeyedOneInputStreamOperatorTestHarness<>(doFnOperator, new KeySelector<WindowedValue<KV<String, Integer>>, String>() {
@Override
public String getKey(WindowedValue<KV<String, Integer>> kvWindowedValue) throws Exception {
return kvWindowedValue.getValue().getKey();
}
}, new CoderTypeInformation<>(StringUtf8Coder.of()));
testHarness.open();
testHarness.processWatermark(0);
assertEquals(0, testHarness.numKeyedStateEntries());
IntervalWindow window1 = new IntervalWindow(new Instant(0), Duration.millis(10));
testHarness.processElement(new StreamRecord<>(WindowedValue.of(KV.of("key1", 5), new Instant(1), window1, PaneInfo.NO_FIRING)));
testHarness.processElement(new StreamRecord<>(WindowedValue.of(KV.of("key2", 7), new Instant(3), window1, PaneInfo.NO_FIRING)));
assertThat(this.<KV<String, Integer>>stripStreamRecordFromWindowedValue(testHarness.getOutput()), contains(WindowedValue.of(KV.of("key1", 5 + offset), new Instant(1), window1, PaneInfo.NO_FIRING), WindowedValue.of(KV.of("key2", 7 + offset), new Instant(3), window1, PaneInfo.NO_FIRING)));
assertEquals(2, testHarness.numKeyedStateEntries());
testHarness.getOutput().clear();
// this should trigger both the window.maxTimestamp() timer and the GC timer
// this tests that the GC timer fires after the user timer
testHarness.processWatermark(window1.maxTimestamp().plus(windowingStrategy.getAllowedLateness()).plus(StatefulDoFnRunner.TimeInternalsCleanupTimer.GC_DELAY_MS).getMillis());
assertThat(this.<KV<String, Integer>>stripStreamRecordFromWindowedValue(testHarness.getOutput()), contains(WindowedValue.of(KV.of("key1", timerOutput), new Instant(9), window1, PaneInfo.NO_FIRING), WindowedValue.of(KV.of("key2", timerOutput), new Instant(9), window1, PaneInfo.NO_FIRING)));
// ensure the state was garbage collected
assertEquals(0, testHarness.numKeyedStateEntries());
testHarness.close();
}
use of org.apache.beam.sdk.transforms.windowing.IntervalWindow in project beam by apache.
the class SparkGlobalCombineFn method createAccumulator.
private Iterable<WindowedValue<AccumT>> createAccumulator(WindowedValue<InputT> input) {
// sort exploded inputs.
Iterable<WindowedValue<InputT>> sortedInputs = sortByWindows(input.explodeWindows());
TimestampCombiner timestampCombiner = windowingStrategy.getTimestampCombiner();
WindowFn<?, BoundedWindow> windowFn = windowingStrategy.getWindowFn();
//--- inputs iterator, by window order.
final Iterator<WindowedValue<InputT>> iterator = sortedInputs.iterator();
WindowedValue<InputT> currentInput = iterator.next();
BoundedWindow currentWindow = Iterables.getFirst(currentInput.getWindows(), null);
// first create the accumulator and accumulate first input.
AccumT accumulator = combineFn.createAccumulator(ctxtForInput(currentInput));
accumulator = combineFn.addInput(accumulator, currentInput.getValue(), ctxtForInput(currentInput));
// keep track of the timestamps assigned by the TimestampCombiner.
Instant windowTimestamp = timestampCombiner.assign(currentWindow, windowingStrategy.getWindowFn().getOutputTime(currentInput.getTimestamp(), currentWindow));
// accumulate the next windows, or output.
List<WindowedValue<AccumT>> output = Lists.newArrayList();
// if merging, merge overlapping windows, e.g. Sessions.
final boolean merging = !windowingStrategy.getWindowFn().isNonMerging();
while (iterator.hasNext()) {
WindowedValue<InputT> nextValue = iterator.next();
BoundedWindow nextWindow = Iterables.getOnlyElement(nextValue.getWindows());
boolean mergingAndIntersecting = merging && isIntersecting((IntervalWindow) currentWindow, (IntervalWindow) nextWindow);
if (mergingAndIntersecting || nextWindow.equals(currentWindow)) {
if (mergingAndIntersecting) {
// merge intersecting windows.
currentWindow = merge((IntervalWindow) currentWindow, (IntervalWindow) nextWindow);
}
// keep accumulating and carry on ;-)
accumulator = combineFn.addInput(accumulator, nextValue.getValue(), ctxtForInput(nextValue));
windowTimestamp = timestampCombiner.merge(currentWindow, windowTimestamp, windowingStrategy.getWindowFn().getOutputTime(nextValue.getTimestamp(), currentWindow));
} else {
// moving to the next window, first add the current accumulation to output
// and initialize the accumulator.
output.add(WindowedValue.of(accumulator, windowTimestamp, currentWindow, PaneInfo.NO_FIRING));
// re-init accumulator, window and timestamp.
accumulator = combineFn.createAccumulator(ctxtForInput(nextValue));
accumulator = combineFn.addInput(accumulator, nextValue.getValue(), ctxtForInput(nextValue));
currentWindow = nextWindow;
windowTimestamp = timestampCombiner.assign(currentWindow, windowFn.getOutputTime(nextValue.getTimestamp(), currentWindow));
}
}
// add last accumulator to the output.
output.add(WindowedValue.of(accumulator, windowTimestamp, currentWindow, PaneInfo.NO_FIRING));
return output;
}
use of org.apache.beam.sdk.transforms.windowing.IntervalWindow in project beam by apache.
the class SparkGlobalCombineFn method combOp.
/**
* Implement Spark's combOp function in:
* <p>
* {@link org.apache.spark.api.java.JavaRDD#aggregate}.
* </p>
*/
Iterable<WindowedValue<AccumT>> combOp(Iterable<WindowedValue<AccumT>> a1, Iterable<WindowedValue<AccumT>> a2) {
// concatenate accumulators.
Iterable<WindowedValue<AccumT>> accumulators = Iterables.concat(a1, a2);
// if empty, return an empty accumulators iterable.
if (!accumulators.iterator().hasNext()) {
return Lists.newArrayList();
}
// sort accumulators, no need to explode since inputs were exploded.
Iterable<WindowedValue<AccumT>> sortedAccumulators = sortByWindows(accumulators);
@SuppressWarnings("unchecked") TimestampCombiner timestampCombiner = windowingStrategy.getTimestampCombiner();
//--- accumulators iterator, by window order.
final Iterator<WindowedValue<AccumT>> iterator = sortedAccumulators.iterator();
// get the first accumulator and assign it to the current window's accumulators.
WindowedValue<AccumT> currentValue = iterator.next();
BoundedWindow currentWindow = Iterables.getFirst(currentValue.getWindows(), null);
List<AccumT> currentWindowAccumulators = Lists.newArrayList();
currentWindowAccumulators.add(currentValue.getValue());
// keep track of the timestamps assigned by the TimestampCombiner,
// in createCombiner we already merge the timestamps assigned
// to individual elements, here we will just merge them.
List<Instant> windowTimestamps = Lists.newArrayList();
windowTimestamps.add(currentValue.getTimestamp());
// accumulate the next windows, or output.
List<WindowedValue<AccumT>> output = Lists.newArrayList();
// if merging, merge overlapping windows, e.g. Sessions.
final boolean merging = !windowingStrategy.getWindowFn().isNonMerging();
while (iterator.hasNext()) {
WindowedValue<AccumT> nextValue = iterator.next();
BoundedWindow nextWindow = Iterables.getOnlyElement(nextValue.getWindows());
boolean mergingAndIntersecting = merging && isIntersecting((IntervalWindow) currentWindow, (IntervalWindow) nextWindow);
if (mergingAndIntersecting || nextWindow.equals(currentWindow)) {
if (mergingAndIntersecting) {
// merge intersecting windows.
currentWindow = merge((IntervalWindow) currentWindow, (IntervalWindow) nextWindow);
}
// add to window accumulators.
currentWindowAccumulators.add(nextValue.getValue());
windowTimestamps.add(nextValue.getTimestamp());
} else {
// before moving to the next window,
// add the current accumulation to the output and initialize the accumulation.
// merge the timestamps of all accumulators to merge.
Instant mergedTimestamp = timestampCombiner.merge(currentWindow, windowTimestamps);
// merge accumulators.
// transforming a KV<K, Iterable<AccumT>> into a KV<K, Iterable<AccumT>>.
// for the (possibly merged) window.
Iterable<AccumT> accumsToMerge = Iterables.unmodifiableIterable(currentWindowAccumulators);
WindowedValue<Iterable<AccumT>> preMergeWindowedValue = WindowedValue.of(accumsToMerge, mergedTimestamp, currentWindow, PaneInfo.NO_FIRING);
// applying the actual combiner onto the accumulators.
AccumT accumulated = combineFn.mergeAccumulators(accumsToMerge, ctxtForInput(preMergeWindowedValue));
WindowedValue<AccumT> postMergeWindowedValue = preMergeWindowedValue.withValue(accumulated);
// emit the accumulated output.
output.add(postMergeWindowedValue);
// re-init accumulator, window and timestamps.
currentWindowAccumulators.clear();
currentWindowAccumulators.add(nextValue.getValue());
currentWindow = nextWindow;
windowTimestamps.clear();
windowTimestamps.add(nextValue.getTimestamp());
}
}
// merge the last chunk of accumulators.
Instant mergedTimestamp = timestampCombiner.merge(currentWindow, windowTimestamps);
Iterable<AccumT> accumsToMerge = Iterables.unmodifiableIterable(currentWindowAccumulators);
WindowedValue<Iterable<AccumT>> preMergeWindowedValue = WindowedValue.of(accumsToMerge, mergedTimestamp, currentWindow, PaneInfo.NO_FIRING);
AccumT accumulated = combineFn.mergeAccumulators(accumsToMerge, ctxtForInput(preMergeWindowedValue));
WindowedValue<AccumT> postMergeWindowedValue = preMergeWindowedValue.withValue(accumulated);
output.add(postMergeWindowedValue);
return output;
}
use of org.apache.beam.sdk.transforms.windowing.IntervalWindow in project beam by apache.
the class SparkKeyedCombineFn method mergeCombiners.
/**
* Implements Spark's mergeCombiners function in:
* <p>
* {@link org.apache.spark.rdd.PairRDDFunctions#combineByKey}.
* </p>
*/
Iterable<WindowedValue<KV<K, AccumT>>> mergeCombiners(Iterable<WindowedValue<KV<K, AccumT>>> a1, Iterable<WindowedValue<KV<K, AccumT>>> a2) {
// concatenate accumulators.
Iterable<WindowedValue<KV<K, AccumT>>> accumulators = Iterables.concat(a1, a2);
// sort accumulators, no need to explode since inputs were exploded.
Iterable<WindowedValue<KV<K, AccumT>>> sortedAccumulators = sortByWindows(accumulators);
@SuppressWarnings("unchecked") TimestampCombiner timestampCombiner = windowingStrategy.getTimestampCombiner();
//--- accumulators iterator, by window order.
final Iterator<WindowedValue<KV<K, AccumT>>> iterator = sortedAccumulators.iterator();
// get the first accumulator and assign it to the current window's accumulators.
WindowedValue<KV<K, AccumT>> currentValue = iterator.next();
K key = currentValue.getValue().getKey();
BoundedWindow currentWindow = Iterables.getFirst(currentValue.getWindows(), null);
List<AccumT> currentWindowAccumulators = Lists.newArrayList();
currentWindowAccumulators.add(currentValue.getValue().getValue());
// keep track of the timestamps assigned by the TimestampCombiner,
// in createCombiner we already merge the timestamps assigned
// to individual elements, here we will just merge them.
List<Instant> windowTimestamps = Lists.newArrayList();
windowTimestamps.add(currentValue.getTimestamp());
// accumulate the next windows, or output.
List<WindowedValue<KV<K, AccumT>>> output = Lists.newArrayList();
// if merging, merge overlapping windows, e.g. Sessions.
final boolean merging = !windowingStrategy.getWindowFn().isNonMerging();
while (iterator.hasNext()) {
WindowedValue<KV<K, AccumT>> nextValue = iterator.next();
BoundedWindow nextWindow = Iterables.getOnlyElement(nextValue.getWindows());
boolean mergingAndIntersecting = merging && isIntersecting((IntervalWindow) currentWindow, (IntervalWindow) nextWindow);
if (mergingAndIntersecting || nextWindow.equals(currentWindow)) {
if (mergingAndIntersecting) {
// merge intersecting windows.
currentWindow = merge((IntervalWindow) currentWindow, (IntervalWindow) nextWindow);
}
// add to window accumulators.
currentWindowAccumulators.add(nextValue.getValue().getValue());
windowTimestamps.add(nextValue.getTimestamp());
} else {
// before moving to the next window,
// add the current accumulation to the output and initialize the accumulation.
// merge the timestamps of all accumulators to merge.
Instant mergedTimestamp = timestampCombiner.merge(currentWindow, windowTimestamps);
// merge accumulators.
// transforming a KV<K, Iterable<AccumT>> into a KV<K, Iterable<AccumT>>.
// for the (possibly merged) window.
Iterable<AccumT> accumsToMerge = Iterables.unmodifiableIterable(currentWindowAccumulators);
WindowedValue<KV<K, Iterable<AccumT>>> preMergeWindowedValue = WindowedValue.of(KV.of(key, accumsToMerge), mergedTimestamp, currentWindow, PaneInfo.NO_FIRING);
// applying the actual combiner onto the accumulators.
AccumT accumulated = combineFn.mergeAccumulators(accumsToMerge, ctxtForInput(preMergeWindowedValue));
WindowedValue<KV<K, AccumT>> postMergeWindowedValue = preMergeWindowedValue.withValue(KV.of(key, accumulated));
// emit the accumulated output.
output.add(postMergeWindowedValue);
// re-init accumulator, window and timestamps.
currentWindowAccumulators.clear();
currentWindowAccumulators.add(nextValue.getValue().getValue());
currentWindow = nextWindow;
windowTimestamps.clear();
windowTimestamps.add(nextValue.getTimestamp());
}
}
// merge the last chunk of accumulators.
Instant mergedTimestamp = timestampCombiner.merge(currentWindow, windowTimestamps);
Iterable<AccumT> accumsToMerge = Iterables.unmodifiableIterable(currentWindowAccumulators);
WindowedValue<KV<K, Iterable<AccumT>>> preMergeWindowedValue = WindowedValue.of(KV.of(key, accumsToMerge), mergedTimestamp, currentWindow, PaneInfo.NO_FIRING);
AccumT accumulated = combineFn.mergeAccumulators(accumsToMerge, ctxtForInput(preMergeWindowedValue));
WindowedValue<KV<K, AccumT>> postMergeWindowedValue = preMergeWindowedValue.withValue(KV.of(key, accumulated));
output.add(postMergeWindowedValue);
return output;
}
use of org.apache.beam.sdk.transforms.windowing.IntervalWindow in project beam by apache.
the class SparkKeyedCombineFn method createCombiner.
/**
* Implements Spark's createCombiner function in:
* <p>
* {@link org.apache.spark.rdd.PairRDDFunctions#combineByKey}.
* </p>
*/
Iterable<WindowedValue<KV<K, AccumT>>> createCombiner(WindowedValue<KV<K, InputT>> wkvi) {
// sort exploded inputs.
Iterable<WindowedValue<KV<K, InputT>>> sortedInputs = sortByWindows(wkvi.explodeWindows());
TimestampCombiner timestampCombiner = windowingStrategy.getTimestampCombiner();
WindowFn<?, BoundedWindow> windowFn = windowingStrategy.getWindowFn();
//--- inputs iterator, by window order.
final Iterator<WindowedValue<KV<K, InputT>>> iterator = sortedInputs.iterator();
WindowedValue<KV<K, InputT>> currentInput = iterator.next();
BoundedWindow currentWindow = Iterables.getFirst(currentInput.getWindows(), null);
// first create the accumulator and accumulate first input.
K key = currentInput.getValue().getKey();
AccumT accumulator = combineFn.createAccumulator(ctxtForInput(currentInput));
accumulator = combineFn.addInput(accumulator, currentInput.getValue().getValue(), ctxtForInput(currentInput));
// keep track of the timestamps assigned by the TimestampCombiner.
Instant windowTimestamp = timestampCombiner.assign(currentWindow, windowingStrategy.getWindowFn().getOutputTime(currentInput.getTimestamp(), currentWindow));
// accumulate the next windows, or output.
List<WindowedValue<KV<K, AccumT>>> output = Lists.newArrayList();
// if merging, merge overlapping windows, e.g. Sessions.
final boolean merging = !windowingStrategy.getWindowFn().isNonMerging();
while (iterator.hasNext()) {
WindowedValue<KV<K, InputT>> nextValue = iterator.next();
BoundedWindow nextWindow = Iterables.getOnlyElement(nextValue.getWindows());
boolean mergingAndIntersecting = merging && isIntersecting((IntervalWindow) currentWindow, (IntervalWindow) nextWindow);
if (mergingAndIntersecting || nextWindow.equals(currentWindow)) {
if (mergingAndIntersecting) {
// merge intersecting windows.
currentWindow = merge((IntervalWindow) currentWindow, (IntervalWindow) nextWindow);
}
// keep accumulating and carry on ;-)
accumulator = combineFn.addInput(accumulator, nextValue.getValue().getValue(), ctxtForInput(nextValue));
windowTimestamp = timestampCombiner.combine(windowTimestamp, timestampCombiner.assign(currentWindow, windowFn.getOutputTime(nextValue.getTimestamp(), currentWindow)));
} else {
// moving to the next window, first add the current accumulation to output
// and initialize the accumulator.
output.add(WindowedValue.of(KV.of(key, accumulator), windowTimestamp, currentWindow, PaneInfo.NO_FIRING));
// re-init accumulator, window and timestamp.
accumulator = combineFn.createAccumulator(ctxtForInput(nextValue));
accumulator = combineFn.addInput(accumulator, nextValue.getValue().getValue(), ctxtForInput(nextValue));
currentWindow = nextWindow;
windowTimestamp = timestampCombiner.assign(currentWindow, windowFn.getOutputTime(nextValue.getTimestamp(), currentWindow));
}
}
// add last accumulator to the output.
output.add(WindowedValue.of(KV.of(key, accumulator), windowTimestamp, currentWindow, PaneInfo.NO_FIRING));
return output;
}
Aggregations