use of org.apache.beam.runners.core.StateInternals in project beam by apache.
the class FlinkStatefulDoFnFunction method reduce.
@Override
public void reduce(Iterable<WindowedValue<KV<K, V>>> values, Collector<WindowedValue<OutputT>> out) throws Exception {
RuntimeContext runtimeContext = getRuntimeContext();
DoFnRunners.OutputManager outputManager;
if (outputMap.size() == 1) {
outputManager = new FlinkDoFnFunction.DoFnOutputManager(out);
} else {
// it has some additional Outputs
outputManager = new FlinkDoFnFunction.MultiDoFnOutputManager((Collector) out, outputMap);
}
final Iterator<WindowedValue<KV<K, V>>> iterator = values.iterator();
// get the first value, we need this for initializing the state internals with the key.
// we are guaranteed to have a first value, otherwise reduce() would not have been called.
WindowedValue<KV<K, V>> currentValue = iterator.next();
final K key = currentValue.getValue().getKey();
final InMemoryStateInternals<K> stateInternals = InMemoryStateInternals.forKey(key);
// Used with Batch, we know that all the data is available for this key. We can't use the
// timer manager from the context because it doesn't exist. So we create one and advance
// time to the end after processing all elements.
final InMemoryTimerInternals timerInternals = new InMemoryTimerInternals();
timerInternals.advanceProcessingTime(Instant.now());
timerInternals.advanceSynchronizedProcessingTime(Instant.now());
List<TupleTag<?>> additionalOutputTags = Lists.newArrayList(outputMap.keySet());
DoFnRunner<KV<K, V>, OutputT> doFnRunner = DoFnRunners.simpleRunner(serializedOptions.getPipelineOptions(), dofn, new FlinkSideInputReader(sideInputs, runtimeContext), outputManager, mainOutputTag, additionalOutputTags, new FlinkNoOpStepContext() {
@Override
public StateInternals stateInternals() {
return stateInternals;
}
@Override
public TimerInternals timerInternals() {
return timerInternals;
}
}, windowingStrategy);
if ((serializedOptions.getPipelineOptions().as(FlinkPipelineOptions.class)).getEnableMetrics()) {
doFnRunner = new DoFnRunnerWithMetricsUpdate<>(stepName, doFnRunner, getRuntimeContext());
}
doFnRunner.startBundle();
doFnRunner.processElement(currentValue);
while (iterator.hasNext()) {
currentValue = iterator.next();
doFnRunner.processElement(currentValue);
}
// Finish any pending windows by advancing the input watermark to infinity.
timerInternals.advanceInputWatermark(BoundedWindow.TIMESTAMP_MAX_VALUE);
// Finally, advance the processing time to infinity to fire any timers.
timerInternals.advanceProcessingTime(BoundedWindow.TIMESTAMP_MAX_VALUE);
timerInternals.advanceSynchronizedProcessingTime(BoundedWindow.TIMESTAMP_MAX_VALUE);
fireEligibleTimers(timerInternals, doFnRunner);
doFnRunner.finishBundle();
}
use of org.apache.beam.runners.core.StateInternals in project beam by apache.
the class SparkGroupAlsoByWindowViaOutputBufferFn method call.
@Override
public Iterable<WindowedValue<KV<K, Iterable<InputT>>>> call(WindowedValue<KV<K, Iterable<WindowedValue<InputT>>>> windowedValue) throws Exception {
K key = windowedValue.getValue().getKey();
Iterable<WindowedValue<InputT>> values = windowedValue.getValue().getValue();
//------ based on GroupAlsoByWindowsViaOutputBufferDoFn ------//
// Used with Batch, we know that all the data is available for this key. We can't use the
// timer manager from the context because it doesn't exist. So we create one and emulate the
// watermark, knowing that we have all data and it is in timestamp order.
InMemoryTimerInternals timerInternals = new InMemoryTimerInternals();
timerInternals.advanceProcessingTime(Instant.now());
timerInternals.advanceSynchronizedProcessingTime(Instant.now());
StateInternals stateInternals = stateInternalsFactory.stateInternalsForKey(key);
GABWOutputWindowedValue<K, InputT> outputter = new GABWOutputWindowedValue<>();
ReduceFnRunner<K, InputT, Iterable<InputT>, W> reduceFnRunner = new ReduceFnRunner<>(key, windowingStrategy, ExecutableTriggerStateMachine.create(TriggerStateMachines.stateMachineForTrigger(TriggerTranslation.toProto(windowingStrategy.getTrigger()))), stateInternals, timerInternals, outputter, new UnsupportedSideInputReader("GroupAlsoByWindow"), reduceFn, runtimeContext.getPipelineOptions());
// Process the grouped values.
reduceFnRunner.processElements(values);
// Finish any pending windows by advancing the input watermark to infinity.
timerInternals.advanceInputWatermark(BoundedWindow.TIMESTAMP_MAX_VALUE);
// Finally, advance the processing time to infinity to fire any timers.
timerInternals.advanceProcessingTime(BoundedWindow.TIMESTAMP_MAX_VALUE);
timerInternals.advanceSynchronizedProcessingTime(BoundedWindow.TIMESTAMP_MAX_VALUE);
fireEligibleTimers(timerInternals, reduceFnRunner);
reduceFnRunner.persist();
return outputter.getOutputs();
}
use of org.apache.beam.runners.core.StateInternals in project beam by apache.
the class GroupAlsoByWindowViaOutputBufferFn method call.
@Override
public Iterator<WindowedValue<KV<K, Iterable<InputT>>>> call(K key, Iterator<WindowedValue<KV<K, InputT>>> iterator) throws Exception {
// we have to materialize the Iterator because ReduceFnRunner.processElements expects
// to have all elements to merge the windows between each other.
// possible OOM even though the spark framework spills to disk if a given group is too large to
// fit in memory.
ArrayList<WindowedValue<InputT>> values = new ArrayList<>();
while (iterator.hasNext()) {
WindowedValue<KV<K, InputT>> wv = iterator.next();
values.add(wv.withValue(wv.getValue().getValue()));
}
// ------ based on GroupAlsoByWindowsViaOutputBufferDoFn ------//
// Used with Batch, we know that all the data is available for this key. We can't use the
// timer manager from the context because it doesn't exist. So we create one and emulate the
// watermark, knowing that we have all data and it is in timestamp order.
InMemoryTimerInternals timerInternals = new InMemoryTimerInternals();
timerInternals.advanceProcessingTime(Instant.now());
timerInternals.advanceSynchronizedProcessingTime(Instant.now());
StateInternals stateInternals = stateInternalsFactory.stateInternalsForKey(key);
GABWOutputWindowedValue<K, InputT> outputter = new GABWOutputWindowedValue<>();
ReduceFnRunner<K, InputT, Iterable<InputT>, W> reduceFnRunner = new ReduceFnRunner<>(key, windowingStrategy, ExecutableTriggerStateMachine.create(TriggerStateMachines.stateMachineForTrigger(TriggerTranslation.toProto(windowingStrategy.getTrigger()))), stateInternals, timerInternals, outputter, new UnsupportedSideInputReader("GroupAlsoByWindow"), reduceFn, options.get());
// Process the grouped values.
reduceFnRunner.processElements(values);
// Finish any pending windows by advancing the input watermark to infinity.
timerInternals.advanceInputWatermark(BoundedWindow.TIMESTAMP_MAX_VALUE);
// Finally, advance the processing time to infinity to fire any timers.
timerInternals.advanceProcessingTime(BoundedWindow.TIMESTAMP_MAX_VALUE);
timerInternals.advanceSynchronizedProcessingTime(BoundedWindow.TIMESTAMP_MAX_VALUE);
fireEligibleTimers(timerInternals, reduceFnRunner);
reduceFnRunner.persist();
return outputter.getOutputs().iterator();
}
use of org.apache.beam.runners.core.StateInternals in project beam by apache.
the class SparkGroupAlsoByWindowViaOutputBufferFn method call.
@Override
public Iterator<WindowedValue<KV<K, Iterable<InputT>>>> call(KV<K, Iterable<WindowedValue<InputT>>> kv) throws Exception {
K key = kv.getKey();
Iterable<WindowedValue<InputT>> values = kv.getValue();
// ------ based on GroupAlsoByWindowsViaOutputBufferDoFn ------//
// Used with Batch, we know that all the data is available for this key. We can't use the
// timer manager from the context because it doesn't exist. So we create one and emulate the
// watermark, knowing that we have all data and it is in timestamp order.
InMemoryTimerInternals timerInternals = new InMemoryTimerInternals();
timerInternals.advanceProcessingTime(Instant.now());
timerInternals.advanceSynchronizedProcessingTime(Instant.now());
StateInternals stateInternals = stateInternalsFactory.stateInternalsForKey(key);
GABWOutputWindowedValue<K, InputT> outputter = new GABWOutputWindowedValue<>();
ReduceFnRunner<K, InputT, Iterable<InputT>, W> reduceFnRunner = new ReduceFnRunner<>(key, windowingStrategy, ExecutableTriggerStateMachine.create(TriggerStateMachines.stateMachineForTrigger(TriggerTranslation.toProto(windowingStrategy.getTrigger()))), stateInternals, timerInternals, outputter, new UnsupportedSideInputReader("GroupAlsoByWindow"), reduceFn, options.get());
// Process the grouped values.
reduceFnRunner.processElements(values);
// Finish any pending windows by advancing the input watermark to infinity.
timerInternals.advanceInputWatermark(BoundedWindow.TIMESTAMP_MAX_VALUE);
// Finally, advance the processing time to infinity to fire any timers.
timerInternals.advanceProcessingTime(BoundedWindow.TIMESTAMP_MAX_VALUE);
timerInternals.advanceSynchronizedProcessingTime(BoundedWindow.TIMESTAMP_MAX_VALUE);
fireEligibleTimers(timerInternals, reduceFnRunner);
reduceFnRunner.persist();
return outputter.getOutputs().iterator();
}
use of org.apache.beam.runners.core.StateInternals in project beam by apache.
the class FlinkStatefulDoFnFunction method reduce.
@Override
public void reduce(Iterable<WindowedValue<KV<K, V>>> values, Collector<WindowedValue<RawUnionValue>> out) throws Exception {
RuntimeContext runtimeContext = getRuntimeContext();
DoFnRunners.OutputManager outputManager;
if (outputMap.size() == 1) {
outputManager = new FlinkDoFnFunction.DoFnOutputManager(out);
} else {
// it has some additional Outputs
outputManager = new FlinkDoFnFunction.MultiDoFnOutputManager(out, outputMap);
}
final Iterator<WindowedValue<KV<K, V>>> iterator = values.iterator();
// get the first value, we need this for initializing the state internals with the key.
// we are guaranteed to have a first value, otherwise reduce() would not have been called.
WindowedValue<KV<K, V>> currentValue = iterator.next();
final K key = currentValue.getValue().getKey();
final InMemoryStateInternals<K> stateInternals = InMemoryStateInternals.forKey(key);
// Used with Batch, we know that all the data is available for this key. We can't use the
// timer manager from the context because it doesn't exist. So we create one and advance
// time to the end after processing all elements.
final InMemoryTimerInternals timerInternals = new InMemoryTimerInternals();
timerInternals.advanceProcessingTime(Instant.now());
timerInternals.advanceSynchronizedProcessingTime(Instant.now());
final Set<BoundedWindow> windowsSeen = new HashSet<>();
List<TupleTag<?>> additionalOutputTags = Lists.newArrayList(outputMap.keySet());
DoFnRunner<KV<K, V>, OutputT> doFnRunner = DoFnRunners.simpleRunner(serializedOptions.get(), dofn, new FlinkSideInputReader(sideInputs, runtimeContext), outputManager, mainOutputTag, additionalOutputTags, new FlinkNoOpStepContext() {
@Override
public StateInternals stateInternals() {
return stateInternals;
}
@Override
public TimerInternals timerInternals() {
return timerInternals;
}
}, inputCoder, outputCoderMap, windowingStrategy, doFnSchemaInformation, sideInputMapping);
FlinkPipelineOptions pipelineOptions = serializedOptions.get().as(FlinkPipelineOptions.class);
if (!pipelineOptions.getDisableMetrics()) {
doFnRunner = new DoFnRunnerWithMetricsUpdate<>(stepName, doFnRunner, metricContainer);
}
doFnRunner.startBundle();
doFnRunner.processElement(currentValue);
if (usesOnWindowExpiration) {
windowsSeen.addAll(currentValue.getWindows());
}
while (iterator.hasNext()) {
currentValue = iterator.next();
if (usesOnWindowExpiration) {
windowsSeen.addAll(currentValue.getWindows());
}
doFnRunner.processElement(currentValue);
}
// Finish any pending windows by advancing the input watermark to infinity.
timerInternals.advanceInputWatermark(BoundedWindow.TIMESTAMP_MAX_VALUE);
// Finally, advance the processing time to infinity to fire any timers.
timerInternals.advanceProcessingTime(BoundedWindow.TIMESTAMP_MAX_VALUE);
timerInternals.advanceSynchronizedProcessingTime(BoundedWindow.TIMESTAMP_MAX_VALUE);
fireEligibleTimers(key, timerInternals, doFnRunner);
if (usesOnWindowExpiration) {
for (BoundedWindow window : windowsSeen) {
doFnRunner.onWindowExpiration(window, window.maxTimestamp().minus(Duration.millis(1)), key);
}
}
doFnRunner.finishBundle();
}
Aggregations