use of org.apache.beam.runners.core.InMemoryStateInternals in project beam by apache.
the class FlinkStatefulDoFnFunction method reduce.
@Override
public void reduce(Iterable<WindowedValue<KV<K, V>>> values, Collector<WindowedValue<OutputT>> out) throws Exception {
RuntimeContext runtimeContext = getRuntimeContext();
DoFnRunners.OutputManager outputManager;
if (outputMap.size() == 1) {
outputManager = new FlinkDoFnFunction.DoFnOutputManager(out);
} else {
// it has some additional Outputs
outputManager = new FlinkDoFnFunction.MultiDoFnOutputManager((Collector) out, outputMap);
}
final Iterator<WindowedValue<KV<K, V>>> iterator = values.iterator();
// get the first value, we need this for initializing the state internals with the key.
// we are guaranteed to have a first value, otherwise reduce() would not have been called.
WindowedValue<KV<K, V>> currentValue = iterator.next();
final K key = currentValue.getValue().getKey();
final InMemoryStateInternals<K> stateInternals = InMemoryStateInternals.forKey(key);
// Used with Batch, we know that all the data is available for this key. We can't use the
// timer manager from the context because it doesn't exist. So we create one and advance
// time to the end after processing all elements.
final InMemoryTimerInternals timerInternals = new InMemoryTimerInternals();
timerInternals.advanceProcessingTime(Instant.now());
timerInternals.advanceSynchronizedProcessingTime(Instant.now());
List<TupleTag<?>> additionalOutputTags = Lists.newArrayList(outputMap.keySet());
DoFnRunner<KV<K, V>, OutputT> doFnRunner = DoFnRunners.simpleRunner(serializedOptions.getPipelineOptions(), dofn, new FlinkSideInputReader(sideInputs, runtimeContext), outputManager, mainOutputTag, additionalOutputTags, new FlinkNoOpStepContext() {
@Override
public StateInternals stateInternals() {
return stateInternals;
}
@Override
public TimerInternals timerInternals() {
return timerInternals;
}
}, windowingStrategy);
if ((serializedOptions.getPipelineOptions().as(FlinkPipelineOptions.class)).getEnableMetrics()) {
doFnRunner = new DoFnRunnerWithMetricsUpdate<>(stepName, doFnRunner, getRuntimeContext());
}
doFnRunner.startBundle();
doFnRunner.processElement(currentValue);
while (iterator.hasNext()) {
currentValue = iterator.next();
doFnRunner.processElement(currentValue);
}
// Finish any pending windows by advancing the input watermark to infinity.
timerInternals.advanceInputWatermark(BoundedWindow.TIMESTAMP_MAX_VALUE);
// Finally, advance the processing time to infinity to fire any timers.
timerInternals.advanceProcessingTime(BoundedWindow.TIMESTAMP_MAX_VALUE);
timerInternals.advanceSynchronizedProcessingTime(BoundedWindow.TIMESTAMP_MAX_VALUE);
fireEligibleTimers(timerInternals, doFnRunner);
doFnRunner.finishBundle();
}
use of org.apache.beam.runners.core.InMemoryStateInternals in project beam by apache.
the class FlinkStatefulDoFnFunction method reduce.
@Override
public void reduce(Iterable<WindowedValue<KV<K, V>>> values, Collector<WindowedValue<RawUnionValue>> out) throws Exception {
RuntimeContext runtimeContext = getRuntimeContext();
DoFnRunners.OutputManager outputManager;
if (outputMap.size() == 1) {
outputManager = new FlinkDoFnFunction.DoFnOutputManager(out);
} else {
// it has some additional Outputs
outputManager = new FlinkDoFnFunction.MultiDoFnOutputManager(out, outputMap);
}
final Iterator<WindowedValue<KV<K, V>>> iterator = values.iterator();
// get the first value, we need this for initializing the state internals with the key.
// we are guaranteed to have a first value, otherwise reduce() would not have been called.
WindowedValue<KV<K, V>> currentValue = iterator.next();
final K key = currentValue.getValue().getKey();
final InMemoryStateInternals<K> stateInternals = InMemoryStateInternals.forKey(key);
// Used with Batch, we know that all the data is available for this key. We can't use the
// timer manager from the context because it doesn't exist. So we create one and advance
// time to the end after processing all elements.
final InMemoryTimerInternals timerInternals = new InMemoryTimerInternals();
timerInternals.advanceProcessingTime(Instant.now());
timerInternals.advanceSynchronizedProcessingTime(Instant.now());
final Set<BoundedWindow> windowsSeen = new HashSet<>();
List<TupleTag<?>> additionalOutputTags = Lists.newArrayList(outputMap.keySet());
DoFnRunner<KV<K, V>, OutputT> doFnRunner = DoFnRunners.simpleRunner(serializedOptions.get(), dofn, new FlinkSideInputReader(sideInputs, runtimeContext), outputManager, mainOutputTag, additionalOutputTags, new FlinkNoOpStepContext() {
@Override
public StateInternals stateInternals() {
return stateInternals;
}
@Override
public TimerInternals timerInternals() {
return timerInternals;
}
}, inputCoder, outputCoderMap, windowingStrategy, doFnSchemaInformation, sideInputMapping);
FlinkPipelineOptions pipelineOptions = serializedOptions.get().as(FlinkPipelineOptions.class);
if (!pipelineOptions.getDisableMetrics()) {
doFnRunner = new DoFnRunnerWithMetricsUpdate<>(stepName, doFnRunner, metricContainer);
}
doFnRunner.startBundle();
doFnRunner.processElement(currentValue);
if (usesOnWindowExpiration) {
windowsSeen.addAll(currentValue.getWindows());
}
while (iterator.hasNext()) {
currentValue = iterator.next();
if (usesOnWindowExpiration) {
windowsSeen.addAll(currentValue.getWindows());
}
doFnRunner.processElement(currentValue);
}
// Finish any pending windows by advancing the input watermark to infinity.
timerInternals.advanceInputWatermark(BoundedWindow.TIMESTAMP_MAX_VALUE);
// Finally, advance the processing time to infinity to fire any timers.
timerInternals.advanceProcessingTime(BoundedWindow.TIMESTAMP_MAX_VALUE);
timerInternals.advanceSynchronizedProcessingTime(BoundedWindow.TIMESTAMP_MAX_VALUE);
fireEligibleTimers(key, timerInternals, doFnRunner);
if (usesOnWindowExpiration) {
for (BoundedWindow window : windowsSeen) {
doFnRunner.onWindowExpiration(window, window.maxTimestamp().minus(Duration.millis(1)), key);
}
}
doFnRunner.finishBundle();
}
use of org.apache.beam.runners.core.InMemoryStateInternals in project beam by apache.
the class ExecutableStageDoFnOperatorTest method testCacheTokenHandling.
@Test
public void testCacheTokenHandling() throws Exception {
InMemoryStateInternals test = InMemoryStateInternals.forKey("test");
KeyedStateBackend<ByteBuffer> stateBackend = FlinkStateInternalsTest.createStateBackend();
ExecutableStageDoFnOperator.BagUserStateFactory<Integer, GlobalWindow> bagUserStateFactory = new ExecutableStageDoFnOperator.BagUserStateFactory<>(test, stateBackend, NoopLock.get(), null);
ByteString key1 = ByteString.copyFrom("key1", Charsets.UTF_8);
ByteString key2 = ByteString.copyFrom("key2", Charsets.UTF_8);
Map<String, Map<String, ProcessBundleDescriptors.BagUserStateSpec>> userStateMapMock = Mockito.mock(Map.class);
Map<String, ProcessBundleDescriptors.BagUserStateSpec> transformMap = Mockito.mock(Map.class);
final String userState1 = "userstate1";
ProcessBundleDescriptors.BagUserStateSpec bagUserStateSpec1 = mockBagUserState(userState1);
when(transformMap.get(userState1)).thenReturn(bagUserStateSpec1);
final String userState2 = "userstate2";
ProcessBundleDescriptors.BagUserStateSpec bagUserStateSpec2 = mockBagUserState(userState2);
when(transformMap.get(userState2)).thenReturn(bagUserStateSpec2);
when(userStateMapMock.get(anyString())).thenReturn(transformMap);
when(processBundleDescriptor.getBagUserStateSpecs()).thenReturn(userStateMapMock);
StateRequestHandler stateRequestHandler = StateRequestHandlers.forBagUserStateHandlerFactory(processBundleDescriptor, bagUserStateFactory);
// User state the cache token is valid for the lifetime of the operator
final BeamFnApi.ProcessBundleRequest.CacheToken expectedCacheToken = Iterables.getOnlyElement(stateRequestHandler.getCacheTokens());
// Make a request to generate initial cache token
stateRequestHandler.handle(getRequest(key1, userState1));
BeamFnApi.ProcessBundleRequest.CacheToken returnedCacheToken = Iterables.getOnlyElement(stateRequestHandler.getCacheTokens());
assertThat(returnedCacheToken.hasUserState(), is(true));
assertThat(returnedCacheToken, is(expectedCacheToken));
List<RequestGenerator> generators = Arrays.asList(ExecutableStageDoFnOperatorTest::getRequest, ExecutableStageDoFnOperatorTest::getAppend, ExecutableStageDoFnOperatorTest::getClear);
for (RequestGenerator req : generators) {
// For every state read the tokens remains unchanged
stateRequestHandler.handle(req.makeRequest(key1, userState1));
assertThat(Iterables.getOnlyElement(stateRequestHandler.getCacheTokens()), is(expectedCacheToken));
// The token is still valid for another key in the same key range
stateRequestHandler.handle(req.makeRequest(key2, userState1));
assertThat(Iterables.getOnlyElement(stateRequestHandler.getCacheTokens()), is(expectedCacheToken));
// The token is still valid for another state cell in the same key range
stateRequestHandler.handle(req.makeRequest(key2, userState2));
assertThat(Iterables.getOnlyElement(stateRequestHandler.getCacheTokens()), is(expectedCacheToken));
}
}
use of org.apache.beam.runners.core.InMemoryStateInternals in project beam by apache.
the class MultiDoFnFunction method call.
@Override
public Iterator<Tuple2<TupleTag<?>, WindowedValue<?>>> call(Iterator<WindowedValue<InputT>> iter) throws Exception {
if (!wasSetupCalled && iter.hasNext()) {
DoFnInvokers.tryInvokeSetupFor(doFn, options.get());
wasSetupCalled = true;
}
DoFnOutputManager outputManager = new DoFnOutputManager();
final InMemoryTimerInternals timerInternals;
final StepContext context;
// Now only implements the StatefulParDo in Batch mode.
Object key = null;
if (stateful) {
if (iter.hasNext()) {
WindowedValue<InputT> currentValue = iter.next();
key = ((KV) currentValue.getValue()).getKey();
iter = Iterators.concat(Iterators.singletonIterator(currentValue), iter);
}
final InMemoryStateInternals<?> stateInternals = InMemoryStateInternals.forKey(key);
timerInternals = new InMemoryTimerInternals();
context = new StepContext() {
@Override
public StateInternals stateInternals() {
return stateInternals;
}
@Override
public TimerInternals timerInternals() {
return timerInternals;
}
};
} else {
timerInternals = null;
context = new SparkProcessContext.NoOpStepContext();
}
final DoFnRunner<InputT, OutputT> doFnRunner = DoFnRunners.simpleRunner(options.get(), doFn, CachedSideInputReader.of(new SparkSideInputReader(sideInputs)), outputManager, mainOutputTag, additionalOutputTags, context, inputCoder, outputCoders, windowingStrategy, doFnSchemaInformation, sideInputMapping);
DoFnRunnerWithMetrics<InputT, OutputT> doFnRunnerWithMetrics = new DoFnRunnerWithMetrics<>(stepName, doFnRunner, metricsAccum);
return new SparkProcessContext<>(doFn, doFnRunnerWithMetrics, outputManager, key, stateful ? new TimerDataIterator(timerInternals) : Collections.emptyIterator()).processPartition(iter).iterator();
}
Aggregations