use of org.apache.beam.sdk.values.KV in project beam by apache.
the class DoFnOperatorTest method testStateGCForStatefulFn.
@Test
public void testStateGCForStatefulFn() throws Exception {
WindowingStrategy<Object, IntervalWindow> windowingStrategy = WindowingStrategy.of(FixedWindows.of(new Duration(10))).withAllowedLateness(Duration.ZERO);
final String timerId = "boo";
final String stateId = "dazzle";
final int offset = 5000;
final int timerOutput = 4093;
DoFn<KV<String, Integer>, KV<String, Integer>> fn = new DoFn<KV<String, Integer>, KV<String, Integer>>() {
@TimerId(timerId)
private final TimerSpec spec = TimerSpecs.timer(TimeDomain.EVENT_TIME);
@StateId(stateId)
private final StateSpec<ValueState<String>> stateSpec = StateSpecs.value(StringUtf8Coder.of());
@ProcessElement
public void processElement(ProcessContext context, @TimerId(timerId) Timer timer, @StateId(stateId) ValueState<String> state, BoundedWindow window) {
timer.set(window.maxTimestamp());
state.write(context.element().getKey());
context.output(KV.of(context.element().getKey(), context.element().getValue() + offset));
}
@OnTimer(timerId)
public void onTimer(OnTimerContext context, @StateId(stateId) ValueState<String> state) {
context.output(KV.of(state.read(), timerOutput));
}
};
WindowedValue.FullWindowedValueCoder<KV<String, Integer>> windowedValueCoder = WindowedValue.getFullCoder(KvCoder.of(StringUtf8Coder.of(), VarIntCoder.of()), windowingStrategy.getWindowFn().windowCoder());
TupleTag<KV<String, Integer>> outputTag = new TupleTag<>("main-output");
DoFnOperator<KV<String, Integer>, KV<String, Integer>, WindowedValue<KV<String, Integer>>> doFnOperator = new DoFnOperator<>(fn, "stepName", windowedValueCoder, outputTag, Collections.<TupleTag<?>>emptyList(), new DoFnOperator.DefaultOutputManagerFactory<WindowedValue<KV<String, Integer>>>(), windowingStrategy, new HashMap<Integer, PCollectionView<?>>(), /* side-input mapping */
Collections.<PCollectionView<?>>emptyList(), /* side inputs */
PipelineOptionsFactory.as(FlinkPipelineOptions.class), StringUtf8Coder.of());
KeyedOneInputStreamOperatorTestHarness<String, WindowedValue<KV<String, Integer>>, WindowedValue<KV<String, Integer>>> testHarness = new KeyedOneInputStreamOperatorTestHarness<>(doFnOperator, new KeySelector<WindowedValue<KV<String, Integer>>, String>() {
@Override
public String getKey(WindowedValue<KV<String, Integer>> kvWindowedValue) throws Exception {
return kvWindowedValue.getValue().getKey();
}
}, new CoderTypeInformation<>(StringUtf8Coder.of()));
testHarness.open();
testHarness.processWatermark(0);
assertEquals(0, testHarness.numKeyedStateEntries());
IntervalWindow window1 = new IntervalWindow(new Instant(0), Duration.millis(10));
testHarness.processElement(new StreamRecord<>(WindowedValue.of(KV.of("key1", 5), new Instant(1), window1, PaneInfo.NO_FIRING)));
testHarness.processElement(new StreamRecord<>(WindowedValue.of(KV.of("key2", 7), new Instant(3), window1, PaneInfo.NO_FIRING)));
assertThat(this.<KV<String, Integer>>stripStreamRecordFromWindowedValue(testHarness.getOutput()), contains(WindowedValue.of(KV.of("key1", 5 + offset), new Instant(1), window1, PaneInfo.NO_FIRING), WindowedValue.of(KV.of("key2", 7 + offset), new Instant(3), window1, PaneInfo.NO_FIRING)));
assertEquals(2, testHarness.numKeyedStateEntries());
testHarness.getOutput().clear();
// this should trigger both the window.maxTimestamp() timer and the GC timer
// this tests that the GC timer fires after the user timer
testHarness.processWatermark(window1.maxTimestamp().plus(windowingStrategy.getAllowedLateness()).plus(StatefulDoFnRunner.TimeInternalsCleanupTimer.GC_DELAY_MS).getMillis());
assertThat(this.<KV<String, Integer>>stripStreamRecordFromWindowedValue(testHarness.getOutput()), contains(WindowedValue.of(KV.of("key1", timerOutput), new Instant(9), window1, PaneInfo.NO_FIRING), WindowedValue.of(KV.of("key2", timerOutput), new Instant(9), window1, PaneInfo.NO_FIRING)));
// ensure the state was garbage collected
assertEquals(0, testHarness.numKeyedStateEntries());
testHarness.close();
}
use of org.apache.beam.sdk.values.KV in project beam by apache.
the class UnboundedSourceWrapper method initializeState.
@Override
public void initializeState(FunctionInitializationContext context) throws Exception {
if (checkpointCoder == null) {
// no checkpoint coder available in this source
return;
}
OperatorStateStore stateStore = context.getOperatorStateStore();
CoderTypeInformation<KV<? extends UnboundedSource<OutputT, CheckpointMarkT>, CheckpointMarkT>> typeInformation = (CoderTypeInformation) new CoderTypeInformation<>(checkpointCoder);
stateForCheckpoint = stateStore.getOperatorState(new ListStateDescriptor<>(DefaultOperatorStateBackend.DEFAULT_OPERATOR_STATE_NAME, typeInformation.createSerializer(new ExecutionConfig())));
if (context.isRestored()) {
isRestored = true;
LOG.info("Having restore state in the UnbounedSourceWrapper.");
} else {
LOG.info("No restore state for UnbounedSourceWrapper.");
}
}
use of org.apache.beam.sdk.values.KV in project beam by apache.
the class FlinkStatefulDoFnFunction method reduce.
@Override
public void reduce(Iterable<WindowedValue<KV<K, V>>> values, Collector<WindowedValue<OutputT>> out) throws Exception {
RuntimeContext runtimeContext = getRuntimeContext();
DoFnRunners.OutputManager outputManager;
if (outputMap.size() == 1) {
outputManager = new FlinkDoFnFunction.DoFnOutputManager(out);
} else {
// it has some additional Outputs
outputManager = new FlinkDoFnFunction.MultiDoFnOutputManager((Collector) out, outputMap);
}
final Iterator<WindowedValue<KV<K, V>>> iterator = values.iterator();
// get the first value, we need this for initializing the state internals with the key.
// we are guaranteed to have a first value, otherwise reduce() would not have been called.
WindowedValue<KV<K, V>> currentValue = iterator.next();
final K key = currentValue.getValue().getKey();
final InMemoryStateInternals<K> stateInternals = InMemoryStateInternals.forKey(key);
// Used with Batch, we know that all the data is available for this key. We can't use the
// timer manager from the context because it doesn't exist. So we create one and advance
// time to the end after processing all elements.
final InMemoryTimerInternals timerInternals = new InMemoryTimerInternals();
timerInternals.advanceProcessingTime(Instant.now());
timerInternals.advanceSynchronizedProcessingTime(Instant.now());
List<TupleTag<?>> additionalOutputTags = Lists.newArrayList(outputMap.keySet());
DoFnRunner<KV<K, V>, OutputT> doFnRunner = DoFnRunners.simpleRunner(serializedOptions.getPipelineOptions(), dofn, new FlinkSideInputReader(sideInputs, runtimeContext), outputManager, mainOutputTag, additionalOutputTags, new FlinkNoOpStepContext() {
@Override
public StateInternals stateInternals() {
return stateInternals;
}
@Override
public TimerInternals timerInternals() {
return timerInternals;
}
}, windowingStrategy);
if ((serializedOptions.getPipelineOptions().as(FlinkPipelineOptions.class)).getEnableMetrics()) {
doFnRunner = new DoFnRunnerWithMetricsUpdate<>(stepName, doFnRunner, getRuntimeContext());
}
doFnRunner.startBundle();
doFnRunner.processElement(currentValue);
while (iterator.hasNext()) {
currentValue = iterator.next();
doFnRunner.processElement(currentValue);
}
// Finish any pending windows by advancing the input watermark to infinity.
timerInternals.advanceInputWatermark(BoundedWindow.TIMESTAMP_MAX_VALUE);
// Finally, advance the processing time to infinity to fire any timers.
timerInternals.advanceProcessingTime(BoundedWindow.TIMESTAMP_MAX_VALUE);
timerInternals.advanceSynchronizedProcessingTime(BoundedWindow.TIMESTAMP_MAX_VALUE);
fireEligibleTimers(timerInternals, doFnRunner);
doFnRunner.finishBundle();
}
use of org.apache.beam.sdk.values.KV in project beam by apache.
the class SortingFlinkCombineRunner method mergeWindow.
/**
* Merge windows. This assumes that the list of elements is sorted by window-end timestamp.
* This replaces windows in the input list.
*/
private void mergeWindow(List<WindowedValue<KV<K, InputT>>> elements) {
int currentStart = 0;
IntervalWindow currentWindow = (IntervalWindow) Iterables.getOnlyElement(elements.get(0).getWindows());
for (int i = 1; i < elements.size(); i++) {
WindowedValue<KV<K, InputT>> nextValue = elements.get(i);
IntervalWindow nextWindow = (IntervalWindow) Iterables.getOnlyElement(nextValue.getWindows());
if (currentWindow.intersects(nextWindow)) {
// we continue
currentWindow = currentWindow.span(nextWindow);
} else {
// retrofit the merged window to all windows up to "currentStart"
for (int j = i - 1; j >= currentStart; j--) {
WindowedValue<KV<K, InputT>> value = elements.get(j);
elements.set(j, WindowedValue.of(value.getValue(), value.getTimestamp(), currentWindow, value.getPane()));
}
currentStart = i;
currentWindow = nextWindow;
}
}
if (currentStart < elements.size() - 1) {
// we have to retrofit the last batch
for (int j = elements.size() - 1; j >= currentStart; j--) {
WindowedValue<KV<K, InputT>> value = elements.get(j);
elements.set(j, WindowedValue.of(value.getValue(), value.getTimestamp(), currentWindow, value.getPane()));
}
}
}
use of org.apache.beam.sdk.values.KV in project beam by apache.
the class HIFIOElasticIT method testHifIOWithElastic.
/**
* This test reads data from the Elasticsearch instance and verifies whether data is read
* successfully.
*/
@Test
public void testHifIOWithElastic() throws SecurityException, IOException {
// Expected hashcode is evaluated during insertion time one time and hardcoded here.
final long expectedRowCount = 1000L;
String expectedHashCode = "42e254c8689050ed0a617ff5e80ea392";
Configuration conf = getConfiguration(options);
PCollection<KV<Text, LinkedMapWritable>> esData = pipeline.apply(HadoopInputFormatIO.<Text, LinkedMapWritable>read().withConfiguration(conf));
// Verify that the count of objects fetched using HIFInputFormat IO is correct.
PCollection<Long> count = esData.apply(Count.<KV<Text, LinkedMapWritable>>globally());
PAssert.thatSingleton(count).isEqualTo(expectedRowCount);
PCollection<LinkedMapWritable> values = esData.apply(Values.<LinkedMapWritable>create());
PCollection<String> textValues = values.apply(transformFunc);
// Verify the output values using checksum comparison.
PCollection<String> consolidatedHashcode = textValues.apply(Combine.globally(new HashingFn()).withoutDefaults());
PAssert.that(consolidatedHashcode).containsInAnyOrder(expectedHashCode);
pipeline.run().waitUntilFinish();
}
Aggregations