use of org.apache.beam.sdk.values.KV in project beam by apache.
the class StatefulParDoEvaluatorFactoryTest method testUnprocessedElements.
/**
* A test that explicitly delays a side input so that the main input will have to be reprocessed,
* testing that {@code finishBundle()} re-assembles the GBK outputs correctly.
*/
@Test
public void testUnprocessedElements() throws Exception {
// To test the factory, first we set up a pipeline and then we use the constructed
// pipeline to create the right parameters to pass to the factory
final String stateId = "my-state-id";
// For consistency, window it into FixedWindows. Actually we will fabricate an input bundle.
PCollection<KV<String, Integer>> mainInput = pipeline.apply(Create.of(KV.of("hello", 1), KV.of("hello", 2))).apply(Window.<KV<String, Integer>>into(FixedWindows.of(Duration.millis(10))));
final PCollectionView<List<Integer>> sideInput = pipeline.apply("Create side input", Create.of(42)).apply("Window side input", Window.<Integer>into(FixedWindows.of(Duration.millis(10)))).apply("View side input", View.<Integer>asList());
TupleTag<Integer> mainOutput = new TupleTag<>();
PCollection<Integer> produced = mainInput.apply(new ParDoMultiOverrideFactory.GbkThenStatefulParDo<>(ParDo.of(new DoFn<KV<String, Integer>, Integer>() {
@StateId(stateId)
private final StateSpec<ValueState<String>> spec = StateSpecs.value(StringUtf8Coder.of());
@ProcessElement
public void process(ProcessContext c) {
}
}).withSideInputs(sideInput).withOutputTags(mainOutput, TupleTagList.empty()))).get(mainOutput).setCoder(VarIntCoder.of());
StatefulParDoEvaluatorFactory<String, Integer, Integer> factory = new StatefulParDoEvaluatorFactory(mockEvaluationContext);
// This will be the stateful ParDo from the expansion
AppliedPTransform<PCollection<KeyedWorkItem<String, KV<String, Integer>>>, PCollectionTuple, StatefulParDo<String, Integer, Integer>> producingTransform = (AppliedPTransform) DirectGraphs.getProducer(produced);
// Then there will be a digging down to the step context to get the state internals
when(mockEvaluationContext.getExecutionContext(eq(producingTransform), Mockito.<StructuralKey>any())).thenReturn(mockExecutionContext);
when(mockExecutionContext.getStepContext(anyString())).thenReturn(mockStepContext);
when(mockEvaluationContext.createBundle(Matchers.<PCollection<Integer>>any())).thenReturn(mockUncommittedBundle);
when(mockStepContext.getTimerUpdate()).thenReturn(TimerUpdate.empty());
// And digging to check whether the window is ready
when(mockEvaluationContext.createSideInputReader(anyList())).thenReturn(mockSideInputReader);
when(mockSideInputReader.isReady(Matchers.<PCollectionView<?>>any(), Matchers.<BoundedWindow>any())).thenReturn(false);
IntervalWindow firstWindow = new IntervalWindow(new Instant(0), new Instant(9));
// A single bundle with some elements in the global window; it should register cleanup for the
// global window state merely by having the evaluator created. The cleanup logic does not
// depend on the window.
String key = "hello";
WindowedValue<KV<String, Integer>> firstKv = WindowedValue.of(KV.of(key, 1), new Instant(3), firstWindow, PaneInfo.NO_FIRING);
WindowedValue<KeyedWorkItem<String, KV<String, Integer>>> gbkOutputElement = firstKv.withValue(KeyedWorkItems.elementsWorkItem("hello", ImmutableList.of(firstKv, firstKv.withValue(KV.of(key, 13)), firstKv.withValue(KV.of(key, 15)))));
CommittedBundle<KeyedWorkItem<String, KV<String, Integer>>> inputBundle = BUNDLE_FACTORY.createBundle((PCollection<KeyedWorkItem<String, KV<String, Integer>>>) Iterables.getOnlyElement(producingTransform.getInputs().values())).add(gbkOutputElement).commit(Instant.now());
TransformEvaluator<KeyedWorkItem<String, KV<String, Integer>>> evaluator = factory.forApplication(producingTransform, inputBundle);
evaluator.processElement(gbkOutputElement);
// This should push back every element as a KV<String, Iterable<Integer>>
// in the appropriate window. Since the keys are equal they are single-threaded
TransformResult<KeyedWorkItem<String, KV<String, Integer>>> result = evaluator.finishBundle();
List<Integer> pushedBackInts = new ArrayList<>();
for (WindowedValue<? extends KeyedWorkItem<String, KV<String, Integer>>> unprocessedElement : result.getUnprocessedElements()) {
assertThat(Iterables.getOnlyElement(unprocessedElement.getWindows()), equalTo((BoundedWindow) firstWindow));
assertThat(unprocessedElement.getValue().key(), equalTo("hello"));
for (WindowedValue<KV<String, Integer>> windowedKv : unprocessedElement.getValue().elementsIterable()) {
pushedBackInts.add(windowedKv.getValue().getValue());
}
}
assertThat(pushedBackInts, containsInAnyOrder(1, 13, 15));
}
use of org.apache.beam.sdk.values.KV in project beam by apache.
the class SideInputContainerTest method getAfterWriteReturnsPaneInWindow.
@Test
public void getAfterWriteReturnsPaneInWindow() throws Exception {
WindowedValue<KV<String, Integer>> one = WindowedValue.of(KV.of("one", 1), new Instant(1L), FIRST_WINDOW, PaneInfo.ON_TIME_AND_ONLY_FIRING);
WindowedValue<KV<String, Integer>> two = WindowedValue.of(KV.of("two", 2), new Instant(20L), FIRST_WINDOW, PaneInfo.ON_TIME_AND_ONLY_FIRING);
container.write(mapView, ImmutableList.<WindowedValue<?>>of(one, two));
Map<String, Integer> viewContents = container.createReaderForViews(ImmutableList.<PCollectionView<?>>of(mapView)).get(mapView, FIRST_WINDOW);
assertThat(viewContents, hasEntry("one", 1));
assertThat(viewContents, hasEntry("two", 2));
assertThat(viewContents.size(), is(2));
}
use of org.apache.beam.sdk.values.KV in project beam by apache.
the class SideInputContainerTest method finishDoesNotOverwriteWrittenElements.
@Test
public void finishDoesNotOverwriteWrittenElements() throws Exception {
WindowedValue<KV<String, Integer>> one = WindowedValue.of(KV.of("one", 1), new Instant(1L), SECOND_WINDOW, PaneInfo.createPane(true, false, Timing.EARLY));
WindowedValue<KV<String, Integer>> two = WindowedValue.of(KV.of("two", 2), new Instant(20L), SECOND_WINDOW, PaneInfo.createPane(true, false, Timing.EARLY));
container.write(mapView, ImmutableList.<WindowedValue<?>>of(one, two));
immediatelyInvokeCallback(mapView, SECOND_WINDOW);
Map<String, Integer> viewContents = container.createReaderForViews(ImmutableList.<PCollectionView<?>>of(mapView)).get(mapView, SECOND_WINDOW);
assertThat(viewContents, hasEntry("one", 1));
assertThat(viewContents, hasEntry("two", 2));
assertThat(viewContents.size(), is(2));
}
use of org.apache.beam.sdk.values.KV in project beam by apache.
the class WatermarkManagerTest method updateWatermarkWithLateData.
/**
* Demonstrates that updateWatermarks in the presence of late data is monotonic.
*/
@Test
public void updateWatermarkWithLateData() {
Instant sourceWatermark = new Instant(1_000_000L);
CommittedBundle<Integer> createdBundle = timestampedBundle(createdInts, TimestampedValue.of(1, sourceWatermark), TimestampedValue.of(2, new Instant(1234L)));
manager.updateWatermarks(null, TimerUpdate.empty(), result(graph.getProducer(createdInts), null, Collections.<CommittedBundle<?>>singleton(createdBundle)), sourceWatermark);
CommittedBundle<KV<String, Integer>> keyBundle = timestampedBundle(keyed, TimestampedValue.of(KV.of("MyKey", 1), sourceWatermark), TimestampedValue.of(KV.of("MyKey", 2), new Instant(1234L)));
// Finish processing the on-time data. The watermarks should progress to be equal to the source
manager.updateWatermarks(createdBundle, TimerUpdate.empty(), result(graph.getProducer(keyed), createdBundle.withElements(Collections.<WindowedValue<Integer>>emptyList()), Collections.<CommittedBundle<?>>singleton(keyBundle)), BoundedWindow.TIMESTAMP_MAX_VALUE);
manager.refreshAll();
TransformWatermarks onTimeWatermarks = manager.getWatermarks(graph.getProducer(keyed));
assertThat(onTimeWatermarks.getInputWatermark(), equalTo(sourceWatermark));
assertThat(onTimeWatermarks.getOutputWatermark(), equalTo(sourceWatermark));
CommittedBundle<Integer> lateDataBundle = timestampedBundle(createdInts, TimestampedValue.of(3, new Instant(-1000L)));
// the late data arrives in a downstream PCollection after its watermark has advanced past it;
// we don't advance the watermark past the current watermark until we've consumed the late data
manager.updateWatermarks(null, TimerUpdate.empty(), result(graph.getProducer(createdInts), createdBundle.withElements(Collections.<WindowedValue<Integer>>emptyList()), Collections.<CommittedBundle<?>>singleton(lateDataBundle)), new Instant(2_000_000L));
manager.refreshAll();
TransformWatermarks bufferedLateWm = manager.getWatermarks(graph.getProducer(createdInts));
assertThat(bufferedLateWm.getOutputWatermark(), equalTo(new Instant(2_000_000L)));
// The input watermark should be held to its previous value (not advanced due to late data; not
// moved backwards in the presence of watermarks due to monotonicity).
TransformWatermarks lateDataBufferedWatermark = manager.getWatermarks(graph.getProducer(keyed));
assertThat(lateDataBufferedWatermark.getInputWatermark(), not(earlierThan(sourceWatermark)));
assertThat(lateDataBufferedWatermark.getOutputWatermark(), not(earlierThan(sourceWatermark)));
CommittedBundle<KV<String, Integer>> lateKeyedBundle = timestampedBundle(keyed, TimestampedValue.of(KV.of("MyKey", 3), new Instant(-1000L)));
manager.updateWatermarks(lateDataBundle, TimerUpdate.empty(), result(graph.getProducer(keyed), lateDataBundle.withElements(Collections.<WindowedValue<Integer>>emptyList()), Collections.<CommittedBundle<?>>singleton(lateKeyedBundle)), BoundedWindow.TIMESTAMP_MAX_VALUE);
manager.refreshAll();
}
use of org.apache.beam.sdk.values.KV in project beam by apache.
the class WatermarkManagerTest method updateWatermarkWithUnprocessedElements.
@Test
public void updateWatermarkWithUnprocessedElements() {
WindowedValue<Integer> first = WindowedValue.valueInGlobalWindow(1);
WindowedValue<Integer> second = WindowedValue.timestampedValueInGlobalWindow(2, new Instant(-1000L));
WindowedValue<Integer> third = WindowedValue.timestampedValueInGlobalWindow(3, new Instant(1234L));
CommittedBundle<Integer> createdBundle = bundleFactory.createBundle(createdInts).add(first).add(second).add(third).commit(clock.now());
manager.updateWatermarks(null, TimerUpdate.empty(), result(graph.getProducer(createdInts), null, Collections.<CommittedBundle<?>>singleton(createdBundle)), BoundedWindow.TIMESTAMP_MAX_VALUE);
CommittedBundle<KV<String, Integer>> keyBundle = timestampedBundle(keyed, TimestampedValue.of(KV.of("MyKey", 1), BoundedWindow.TIMESTAMP_MIN_VALUE));
manager.updateWatermarks(createdBundle, TimerUpdate.empty(), result(graph.getProducer(keyed), createdBundle.withElements(ImmutableList.of(second, third)), Collections.<CommittedBundle<?>>singleton(keyBundle)), BoundedWindow.TIMESTAMP_MAX_VALUE);
TransformWatermarks keyedWatermarks = manager.getWatermarks(graph.getProducer(keyed));
// the unprocessed second and third are readded to pending
assertThat(keyedWatermarks.getInputWatermark(), not(laterThan(new Instant(-1000L))));
}
Aggregations