Search in sources :

Example 11 with KV

use of org.apache.beam.sdk.values.KV in project beam by apache.

the class StatefulParDoEvaluatorFactoryTest method testUnprocessedElements.

/**
   * A test that explicitly delays a side input so that the main input will have to be reprocessed,
   * testing that {@code finishBundle()} re-assembles the GBK outputs correctly.
   */
@Test
public void testUnprocessedElements() throws Exception {
    // To test the factory, first we set up a pipeline and then we use the constructed
    // pipeline to create the right parameters to pass to the factory
    final String stateId = "my-state-id";
    // For consistency, window it into FixedWindows. Actually we will fabricate an input bundle.
    PCollection<KV<String, Integer>> mainInput = pipeline.apply(Create.of(KV.of("hello", 1), KV.of("hello", 2))).apply(Window.<KV<String, Integer>>into(FixedWindows.of(Duration.millis(10))));
    final PCollectionView<List<Integer>> sideInput = pipeline.apply("Create side input", Create.of(42)).apply("Window side input", Window.<Integer>into(FixedWindows.of(Duration.millis(10)))).apply("View side input", View.<Integer>asList());
    TupleTag<Integer> mainOutput = new TupleTag<>();
    PCollection<Integer> produced = mainInput.apply(new ParDoMultiOverrideFactory.GbkThenStatefulParDo<>(ParDo.of(new DoFn<KV<String, Integer>, Integer>() {

        @StateId(stateId)
        private final StateSpec<ValueState<String>> spec = StateSpecs.value(StringUtf8Coder.of());

        @ProcessElement
        public void process(ProcessContext c) {
        }
    }).withSideInputs(sideInput).withOutputTags(mainOutput, TupleTagList.empty()))).get(mainOutput).setCoder(VarIntCoder.of());
    StatefulParDoEvaluatorFactory<String, Integer, Integer> factory = new StatefulParDoEvaluatorFactory(mockEvaluationContext);
    // This will be the stateful ParDo from the expansion
    AppliedPTransform<PCollection<KeyedWorkItem<String, KV<String, Integer>>>, PCollectionTuple, StatefulParDo<String, Integer, Integer>> producingTransform = (AppliedPTransform) DirectGraphs.getProducer(produced);
    // Then there will be a digging down to the step context to get the state internals
    when(mockEvaluationContext.getExecutionContext(eq(producingTransform), Mockito.<StructuralKey>any())).thenReturn(mockExecutionContext);
    when(mockExecutionContext.getStepContext(anyString())).thenReturn(mockStepContext);
    when(mockEvaluationContext.createBundle(Matchers.<PCollection<Integer>>any())).thenReturn(mockUncommittedBundle);
    when(mockStepContext.getTimerUpdate()).thenReturn(TimerUpdate.empty());
    // And digging to check whether the window is ready
    when(mockEvaluationContext.createSideInputReader(anyList())).thenReturn(mockSideInputReader);
    when(mockSideInputReader.isReady(Matchers.<PCollectionView<?>>any(), Matchers.<BoundedWindow>any())).thenReturn(false);
    IntervalWindow firstWindow = new IntervalWindow(new Instant(0), new Instant(9));
    // A single bundle with some elements in the global window; it should register cleanup for the
    // global window state merely by having the evaluator created. The cleanup logic does not
    // depend on the window.
    String key = "hello";
    WindowedValue<KV<String, Integer>> firstKv = WindowedValue.of(KV.of(key, 1), new Instant(3), firstWindow, PaneInfo.NO_FIRING);
    WindowedValue<KeyedWorkItem<String, KV<String, Integer>>> gbkOutputElement = firstKv.withValue(KeyedWorkItems.elementsWorkItem("hello", ImmutableList.of(firstKv, firstKv.withValue(KV.of(key, 13)), firstKv.withValue(KV.of(key, 15)))));
    CommittedBundle<KeyedWorkItem<String, KV<String, Integer>>> inputBundle = BUNDLE_FACTORY.createBundle((PCollection<KeyedWorkItem<String, KV<String, Integer>>>) Iterables.getOnlyElement(producingTransform.getInputs().values())).add(gbkOutputElement).commit(Instant.now());
    TransformEvaluator<KeyedWorkItem<String, KV<String, Integer>>> evaluator = factory.forApplication(producingTransform, inputBundle);
    evaluator.processElement(gbkOutputElement);
    // This should push back every element as a KV<String, Iterable<Integer>>
    // in the appropriate window. Since the keys are equal they are single-threaded
    TransformResult<KeyedWorkItem<String, KV<String, Integer>>> result = evaluator.finishBundle();
    List<Integer> pushedBackInts = new ArrayList<>();
    for (WindowedValue<? extends KeyedWorkItem<String, KV<String, Integer>>> unprocessedElement : result.getUnprocessedElements()) {
        assertThat(Iterables.getOnlyElement(unprocessedElement.getWindows()), equalTo((BoundedWindow) firstWindow));
        assertThat(unprocessedElement.getValue().key(), equalTo("hello"));
        for (WindowedValue<KV<String, Integer>> windowedKv : unprocessedElement.getValue().elementsIterable()) {
            pushedBackInts.add(windowedKv.getValue().getValue());
        }
    }
    assertThat(pushedBackInts, containsInAnyOrder(1, 13, 15));
}
Also used : ArrayList(java.util.ArrayList) TupleTag(org.apache.beam.sdk.values.TupleTag) Matchers.anyString(org.mockito.Matchers.anyString) AppliedPTransform(org.apache.beam.sdk.runners.AppliedPTransform) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) List(java.util.List) TupleTagList(org.apache.beam.sdk.values.TupleTagList) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) Matchers.anyList(org.mockito.Matchers.anyList) IntervalWindow(org.apache.beam.sdk.transforms.windowing.IntervalWindow) Instant(org.joda.time.Instant) KV(org.apache.beam.sdk.values.KV) KeyedWorkItem(org.apache.beam.runners.core.KeyedWorkItem) PCollection(org.apache.beam.sdk.values.PCollection) ValueState(org.apache.beam.sdk.state.ValueState) StatefulParDo(org.apache.beam.runners.direct.ParDoMultiOverrideFactory.StatefulParDo) Test(org.junit.Test)

Example 12 with KV

use of org.apache.beam.sdk.values.KV in project beam by apache.

the class SideInputContainerTest method getAfterWriteReturnsPaneInWindow.

@Test
public void getAfterWriteReturnsPaneInWindow() throws Exception {
    WindowedValue<KV<String, Integer>> one = WindowedValue.of(KV.of("one", 1), new Instant(1L), FIRST_WINDOW, PaneInfo.ON_TIME_AND_ONLY_FIRING);
    WindowedValue<KV<String, Integer>> two = WindowedValue.of(KV.of("two", 2), new Instant(20L), FIRST_WINDOW, PaneInfo.ON_TIME_AND_ONLY_FIRING);
    container.write(mapView, ImmutableList.<WindowedValue<?>>of(one, two));
    Map<String, Integer> viewContents = container.createReaderForViews(ImmutableList.<PCollectionView<?>>of(mapView)).get(mapView, FIRST_WINDOW);
    assertThat(viewContents, hasEntry("one", 1));
    assertThat(viewContents, hasEntry("two", 2));
    assertThat(viewContents.size(), is(2));
}
Also used : PCollectionView(org.apache.beam.sdk.values.PCollectionView) Instant(org.joda.time.Instant) KV(org.apache.beam.sdk.values.KV) Test(org.junit.Test)

Example 13 with KV

use of org.apache.beam.sdk.values.KV in project beam by apache.

the class SideInputContainerTest method finishDoesNotOverwriteWrittenElements.

@Test
public void finishDoesNotOverwriteWrittenElements() throws Exception {
    WindowedValue<KV<String, Integer>> one = WindowedValue.of(KV.of("one", 1), new Instant(1L), SECOND_WINDOW, PaneInfo.createPane(true, false, Timing.EARLY));
    WindowedValue<KV<String, Integer>> two = WindowedValue.of(KV.of("two", 2), new Instant(20L), SECOND_WINDOW, PaneInfo.createPane(true, false, Timing.EARLY));
    container.write(mapView, ImmutableList.<WindowedValue<?>>of(one, two));
    immediatelyInvokeCallback(mapView, SECOND_WINDOW);
    Map<String, Integer> viewContents = container.createReaderForViews(ImmutableList.<PCollectionView<?>>of(mapView)).get(mapView, SECOND_WINDOW);
    assertThat(viewContents, hasEntry("one", 1));
    assertThat(viewContents, hasEntry("two", 2));
    assertThat(viewContents.size(), is(2));
}
Also used : PCollectionView(org.apache.beam.sdk.values.PCollectionView) Instant(org.joda.time.Instant) KV(org.apache.beam.sdk.values.KV) Test(org.junit.Test)

Example 14 with KV

use of org.apache.beam.sdk.values.KV in project beam by apache.

the class WatermarkManagerTest method updateWatermarkWithLateData.

/**
   * Demonstrates that updateWatermarks in the presence of late data is monotonic.
   */
@Test
public void updateWatermarkWithLateData() {
    Instant sourceWatermark = new Instant(1_000_000L);
    CommittedBundle<Integer> createdBundle = timestampedBundle(createdInts, TimestampedValue.of(1, sourceWatermark), TimestampedValue.of(2, new Instant(1234L)));
    manager.updateWatermarks(null, TimerUpdate.empty(), result(graph.getProducer(createdInts), null, Collections.<CommittedBundle<?>>singleton(createdBundle)), sourceWatermark);
    CommittedBundle<KV<String, Integer>> keyBundle = timestampedBundle(keyed, TimestampedValue.of(KV.of("MyKey", 1), sourceWatermark), TimestampedValue.of(KV.of("MyKey", 2), new Instant(1234L)));
    // Finish processing the on-time data. The watermarks should progress to be equal to the source
    manager.updateWatermarks(createdBundle, TimerUpdate.empty(), result(graph.getProducer(keyed), createdBundle.withElements(Collections.<WindowedValue<Integer>>emptyList()), Collections.<CommittedBundle<?>>singleton(keyBundle)), BoundedWindow.TIMESTAMP_MAX_VALUE);
    manager.refreshAll();
    TransformWatermarks onTimeWatermarks = manager.getWatermarks(graph.getProducer(keyed));
    assertThat(onTimeWatermarks.getInputWatermark(), equalTo(sourceWatermark));
    assertThat(onTimeWatermarks.getOutputWatermark(), equalTo(sourceWatermark));
    CommittedBundle<Integer> lateDataBundle = timestampedBundle(createdInts, TimestampedValue.of(3, new Instant(-1000L)));
    // the late data arrives in a downstream PCollection after its watermark has advanced past it;
    // we don't advance the watermark past the current watermark until we've consumed the late data
    manager.updateWatermarks(null, TimerUpdate.empty(), result(graph.getProducer(createdInts), createdBundle.withElements(Collections.<WindowedValue<Integer>>emptyList()), Collections.<CommittedBundle<?>>singleton(lateDataBundle)), new Instant(2_000_000L));
    manager.refreshAll();
    TransformWatermarks bufferedLateWm = manager.getWatermarks(graph.getProducer(createdInts));
    assertThat(bufferedLateWm.getOutputWatermark(), equalTo(new Instant(2_000_000L)));
    // The input watermark should be held to its previous value (not advanced due to late data; not
    // moved backwards in the presence of watermarks due to monotonicity).
    TransformWatermarks lateDataBufferedWatermark = manager.getWatermarks(graph.getProducer(keyed));
    assertThat(lateDataBufferedWatermark.getInputWatermark(), not(earlierThan(sourceWatermark)));
    assertThat(lateDataBufferedWatermark.getOutputWatermark(), not(earlierThan(sourceWatermark)));
    CommittedBundle<KV<String, Integer>> lateKeyedBundle = timestampedBundle(keyed, TimestampedValue.of(KV.of("MyKey", 3), new Instant(-1000L)));
    manager.updateWatermarks(lateDataBundle, TimerUpdate.empty(), result(graph.getProducer(keyed), lateDataBundle.withElements(Collections.<WindowedValue<Integer>>emptyList()), Collections.<CommittedBundle<?>>singleton(lateKeyedBundle)), BoundedWindow.TIMESTAMP_MAX_VALUE);
    manager.refreshAll();
}
Also used : TransformWatermarks(org.apache.beam.runners.direct.WatermarkManager.TransformWatermarks) ReadableInstant(org.joda.time.ReadableInstant) Instant(org.joda.time.Instant) KV(org.apache.beam.sdk.values.KV) Test(org.junit.Test)

Example 15 with KV

use of org.apache.beam.sdk.values.KV in project beam by apache.

the class WatermarkManagerTest method updateWatermarkWithUnprocessedElements.

@Test
public void updateWatermarkWithUnprocessedElements() {
    WindowedValue<Integer> first = WindowedValue.valueInGlobalWindow(1);
    WindowedValue<Integer> second = WindowedValue.timestampedValueInGlobalWindow(2, new Instant(-1000L));
    WindowedValue<Integer> third = WindowedValue.timestampedValueInGlobalWindow(3, new Instant(1234L));
    CommittedBundle<Integer> createdBundle = bundleFactory.createBundle(createdInts).add(first).add(second).add(third).commit(clock.now());
    manager.updateWatermarks(null, TimerUpdate.empty(), result(graph.getProducer(createdInts), null, Collections.<CommittedBundle<?>>singleton(createdBundle)), BoundedWindow.TIMESTAMP_MAX_VALUE);
    CommittedBundle<KV<String, Integer>> keyBundle = timestampedBundle(keyed, TimestampedValue.of(KV.of("MyKey", 1), BoundedWindow.TIMESTAMP_MIN_VALUE));
    manager.updateWatermarks(createdBundle, TimerUpdate.empty(), result(graph.getProducer(keyed), createdBundle.withElements(ImmutableList.of(second, third)), Collections.<CommittedBundle<?>>singleton(keyBundle)), BoundedWindow.TIMESTAMP_MAX_VALUE);
    TransformWatermarks keyedWatermarks = manager.getWatermarks(graph.getProducer(keyed));
    // the unprocessed second and third are readded to pending
    assertThat(keyedWatermarks.getInputWatermark(), not(laterThan(new Instant(-1000L))));
}
Also used : TransformWatermarks(org.apache.beam.runners.direct.WatermarkManager.TransformWatermarks) ReadableInstant(org.joda.time.ReadableInstant) Instant(org.joda.time.Instant) KV(org.apache.beam.sdk.values.KV) Test(org.junit.Test)

Aggregations

KV (org.apache.beam.sdk.values.KV)192 Test (org.junit.Test)143 Instant (org.joda.time.Instant)66 Category (org.junit.experimental.categories.Category)62 Pipeline (org.apache.beam.sdk.Pipeline)35 IntervalWindow (org.apache.beam.sdk.transforms.windowing.IntervalWindow)34 StringUtils.byteArrayToJsonString (org.apache.beam.sdk.util.StringUtils.byteArrayToJsonString)33 Matchers.containsString (org.hamcrest.Matchers.containsString)33 StateSpec (org.apache.beam.sdk.state.StateSpec)25 BoundedWindow (org.apache.beam.sdk.transforms.windowing.BoundedWindow)22 ArrayList (java.util.ArrayList)19 WindowedValue (org.apache.beam.sdk.util.WindowedValue)19 TupleTag (org.apache.beam.sdk.values.TupleTag)16 TableRow (com.google.api.services.bigquery.model.TableRow)15 Map (java.util.Map)15 ValueState (org.apache.beam.sdk.state.ValueState)15 List (java.util.List)14 ImmutableList (com.google.common.collect.ImmutableList)12 HashMap (java.util.HashMap)12 Timer (org.apache.beam.sdk.state.Timer)12