Search in sources :

Example 1 with TupleTag

use of org.apache.beam.sdk.values.TupleTag in project beam by apache.

the class DataflowPipelineTranslatorTest method testBatchStatefulParDoTranslation.

/**
   * Smoke test to fail fast if translation of a stateful ParDo
   * in batch breaks.
   */
@Test
public void testBatchStatefulParDoTranslation() throws Exception {
    DataflowPipelineOptions options = buildPipelineOptions();
    DataflowRunner runner = DataflowRunner.fromOptions(options);
    options.setStreaming(false);
    DataflowPipelineTranslator translator = DataflowPipelineTranslator.fromOptions(options);
    Pipeline pipeline = Pipeline.create(options);
    TupleTag<Integer> mainOutputTag = new TupleTag<Integer>() {
    };
    pipeline.apply(Create.of(KV.of(1, 1))).apply(ParDo.of(new DoFn<KV<Integer, Integer>, Integer>() {

        @StateId("unused")
        final StateSpec<ValueState<Integer>> stateSpec = StateSpecs.value(VarIntCoder.of());

        @ProcessElement
        public void process(ProcessContext c) {
        // noop
        }
    }).withOutputTags(mainOutputTag, TupleTagList.empty()));
    runner.replaceTransforms(pipeline);
    Job job = translator.translate(pipeline, runner, Collections.<DataflowPackage>emptyList()).getJob();
    // The job should look like:
    // 0. ParallelRead (Create)
    // 1. ParDo(ReifyWVs)
    // 2. GroupByKeyAndSortValuesONly
    // 3. A ParDo over grouped and sorted KVs that is executed via ungrouping service-side
    List<Step> steps = job.getSteps();
    assertEquals(4, steps.size());
    Step createStep = steps.get(0);
    assertEquals("ParallelRead", createStep.getKind());
    Step reifyWindowedValueStep = steps.get(1);
    assertEquals("ParallelDo", reifyWindowedValueStep.getKind());
    Step gbkStep = steps.get(2);
    assertEquals("GroupByKey", gbkStep.getKind());
    Step statefulParDoStep = steps.get(3);
    assertEquals("ParallelDo", statefulParDoStep.getKind());
    assertThat((String) statefulParDoStep.getProperties().get(PropertyNames.USES_KEYED_STATE), not(equalTo("true")));
}
Also used : DataflowPipelineOptions(org.apache.beam.runners.dataflow.options.DataflowPipelineOptions) TupleTag(org.apache.beam.sdk.values.TupleTag) KV(org.apache.beam.sdk.values.KV) Step(com.google.api.services.dataflow.model.Step) Pipeline(org.apache.beam.sdk.Pipeline) ValueState(org.apache.beam.sdk.state.ValueState) Job(com.google.api.services.dataflow.model.Job) DataflowPackage(com.google.api.services.dataflow.model.DataflowPackage) Test(org.junit.Test)

Example 2 with TupleTag

use of org.apache.beam.sdk.values.TupleTag in project beam by apache.

the class PipelineTest method testTupleInjectionTransform.

/**
   * Tests that Pipeline supports putting an element into a tuple as a transform.
   */
@Test
@Category(ValidatesRunner.class)
public void testTupleInjectionTransform() throws Exception {
    PCollection<Integer> input = pipeline.apply(Create.<Integer>of(1, 2, 3, 4));
    TupleTag<Integer> tag = new TupleTag<Integer>();
    PCollectionTuple output = input.apply("ProjectTag", new TupleInjectionTransform<Integer>(tag));
    PAssert.that(output.get(tag)).containsInAnyOrder(1, 2, 3, 4);
    pipeline.run();
}
Also used : TupleTag(org.apache.beam.sdk.values.TupleTag) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) Category(org.junit.experimental.categories.Category) Test(org.junit.Test)

Example 3 with TupleTag

use of org.apache.beam.sdk.values.TupleTag in project beam by apache.

the class PipelineTest method testTupleProjectionTransform.

/**
   * Tests that Pipeline supports pulling an element out of a tuple as a transform.
   */
@Test
@Category(ValidatesRunner.class)
public void testTupleProjectionTransform() throws Exception {
    PCollection<Integer> input = pipeline.apply(Create.<Integer>of(1, 2, 3, 4));
    TupleTag<Integer> tag = new TupleTag<Integer>();
    PCollectionTuple tuple = PCollectionTuple.of(tag, input);
    PCollection<Integer> output = tuple.apply("ProjectTag", new TupleProjectionTransform<Integer>(tag));
    PAssert.that(output).containsInAnyOrder(1, 2, 3, 4);
    pipeline.run();
}
Also used : TupleTag(org.apache.beam.sdk.values.TupleTag) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) Category(org.junit.experimental.categories.Category) Test(org.junit.Test)

Example 4 with TupleTag

use of org.apache.beam.sdk.values.TupleTag in project beam by apache.

the class TransformHierarchyTest method visitAfterReplace.

/**
   * Tests that visiting the {@link TransformHierarchy} after replacing nodes does not visit any
   * of the original nodes or inaccessible values but does visit all of the replacement nodes,
   * new inaccessible replacement values, and the original output values.
   */
@Test
public void visitAfterReplace() {
    Node root = hierarchy.getCurrent();
    final SingleOutput<Long, Long> originalParDo = ParDo.of(new DoFn<Long, Long>() {

        @ProcessElement
        public void processElement(ProcessContext ctxt) {
            ctxt.output(ctxt.element() + 1L);
        }
    });
    GenerateSequence genUpstream = GenerateSequence.from(0);
    PCollection<Long> upstream = pipeline.apply(genUpstream);
    PCollection<Long> output = upstream.apply("Original", originalParDo);
    Node upstreamNode = hierarchy.pushNode("Upstream", pipeline.begin(), genUpstream);
    hierarchy.finishSpecifyingInput();
    hierarchy.setOutput(upstream);
    hierarchy.popNode();
    Node original = hierarchy.pushNode("Original", upstream, originalParDo);
    hierarchy.finishSpecifyingInput();
    hierarchy.setOutput(output);
    hierarchy.popNode();
    final TupleTag<Long> longs = new TupleTag<>();
    final MultiOutput<Long, Long> replacementParDo = ParDo.of(new DoFn<Long, Long>() {

        @ProcessElement
        public void processElement(ProcessContext ctxt) {
            ctxt.output(ctxt.element() + 1L);
        }
    }).withOutputTags(longs, TupleTagList.empty());
    PTransform<PCollection<Long>, PCollection<Long>> replacementComposite = new PTransform<PCollection<Long>, PCollection<Long>>() {

        @Override
        public PCollection<Long> expand(PCollection<Long> input) {
            return input.apply("Contained", replacementParDo).get(longs);
        }
    };
    PCollectionTuple replacementOutput = upstream.apply("Contained", replacementParDo);
    Node compositeNode = hierarchy.replaceNode(original, upstream, replacementComposite);
    Node replacementParNode = hierarchy.pushNode("Original/Contained", upstream, replacementParDo);
    hierarchy.finishSpecifyingInput();
    hierarchy.setOutput(replacementOutput);
    hierarchy.popNode();
    hierarchy.setOutput(replacementOutput.get(longs));
    Entry<TupleTag<?>, PValue> replacementLongs = Iterables.getOnlyElement(replacementOutput.expand().entrySet());
    hierarchy.replaceOutputs(Collections.<PValue, ReplacementOutput>singletonMap(replacementOutput.get(longs), ReplacementOutput.of(TaggedPValue.ofExpandedValue(output), TaggedPValue.of(replacementLongs.getKey(), replacementLongs.getValue()))));
    hierarchy.popNode();
    final Set<Node> visitedCompositeNodes = new HashSet<>();
    final Set<Node> visitedPrimitiveNodes = new HashSet<>();
    Set<PValue> visitedValues = hierarchy.visit(new Defaults() {

        @Override
        public CompositeBehavior enterCompositeTransform(Node node) {
            visitedCompositeNodes.add(node);
            return CompositeBehavior.ENTER_TRANSFORM;
        }

        @Override
        public void visitPrimitiveTransform(Node node) {
            visitedPrimitiveNodes.add(node);
        }
    });
    /*
     Final Graph:
     Upstream -> Upstream.out -> Composite -> (ReplacementParDo -> OriginalParDo.out)
     */
    assertThat(visitedCompositeNodes, containsInAnyOrder(root, compositeNode));
    assertThat(visitedPrimitiveNodes, containsInAnyOrder(upstreamNode, replacementParNode));
    assertThat(visitedValues, Matchers.<PValue>containsInAnyOrder(upstream, output));
}
Also used : Node(org.apache.beam.sdk.runners.TransformHierarchy.Node) TupleTag(org.apache.beam.sdk.values.TupleTag) TaggedPValue(org.apache.beam.sdk.values.TaggedPValue) PValue(org.apache.beam.sdk.values.PValue) GenerateSequence(org.apache.beam.sdk.io.GenerateSequence) PCollection(org.apache.beam.sdk.values.PCollection) DoFn(org.apache.beam.sdk.transforms.DoFn) Defaults(org.apache.beam.sdk.Pipeline.PipelineVisitor.Defaults) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) PTransform(org.apache.beam.sdk.transforms.PTransform) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 5 with TupleTag

use of org.apache.beam.sdk.values.TupleTag in project beam by apache.

the class StatefulParDoEvaluatorFactoryTest method windowCleanupScheduled.

@Test
public void windowCleanupScheduled() throws Exception {
    // To test the factory, first we set up a pipeline and then we use the constructed
    // pipeline to create the right parameters to pass to the factory
    final String stateId = "my-state-id";
    // For consistency, window it into FixedWindows. Actually we will fabricate an input bundle.
    PCollection<KV<String, Integer>> input = pipeline.apply(Create.of(KV.of("hello", 1), KV.of("hello", 2))).apply(Window.<KV<String, Integer>>into(FixedWindows.of(Duration.millis(10))));
    TupleTag<Integer> mainOutput = new TupleTag<>();
    PCollection<Integer> produced = input.apply(new ParDoMultiOverrideFactory.GbkThenStatefulParDo<>(ParDo.of(new DoFn<KV<String, Integer>, Integer>() {

        @StateId(stateId)
        private final StateSpec<ValueState<String>> spec = StateSpecs.value(StringUtf8Coder.of());

        @ProcessElement
        public void process(ProcessContext c) {
        }
    }).withOutputTags(mainOutput, TupleTagList.empty()))).get(mainOutput).setCoder(VarIntCoder.of());
    StatefulParDoEvaluatorFactory<String, Integer, Integer> factory = new StatefulParDoEvaluatorFactory(mockEvaluationContext);
    AppliedPTransform<PCollection<? extends KeyedWorkItem<String, KV<String, Integer>>>, PCollectionTuple, StatefulParDo<String, Integer, Integer>> producingTransform = (AppliedPTransform) DirectGraphs.getProducer(produced);
    // Then there will be a digging down to the step context to get the state internals
    when(mockEvaluationContext.getExecutionContext(eq(producingTransform), Mockito.<StructuralKey>any())).thenReturn(mockExecutionContext);
    when(mockExecutionContext.getStepContext(anyString())).thenReturn(mockStepContext);
    IntervalWindow firstWindow = new IntervalWindow(new Instant(0), new Instant(9));
    IntervalWindow secondWindow = new IntervalWindow(new Instant(10), new Instant(19));
    StateNamespace firstWindowNamespace = StateNamespaces.window(IntervalWindow.getCoder(), firstWindow);
    StateNamespace secondWindowNamespace = StateNamespaces.window(IntervalWindow.getCoder(), secondWindow);
    StateTag<ValueState<String>> tag = StateTags.tagForSpec(stateId, StateSpecs.value(StringUtf8Coder.of()));
    // Set up non-empty state. We don't mock + verify calls to clear() but instead
    // check that state is actually empty. We musn't care how it is accomplished.
    stateInternals.state(firstWindowNamespace, tag).write("first");
    stateInternals.state(secondWindowNamespace, tag).write("second");
    // A single bundle with some elements in the global window; it should register cleanup for the
    // global window state merely by having the evaluator created. The cleanup logic does not
    // depend on the window.
    CommittedBundle<KV<String, Integer>> inputBundle = BUNDLE_FACTORY.createBundle(input).add(WindowedValue.of(KV.of("hello", 1), new Instant(3), firstWindow, PaneInfo.NO_FIRING)).add(WindowedValue.of(KV.of("hello", 2), new Instant(11), secondWindow, PaneInfo.NO_FIRING)).commit(Instant.now());
    // Merely creating the evaluator should suffice to register the cleanup callback
    factory.forApplication(producingTransform, inputBundle);
    ArgumentCaptor<Runnable> argumentCaptor = ArgumentCaptor.forClass(Runnable.class);
    verify(mockEvaluationContext).scheduleAfterWindowExpiration(eq(producingTransform), eq(firstWindow), Mockito.<WindowingStrategy<?, ?>>any(), argumentCaptor.capture());
    // Should actually clear the state for the first window
    argumentCaptor.getValue().run();
    assertThat(stateInternals.state(firstWindowNamespace, tag).read(), nullValue());
    assertThat(stateInternals.state(secondWindowNamespace, tag).read(), equalTo("second"));
    verify(mockEvaluationContext).scheduleAfterWindowExpiration(eq(producingTransform), eq(secondWindow), Mockito.<WindowingStrategy<?, ?>>any(), argumentCaptor.capture());
    // Should actually clear the state for the second window
    argumentCaptor.getValue().run();
    assertThat(stateInternals.state(secondWindowNamespace, tag).read(), nullValue());
}
Also used : TupleTag(org.apache.beam.sdk.values.TupleTag) Matchers.anyString(org.mockito.Matchers.anyString) StateSpec(org.apache.beam.sdk.state.StateSpec) AppliedPTransform(org.apache.beam.sdk.runners.AppliedPTransform) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) IntervalWindow(org.apache.beam.sdk.transforms.windowing.IntervalWindow) Instant(org.joda.time.Instant) KV(org.apache.beam.sdk.values.KV) KeyedWorkItem(org.apache.beam.runners.core.KeyedWorkItem) StateNamespace(org.apache.beam.runners.core.StateNamespace) PCollection(org.apache.beam.sdk.values.PCollection) DoFn(org.apache.beam.sdk.transforms.DoFn) ValueState(org.apache.beam.sdk.state.ValueState) StatefulParDo(org.apache.beam.runners.direct.ParDoMultiOverrideFactory.StatefulParDo) Test(org.junit.Test)

Aggregations

TupleTag (org.apache.beam.sdk.values.TupleTag)69 Test (org.junit.Test)44 PCollectionTuple (org.apache.beam.sdk.values.PCollectionTuple)28 KV (org.apache.beam.sdk.values.KV)17 PCollection (org.apache.beam.sdk.values.PCollection)17 Instant (org.joda.time.Instant)14 WindowedValue (org.apache.beam.sdk.util.WindowedValue)13 PValue (org.apache.beam.sdk.values.PValue)13 Category (org.junit.experimental.categories.Category)13 IntervalWindow (org.apache.beam.sdk.transforms.windowing.IntervalWindow)10 Map (java.util.Map)9 AppliedPTransform (org.apache.beam.sdk.runners.AppliedPTransform)9 DoFn (org.apache.beam.sdk.transforms.DoFn)9 StringUtils.byteArrayToJsonString (org.apache.beam.sdk.util.StringUtils.byteArrayToJsonString)9 PCollectionView (org.apache.beam.sdk.values.PCollectionView)9 Matchers.containsString (org.hamcrest.Matchers.containsString)9 ImmutableMap (com.google.common.collect.ImmutableMap)6 Pipeline (org.apache.beam.sdk.Pipeline)5 ImmutableList (com.google.common.collect.ImmutableList)4 ArrayList (java.util.ArrayList)4