Search in sources :

Example 1 with PCollection

use of org.apache.beam.sdk.values.PCollection in project beam by apache.

the class StatefulParDoEvaluatorFactoryTest method windowCleanupScheduled.

@Test
public void windowCleanupScheduled() throws Exception {
    // To test the factory, first we set up a pipeline and then we use the constructed
    // pipeline to create the right parameters to pass to the factory
    final String stateId = "my-state-id";
    // For consistency, window it into FixedWindows. Actually we will fabricate an input bundle.
    PCollection<KV<String, Integer>> input = pipeline.apply(Create.of(KV.of("hello", 1), KV.of("hello", 2))).apply(Window.<KV<String, Integer>>into(FixedWindows.of(Duration.millis(10))));
    TupleTag<Integer> mainOutput = new TupleTag<>();
    PCollection<Integer> produced = input.apply(new ParDoMultiOverrideFactory.GbkThenStatefulParDo<>(ParDo.of(new DoFn<KV<String, Integer>, Integer>() {

        @StateId(stateId)
        private final StateSpec<ValueState<String>> spec = StateSpecs.value(StringUtf8Coder.of());

        @ProcessElement
        public void process(ProcessContext c) {
        }
    }).withOutputTags(mainOutput, TupleTagList.empty()))).get(mainOutput).setCoder(VarIntCoder.of());
    StatefulParDoEvaluatorFactory<String, Integer, Integer> factory = new StatefulParDoEvaluatorFactory(mockEvaluationContext);
    AppliedPTransform<PCollection<? extends KeyedWorkItem<String, KV<String, Integer>>>, PCollectionTuple, StatefulParDo<String, Integer, Integer>> producingTransform = (AppliedPTransform) DirectGraphs.getProducer(produced);
    // Then there will be a digging down to the step context to get the state internals
    when(mockEvaluationContext.getExecutionContext(eq(producingTransform), Mockito.<StructuralKey>any())).thenReturn(mockExecutionContext);
    when(mockExecutionContext.getStepContext(anyString())).thenReturn(mockStepContext);
    IntervalWindow firstWindow = new IntervalWindow(new Instant(0), new Instant(9));
    IntervalWindow secondWindow = new IntervalWindow(new Instant(10), new Instant(19));
    StateNamespace firstWindowNamespace = StateNamespaces.window(IntervalWindow.getCoder(), firstWindow);
    StateNamespace secondWindowNamespace = StateNamespaces.window(IntervalWindow.getCoder(), secondWindow);
    StateTag<ValueState<String>> tag = StateTags.tagForSpec(stateId, StateSpecs.value(StringUtf8Coder.of()));
    // Set up non-empty state. We don't mock + verify calls to clear() but instead
    // check that state is actually empty. We musn't care how it is accomplished.
    stateInternals.state(firstWindowNamespace, tag).write("first");
    stateInternals.state(secondWindowNamespace, tag).write("second");
    // A single bundle with some elements in the global window; it should register cleanup for the
    // global window state merely by having the evaluator created. The cleanup logic does not
    // depend on the window.
    CommittedBundle<KV<String, Integer>> inputBundle = BUNDLE_FACTORY.createBundle(input).add(WindowedValue.of(KV.of("hello", 1), new Instant(3), firstWindow, PaneInfo.NO_FIRING)).add(WindowedValue.of(KV.of("hello", 2), new Instant(11), secondWindow, PaneInfo.NO_FIRING)).commit(Instant.now());
    // Merely creating the evaluator should suffice to register the cleanup callback
    factory.forApplication(producingTransform, inputBundle);
    ArgumentCaptor<Runnable> argumentCaptor = ArgumentCaptor.forClass(Runnable.class);
    verify(mockEvaluationContext).scheduleAfterWindowExpiration(eq(producingTransform), eq(firstWindow), Mockito.<WindowingStrategy<?, ?>>any(), argumentCaptor.capture());
    // Should actually clear the state for the first window
    argumentCaptor.getValue().run();
    assertThat(stateInternals.state(firstWindowNamespace, tag).read(), nullValue());
    assertThat(stateInternals.state(secondWindowNamespace, tag).read(), equalTo("second"));
    verify(mockEvaluationContext).scheduleAfterWindowExpiration(eq(producingTransform), eq(secondWindow), Mockito.<WindowingStrategy<?, ?>>any(), argumentCaptor.capture());
    // Should actually clear the state for the second window
    argumentCaptor.getValue().run();
    assertThat(stateInternals.state(secondWindowNamespace, tag).read(), nullValue());
}
Also used : TupleTag(org.apache.beam.sdk.values.TupleTag) Matchers.anyString(org.mockito.Matchers.anyString) StateSpec(org.apache.beam.sdk.state.StateSpec) AppliedPTransform(org.apache.beam.sdk.runners.AppliedPTransform) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) IntervalWindow(org.apache.beam.sdk.transforms.windowing.IntervalWindow) Instant(org.joda.time.Instant) KV(org.apache.beam.sdk.values.KV) KeyedWorkItem(org.apache.beam.runners.core.KeyedWorkItem) StateNamespace(org.apache.beam.runners.core.StateNamespace) PCollection(org.apache.beam.sdk.values.PCollection) DoFn(org.apache.beam.sdk.transforms.DoFn) ValueState(org.apache.beam.sdk.state.ValueState) StatefulParDo(org.apache.beam.runners.direct.ParDoMultiOverrideFactory.StatefulParDo) Test(org.junit.Test)

Example 2 with PCollection

use of org.apache.beam.sdk.values.PCollection in project beam by apache.

the class ViewOverrideFactoryTest method replacementGetViewReturnsOriginal.

@Test
public void replacementGetViewReturnsOriginal() {
    final PCollection<Integer> ints = p.apply("CreateContents", Create.of(1, 2, 3));
    final PCollectionView<List<Integer>> view = PCollectionViews.listView(ints, WindowingStrategy.globalDefault(), ints.getCoder());
    PTransformReplacement<PCollection<Integer>, PCollectionView<List<Integer>>> replacement = factory.getReplacementTransform(AppliedPTransform.<PCollection<Integer>, PCollectionView<List<Integer>>, CreatePCollectionView<Integer, List<Integer>>>of("foo", ints.expand(), view.expand(), CreatePCollectionView.<Integer, List<Integer>>of(view), p));
    ints.apply(replacement.getTransform());
    final AtomicBoolean writeViewVisited = new AtomicBoolean();
    p.traverseTopologically(new PipelineVisitor.Defaults() {

        @Override
        public void visitPrimitiveTransform(Node node) {
            if (node.getTransform() instanceof WriteView) {
                assertThat("There should only be one WriteView primitive in the graph", writeViewVisited.getAndSet(true), is(false));
                PCollectionView replacementView = ((WriteView) node.getTransform()).getView();
                assertThat(replacementView, Matchers.<PCollectionView>theInstance(view));
                assertThat(node.getInputs().entrySet(), hasSize(1));
            }
        }
    });
    assertThat(writeViewVisited.get(), is(true));
}
Also used : WriteView(org.apache.beam.runners.direct.ViewOverrideFactory.WriteView) Node(org.apache.beam.sdk.runners.TransformHierarchy.Node) PCollection(org.apache.beam.sdk.values.PCollection) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) PCollectionView(org.apache.beam.sdk.values.PCollectionView) CreatePCollectionView(org.apache.beam.sdk.transforms.View.CreatePCollectionView) PipelineVisitor(org.apache.beam.sdk.Pipeline.PipelineVisitor) List(java.util.List) Test(org.junit.Test)

Example 3 with PCollection

use of org.apache.beam.sdk.values.PCollection in project beam by apache.

the class SdkComponentsTest method translatePipeline.

@Test
public void translatePipeline() {
    BigEndianLongCoder customCoder = BigEndianLongCoder.of();
    PCollection<Long> elems = pipeline.apply(GenerateSequence.from(0L).to(207L));
    PCollection<Long> counted = elems.apply(Count.<Long>globally()).setCoder(customCoder);
    PCollection<Long> windowed = counted.apply(Window.<Long>into(FixedWindows.of(Duration.standardMinutes(7))).triggering(AfterWatermark.pastEndOfWindow().withEarlyFirings(AfterPane.elementCountAtLeast(19))).accumulatingFiredPanes().withAllowedLateness(Duration.standardMinutes(3L)));
    final WindowingStrategy<?, ?> windowedStrategy = windowed.getWindowingStrategy();
    PCollection<KV<String, Long>> keyed = windowed.apply(WithKeys.<String, Long>of("foo"));
    PCollection<KV<String, Iterable<Long>>> grouped = keyed.apply(GroupByKey.<String, Long>create());
    final RunnerApi.Pipeline pipelineProto = SdkComponents.translatePipeline(pipeline);
    pipeline.traverseTopologically(new PipelineVisitor.Defaults() {

        Set<Node> transforms = new HashSet<>();

        Set<PCollection<?>> pcollections = new HashSet<>();

        Set<Equivalence.Wrapper<? extends Coder<?>>> coders = new HashSet<>();

        Set<WindowingStrategy<?, ?>> windowingStrategies = new HashSet<>();

        @Override
        public void leaveCompositeTransform(Node node) {
            if (node.isRootNode()) {
                assertThat("Unexpected number of PTransforms", pipelineProto.getComponents().getTransformsCount(), equalTo(transforms.size()));
                assertThat("Unexpected number of PCollections", pipelineProto.getComponents().getPcollectionsCount(), equalTo(pcollections.size()));
                assertThat("Unexpected number of Coders", pipelineProto.getComponents().getCodersCount(), equalTo(coders.size()));
                assertThat("Unexpected number of Windowing Strategies", pipelineProto.getComponents().getWindowingStrategiesCount(), equalTo(windowingStrategies.size()));
            } else {
                transforms.add(node);
            }
        }

        @Override
        public void visitPrimitiveTransform(Node node) {
            transforms.add(node);
        }

        @Override
        public void visitValue(PValue value, Node producer) {
            if (value instanceof PCollection) {
                PCollection pc = (PCollection) value;
                pcollections.add(pc);
                addCoders(pc.getCoder());
                windowingStrategies.add(pc.getWindowingStrategy());
                addCoders(pc.getWindowingStrategy().getWindowFn().windowCoder());
            }
        }

        private void addCoders(Coder<?> coder) {
            coders.add(Equivalence.<Coder<?>>identity().wrap(coder));
            if (coder instanceof StructuredCoder) {
                for (Coder<?> component : ((StructuredCoder<?>) coder).getComponents()) {
                    addCoders(component);
                }
            }
        }
    });
}
Also used : Node(org.apache.beam.sdk.runners.TransformHierarchy.Node) WindowingStrategy(org.apache.beam.sdk.values.WindowingStrategy) RunnerApi(org.apache.beam.sdk.common.runner.v1.RunnerApi) PipelineVisitor(org.apache.beam.sdk.Pipeline.PipelineVisitor) BigEndianLongCoder(org.apache.beam.sdk.coders.BigEndianLongCoder) HashSet(java.util.HashSet) Coder(org.apache.beam.sdk.coders.Coder) SetCoder(org.apache.beam.sdk.coders.SetCoder) StringUtf8Coder(org.apache.beam.sdk.coders.StringUtf8Coder) KvCoder(org.apache.beam.sdk.coders.KvCoder) BigEndianLongCoder(org.apache.beam.sdk.coders.BigEndianLongCoder) IterableCoder(org.apache.beam.sdk.coders.IterableCoder) VarLongCoder(org.apache.beam.sdk.coders.VarLongCoder) StructuredCoder(org.apache.beam.sdk.coders.StructuredCoder) ByteArrayCoder(org.apache.beam.sdk.coders.ByteArrayCoder) KV(org.apache.beam.sdk.values.KV) PValue(org.apache.beam.sdk.values.PValue) PCollection(org.apache.beam.sdk.values.PCollection) StructuredCoder(org.apache.beam.sdk.coders.StructuredCoder) Test(org.junit.Test)

Example 4 with PCollection

use of org.apache.beam.sdk.values.PCollection in project beam by apache.

the class ViewEvaluatorFactory method createEvaluator.

private <InT, OuT> TransformEvaluator<Iterable<InT>> createEvaluator(final AppliedPTransform<PCollection<Iterable<InT>>, PCollectionView<OuT>, WriteView<InT, OuT>> application) {
    PCollection<Iterable<InT>> input = (PCollection<Iterable<InT>>) Iterables.getOnlyElement(application.getInputs().values());
    final PCollectionViewWriter<InT, OuT> writer = context.createPCollectionViewWriter(input, (PCollectionView<OuT>) Iterables.getOnlyElement(application.getOutputs().values()));
    return new TransformEvaluator<Iterable<InT>>() {

        private final List<WindowedValue<InT>> elements = new ArrayList<>();

        @Override
        public void processElement(WindowedValue<Iterable<InT>> element) {
            for (InT input : element.getValue()) {
                elements.add(element.withValue(input));
            }
        }

        @Override
        public TransformResult<Iterable<InT>> finishBundle() {
            writer.add(elements);
            Builder resultBuilder = StepTransformResult.withoutHold(application);
            if (!elements.isEmpty()) {
                resultBuilder = resultBuilder.withAdditionalOutput(OutputType.PCOLLECTION_VIEW);
            }
            return resultBuilder.build();
        }
    };
}
Also used : PCollection(org.apache.beam.sdk.values.PCollection) WindowedValue(org.apache.beam.sdk.util.WindowedValue) Builder(org.apache.beam.runners.direct.StepTransformResult.Builder) List(java.util.List) ArrayList(java.util.ArrayList)

Example 5 with PCollection

use of org.apache.beam.sdk.values.PCollection in project beam by apache.

the class TransformTranslator method flattenPColl.

private static <T> TransformEvaluator<Flatten.PCollections<T>> flattenPColl() {
    return new TransformEvaluator<Flatten.PCollections<T>>() {

        @SuppressWarnings("unchecked")
        @Override
        public void evaluate(Flatten.PCollections<T> transform, EvaluationContext context) {
            Collection<PValue> pcs = context.getInputs(transform).values();
            JavaRDD<WindowedValue<T>> unionRDD;
            if (pcs.size() == 0) {
                unionRDD = context.getSparkContext().emptyRDD();
            } else {
                JavaRDD<WindowedValue<T>>[] rdds = new JavaRDD[pcs.size()];
                int index = 0;
                for (PValue pc : pcs) {
                    checkArgument(pc instanceof PCollection, "Flatten had non-PCollection value in input: %s of type %s", pc, pc.getClass().getSimpleName());
                    rdds[index] = ((BoundedDataset<T>) context.borrowDataset(pc)).getRDD();
                    index++;
                }
                unionRDD = context.getSparkContext().union(rdds);
            }
            context.putDataset(transform, new BoundedDataset<>(unionRDD));
        }

        @Override
        public String toNativeString() {
            return "sparkContext.union(...)";
        }
    };
}
Also used : Flatten(org.apache.beam.sdk.transforms.Flatten) PValue(org.apache.beam.sdk.values.PValue) JavaRDD(org.apache.spark.api.java.JavaRDD) PCollection(org.apache.beam.sdk.values.PCollection) WindowedValue(org.apache.beam.sdk.util.WindowedValue)

Aggregations

PCollection (org.apache.beam.sdk.values.PCollection)198 Test (org.junit.Test)133 TestPipeline (org.apache.beam.sdk.testing.TestPipeline)61 KV (org.apache.beam.sdk.values.KV)61 Map (java.util.Map)59 List (java.util.List)58 Rule (org.junit.Rule)57 RunWith (org.junit.runner.RunWith)54 PAssert (org.apache.beam.sdk.testing.PAssert)52 Instant (org.joda.time.Instant)46 Duration (org.joda.time.Duration)45 JUnit4 (org.junit.runners.JUnit4)45 ParDo (org.apache.beam.sdk.transforms.ParDo)44 TupleTag (org.apache.beam.sdk.values.TupleTag)42 Pipeline (org.apache.beam.sdk.Pipeline)41 Create (org.apache.beam.sdk.transforms.Create)41 ArrayList (java.util.ArrayList)40 Serializable (java.io.Serializable)39 PTransform (org.apache.beam.sdk.transforms.PTransform)37 Row (org.apache.beam.sdk.values.Row)37