Search in sources :

Example 61 with TupleTag

use of org.apache.beam.sdk.values.TupleTag in project beam by apache.

the class ParDoTest method testParDoWithTaggedOutputName.

@Test
public void testParDoWithTaggedOutputName() {
    pipeline.enableAbandonedNodeEnforcement(false);
    TupleTag<String> mainOutputTag = new TupleTag<String>("main") {
    };
    TupleTag<String> additionalOutputTag1 = new TupleTag<String>("output1") {
    };
    TupleTag<String> additionalOutputTag2 = new TupleTag<String>("output2") {
    };
    TupleTag<String> additionalOutputTag3 = new TupleTag<String>("output3") {
    };
    TupleTag<String> additionalOutputTagUnwritten = new TupleTag<String>("unwrittenOutput") {
    };
    PCollectionTuple outputs = pipeline.apply(Create.of(Arrays.asList(3, -42, 666))).setName("MyInput").apply("MyParDo", ParDo.of(new TestDoFn(Arrays.<PCollectionView<Integer>>asList(), Arrays.asList(additionalOutputTag1, additionalOutputTag2, additionalOutputTag3))).withOutputTags(mainOutputTag, TupleTagList.of(additionalOutputTag3).and(additionalOutputTag1).and(additionalOutputTagUnwritten).and(additionalOutputTag2)));
    assertEquals("MyParDo.main", outputs.get(mainOutputTag).getName());
    assertEquals("MyParDo.output1", outputs.get(additionalOutputTag1).getName());
    assertEquals("MyParDo.output2", outputs.get(additionalOutputTag2).getName());
    assertEquals("MyParDo.output3", outputs.get(additionalOutputTag3).getName());
    assertEquals("MyParDo.unwrittenOutput", outputs.get(additionalOutputTagUnwritten).getName());
}
Also used : PCollectionView(org.apache.beam.sdk.values.PCollectionView) TupleTag(org.apache.beam.sdk.values.TupleTag) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) StringUtils.byteArrayToJsonString(org.apache.beam.sdk.util.StringUtils.byteArrayToJsonString) Matchers.containsString(org.hamcrest.Matchers.containsString) Test(org.junit.Test)

Example 62 with TupleTag

use of org.apache.beam.sdk.values.TupleTag in project beam by apache.

the class ParDoTest method testMainOutputUnregisteredExplicitCoder.

@Test
@Category(NeedsRunner.class)
public void testMainOutputUnregisteredExplicitCoder() {
    PCollection<Integer> input = pipeline.apply(Create.of(Arrays.asList(1, 2, 3)));
    final TupleTag<TestDummy> mainOutputTag = new TupleTag<TestDummy>("unregisteredMain");
    final TupleTag<Integer> additionalOutputTag = new TupleTag<Integer>("additionalOutput") {
    };
    PCollectionTuple outputTuple = input.apply(ParDo.of(new MainOutputDummyFn(additionalOutputTag)).withOutputTags(mainOutputTag, TupleTagList.of(additionalOutputTag)));
    outputTuple.get(mainOutputTag).setCoder(new TestDummyCoder());
    pipeline.run();
}
Also used : TupleTag(org.apache.beam.sdk.values.TupleTag) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) Category(org.junit.experimental.categories.Category) Test(org.junit.Test)

Example 63 with TupleTag

use of org.apache.beam.sdk.values.TupleTag in project beam by apache.

the class ParDoTest method testParDoWithTaggedOutput.

@Test
@Category(ValidatesRunner.class)
public void testParDoWithTaggedOutput() {
    List<Integer> inputs = Arrays.asList(3, -42, 666);
    TupleTag<String> mainOutputTag = new TupleTag<String>("main") {
    };
    TupleTag<String> additionalOutputTag1 = new TupleTag<String>("additional1") {
    };
    TupleTag<String> additionalOutputTag2 = new TupleTag<String>("additional2") {
    };
    TupleTag<String> additionalOutputTag3 = new TupleTag<String>("additional3") {
    };
    TupleTag<String> additionalOutputTagUnwritten = new TupleTag<String>("unwrittenOutput") {
    };
    PCollectionTuple outputs = pipeline.apply(Create.of(inputs)).apply(ParDo.of(new TestDoFn(Arrays.<PCollectionView<Integer>>asList(), Arrays.asList(additionalOutputTag1, additionalOutputTag2, additionalOutputTag3))).withOutputTags(mainOutputTag, TupleTagList.of(additionalOutputTag3).and(additionalOutputTag1).and(additionalOutputTagUnwritten).and(additionalOutputTag2)));
    PAssert.that(outputs.get(mainOutputTag)).satisfies(ParDoTest.HasExpectedOutput.forInput(inputs));
    PAssert.that(outputs.get(additionalOutputTag1)).satisfies(ParDoTest.HasExpectedOutput.forInput(inputs).fromOutput(additionalOutputTag1));
    PAssert.that(outputs.get(additionalOutputTag2)).satisfies(ParDoTest.HasExpectedOutput.forInput(inputs).fromOutput(additionalOutputTag2));
    PAssert.that(outputs.get(additionalOutputTag3)).satisfies(ParDoTest.HasExpectedOutput.forInput(inputs).fromOutput(additionalOutputTag3));
    PAssert.that(outputs.get(additionalOutputTagUnwritten)).empty();
    pipeline.run();
}
Also used : PCollectionView(org.apache.beam.sdk.values.PCollectionView) TupleTag(org.apache.beam.sdk.values.TupleTag) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) StringUtils.byteArrayToJsonString(org.apache.beam.sdk.util.StringUtils.byteArrayToJsonString) Matchers.containsString(org.hamcrest.Matchers.containsString) Category(org.junit.experimental.categories.Category) Test(org.junit.Test)

Example 64 with TupleTag

use of org.apache.beam.sdk.values.TupleTag in project beam by apache.

the class ApexParDoOperator method setup.

@Override
public void setup(OperatorContext context) {
    this.traceTuples = ApexStreamTuple.Logging.isDebugEnabled(pipelineOptions.get(), this);
    SideInputReader sideInputReader = NullSideInputReader.of(sideInputs);
    if (!sideInputs.isEmpty()) {
        sideInputHandler = new SideInputHandler(sideInputs, sideInputStateInternals);
        sideInputReader = sideInputHandler;
    }
    for (int i = 0; i < additionalOutputTags.size(); i++) {
        @SuppressWarnings("unchecked") DefaultOutputPort<ApexStreamTuple<?>> port = (DefaultOutputPort<ApexStreamTuple<?>>) additionalOutputPorts[i];
        additionalOutputPortMapping.put(additionalOutputTags.get(i), port);
    }
    NoOpStepContext stepContext = new NoOpStepContext() {

        @Override
        public StateInternals stateInternals() {
            return currentKeyStateInternals;
        }

        @Override
        public TimerInternals timerInternals() {
            return currentKeyTimerInternals;
        }
    };
    DoFnRunner<InputT, OutputT> doFnRunner = DoFnRunners.simpleRunner(pipelineOptions.get(), doFn, sideInputReader, this, mainOutputTag, additionalOutputTags, stepContext, windowingStrategy);
    doFnInvoker = DoFnInvokers.invokerFor(doFn);
    doFnInvoker.invokeSetup();
    if (this.currentKeyStateInternals != null) {
        StatefulDoFnRunner.CleanupTimer cleanupTimer = new StatefulDoFnRunner.TimeInternalsCleanupTimer(stepContext.timerInternals(), windowingStrategy);
        @SuppressWarnings({ "rawtypes" }) Coder windowCoder = windowingStrategy.getWindowFn().windowCoder();
        @SuppressWarnings({ "unchecked" }) StatefulDoFnRunner.StateCleaner<?> stateCleaner = new StatefulDoFnRunner.StateInternalsStateCleaner<>(doFn, stepContext.stateInternals(), windowCoder);
        doFnRunner = DoFnRunners.defaultStatefulDoFnRunner(doFn, doFnRunner, windowingStrategy, cleanupTimer, stateCleaner);
    }
    pushbackDoFnRunner = SimplePushbackSideInputDoFnRunner.create(doFnRunner, sideInputs, sideInputHandler);
    if (doFn instanceof ProcessFn) {
        @SuppressWarnings("unchecked") StateInternalsFactory<String> stateInternalsFactory = (StateInternalsFactory<String>) this.currentKeyStateInternals.getFactory();
        @SuppressWarnings({ "rawtypes", "unchecked" }) ProcessFn<InputT, OutputT, Object, RestrictionTracker<Object>> splittableDoFn = (ProcessFn) doFn;
        splittableDoFn.setStateInternalsFactory(stateInternalsFactory);
        TimerInternalsFactory<String> timerInternalsFactory = new TimerInternalsFactory<String>() {

            @Override
            public TimerInternals timerInternalsForKey(String key) {
                return currentKeyTimerInternals;
            }
        };
        splittableDoFn.setTimerInternalsFactory(timerInternalsFactory);
        splittableDoFn.setProcessElementInvoker(new OutputAndTimeBoundedSplittableProcessElementInvoker<>(doFn, pipelineOptions.get(), new OutputWindowedValue<OutputT>() {

            @Override
            public void outputWindowedValue(OutputT output, Instant timestamp, Collection<? extends BoundedWindow> windows, PaneInfo pane) {
                output(mainOutputTag, WindowedValue.of(output, timestamp, windows, pane));
            }

            @Override
            public <AdditionalOutputT> void outputWindowedValue(TupleTag<AdditionalOutputT> tag, AdditionalOutputT output, Instant timestamp, Collection<? extends BoundedWindow> windows, PaneInfo pane) {
                output(tag, WindowedValue.of(output, timestamp, windows, pane));
            }
        }, sideInputReader, Executors.newSingleThreadScheduledExecutor(Executors.defaultThreadFactory()), 10000, Duration.standardSeconds(10)));
    }
}
Also used : RestrictionTracker(org.apache.beam.sdk.transforms.splittabledofn.RestrictionTracker) ApexStreamTuple(org.apache.beam.runners.apex.translation.utils.ApexStreamTuple) ProcessFn(org.apache.beam.runners.core.SplittableParDoViaKeyedWorkItems.ProcessFn) SideInputHandler(org.apache.beam.runners.core.SideInputHandler) TupleTag(org.apache.beam.sdk.values.TupleTag) SideInputReader(org.apache.beam.runners.core.SideInputReader) NullSideInputReader(org.apache.beam.runners.core.NullSideInputReader) NoOpStepContext(org.apache.beam.runners.apex.translation.utils.NoOpStepContext) PaneInfo(org.apache.beam.sdk.transforms.windowing.PaneInfo) StatefulDoFnRunner(org.apache.beam.runners.core.StatefulDoFnRunner) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) DefaultOutputPort(com.datatorrent.api.DefaultOutputPort) WindowedValueCoder(org.apache.beam.sdk.util.WindowedValue.WindowedValueCoder) KeyedWorkItemCoder(org.apache.beam.runners.core.KeyedWorkItemCoder) ListCoder(org.apache.beam.sdk.coders.ListCoder) KvCoder(org.apache.beam.sdk.coders.KvCoder) Coder(org.apache.beam.sdk.coders.Coder) StringUtf8Coder(org.apache.beam.sdk.coders.StringUtf8Coder) VoidCoder(org.apache.beam.sdk.coders.VoidCoder) OutputWindowedValue(org.apache.beam.runners.core.OutputWindowedValue) TimerInternalsFactory(org.apache.beam.runners.core.TimerInternalsFactory) Instant(org.joda.time.Instant) StateInternalsFactory(org.apache.beam.runners.core.StateInternalsFactory) Collection(java.util.Collection)

Example 65 with TupleTag

use of org.apache.beam.sdk.values.TupleTag in project beam by apache.

the class TransformTranslator method parDo.

private static <InputT, OutputT> TransformEvaluator<ParDo.MultiOutput<InputT, OutputT>> parDo() {
    return new TransformEvaluator<ParDo.MultiOutput<InputT, OutputT>>() {

        @Override
        @SuppressWarnings("unchecked")
        public void evaluate(ParDo.MultiOutput<InputT, OutputT> transform, EvaluationContext context) {
            String stepName = context.getCurrentTransform().getFullName();
            DoFn<InputT, OutputT> doFn = transform.getFn();
            rejectSplittable(doFn);
            JavaRDD<WindowedValue<InputT>> inRDD = ((BoundedDataset<InputT>) context.borrowDataset(transform)).getRDD();
            WindowingStrategy<?, ?> windowingStrategy = context.getInput(transform).getWindowingStrategy();
            Accumulator<NamedAggregators> aggAccum = AggregatorsAccumulator.getInstance();
            Accumulator<MetricsContainerStepMap> metricsAccum = MetricsAccumulator.getInstance();
            JavaPairRDD<TupleTag<?>, WindowedValue<?>> all;
            DoFnSignature signature = DoFnSignatures.getSignature(transform.getFn().getClass());
            boolean stateful = signature.stateDeclarations().size() > 0 || signature.timerDeclarations().size() > 0;
            MultiDoFnFunction<InputT, OutputT> multiDoFnFunction = new MultiDoFnFunction<>(aggAccum, metricsAccum, stepName, doFn, context.getRuntimeContext(), transform.getMainOutputTag(), transform.getAdditionalOutputTags().getAll(), TranslationUtils.getSideInputs(transform.getSideInputs(), context), windowingStrategy, stateful);
            if (stateful) {
                // Based on the fact that the signature is stateful, DoFnSignatures ensures
                // that it is also keyed
                all = statefulParDoTransform((KvCoder) context.getInput(transform).getCoder(), windowingStrategy.getWindowFn().windowCoder(), (JavaRDD) inRDD, (MultiDoFnFunction) multiDoFnFunction);
            } else {
                all = inRDD.mapPartitionsToPair(multiDoFnFunction);
            }
            Map<TupleTag<?>, PValue> outputs = context.getOutputs(transform);
            if (outputs.size() > 1) {
                // cache the RDD if we're going to filter it more than once.
                all.cache();
            }
            for (Map.Entry<TupleTag<?>, PValue> output : outputs.entrySet()) {
                JavaPairRDD<TupleTag<?>, WindowedValue<?>> filtered = all.filter(new TranslationUtils.TupleTagFilter(output.getKey()));
                // Object is the best we can do since different outputs can have different tags
                JavaRDD<WindowedValue<Object>> values = (JavaRDD<WindowedValue<Object>>) (JavaRDD<?>) filtered.values();
                context.putDataset(output.getValue(), new BoundedDataset<>(values));
            }
        }

        @Override
        public String toNativeString() {
            return "mapPartitions(new <fn>())";
        }
    };
}
Also used : MetricsContainerStepMap(org.apache.beam.runners.core.metrics.MetricsContainerStepMap) TupleTag(org.apache.beam.sdk.values.TupleTag) WindowedValue(org.apache.beam.sdk.util.WindowedValue) NamedAggregators(org.apache.beam.runners.spark.aggregators.NamedAggregators) KvCoder(org.apache.beam.sdk.coders.KvCoder) PValue(org.apache.beam.sdk.values.PValue) JavaRDD(org.apache.spark.api.java.JavaRDD) ParDo(org.apache.beam.sdk.transforms.ParDo) MetricsContainerStepMap(org.apache.beam.runners.core.metrics.MetricsContainerStepMap) Map(java.util.Map) DoFnSignature(org.apache.beam.sdk.transforms.reflect.DoFnSignature)

Aggregations

TupleTag (org.apache.beam.sdk.values.TupleTag)67 Test (org.junit.Test)44 PCollectionTuple (org.apache.beam.sdk.values.PCollectionTuple)27 KV (org.apache.beam.sdk.values.KV)16 PCollection (org.apache.beam.sdk.values.PCollection)15 Instant (org.joda.time.Instant)14 WindowedValue (org.apache.beam.sdk.util.WindowedValue)13 PValue (org.apache.beam.sdk.values.PValue)13 Category (org.junit.experimental.categories.Category)13 IntervalWindow (org.apache.beam.sdk.transforms.windowing.IntervalWindow)10 AppliedPTransform (org.apache.beam.sdk.runners.AppliedPTransform)9 DoFn (org.apache.beam.sdk.transforms.DoFn)9 StringUtils.byteArrayToJsonString (org.apache.beam.sdk.util.StringUtils.byteArrayToJsonString)9 PCollectionView (org.apache.beam.sdk.values.PCollectionView)9 Matchers.containsString (org.hamcrest.Matchers.containsString)9 Map (java.util.Map)8 ImmutableMap (com.google.common.collect.ImmutableMap)6 Pipeline (org.apache.beam.sdk.Pipeline)5 ImmutableList (com.google.common.collect.ImmutableList)4 ArrayList (java.util.ArrayList)4