Search in sources :

Example 31 with TupleTag

use of org.apache.beam.sdk.values.TupleTag in project beam by apache.

the class StreamingTransformTranslator method parDo.

private static <InputT, OutputT> TransformEvaluator<ParDo.MultiOutput<InputT, OutputT>> parDo() {
    return new TransformEvaluator<ParDo.MultiOutput<InputT, OutputT>>() {

        @Override
        public void evaluate(final ParDo.MultiOutput<InputT, OutputT> transform, final EvaluationContext context) {
            final DoFn<InputT, OutputT> doFn = transform.getFn();
            checkArgument(!DoFnSignatures.signatureForDoFn(doFn).processElement().isSplittable(), "Splittable DoFn not yet supported in streaming mode: %s", doFn);
            rejectStateAndTimers(doFn);
            final SerializablePipelineOptions options = context.getSerializableOptions();
            final SparkPCollectionView pviews = context.getPViews();
            final WindowingStrategy<?, ?> windowingStrategy = context.getInput(transform).getWindowingStrategy();
            Coder<InputT> inputCoder = (Coder<InputT>) context.getInput(transform).getCoder();
            Map<TupleTag<?>, Coder<?>> outputCoders = context.getOutputCoders();
            @SuppressWarnings("unchecked") UnboundedDataset<InputT> unboundedDataset = (UnboundedDataset<InputT>) context.borrowDataset(transform);
            JavaDStream<WindowedValue<InputT>> dStream = unboundedDataset.getDStream();
            final DoFnSchemaInformation doFnSchemaInformation = ParDoTranslation.getSchemaInformation(context.getCurrentTransform());
            final Map<String, PCollectionView<?>> sideInputMapping = ParDoTranslation.getSideInputMapping(context.getCurrentTransform());
            final String stepName = context.getCurrentTransform().getFullName();
            JavaPairDStream<TupleTag<?>, WindowedValue<?>> all = dStream.transformToPair(rdd -> {
                final MetricsContainerStepMapAccumulator metricsAccum = MetricsAccumulator.getInstance();
                final Map<TupleTag<?>, KV<WindowingStrategy<?, ?>, SideInputBroadcast<?>>> sideInputs = TranslationUtils.getSideInputs(transform.getSideInputs().values(), JavaSparkContext.fromSparkContext(rdd.context()), pviews);
                return rdd.mapPartitionsToPair(new MultiDoFnFunction<>(metricsAccum, stepName, doFn, options, transform.getMainOutputTag(), transform.getAdditionalOutputTags().getAll(), inputCoder, outputCoders, sideInputs, windowingStrategy, false, doFnSchemaInformation, sideInputMapping));
            });
            Map<TupleTag<?>, PCollection<?>> outputs = context.getOutputs(transform);
            if (outputs.size() > 1) {
                // Caching can cause Serialization, we need to code to bytes
                // more details in https://issues.apache.org/jira/browse/BEAM-2669
                Map<TupleTag<?>, Coder<WindowedValue<?>>> coderMap = TranslationUtils.getTupleTagCoders(outputs);
                all = all.mapToPair(TranslationUtils.getTupleTagEncodeFunction(coderMap)).cache().mapToPair(TranslationUtils.getTupleTagDecodeFunction(coderMap));
            }
            for (Map.Entry<TupleTag<?>, PCollection<?>> output : outputs.entrySet()) {
                @SuppressWarnings("unchecked") JavaPairDStream<TupleTag<?>, WindowedValue<?>> filtered = all.filter(new TranslationUtils.TupleTagFilter(output.getKey()));
                @SuppressWarnings("unchecked") JavaDStream<WindowedValue<Object>> // Object is the best we can do since different outputs can have different tags
                values = (JavaDStream<WindowedValue<Object>>) (JavaDStream<?>) TranslationUtils.dStreamValues(filtered);
                context.putDataset(output.getValue(), new UnboundedDataset<>(values, unboundedDataset.getStreamSources()));
            }
        }

        @Override
        public String toNativeString() {
            return "mapPartitions(new <fn>())";
        }
    };
}
Also used : TupleTag(org.apache.beam.sdk.values.TupleTag) JavaDStream(org.apache.spark.streaming.api.java.JavaDStream) WindowedValue(org.apache.beam.sdk.util.WindowedValue) SerializablePipelineOptions(org.apache.beam.runners.core.construction.SerializablePipelineOptions) KvCoder(org.apache.beam.sdk.coders.KvCoder) Coder(org.apache.beam.sdk.coders.Coder) KV(org.apache.beam.sdk.values.KV) MetricsContainerStepMapAccumulator(org.apache.beam.runners.spark.metrics.MetricsContainerStepMapAccumulator) TransformEvaluator(org.apache.beam.runners.spark.translation.TransformEvaluator) TranslationUtils(org.apache.beam.runners.spark.translation.TranslationUtils) PCollection(org.apache.beam.sdk.values.PCollection) SparkPCollectionView(org.apache.beam.runners.spark.translation.SparkPCollectionView) PCollectionView(org.apache.beam.sdk.values.PCollectionView) DoFnSchemaInformation(org.apache.beam.sdk.transforms.DoFnSchemaInformation) ParDo(org.apache.beam.sdk.transforms.ParDo) SplittableParDo(org.apache.beam.runners.core.construction.SplittableParDo) EvaluationContext(org.apache.beam.runners.spark.translation.EvaluationContext) ImmutableMap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap) Map(java.util.Map) HashMap(java.util.HashMap) SparkPCollectionView(org.apache.beam.runners.spark.translation.SparkPCollectionView)

Example 32 with TupleTag

use of org.apache.beam.sdk.values.TupleTag in project beam by apache.

the class CreateStreamTest method testMultiOutputParDo.

/**
 * Test multiple output {@link ParDo} in streaming pipelines. This is currently needed as a test
 * for https://issues.apache.org/jira/browse/BEAM-2029 since {@link
 * org.apache.beam.sdk.testing.ValidatesRunner} tests do not currently run for Spark runner in
 * streaming mode.
 */
@Test
public void testMultiOutputParDo() throws IOException {
    Instant instant = new Instant(0);
    CreateStream<Integer> source1 = CreateStream.of(VarIntCoder.of(), batchDuration()).emptyBatch().advanceWatermarkForNextBatch(instant.plus(Duration.standardMinutes(5))).nextBatch(TimestampedValue.of(1, instant), TimestampedValue.of(2, instant), TimestampedValue.of(3, instant)).advanceNextBatchWatermarkToInfinity();
    PCollection<Integer> inputs = p.apply(source1);
    final TupleTag<Integer> mainTag = new TupleTag<>();
    final TupleTag<Integer> additionalTag = new TupleTag<>();
    PCollectionTuple outputs = inputs.apply(ParDo.of(new DoFn<Integer, Integer>() {

        @SuppressWarnings("unused")
        @ProcessElement
        public void process(ProcessContext context) {
            Integer element = context.element();
            context.output(element);
            context.output(additionalTag, element + 1);
        }
    }).withOutputTags(mainTag, TupleTagList.of(additionalTag)));
    PCollection<Integer> output1 = outputs.get(mainTag).setCoder(VarIntCoder.of());
    PCollection<Integer> output2 = outputs.get(additionalTag).setCoder(VarIntCoder.of());
    PAssert.that(output1).containsInAnyOrder(1, 2, 3);
    PAssert.that(output2).containsInAnyOrder(2, 3, 4);
    p.run();
}
Also used : AtomicInteger(java.util.concurrent.atomic.AtomicInteger) Instant(org.joda.time.Instant) TupleTag(org.apache.beam.sdk.values.TupleTag) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) StreamingTest(org.apache.beam.runners.spark.StreamingTest) Test(org.junit.Test)

Example 33 with TupleTag

use of org.apache.beam.sdk.values.TupleTag in project beam by apache.

the class DoFnFunction method prepareSerialization.

/**
 * prepares the DoFnFunction class so it can be serialized properly. This involves using various
 * protobuf's and byte arrays which are later converted back into the proper classes during
 * deserialization.
 */
private void prepareSerialization() {
    SdkComponents components = SdkComponents.create();
    components.registerEnvironment(Environments.createOrGetDefaultEnvironment(pipelineOptions.as(PortablePipelineOptions.class)));
    this.serializedOptions = new SerializablePipelineOptions(pipelineOptions).toString();
    doFnwithEx = ParDoTranslation.translateDoFn(this.doFn, mainOutput, sideInputMapping, doFnSchemaInformation, components);
    doFnwithExBytes = doFnwithEx.getPayload().toByteArray();
    outputCodersBytes = new HashMap<>();
    try {
        coderBytes = SerializableUtils.serializeToByteArray(inputCoder);
        windowStrategyProto = WindowingStrategyTranslation.toMessageProto(windowingStrategy, components);
        windowBytes = windowStrategyProto.toByteArray();
        for (Map.Entry<TupleTag<?>, Coder<?>> entry : outputCoders.entrySet()) {
            outputCodersBytes.put(entry.getKey().getId(), SerializableUtils.serializeToByteArray(entry.getValue()));
        }
        sideInputBytes = new HashMap<>();
        for (Map.Entry<TupleTag<?>, WindowingStrategy<?, ?>> entry : sideInputs.entrySet()) {
            windowStrategyProto = WindowingStrategyTranslation.toMessageProto(entry.getValue(), components);
            sideInputBytes.put(entry.getKey().getId(), windowStrategyProto.toByteArray());
        }
        serializedSideOutputs = new ArrayList<>();
        for (TupleTag<?> sideOutput : sideOutputs) {
            serializedSideOutputs.add(sideOutput.getId());
        }
        serializedOutputMap = new HashMap<>();
        for (Map.Entry<TupleTag<?>, Integer> entry : outputMap.entrySet()) {
            serializedOutputMap.put(entry.getKey().getId(), entry.getValue());
        }
    } catch (IOException e) {
        LOG.info(e.getMessage());
    }
}
Also used : Coder(org.apache.beam.sdk.coders.Coder) TupleTag(org.apache.beam.sdk.values.TupleTag) IOException(java.io.IOException) SdkComponents(org.apache.beam.runners.core.construction.SdkComponents) WindowingStrategy(org.apache.beam.sdk.values.WindowingStrategy) SerializablePipelineOptions(org.apache.beam.runners.core.construction.SerializablePipelineOptions) HashMap(java.util.HashMap) Map(java.util.Map)

Example 34 with TupleTag

use of org.apache.beam.sdk.values.TupleTag in project beam by apache.

the class Partition method expand.

// ///////////////////////////////////////////////////////////////////////////
@Override
public PCollectionList<T> expand(PCollection<T> in) {
    final TupleTagList outputTags = partitionDoFn.getOutputTags();
    PCollectionTuple outputs = in.apply(ParDo.of(partitionDoFn).withOutputTags(new TupleTag<Void>() {
    }, outputTags).withSideInputs(partitionDoFn.getSideInputs()));
    PCollectionList<T> pcs = PCollectionList.empty(in.getPipeline());
    Coder<T> coder = in.getCoder();
    for (TupleTag<?> outputTag : outputTags.getAll()) {
        // All the tuple tags are actually TupleTag<T>
        // And all the collections are actually PCollection<T>
        @SuppressWarnings("unchecked") TupleTag<T> typedOutputTag = (TupleTag<T>) outputTag;
        pcs = pcs.and(outputs.get(typedOutputTag).setCoder(coder));
    }
    return pcs;
}
Also used : TupleTagList(org.apache.beam.sdk.values.TupleTagList) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) TupleTag(org.apache.beam.sdk.values.TupleTag)

Example 35 with TupleTag

use of org.apache.beam.sdk.values.TupleTag in project beam by apache.

the class PipelineTest method testTupleProjectionTransform.

/**
 * Tests that Pipeline supports pulling an element out of a tuple as a transform.
 */
@Test
@Category(ValidatesRunner.class)
public void testTupleProjectionTransform() throws Exception {
    PCollection<Integer> input = pipeline.apply(Create.of(1, 2, 3, 4));
    TupleTag<Integer> tag = new TupleTag<>();
    PCollectionTuple tuple = PCollectionTuple.of(tag, input);
    PCollection<Integer> output = tuple.apply("ProjectTag", new TupleProjectionTransform<>(tag));
    PAssert.that(output).containsInAnyOrder(1, 2, 3, 4);
    pipeline.run();
}
Also used : TupleTag(org.apache.beam.sdk.values.TupleTag) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) Category(org.junit.experimental.categories.Category) Test(org.junit.Test)

Aggregations

TupleTag (org.apache.beam.sdk.values.TupleTag)185 Test (org.junit.Test)100 WindowedValue (org.apache.beam.sdk.util.WindowedValue)54 KV (org.apache.beam.sdk.values.KV)54 PCollectionTuple (org.apache.beam.sdk.values.PCollectionTuple)49 PCollection (org.apache.beam.sdk.values.PCollection)42 DoFn (org.apache.beam.sdk.transforms.DoFn)32 Instant (org.joda.time.Instant)32 SerializablePipelineOptions (org.apache.beam.runners.core.construction.SerializablePipelineOptions)30 Map (java.util.Map)29 Pipeline (org.apache.beam.sdk.Pipeline)29 PCollectionView (org.apache.beam.sdk.values.PCollectionView)29 HashMap (java.util.HashMap)27 Coder (org.apache.beam.sdk.coders.Coder)26 StreamRecordStripper.stripStreamRecordFromWindowedValue (org.apache.beam.runners.flink.translation.wrappers.streaming.StreamRecordStripper.stripStreamRecordFromWindowedValue)25 Matchers.containsString (org.hamcrest.Matchers.containsString)25 List (java.util.List)24 BoundedWindow (org.apache.beam.sdk.transforms.windowing.BoundedWindow)23 KvCoder (org.apache.beam.sdk.coders.KvCoder)22 KeyedOneInputStreamOperatorTestHarness (org.apache.flink.streaming.util.KeyedOneInputStreamOperatorTestHarness)22