Search in sources :

Example 1 with SparkPCollectionView

use of org.apache.beam.runners.spark.translation.SparkPCollectionView in project beam by apache.

the class StreamingTransformTranslator method parDo.

private static <InputT, OutputT> TransformEvaluator<ParDo.MultiOutput<InputT, OutputT>> parDo() {
    return new TransformEvaluator<ParDo.MultiOutput<InputT, OutputT>>() {

        @Override
        public void evaluate(final ParDo.MultiOutput<InputT, OutputT> transform, final EvaluationContext context) {
            final DoFn<InputT, OutputT> doFn = transform.getFn();
            checkArgument(!DoFnSignatures.signatureForDoFn(doFn).processElement().isSplittable(), "Splittable DoFn not yet supported in streaming mode: %s", doFn);
            rejectStateAndTimers(doFn);
            final SerializablePipelineOptions options = context.getSerializableOptions();
            final SparkPCollectionView pviews = context.getPViews();
            final WindowingStrategy<?, ?> windowingStrategy = context.getInput(transform).getWindowingStrategy();
            Coder<InputT> inputCoder = (Coder<InputT>) context.getInput(transform).getCoder();
            Map<TupleTag<?>, Coder<?>> outputCoders = context.getOutputCoders();
            @SuppressWarnings("unchecked") UnboundedDataset<InputT> unboundedDataset = (UnboundedDataset<InputT>) context.borrowDataset(transform);
            JavaDStream<WindowedValue<InputT>> dStream = unboundedDataset.getDStream();
            final DoFnSchemaInformation doFnSchemaInformation = ParDoTranslation.getSchemaInformation(context.getCurrentTransform());
            final Map<String, PCollectionView<?>> sideInputMapping = ParDoTranslation.getSideInputMapping(context.getCurrentTransform());
            final String stepName = context.getCurrentTransform().getFullName();
            JavaPairDStream<TupleTag<?>, WindowedValue<?>> all = dStream.transformToPair(rdd -> {
                final MetricsContainerStepMapAccumulator metricsAccum = MetricsAccumulator.getInstance();
                final Map<TupleTag<?>, KV<WindowingStrategy<?, ?>, SideInputBroadcast<?>>> sideInputs = TranslationUtils.getSideInputs(transform.getSideInputs().values(), JavaSparkContext.fromSparkContext(rdd.context()), pviews);
                return rdd.mapPartitionsToPair(new MultiDoFnFunction<>(metricsAccum, stepName, doFn, options, transform.getMainOutputTag(), transform.getAdditionalOutputTags().getAll(), inputCoder, outputCoders, sideInputs, windowingStrategy, false, doFnSchemaInformation, sideInputMapping));
            });
            Map<TupleTag<?>, PCollection<?>> outputs = context.getOutputs(transform);
            if (outputs.size() > 1) {
                // Caching can cause Serialization, we need to code to bytes
                // more details in https://issues.apache.org/jira/browse/BEAM-2669
                Map<TupleTag<?>, Coder<WindowedValue<?>>> coderMap = TranslationUtils.getTupleTagCoders(outputs);
                all = all.mapToPair(TranslationUtils.getTupleTagEncodeFunction(coderMap)).cache().mapToPair(TranslationUtils.getTupleTagDecodeFunction(coderMap));
            }
            for (Map.Entry<TupleTag<?>, PCollection<?>> output : outputs.entrySet()) {
                @SuppressWarnings("unchecked") JavaPairDStream<TupleTag<?>, WindowedValue<?>> filtered = all.filter(new TranslationUtils.TupleTagFilter(output.getKey()));
                @SuppressWarnings("unchecked") JavaDStream<WindowedValue<Object>> // Object is the best we can do since different outputs can have different tags
                values = (JavaDStream<WindowedValue<Object>>) (JavaDStream<?>) TranslationUtils.dStreamValues(filtered);
                context.putDataset(output.getValue(), new UnboundedDataset<>(values, unboundedDataset.getStreamSources()));
            }
        }

        @Override
        public String toNativeString() {
            return "mapPartitions(new <fn>())";
        }
    };
}
Also used : TupleTag(org.apache.beam.sdk.values.TupleTag) JavaDStream(org.apache.spark.streaming.api.java.JavaDStream) WindowedValue(org.apache.beam.sdk.util.WindowedValue) SerializablePipelineOptions(org.apache.beam.runners.core.construction.SerializablePipelineOptions) KvCoder(org.apache.beam.sdk.coders.KvCoder) Coder(org.apache.beam.sdk.coders.Coder) KV(org.apache.beam.sdk.values.KV) MetricsContainerStepMapAccumulator(org.apache.beam.runners.spark.metrics.MetricsContainerStepMapAccumulator) TransformEvaluator(org.apache.beam.runners.spark.translation.TransformEvaluator) TranslationUtils(org.apache.beam.runners.spark.translation.TranslationUtils) PCollection(org.apache.beam.sdk.values.PCollection) SparkPCollectionView(org.apache.beam.runners.spark.translation.SparkPCollectionView) PCollectionView(org.apache.beam.sdk.values.PCollectionView) DoFnSchemaInformation(org.apache.beam.sdk.transforms.DoFnSchemaInformation) ParDo(org.apache.beam.sdk.transforms.ParDo) SplittableParDo(org.apache.beam.runners.core.construction.SplittableParDo) EvaluationContext(org.apache.beam.runners.spark.translation.EvaluationContext) ImmutableMap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap) Map(java.util.Map) HashMap(java.util.HashMap) SparkPCollectionView(org.apache.beam.runners.spark.translation.SparkPCollectionView)

Example 2 with SparkPCollectionView

use of org.apache.beam.runners.spark.translation.SparkPCollectionView in project beam by apache.

the class StreamingTransformTranslator method combineGrouped.

private static <K, InputT, OutputT> TransformEvaluator<Combine.GroupedValues<K, InputT, OutputT>> combineGrouped() {
    return new TransformEvaluator<Combine.GroupedValues<K, InputT, OutputT>>() {

        @Override
        public void evaluate(final Combine.GroupedValues<K, InputT, OutputT> transform, EvaluationContext context) {
            // get the applied combine function.
            PCollection<? extends KV<K, ? extends Iterable<InputT>>> input = context.getInput(transform);
            final WindowingStrategy<?, ?> windowingStrategy = input.getWindowingStrategy();
            @SuppressWarnings("unchecked") final CombineWithContext.CombineFnWithContext<InputT, ?, OutputT> fn = (CombineWithContext.CombineFnWithContext<InputT, ?, OutputT>) CombineFnUtil.toFnWithContext(transform.getFn());
            @SuppressWarnings("unchecked") UnboundedDataset<KV<K, Iterable<InputT>>> unboundedDataset = (UnboundedDataset<KV<K, Iterable<InputT>>>) context.borrowDataset(transform);
            JavaDStream<WindowedValue<KV<K, Iterable<InputT>>>> dStream = unboundedDataset.getDStream();
            final SerializablePipelineOptions options = context.getSerializableOptions();
            final SparkPCollectionView pviews = context.getPViews();
            JavaDStream<WindowedValue<KV<K, OutputT>>> outStream = dStream.transform(rdd -> {
                SparkCombineFn<KV<K, InputT>, InputT, ?, OutputT> combineFnWithContext = SparkCombineFn.keyed(fn, options, TranslationUtils.getSideInputs(transform.getSideInputs(), new JavaSparkContext(rdd.context()), pviews), windowingStrategy);
                return rdd.map(new TranslationUtils.CombineGroupedValues<>(combineFnWithContext));
            });
            context.putDataset(transform, new UnboundedDataset<>(outStream, unboundedDataset.getStreamSources()));
        }

        @Override
        public String toNativeString() {
            return "map(new <fn>())";
        }
    };
}
Also used : Combine(org.apache.beam.sdk.transforms.Combine) WindowedValue(org.apache.beam.sdk.util.WindowedValue) SerializablePipelineOptions(org.apache.beam.runners.core.construction.SerializablePipelineOptions) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) CombineWithContext(org.apache.beam.sdk.transforms.CombineWithContext) KV(org.apache.beam.sdk.values.KV) TransformEvaluator(org.apache.beam.runners.spark.translation.TransformEvaluator) TranslationUtils(org.apache.beam.runners.spark.translation.TranslationUtils) EvaluationContext(org.apache.beam.runners.spark.translation.EvaluationContext) SparkPCollectionView(org.apache.beam.runners.spark.translation.SparkPCollectionView)

Aggregations

SerializablePipelineOptions (org.apache.beam.runners.core.construction.SerializablePipelineOptions)2 EvaluationContext (org.apache.beam.runners.spark.translation.EvaluationContext)2 SparkPCollectionView (org.apache.beam.runners.spark.translation.SparkPCollectionView)2 TransformEvaluator (org.apache.beam.runners.spark.translation.TransformEvaluator)2 TranslationUtils (org.apache.beam.runners.spark.translation.TranslationUtils)2 WindowedValue (org.apache.beam.sdk.util.WindowedValue)2 KV (org.apache.beam.sdk.values.KV)2 HashMap (java.util.HashMap)1 Map (java.util.Map)1 SplittableParDo (org.apache.beam.runners.core.construction.SplittableParDo)1 MetricsContainerStepMapAccumulator (org.apache.beam.runners.spark.metrics.MetricsContainerStepMapAccumulator)1 Coder (org.apache.beam.sdk.coders.Coder)1 KvCoder (org.apache.beam.sdk.coders.KvCoder)1 Combine (org.apache.beam.sdk.transforms.Combine)1 CombineWithContext (org.apache.beam.sdk.transforms.CombineWithContext)1 DoFnSchemaInformation (org.apache.beam.sdk.transforms.DoFnSchemaInformation)1 ParDo (org.apache.beam.sdk.transforms.ParDo)1 PCollection (org.apache.beam.sdk.values.PCollection)1 PCollectionView (org.apache.beam.sdk.values.PCollectionView)1 TupleTag (org.apache.beam.sdk.values.TupleTag)1