Search in sources :

Example 1 with SparkRuntimeContext

use of org.apache.beam.runners.spark.translation.SparkRuntimeContext in project beam by apache.

the class StreamingTransformTranslator method groupByKey.

private static <K, V, W extends BoundedWindow> TransformEvaluator<GroupByKey<K, V>> groupByKey() {
    return new TransformEvaluator<GroupByKey<K, V>>() {

        @Override
        public void evaluate(GroupByKey<K, V> transform, EvaluationContext context) {
            @SuppressWarnings("unchecked") UnboundedDataset<KV<K, V>> inputDataset = (UnboundedDataset<KV<K, V>>) context.borrowDataset(transform);
            List<Integer> streamSources = inputDataset.getStreamSources();
            JavaDStream<WindowedValue<KV<K, V>>> dStream = inputDataset.getDStream();
            @SuppressWarnings("unchecked") final KvCoder<K, V> coder = (KvCoder<K, V>) context.getInput(transform).getCoder();
            final SparkRuntimeContext runtimeContext = context.getRuntimeContext();
            @SuppressWarnings("unchecked") final WindowingStrategy<?, W> windowingStrategy = (WindowingStrategy<?, W>) context.getInput(transform).getWindowingStrategy();
            @SuppressWarnings("unchecked") final WindowFn<Object, W> windowFn = (WindowFn<Object, W>) windowingStrategy.getWindowFn();
            //--- coders.
            final WindowedValue.WindowedValueCoder<V> wvCoder = WindowedValue.FullWindowedValueCoder.of(coder.getValueCoder(), windowFn.windowCoder());
            //--- group by key only.
            JavaDStream<WindowedValue<KV<K, Iterable<WindowedValue<V>>>>> groupedByKeyStream = dStream.transform(new Function<JavaRDD<WindowedValue<KV<K, V>>>, JavaRDD<WindowedValue<KV<K, Iterable<WindowedValue<V>>>>>>() {

                @Override
                public JavaRDD<WindowedValue<KV<K, Iterable<WindowedValue<V>>>>> call(JavaRDD<WindowedValue<KV<K, V>>> rdd) throws Exception {
                    return GroupCombineFunctions.groupByKeyOnly(rdd, coder.getKeyCoder(), wvCoder);
                }
            });
            //--- now group also by window.
            JavaDStream<WindowedValue<KV<K, Iterable<V>>>> outStream = SparkGroupAlsoByWindowViaWindowSet.groupAlsoByWindow(groupedByKeyStream, coder.getKeyCoder(), wvCoder, windowingStrategy, runtimeContext, streamSources);
            context.putDataset(transform, new UnboundedDataset<>(outStream, streamSources));
        }

        @Override
        public String toNativeString() {
            return "groupByKey()";
        }
    };
}
Also used : GroupByKey(org.apache.beam.sdk.transforms.GroupByKey) WindowingStrategy(org.apache.beam.sdk.values.WindowingStrategy) KV(org.apache.beam.sdk.values.KV) WindowedValue(org.apache.beam.sdk.util.WindowedValue) SparkRuntimeContext(org.apache.beam.runners.spark.translation.SparkRuntimeContext) WindowFn(org.apache.beam.sdk.transforms.windowing.WindowFn) SparkAssignWindowFn(org.apache.beam.runners.spark.translation.SparkAssignWindowFn) KvCoder(org.apache.beam.sdk.coders.KvCoder) KV(org.apache.beam.sdk.values.KV) TransformEvaluator(org.apache.beam.runners.spark.translation.TransformEvaluator) JavaRDD(org.apache.spark.api.java.JavaRDD) EvaluationContext(org.apache.beam.runners.spark.translation.EvaluationContext)

Example 2 with SparkRuntimeContext

use of org.apache.beam.runners.spark.translation.SparkRuntimeContext in project beam by apache.

the class StreamingTransformTranslator method parDo.

private static <InputT, OutputT> TransformEvaluator<ParDo.MultiOutput<InputT, OutputT>> parDo() {
    return new TransformEvaluator<ParDo.MultiOutput<InputT, OutputT>>() {

        public void evaluate(final ParDo.MultiOutput<InputT, OutputT> transform, final EvaluationContext context) {
            final DoFn<InputT, OutputT> doFn = transform.getFn();
            rejectSplittable(doFn);
            rejectStateAndTimers(doFn);
            final SparkRuntimeContext runtimeContext = context.getRuntimeContext();
            final SparkPCollectionView pviews = context.getPViews();
            final WindowingStrategy<?, ?> windowingStrategy = context.getInput(transform).getWindowingStrategy();
            @SuppressWarnings("unchecked") UnboundedDataset<InputT> unboundedDataset = ((UnboundedDataset<InputT>) context.borrowDataset(transform));
            JavaDStream<WindowedValue<InputT>> dStream = unboundedDataset.getDStream();
            final String stepName = context.getCurrentTransform().getFullName();
            JavaPairDStream<TupleTag<?>, WindowedValue<?>> all = dStream.transformToPair(new Function<JavaRDD<WindowedValue<InputT>>, JavaPairRDD<TupleTag<?>, WindowedValue<?>>>() {

                @Override
                public JavaPairRDD<TupleTag<?>, WindowedValue<?>> call(JavaRDD<WindowedValue<InputT>> rdd) throws Exception {
                    final Accumulator<NamedAggregators> aggAccum = AggregatorsAccumulator.getInstance();
                    final Accumulator<MetricsContainerStepMap> metricsAccum = MetricsAccumulator.getInstance();
                    final Map<TupleTag<?>, KV<WindowingStrategy<?, ?>, SideInputBroadcast<?>>> sideInputs = TranslationUtils.getSideInputs(transform.getSideInputs(), JavaSparkContext.fromSparkContext(rdd.context()), pviews);
                    return rdd.mapPartitionsToPair(new MultiDoFnFunction<>(aggAccum, metricsAccum, stepName, doFn, runtimeContext, transform.getMainOutputTag(), transform.getAdditionalOutputTags().getAll(), sideInputs, windowingStrategy, false));
                }
            });
            Map<TupleTag<?>, PValue> outputs = context.getOutputs(transform);
            if (outputs.size() > 1) {
                // cache the DStream if we're going to filter it more than once.
                all.cache();
            }
            for (Map.Entry<TupleTag<?>, PValue> output : outputs.entrySet()) {
                @SuppressWarnings("unchecked") JavaPairDStream<TupleTag<?>, WindowedValue<?>> filtered = all.filter(new TranslationUtils.TupleTagFilter(output.getKey()));
                @SuppressWarnings("unchecked") JavaDStream<WindowedValue<Object>> // Object is the best we can do since different outputs can have different tags
                values = (JavaDStream<WindowedValue<Object>>) (JavaDStream<?>) TranslationUtils.dStreamValues(filtered);
                context.putDataset(output.getValue(), new UnboundedDataset<>(values, unboundedDataset.getStreamSources()));
            }
        }

        @Override
        public String toNativeString() {
            return "mapPartitions(new <fn>())";
        }
    };
}
Also used : MetricsAccumulator(org.apache.beam.runners.spark.metrics.MetricsAccumulator) AggregatorsAccumulator(org.apache.beam.runners.spark.aggregators.AggregatorsAccumulator) Accumulator(org.apache.spark.Accumulator) TupleTag(org.apache.beam.sdk.values.TupleTag) JavaDStream(org.apache.spark.streaming.api.java.JavaDStream) WindowingStrategy(org.apache.beam.sdk.values.WindowingStrategy) WindowedValue(org.apache.beam.sdk.util.WindowedValue) SparkRuntimeContext(org.apache.beam.runners.spark.translation.SparkRuntimeContext) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) SideInputBroadcast(org.apache.beam.runners.spark.util.SideInputBroadcast) MultiDoFnFunction(org.apache.beam.runners.spark.translation.MultiDoFnFunction) PValue(org.apache.beam.sdk.values.PValue) TransformEvaluator(org.apache.beam.runners.spark.translation.TransformEvaluator) JavaRDD(org.apache.spark.api.java.JavaRDD) TranslationUtils(org.apache.beam.runners.spark.translation.TranslationUtils) ParDo(org.apache.beam.sdk.transforms.ParDo) EvaluationContext(org.apache.beam.runners.spark.translation.EvaluationContext) MetricsContainerStepMap(org.apache.beam.runners.core.metrics.MetricsContainerStepMap) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) SparkPCollectionView(org.apache.beam.runners.spark.translation.SparkPCollectionView)

Example 3 with SparkRuntimeContext

use of org.apache.beam.runners.spark.translation.SparkRuntimeContext in project beam by apache.

the class StreamingTransformTranslator method combineGrouped.

private static <K, InputT, OutputT> TransformEvaluator<Combine.GroupedValues<K, InputT, OutputT>> combineGrouped() {
    return new TransformEvaluator<Combine.GroupedValues<K, InputT, OutputT>>() {

        @Override
        public void evaluate(final Combine.GroupedValues<K, InputT, OutputT> transform, EvaluationContext context) {
            // get the applied combine function.
            PCollection<? extends KV<K, ? extends Iterable<InputT>>> input = context.getInput(transform);
            final WindowingStrategy<?, ?> windowingStrategy = input.getWindowingStrategy();
            @SuppressWarnings("unchecked") final CombineWithContext.CombineFnWithContext<InputT, ?, OutputT> fn = (CombineWithContext.CombineFnWithContext<InputT, ?, OutputT>) CombineFnUtil.toFnWithContext(transform.getFn());
            @SuppressWarnings("unchecked") UnboundedDataset<KV<K, Iterable<InputT>>> unboundedDataset = ((UnboundedDataset<KV<K, Iterable<InputT>>>) context.borrowDataset(transform));
            JavaDStream<WindowedValue<KV<K, Iterable<InputT>>>> dStream = unboundedDataset.getDStream();
            final SparkRuntimeContext runtimeContext = context.getRuntimeContext();
            final SparkPCollectionView pviews = context.getPViews();
            JavaDStream<WindowedValue<KV<K, OutputT>>> outStream = dStream.transform(new Function<JavaRDD<WindowedValue<KV<K, Iterable<InputT>>>>, JavaRDD<WindowedValue<KV<K, OutputT>>>>() {

                @Override
                public JavaRDD<WindowedValue<KV<K, OutputT>>> call(JavaRDD<WindowedValue<KV<K, Iterable<InputT>>>> rdd) throws Exception {
                    SparkKeyedCombineFn<K, InputT, ?, OutputT> combineFnWithContext = new SparkKeyedCombineFn<>(fn, runtimeContext, TranslationUtils.getSideInputs(transform.getSideInputs(), new JavaSparkContext(rdd.context()), pviews), windowingStrategy);
                    return rdd.map(new TranslationUtils.CombineGroupedValues<>(combineFnWithContext));
                }
            });
            context.putDataset(transform, new UnboundedDataset<>(outStream, unboundedDataset.getStreamSources()));
        }

        @Override
        public String toNativeString() {
            return "map(new <fn>())";
        }
    };
}
Also used : Combine(org.apache.beam.sdk.transforms.Combine) SparkKeyedCombineFn(org.apache.beam.runners.spark.translation.SparkKeyedCombineFn) WindowedValue(org.apache.beam.sdk.util.WindowedValue) SparkRuntimeContext(org.apache.beam.runners.spark.translation.SparkRuntimeContext) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) CombineWithContext(org.apache.beam.sdk.transforms.CombineWithContext) KV(org.apache.beam.sdk.values.KV) TransformEvaluator(org.apache.beam.runners.spark.translation.TransformEvaluator) JavaRDD(org.apache.spark.api.java.JavaRDD) EvaluationContext(org.apache.beam.runners.spark.translation.EvaluationContext) SparkPCollectionView(org.apache.beam.runners.spark.translation.SparkPCollectionView)

Aggregations

EvaluationContext (org.apache.beam.runners.spark.translation.EvaluationContext)3 SparkRuntimeContext (org.apache.beam.runners.spark.translation.SparkRuntimeContext)3 TransformEvaluator (org.apache.beam.runners.spark.translation.TransformEvaluator)3 WindowedValue (org.apache.beam.sdk.util.WindowedValue)3 JavaRDD (org.apache.spark.api.java.JavaRDD)3 SparkPCollectionView (org.apache.beam.runners.spark.translation.SparkPCollectionView)2 KV (org.apache.beam.sdk.values.KV)2 WindowingStrategy (org.apache.beam.sdk.values.WindowingStrategy)2 ImmutableMap (com.google.common.collect.ImmutableMap)1 Map (java.util.Map)1 MetricsContainerStepMap (org.apache.beam.runners.core.metrics.MetricsContainerStepMap)1 AggregatorsAccumulator (org.apache.beam.runners.spark.aggregators.AggregatorsAccumulator)1 MetricsAccumulator (org.apache.beam.runners.spark.metrics.MetricsAccumulator)1 MultiDoFnFunction (org.apache.beam.runners.spark.translation.MultiDoFnFunction)1 SparkAssignWindowFn (org.apache.beam.runners.spark.translation.SparkAssignWindowFn)1 SparkKeyedCombineFn (org.apache.beam.runners.spark.translation.SparkKeyedCombineFn)1 TranslationUtils (org.apache.beam.runners.spark.translation.TranslationUtils)1 SideInputBroadcast (org.apache.beam.runners.spark.util.SideInputBroadcast)1 KvCoder (org.apache.beam.sdk.coders.KvCoder)1 Combine (org.apache.beam.sdk.transforms.Combine)1