Search in sources :

Example 1 with EvaluationContext

use of org.apache.beam.runners.spark.translation.EvaluationContext in project beam by apache.

the class StreamingTransformTranslator method groupByKey.

private static <K, V, W extends BoundedWindow> TransformEvaluator<GroupByKey<K, V>> groupByKey() {
    return new TransformEvaluator<GroupByKey<K, V>>() {

        @Override
        public void evaluate(GroupByKey<K, V> transform, EvaluationContext context) {
            @SuppressWarnings("unchecked") UnboundedDataset<KV<K, V>> inputDataset = (UnboundedDataset<KV<K, V>>) context.borrowDataset(transform);
            List<Integer> streamSources = inputDataset.getStreamSources();
            JavaDStream<WindowedValue<KV<K, V>>> dStream = inputDataset.getDStream();
            @SuppressWarnings("unchecked") final KvCoder<K, V> coder = (KvCoder<K, V>) context.getInput(transform).getCoder();
            final SparkRuntimeContext runtimeContext = context.getRuntimeContext();
            @SuppressWarnings("unchecked") final WindowingStrategy<?, W> windowingStrategy = (WindowingStrategy<?, W>) context.getInput(transform).getWindowingStrategy();
            @SuppressWarnings("unchecked") final WindowFn<Object, W> windowFn = (WindowFn<Object, W>) windowingStrategy.getWindowFn();
            //--- coders.
            final WindowedValue.WindowedValueCoder<V> wvCoder = WindowedValue.FullWindowedValueCoder.of(coder.getValueCoder(), windowFn.windowCoder());
            //--- group by key only.
            JavaDStream<WindowedValue<KV<K, Iterable<WindowedValue<V>>>>> groupedByKeyStream = dStream.transform(new Function<JavaRDD<WindowedValue<KV<K, V>>>, JavaRDD<WindowedValue<KV<K, Iterable<WindowedValue<V>>>>>>() {

                @Override
                public JavaRDD<WindowedValue<KV<K, Iterable<WindowedValue<V>>>>> call(JavaRDD<WindowedValue<KV<K, V>>> rdd) throws Exception {
                    return GroupCombineFunctions.groupByKeyOnly(rdd, coder.getKeyCoder(), wvCoder);
                }
            });
            //--- now group also by window.
            JavaDStream<WindowedValue<KV<K, Iterable<V>>>> outStream = SparkGroupAlsoByWindowViaWindowSet.groupAlsoByWindow(groupedByKeyStream, coder.getKeyCoder(), wvCoder, windowingStrategy, runtimeContext, streamSources);
            context.putDataset(transform, new UnboundedDataset<>(outStream, streamSources));
        }

        @Override
        public String toNativeString() {
            return "groupByKey()";
        }
    };
}
Also used : GroupByKey(org.apache.beam.sdk.transforms.GroupByKey) WindowingStrategy(org.apache.beam.sdk.values.WindowingStrategy) KV(org.apache.beam.sdk.values.KV) WindowedValue(org.apache.beam.sdk.util.WindowedValue) SparkRuntimeContext(org.apache.beam.runners.spark.translation.SparkRuntimeContext) WindowFn(org.apache.beam.sdk.transforms.windowing.WindowFn) SparkAssignWindowFn(org.apache.beam.runners.spark.translation.SparkAssignWindowFn) KvCoder(org.apache.beam.sdk.coders.KvCoder) KV(org.apache.beam.sdk.values.KV) TransformEvaluator(org.apache.beam.runners.spark.translation.TransformEvaluator) JavaRDD(org.apache.spark.api.java.JavaRDD) EvaluationContext(org.apache.beam.runners.spark.translation.EvaluationContext)

Example 2 with EvaluationContext

use of org.apache.beam.runners.spark.translation.EvaluationContext in project beam by apache.

the class StreamingTransformTranslator method flattenPColl.

private static <T> TransformEvaluator<Flatten.PCollections<T>> flattenPColl() {
    return new TransformEvaluator<Flatten.PCollections<T>>() {

        @SuppressWarnings("unchecked")
        @Override
        public void evaluate(Flatten.PCollections<T> transform, EvaluationContext context) {
            Map<TupleTag<?>, PCollection<?>> pcs = context.getInputs(transform);
            // since this is a streaming pipeline, at least one of the PCollections to "flatten" are
            // unbounded, meaning it represents a DStream.
            // So we could end up with an unbounded unified DStream.
            final List<JavaDStream<WindowedValue<T>>> dStreams = new ArrayList<>();
            final List<Integer> streamingSources = new ArrayList<>();
            for (PValue pv : pcs.values()) {
                checkArgument(pv instanceof PCollection, "Flatten had non-PCollection value in input: %s of type %s", pv, pv.getClass().getSimpleName());
                PCollection<T> pcol = (PCollection<T>) pv;
                Dataset dataset = context.borrowDataset(pcol);
                if (dataset instanceof UnboundedDataset) {
                    UnboundedDataset<T> unboundedDataset = (UnboundedDataset<T>) dataset;
                    streamingSources.addAll(unboundedDataset.getStreamSources());
                    dStreams.add(unboundedDataset.getDStream());
                } else {
                    // create a single RDD stream.
                    Queue<JavaRDD<WindowedValue<T>>> q = new LinkedBlockingQueue<>();
                    q.offer(((BoundedDataset) dataset).getRDD());
                    // TODO (BEAM-10789): this is not recoverable from checkpoint!
                    JavaDStream<WindowedValue<T>> dStream = context.getStreamingContext().queueStream(q);
                    dStreams.add(dStream);
                }
            }
            // start by unifying streams into a single stream.
            JavaDStream<WindowedValue<T>> unifiedStreams = SparkCompat.joinStreams(context.getStreamingContext(), dStreams);
            context.putDataset(transform, new UnboundedDataset<>(unifiedStreams, streamingSources));
        }

        @Override
        public String toNativeString() {
            return "streamingContext.union(...)";
        }
    };
}
Also used : Dataset(org.apache.beam.runners.spark.translation.Dataset) BoundedDataset(org.apache.beam.runners.spark.translation.BoundedDataset) Flatten(org.apache.beam.sdk.transforms.Flatten) ArrayList(java.util.ArrayList) TupleTag(org.apache.beam.sdk.values.TupleTag) JavaDStream(org.apache.spark.streaming.api.java.JavaDStream) PValue(org.apache.beam.sdk.values.PValue) LinkedBlockingQueue(java.util.concurrent.LinkedBlockingQueue) TransformEvaluator(org.apache.beam.runners.spark.translation.TransformEvaluator) JavaRDD(org.apache.spark.api.java.JavaRDD) PCollection(org.apache.beam.sdk.values.PCollection) WindowedValue(org.apache.beam.sdk.util.WindowedValue) EvaluationContext(org.apache.beam.runners.spark.translation.EvaluationContext)

Example 3 with EvaluationContext

use of org.apache.beam.runners.spark.translation.EvaluationContext in project beam by apache.

the class StreamingTransformTranslator method parDo.

private static <InputT, OutputT> TransformEvaluator<ParDo.MultiOutput<InputT, OutputT>> parDo() {
    return new TransformEvaluator<ParDo.MultiOutput<InputT, OutputT>>() {

        @Override
        public void evaluate(final ParDo.MultiOutput<InputT, OutputT> transform, final EvaluationContext context) {
            final DoFn<InputT, OutputT> doFn = transform.getFn();
            checkArgument(!DoFnSignatures.signatureForDoFn(doFn).processElement().isSplittable(), "Splittable DoFn not yet supported in streaming mode: %s", doFn);
            rejectStateAndTimers(doFn);
            final SerializablePipelineOptions options = context.getSerializableOptions();
            final SparkPCollectionView pviews = context.getPViews();
            final WindowingStrategy<?, ?> windowingStrategy = context.getInput(transform).getWindowingStrategy();
            Coder<InputT> inputCoder = (Coder<InputT>) context.getInput(transform).getCoder();
            Map<TupleTag<?>, Coder<?>> outputCoders = context.getOutputCoders();
            @SuppressWarnings("unchecked") UnboundedDataset<InputT> unboundedDataset = (UnboundedDataset<InputT>) context.borrowDataset(transform);
            JavaDStream<WindowedValue<InputT>> dStream = unboundedDataset.getDStream();
            final DoFnSchemaInformation doFnSchemaInformation = ParDoTranslation.getSchemaInformation(context.getCurrentTransform());
            final Map<String, PCollectionView<?>> sideInputMapping = ParDoTranslation.getSideInputMapping(context.getCurrentTransform());
            final String stepName = context.getCurrentTransform().getFullName();
            JavaPairDStream<TupleTag<?>, WindowedValue<?>> all = dStream.transformToPair(rdd -> {
                final MetricsContainerStepMapAccumulator metricsAccum = MetricsAccumulator.getInstance();
                final Map<TupleTag<?>, KV<WindowingStrategy<?, ?>, SideInputBroadcast<?>>> sideInputs = TranslationUtils.getSideInputs(transform.getSideInputs().values(), JavaSparkContext.fromSparkContext(rdd.context()), pviews);
                return rdd.mapPartitionsToPair(new MultiDoFnFunction<>(metricsAccum, stepName, doFn, options, transform.getMainOutputTag(), transform.getAdditionalOutputTags().getAll(), inputCoder, outputCoders, sideInputs, windowingStrategy, false, doFnSchemaInformation, sideInputMapping));
            });
            Map<TupleTag<?>, PCollection<?>> outputs = context.getOutputs(transform);
            if (outputs.size() > 1) {
                // Caching can cause Serialization, we need to code to bytes
                // more details in https://issues.apache.org/jira/browse/BEAM-2669
                Map<TupleTag<?>, Coder<WindowedValue<?>>> coderMap = TranslationUtils.getTupleTagCoders(outputs);
                all = all.mapToPair(TranslationUtils.getTupleTagEncodeFunction(coderMap)).cache().mapToPair(TranslationUtils.getTupleTagDecodeFunction(coderMap));
            }
            for (Map.Entry<TupleTag<?>, PCollection<?>> output : outputs.entrySet()) {
                @SuppressWarnings("unchecked") JavaPairDStream<TupleTag<?>, WindowedValue<?>> filtered = all.filter(new TranslationUtils.TupleTagFilter(output.getKey()));
                @SuppressWarnings("unchecked") JavaDStream<WindowedValue<Object>> // Object is the best we can do since different outputs can have different tags
                values = (JavaDStream<WindowedValue<Object>>) (JavaDStream<?>) TranslationUtils.dStreamValues(filtered);
                context.putDataset(output.getValue(), new UnboundedDataset<>(values, unboundedDataset.getStreamSources()));
            }
        }

        @Override
        public String toNativeString() {
            return "mapPartitions(new <fn>())";
        }
    };
}
Also used : TupleTag(org.apache.beam.sdk.values.TupleTag) JavaDStream(org.apache.spark.streaming.api.java.JavaDStream) WindowedValue(org.apache.beam.sdk.util.WindowedValue) SerializablePipelineOptions(org.apache.beam.runners.core.construction.SerializablePipelineOptions) KvCoder(org.apache.beam.sdk.coders.KvCoder) Coder(org.apache.beam.sdk.coders.Coder) KV(org.apache.beam.sdk.values.KV) MetricsContainerStepMapAccumulator(org.apache.beam.runners.spark.metrics.MetricsContainerStepMapAccumulator) TransformEvaluator(org.apache.beam.runners.spark.translation.TransformEvaluator) TranslationUtils(org.apache.beam.runners.spark.translation.TranslationUtils) PCollection(org.apache.beam.sdk.values.PCollection) SparkPCollectionView(org.apache.beam.runners.spark.translation.SparkPCollectionView) PCollectionView(org.apache.beam.sdk.values.PCollectionView) DoFnSchemaInformation(org.apache.beam.sdk.transforms.DoFnSchemaInformation) ParDo(org.apache.beam.sdk.transforms.ParDo) SplittableParDo(org.apache.beam.runners.core.construction.SplittableParDo) EvaluationContext(org.apache.beam.runners.spark.translation.EvaluationContext) ImmutableMap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap) Map(java.util.Map) HashMap(java.util.HashMap) SparkPCollectionView(org.apache.beam.runners.spark.translation.SparkPCollectionView)

Example 4 with EvaluationContext

use of org.apache.beam.runners.spark.translation.EvaluationContext in project beam by apache.

the class SparkRunnerStreamingContextFactory method call.

@Override
public JavaStreamingContext call() throws Exception {
    LOG.info("Creating a new Spark Streaming Context");
    // validate unbounded read properties.
    checkArgument(options.getMinReadTimeMillis() < options.getBatchIntervalMillis(), "Minimum read time has to be less than batch time.");
    checkArgument(options.getReadTimePercentage() > 0 && options.getReadTimePercentage() < 1, "Read time percentage is bound to (0, 1).");
    SparkPipelineTranslator translator = new StreamingTransformTranslator.Translator(new TransformTranslator.Translator());
    Duration batchDuration = new Duration(options.getBatchIntervalMillis());
    LOG.info("Setting Spark streaming batchDuration to {} msec", batchDuration.milliseconds());
    JavaSparkContext jsc = SparkContextFactory.getSparkContext(options);
    JavaStreamingContext jssc = new JavaStreamingContext(jsc, batchDuration);
    // We must first init accumulators since translators expect them to be instantiated.
    SparkRunner.initAccumulators(options, jsc);
    // do not need to create a MetricsPusher instance here because if is called in SparkRunner.run()
    EvaluationContext ctxt = new EvaluationContext(jsc, pipeline, options, jssc);
    // update cache candidates
    SparkRunner.updateCacheCandidates(pipeline, translator, ctxt);
    pipeline.traverseTopologically(new SparkRunner.Evaluator(translator, ctxt));
    ctxt.computeOutputs();
    checkpoint(jssc, checkpointDir);
    return jssc;
}
Also used : JavaStreamingContext(org.apache.spark.streaming.api.java.JavaStreamingContext) TransformTranslator(org.apache.beam.runners.spark.translation.TransformTranslator) SparkPipelineTranslator(org.apache.beam.runners.spark.translation.SparkPipelineTranslator) TransformTranslator(org.apache.beam.runners.spark.translation.TransformTranslator) SparkRunner(org.apache.beam.runners.spark.SparkRunner) Duration(org.apache.spark.streaming.Duration) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) EvaluationContext(org.apache.beam.runners.spark.translation.EvaluationContext) SparkPipelineTranslator(org.apache.beam.runners.spark.translation.SparkPipelineTranslator)

Example 5 with EvaluationContext

use of org.apache.beam.runners.spark.translation.EvaluationContext in project beam by apache.

the class StreamingTransformTranslator method createFromQueue.

private static <T> TransformEvaluator<CreateStream<T>> createFromQueue() {
    return new TransformEvaluator<CreateStream<T>>() {

        @Override
        public void evaluate(CreateStream<T> transform, EvaluationContext context) {
            Coder<T> coder = context.getOutput(transform).getCoder();
            JavaStreamingContext jssc = context.getStreamingContext();
            Queue<Iterable<TimestampedValue<T>>> values = transform.getBatches();
            WindowedValue.FullWindowedValueCoder<T> windowCoder = WindowedValue.FullWindowedValueCoder.of(coder, GlobalWindow.Coder.INSTANCE);
            // create the DStream from queue.
            Queue<JavaRDD<WindowedValue<T>>> rddQueue = new LinkedBlockingQueue<>();
            for (Iterable<TimestampedValue<T>> tv : values) {
                Iterable<WindowedValue<T>> windowedValues = Iterables.transform(tv, new com.google.common.base.Function<TimestampedValue<T>, WindowedValue<T>>() {

                    @Override
                    public WindowedValue<T> apply(@Nonnull TimestampedValue<T> timestampedValue) {
                        return WindowedValue.of(timestampedValue.getValue(), timestampedValue.getTimestamp(), GlobalWindow.INSTANCE, PaneInfo.NO_FIRING);
                    }
                });
                JavaRDD<WindowedValue<T>> rdd = jssc.sparkContext().parallelize(CoderHelpers.toByteArrays(windowedValues, windowCoder)).map(CoderHelpers.fromByteFunction(windowCoder));
                rddQueue.offer(rdd);
            }
            JavaInputDStream<WindowedValue<T>> inputDStream = jssc.queueStream(rddQueue, true);
            UnboundedDataset<T> unboundedDataset = new UnboundedDataset<T>(inputDStream, Collections.singletonList(inputDStream.inputDStream().id()));
            // add pre-baked Watermarks for the pre-baked batches.
            Queue<GlobalWatermarkHolder.SparkWatermarks> times = transform.getTimes();
            GlobalWatermarkHolder.addAll(ImmutableMap.of(unboundedDataset.getStreamSources().get(0), times));
            context.putDataset(transform, unboundedDataset);
        }

        @Override
        public String toNativeString() {
            return "streamingContext.queueStream(...)";
        }
    };
}
Also used : LinkedBlockingQueue(java.util.concurrent.LinkedBlockingQueue) JavaStreamingContext(org.apache.spark.streaming.api.java.JavaStreamingContext) TimestampedValue(org.apache.beam.sdk.values.TimestampedValue) WindowedValue(org.apache.beam.sdk.util.WindowedValue) CreateStream(org.apache.beam.runners.spark.io.CreateStream) TransformEvaluator(org.apache.beam.runners.spark.translation.TransformEvaluator) JavaRDD(org.apache.spark.api.java.JavaRDD) EvaluationContext(org.apache.beam.runners.spark.translation.EvaluationContext)

Aggregations

EvaluationContext (org.apache.beam.runners.spark.translation.EvaluationContext)10 TransformEvaluator (org.apache.beam.runners.spark.translation.TransformEvaluator)6 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)6 WindowedValue (org.apache.beam.sdk.util.WindowedValue)5 TransformTranslator (org.apache.beam.runners.spark.translation.TransformTranslator)4 JavaStreamingContext (org.apache.spark.streaming.api.java.JavaStreamingContext)4 SparkPipelineTranslator (org.apache.beam.runners.spark.translation.SparkPipelineTranslator)3 KV (org.apache.beam.sdk.values.KV)3 PCollection (org.apache.beam.sdk.values.PCollection)3 JavaRDD (org.apache.spark.api.java.JavaRDD)3 LinkedBlockingQueue (java.util.concurrent.LinkedBlockingQueue)2 SerializablePipelineOptions (org.apache.beam.runners.core.construction.SerializablePipelineOptions)2 SparkPCollectionView (org.apache.beam.runners.spark.translation.SparkPCollectionView)2 TranslationUtils (org.apache.beam.runners.spark.translation.TranslationUtils)2 Pipeline (org.apache.beam.sdk.Pipeline)2 KvCoder (org.apache.beam.sdk.coders.KvCoder)2 TupleTag (org.apache.beam.sdk.values.TupleTag)2 JavaDStream (org.apache.spark.streaming.api.java.JavaDStream)2 Test (org.junit.Test)2 ArrayList (java.util.ArrayList)1