Search in sources :

Example 1 with Flatten

use of org.apache.beam.sdk.transforms.Flatten in project beam by apache.

the class StreamingTransformTranslator method flattenPColl.

private static <T> TransformEvaluator<Flatten.PCollections<T>> flattenPColl() {
    return new TransformEvaluator<Flatten.PCollections<T>>() {

        @SuppressWarnings("unchecked")
        @Override
        public void evaluate(Flatten.PCollections<T> transform, EvaluationContext context) {
            Map<TupleTag<?>, PValue> pcs = context.getInputs(transform);
            // since this is a streaming pipeline, at least one of the PCollections to "flatten" are
            // unbounded, meaning it represents a DStream.
            // So we could end up with an unbounded unified DStream.
            final List<JavaDStream<WindowedValue<T>>> dStreams = new ArrayList<>();
            final List<Integer> streamingSources = new ArrayList<>();
            for (PValue pv : pcs.values()) {
                checkArgument(pv instanceof PCollection, "Flatten had non-PCollection value in input: %s of type %s", pv, pv.getClass().getSimpleName());
                PCollection<T> pcol = (PCollection<T>) pv;
                Dataset dataset = context.borrowDataset(pcol);
                if (dataset instanceof UnboundedDataset) {
                    UnboundedDataset<T> unboundedDataset = (UnboundedDataset<T>) dataset;
                    streamingSources.addAll(unboundedDataset.getStreamSources());
                    dStreams.add(unboundedDataset.getDStream());
                } else {
                    // create a single RDD stream.
                    Queue<JavaRDD<WindowedValue<T>>> q = new LinkedBlockingQueue<>();
                    q.offer(((BoundedDataset) dataset).getRDD());
                    //TODO: this is not recoverable from checkpoint!
                    JavaDStream<WindowedValue<T>> dStream = context.getStreamingContext().queueStream(q);
                    dStreams.add(dStream);
                }
            }
            // start by unifying streams into a single stream.
            JavaDStream<WindowedValue<T>> unifiedStreams = context.getStreamingContext().union(dStreams.remove(0), dStreams);
            context.putDataset(transform, new UnboundedDataset<>(unifiedStreams, streamingSources));
        }

        @Override
        public String toNativeString() {
            return "streamingContext.union(...)";
        }
    };
}
Also used : Dataset(org.apache.beam.runners.spark.translation.Dataset) BoundedDataset(org.apache.beam.runners.spark.translation.BoundedDataset) Flatten(org.apache.beam.sdk.transforms.Flatten) ArrayList(java.util.ArrayList) TupleTag(org.apache.beam.sdk.values.TupleTag) PValue(org.apache.beam.sdk.values.PValue) JavaDStream(org.apache.spark.streaming.api.java.JavaDStream) LinkedBlockingQueue(java.util.concurrent.LinkedBlockingQueue) TransformEvaluator(org.apache.beam.runners.spark.translation.TransformEvaluator) JavaRDD(org.apache.spark.api.java.JavaRDD) PCollection(org.apache.beam.sdk.values.PCollection) WindowedValue(org.apache.beam.sdk.util.WindowedValue) EvaluationContext(org.apache.beam.runners.spark.translation.EvaluationContext)

Example 2 with Flatten

use of org.apache.beam.sdk.transforms.Flatten in project beam by apache.

the class TransformTranslator method flattenPColl.

private static <T> TransformEvaluator<Flatten.PCollections<T>> flattenPColl() {
    return new TransformEvaluator<Flatten.PCollections<T>>() {

        @SuppressWarnings("unchecked")
        @Override
        public void evaluate(Flatten.PCollections<T> transform, EvaluationContext context) {
            Collection<PValue> pcs = context.getInputs(transform).values();
            JavaRDD<WindowedValue<T>> unionRDD;
            if (pcs.size() == 0) {
                unionRDD = context.getSparkContext().emptyRDD();
            } else {
                JavaRDD<WindowedValue<T>>[] rdds = new JavaRDD[pcs.size()];
                int index = 0;
                for (PValue pc : pcs) {
                    checkArgument(pc instanceof PCollection, "Flatten had non-PCollection value in input: %s of type %s", pc, pc.getClass().getSimpleName());
                    rdds[index] = ((BoundedDataset<T>) context.borrowDataset(pc)).getRDD();
                    index++;
                }
                unionRDD = context.getSparkContext().union(rdds);
            }
            context.putDataset(transform, new BoundedDataset<>(unionRDD));
        }

        @Override
        public String toNativeString() {
            return "sparkContext.union(...)";
        }
    };
}
Also used : Flatten(org.apache.beam.sdk.transforms.Flatten) PValue(org.apache.beam.sdk.values.PValue) JavaRDD(org.apache.spark.api.java.JavaRDD) PCollection(org.apache.beam.sdk.values.PCollection) WindowedValue(org.apache.beam.sdk.util.WindowedValue)

Aggregations

Flatten (org.apache.beam.sdk.transforms.Flatten)2 WindowedValue (org.apache.beam.sdk.util.WindowedValue)2 PCollection (org.apache.beam.sdk.values.PCollection)2 PValue (org.apache.beam.sdk.values.PValue)2 JavaRDD (org.apache.spark.api.java.JavaRDD)2 ArrayList (java.util.ArrayList)1 LinkedBlockingQueue (java.util.concurrent.LinkedBlockingQueue)1 BoundedDataset (org.apache.beam.runners.spark.translation.BoundedDataset)1 Dataset (org.apache.beam.runners.spark.translation.Dataset)1 EvaluationContext (org.apache.beam.runners.spark.translation.EvaluationContext)1 TransformEvaluator (org.apache.beam.runners.spark.translation.TransformEvaluator)1 TupleTag (org.apache.beam.sdk.values.TupleTag)1 JavaDStream (org.apache.spark.streaming.api.java.JavaDStream)1