Search in sources :

Example 36 with WindowedValue

use of org.apache.beam.sdk.util.WindowedValue in project beam by apache.

the class SparkSideInputReader method get.

@Nullable
@Override
public <T> T get(PCollectionView<T> view, BoundedWindow window) {
    //--- validate sideInput.
    checkNotNull(view, "The PCollectionView passed to sideInput cannot be null ");
    KV<WindowingStrategy<?, ?>, SideInputBroadcast<?>> windowedBroadcastHelper = sideInputs.get(view.getTagInternal());
    checkNotNull(windowedBroadcastHelper, "SideInput for view " + view + " is not available.");
    //--- sideInput window
    final BoundedWindow sideInputWindow = view.getWindowMappingFn().getSideInputWindow(window);
    //--- match the appropriate sideInput window.
    // a tag will point to all matching sideInputs, that is all windows.
    // now that we've obtained the appropriate sideInputWindow, all that's left is to filter by it.
    Iterable<WindowedValue<?>> availableSideInputs = (Iterable<WindowedValue<?>>) windowedBroadcastHelper.getValue().getValue();
    Iterable<WindowedValue<?>> sideInputForWindow = Iterables.filter(availableSideInputs, new Predicate<WindowedValue<?>>() {

        @Override
        public boolean apply(@Nullable WindowedValue<?> sideInputCandidate) {
            if (sideInputCandidate == null) {
                return false;
            }
            // first match of a sideInputWindow to the elementWindow is good enough.
            for (BoundedWindow sideInputCandidateWindow : sideInputCandidate.getWindows()) {
                if (sideInputCandidateWindow.equals(sideInputWindow)) {
                    return true;
                }
            }
            // no match found.
            return false;
        }
    });
    return view.getViewFn().apply(sideInputForWindow);
}
Also used : WindowedValue(org.apache.beam.sdk.util.WindowedValue) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) WindowingStrategy(org.apache.beam.sdk.values.WindowingStrategy) Nullable(javax.annotation.Nullable)

Example 37 with WindowedValue

use of org.apache.beam.sdk.util.WindowedValue in project beam by apache.

the class StreamingTransformTranslator method flattenPColl.

private static <T> TransformEvaluator<Flatten.PCollections<T>> flattenPColl() {
    return new TransformEvaluator<Flatten.PCollections<T>>() {

        @SuppressWarnings("unchecked")
        @Override
        public void evaluate(Flatten.PCollections<T> transform, EvaluationContext context) {
            Map<TupleTag<?>, PValue> pcs = context.getInputs(transform);
            // since this is a streaming pipeline, at least one of the PCollections to "flatten" are
            // unbounded, meaning it represents a DStream.
            // So we could end up with an unbounded unified DStream.
            final List<JavaDStream<WindowedValue<T>>> dStreams = new ArrayList<>();
            final List<Integer> streamingSources = new ArrayList<>();
            for (PValue pv : pcs.values()) {
                checkArgument(pv instanceof PCollection, "Flatten had non-PCollection value in input: %s of type %s", pv, pv.getClass().getSimpleName());
                PCollection<T> pcol = (PCollection<T>) pv;
                Dataset dataset = context.borrowDataset(pcol);
                if (dataset instanceof UnboundedDataset) {
                    UnboundedDataset<T> unboundedDataset = (UnboundedDataset<T>) dataset;
                    streamingSources.addAll(unboundedDataset.getStreamSources());
                    dStreams.add(unboundedDataset.getDStream());
                } else {
                    // create a single RDD stream.
                    Queue<JavaRDD<WindowedValue<T>>> q = new LinkedBlockingQueue<>();
                    q.offer(((BoundedDataset) dataset).getRDD());
                    //TODO: this is not recoverable from checkpoint!
                    JavaDStream<WindowedValue<T>> dStream = context.getStreamingContext().queueStream(q);
                    dStreams.add(dStream);
                }
            }
            // start by unifying streams into a single stream.
            JavaDStream<WindowedValue<T>> unifiedStreams = context.getStreamingContext().union(dStreams.remove(0), dStreams);
            context.putDataset(transform, new UnboundedDataset<>(unifiedStreams, streamingSources));
        }

        @Override
        public String toNativeString() {
            return "streamingContext.union(...)";
        }
    };
}
Also used : Dataset(org.apache.beam.runners.spark.translation.Dataset) BoundedDataset(org.apache.beam.runners.spark.translation.BoundedDataset) Flatten(org.apache.beam.sdk.transforms.Flatten) ArrayList(java.util.ArrayList) TupleTag(org.apache.beam.sdk.values.TupleTag) PValue(org.apache.beam.sdk.values.PValue) JavaDStream(org.apache.spark.streaming.api.java.JavaDStream) LinkedBlockingQueue(java.util.concurrent.LinkedBlockingQueue) TransformEvaluator(org.apache.beam.runners.spark.translation.TransformEvaluator) JavaRDD(org.apache.spark.api.java.JavaRDD) PCollection(org.apache.beam.sdk.values.PCollection) WindowedValue(org.apache.beam.sdk.util.WindowedValue) EvaluationContext(org.apache.beam.runners.spark.translation.EvaluationContext)

Example 38 with WindowedValue

use of org.apache.beam.sdk.util.WindowedValue in project beam by apache.

the class StreamingTransformTranslator method parDo.

private static <InputT, OutputT> TransformEvaluator<ParDo.MultiOutput<InputT, OutputT>> parDo() {
    return new TransformEvaluator<ParDo.MultiOutput<InputT, OutputT>>() {

        public void evaluate(final ParDo.MultiOutput<InputT, OutputT> transform, final EvaluationContext context) {
            final DoFn<InputT, OutputT> doFn = transform.getFn();
            rejectSplittable(doFn);
            rejectStateAndTimers(doFn);
            final SparkRuntimeContext runtimeContext = context.getRuntimeContext();
            final SparkPCollectionView pviews = context.getPViews();
            final WindowingStrategy<?, ?> windowingStrategy = context.getInput(transform).getWindowingStrategy();
            @SuppressWarnings("unchecked") UnboundedDataset<InputT> unboundedDataset = ((UnboundedDataset<InputT>) context.borrowDataset(transform));
            JavaDStream<WindowedValue<InputT>> dStream = unboundedDataset.getDStream();
            final String stepName = context.getCurrentTransform().getFullName();
            JavaPairDStream<TupleTag<?>, WindowedValue<?>> all = dStream.transformToPair(new Function<JavaRDD<WindowedValue<InputT>>, JavaPairRDD<TupleTag<?>, WindowedValue<?>>>() {

                @Override
                public JavaPairRDD<TupleTag<?>, WindowedValue<?>> call(JavaRDD<WindowedValue<InputT>> rdd) throws Exception {
                    final Accumulator<NamedAggregators> aggAccum = AggregatorsAccumulator.getInstance();
                    final Accumulator<MetricsContainerStepMap> metricsAccum = MetricsAccumulator.getInstance();
                    final Map<TupleTag<?>, KV<WindowingStrategy<?, ?>, SideInputBroadcast<?>>> sideInputs = TranslationUtils.getSideInputs(transform.getSideInputs(), JavaSparkContext.fromSparkContext(rdd.context()), pviews);
                    return rdd.mapPartitionsToPair(new MultiDoFnFunction<>(aggAccum, metricsAccum, stepName, doFn, runtimeContext, transform.getMainOutputTag(), transform.getAdditionalOutputTags().getAll(), sideInputs, windowingStrategy, false));
                }
            });
            Map<TupleTag<?>, PValue> outputs = context.getOutputs(transform);
            if (outputs.size() > 1) {
                // cache the DStream if we're going to filter it more than once.
                all.cache();
            }
            for (Map.Entry<TupleTag<?>, PValue> output : outputs.entrySet()) {
                @SuppressWarnings("unchecked") JavaPairDStream<TupleTag<?>, WindowedValue<?>> filtered = all.filter(new TranslationUtils.TupleTagFilter(output.getKey()));
                @SuppressWarnings("unchecked") JavaDStream<WindowedValue<Object>> // Object is the best we can do since different outputs can have different tags
                values = (JavaDStream<WindowedValue<Object>>) (JavaDStream<?>) TranslationUtils.dStreamValues(filtered);
                context.putDataset(output.getValue(), new UnboundedDataset<>(values, unboundedDataset.getStreamSources()));
            }
        }

        @Override
        public String toNativeString() {
            return "mapPartitions(new <fn>())";
        }
    };
}
Also used : MetricsAccumulator(org.apache.beam.runners.spark.metrics.MetricsAccumulator) AggregatorsAccumulator(org.apache.beam.runners.spark.aggregators.AggregatorsAccumulator) Accumulator(org.apache.spark.Accumulator) TupleTag(org.apache.beam.sdk.values.TupleTag) JavaDStream(org.apache.spark.streaming.api.java.JavaDStream) WindowingStrategy(org.apache.beam.sdk.values.WindowingStrategy) WindowedValue(org.apache.beam.sdk.util.WindowedValue) SparkRuntimeContext(org.apache.beam.runners.spark.translation.SparkRuntimeContext) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) SideInputBroadcast(org.apache.beam.runners.spark.util.SideInputBroadcast) MultiDoFnFunction(org.apache.beam.runners.spark.translation.MultiDoFnFunction) PValue(org.apache.beam.sdk.values.PValue) TransformEvaluator(org.apache.beam.runners.spark.translation.TransformEvaluator) JavaRDD(org.apache.spark.api.java.JavaRDD) TranslationUtils(org.apache.beam.runners.spark.translation.TranslationUtils) ParDo(org.apache.beam.sdk.transforms.ParDo) EvaluationContext(org.apache.beam.runners.spark.translation.EvaluationContext) MetricsContainerStepMap(org.apache.beam.runners.core.metrics.MetricsContainerStepMap) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) SparkPCollectionView(org.apache.beam.runners.spark.translation.SparkPCollectionView)

Example 39 with WindowedValue

use of org.apache.beam.sdk.util.WindowedValue in project beam by apache.

the class SparkPCollectionView method createBroadcastHelper.

private SideInputBroadcast createBroadcastHelper(PCollectionView<?> view, JavaSparkContext context) {
    Tuple2<byte[], Coder<Iterable<WindowedValue<?>>>> tuple2 = pviews.get(view);
    SideInputBroadcast helper = SideInputBroadcast.create(tuple2._1, tuple2._2);
    helper.broadcast(context);
    broadcastHelperMap.put(view, helper);
    return helper;
}
Also used : Coder(org.apache.beam.sdk.coders.Coder) WindowedValue(org.apache.beam.sdk.util.WindowedValue) SideInputBroadcast(org.apache.beam.runners.spark.util.SideInputBroadcast)

Example 40 with WindowedValue

use of org.apache.beam.sdk.util.WindowedValue in project beam by apache.

the class TransformTranslator method combineGlobally.

private static <InputT, AccumT, OutputT> TransformEvaluator<Combine.Globally<InputT, OutputT>> combineGlobally() {
    return new TransformEvaluator<Combine.Globally<InputT, OutputT>>() {

        @Override
        public void evaluate(Combine.Globally<InputT, OutputT> transform, EvaluationContext context) {
            final PCollection<InputT> input = context.getInput(transform);
            final Coder<InputT> iCoder = context.getInput(transform).getCoder();
            final Coder<OutputT> oCoder = context.getOutput(transform).getCoder();
            final WindowingStrategy<?, ?> windowingStrategy = input.getWindowingStrategy();
            @SuppressWarnings("unchecked") final CombineWithContext.CombineFnWithContext<InputT, AccumT, OutputT> combineFn = (CombineWithContext.CombineFnWithContext<InputT, AccumT, OutputT>) CombineFnUtil.toFnWithContext(transform.getFn());
            final WindowedValue.FullWindowedValueCoder<OutputT> wvoCoder = WindowedValue.FullWindowedValueCoder.of(oCoder, windowingStrategy.getWindowFn().windowCoder());
            final SparkRuntimeContext runtimeContext = context.getRuntimeContext();
            final boolean hasDefault = transform.isInsertDefault();
            final SparkGlobalCombineFn<InputT, AccumT, OutputT> sparkCombineFn = new SparkGlobalCombineFn<>(combineFn, runtimeContext, TranslationUtils.getSideInputs(transform.getSideInputs(), context), windowingStrategy);
            final Coder<AccumT> aCoder;
            try {
                aCoder = combineFn.getAccumulatorCoder(runtimeContext.getCoderRegistry(), iCoder);
            } catch (CannotProvideCoderException e) {
                throw new IllegalStateException("Could not determine coder for accumulator", e);
            }
            @SuppressWarnings("unchecked") JavaRDD<WindowedValue<InputT>> inRdd = ((BoundedDataset<InputT>) context.borrowDataset(transform)).getRDD();
            JavaRDD<WindowedValue<OutputT>> outRdd;
            Optional<Iterable<WindowedValue<AccumT>>> maybeAccumulated = GroupCombineFunctions.combineGlobally(inRdd, sparkCombineFn, iCoder, aCoder, windowingStrategy);
            if (maybeAccumulated.isPresent()) {
                Iterable<WindowedValue<OutputT>> output = sparkCombineFn.extractOutput(maybeAccumulated.get());
                outRdd = context.getSparkContext().parallelize(CoderHelpers.toByteArrays(output, wvoCoder)).map(CoderHelpers.fromByteFunction(wvoCoder));
            } else {
                // handle empty input RDD, which will naturally skip the entire execution
                // as Spark will not run on empty RDDs.
                JavaSparkContext jsc = new JavaSparkContext(inRdd.context());
                if (hasDefault) {
                    OutputT defaultValue = combineFn.defaultValue();
                    outRdd = jsc.parallelize(Lists.newArrayList(CoderHelpers.toByteArray(defaultValue, oCoder))).map(CoderHelpers.fromByteFunction(oCoder)).map(WindowingHelpers.<OutputT>windowFunction());
                } else {
                    outRdd = jsc.emptyRDD();
                }
            }
            context.putDataset(transform, new BoundedDataset<>(outRdd));
        }

        @Override
        public String toNativeString() {
            return "aggregate(..., new <fn>(), ...)";
        }
    };
}
Also used : FluentIterable(com.google.common.collect.FluentIterable) Combine(org.apache.beam.sdk.transforms.Combine) WindowedValue(org.apache.beam.sdk.util.WindowedValue) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) CombineWithContext(org.apache.beam.sdk.transforms.CombineWithContext) CannotProvideCoderException(org.apache.beam.sdk.coders.CannotProvideCoderException)

Aggregations

WindowedValue (org.apache.beam.sdk.util.WindowedValue)89 Test (org.junit.Test)53 Instant (org.joda.time.Instant)47 IntervalWindow (org.apache.beam.sdk.transforms.windowing.IntervalWindow)36 KV (org.apache.beam.sdk.values.KV)19 ArrayList (java.util.ArrayList)17 WindowMatchers.isSingleWindowedValue (org.apache.beam.runners.core.WindowMatchers.isSingleWindowedValue)17 WindowMatchers.isWindowedValue (org.apache.beam.runners.core.WindowMatchers.isWindowedValue)17 BoundedWindow (org.apache.beam.sdk.transforms.windowing.BoundedWindow)17 Matchers.emptyIterable (org.hamcrest.Matchers.emptyIterable)16 TupleTag (org.apache.beam.sdk.values.TupleTag)13 JavaRDD (org.apache.spark.api.java.JavaRDD)8 ByteString (com.google.protobuf.ByteString)7 BeamFnApi (org.apache.beam.fn.v1.BeamFnApi)7 ThrowingConsumer (org.apache.beam.fn.harness.fn.ThrowingConsumer)6 IsmRecord (org.apache.beam.runners.dataflow.internal.IsmFormat.IsmRecord)6 TimestampCombiner (org.apache.beam.sdk.transforms.windowing.TimestampCombiner)6 CloseableThrowingConsumer (org.apache.beam.fn.harness.fn.CloseableThrowingConsumer)5 MetricsContainerImpl (org.apache.beam.runners.core.metrics.MetricsContainerImpl)5 EvaluationContext (org.apache.beam.runners.spark.translation.EvaluationContext)5