Search in sources :

Example 61 with WindowedValue

use of org.apache.beam.sdk.util.WindowedValue in project beam by apache.

the class GroupCombineFunctions method groupByKeyOnly.

/**
   * An implementation of
   * {@link org.apache.beam.runners.core.GroupByKeyViaGroupByKeyOnly.GroupByKeyOnly}
   * for the Spark runner.
   */
public static <K, V> JavaRDD<WindowedValue<KV<K, Iterable<WindowedValue<V>>>>> groupByKeyOnly(JavaRDD<WindowedValue<KV<K, V>>> rdd, Coder<K> keyCoder, WindowedValueCoder<V> wvCoder) {
    // we use coders to convert objects in the PCollection to byte arrays, so they
    // can be transferred over the network for the shuffle.
    JavaPairRDD<ByteArray, byte[]> pairRDD = rdd.map(new ReifyTimestampsAndWindowsFunction<K, V>()).map(WindowingHelpers.<KV<K, WindowedValue<V>>>unwindowFunction()).mapToPair(TranslationUtils.<K, WindowedValue<V>>toPairFunction()).mapToPair(CoderHelpers.toByteFunction(keyCoder, wvCoder));
    // use a default parallelism HashPartitioner.
    Partitioner partitioner = new HashPartitioner(rdd.rdd().sparkContext().defaultParallelism());
    // and avoid unnecessary shuffle downstream.
    return pairRDD.groupByKey(partitioner).mapPartitionsToPair(TranslationUtils.pairFunctionToPairFlatMapFunction(CoderHelpers.fromByteFunctionIterable(keyCoder, wvCoder)), true).mapPartitions(TranslationUtils.<K, Iterable<WindowedValue<V>>>fromPairFlatMapFunction(), true).mapPartitions(TranslationUtils.functionToFlatMapFunction(WindowingHelpers.<KV<K, Iterable<WindowedValue<V>>>>windowFunction()), true);
}
Also used : WindowedValue(org.apache.beam.sdk.util.WindowedValue) KV(org.apache.beam.sdk.values.KV) ByteArray(org.apache.beam.runners.spark.util.ByteArray) HashPartitioner(org.apache.spark.HashPartitioner) KV(org.apache.beam.sdk.values.KV) HashPartitioner(org.apache.spark.HashPartitioner) Partitioner(org.apache.spark.Partitioner)

Example 62 with WindowedValue

use of org.apache.beam.sdk.util.WindowedValue in project beam by apache.

the class SparkUnboundedSource method read.

public static <T, CheckpointMarkT extends CheckpointMark> UnboundedDataset<T> read(JavaStreamingContext jssc, SparkRuntimeContext rc, UnboundedSource<T, CheckpointMarkT> source, String stepName) {
    SparkPipelineOptions options = rc.getPipelineOptions().as(SparkPipelineOptions.class);
    Long maxRecordsPerBatch = options.getMaxRecordsPerBatch();
    SourceDStream<T, CheckpointMarkT> sourceDStream = new SourceDStream<>(jssc.ssc(), source, rc, maxRecordsPerBatch);
    JavaPairInputDStream<Source<T>, CheckpointMarkT> inputDStream = JavaPairInputDStream$.MODULE$.fromInputDStream(sourceDStream, JavaSparkContext$.MODULE$.<Source<T>>fakeClassTag(), JavaSparkContext$.MODULE$.<CheckpointMarkT>fakeClassTag());
    // call mapWithState to read from a checkpointable sources.
    JavaMapWithStateDStream<Source<T>, CheckpointMarkT, Tuple2<byte[], Instant>, Tuple2<Iterable<byte[]>, Metadata>> mapWithStateDStream = inputDStream.mapWithState(StateSpec.function(StateSpecFunctions.<T, CheckpointMarkT>mapSourceFunction(rc, stepName)).numPartitions(sourceDStream.getNumPartitions()));
    // set checkpoint duration for read stream, if set.
    checkpointStream(mapWithStateDStream, options);
    // report the number of input elements for this InputDStream to the InputInfoTracker.
    int id = inputDStream.inputDStream().id();
    JavaDStream<Metadata> metadataDStream = mapWithStateDStream.map(new Tuple2MetadataFunction());
    // register ReadReportDStream to report information related to this read.
    new ReadReportDStream(metadataDStream.dstream(), id, getSourceName(source, id), stepName).register();
    // output the actual (deserialized) stream.
    WindowedValue.FullWindowedValueCoder<T> coder = WindowedValue.FullWindowedValueCoder.of(source.getDefaultOutputCoder(), GlobalWindow.Coder.INSTANCE);
    JavaDStream<WindowedValue<T>> readUnboundedStream = mapWithStateDStream.flatMap(new Tuple2byteFlatMapFunction()).map(CoderHelpers.fromByteFunction(coder));
    return new UnboundedDataset<>(readUnboundedStream, Collections.singletonList(id));
}
Also used : UnboundedSource(org.apache.beam.sdk.io.UnboundedSource) Source(org.apache.beam.sdk.io.Source) SparkPipelineOptions(org.apache.beam.runners.spark.SparkPipelineOptions) UnboundedDataset(org.apache.beam.runners.spark.translation.streaming.UnboundedDataset) Tuple2(scala.Tuple2) WindowedValue(org.apache.beam.sdk.util.WindowedValue)

Example 63 with WindowedValue

use of org.apache.beam.sdk.util.WindowedValue in project beam by apache.

the class TransformTranslator method readBounded.

private static <T> TransformEvaluator<Read.Bounded<T>> readBounded() {
    return new TransformEvaluator<Read.Bounded<T>>() {

        @Override
        public void evaluate(Read.Bounded<T> transform, EvaluationContext context) {
            String stepName = context.getCurrentTransform().getFullName();
            final JavaSparkContext jsc = context.getSparkContext();
            final SparkRuntimeContext runtimeContext = context.getRuntimeContext();
            // create an RDD from a BoundedSource.
            JavaRDD<WindowedValue<T>> input = new SourceRDD.Bounded<>(jsc.sc(), transform.getSource(), runtimeContext, stepName).toJavaRDD();
            // cache to avoid re-evaluation of the source by Spark's lazy DAG evaluation.
            context.putDataset(transform, new BoundedDataset<>(input.cache()));
        }

        @Override
        public String toNativeString() {
            return "sparkContext.<readFrom(<source>)>()";
        }
    };
}
Also used : SourceRDD(org.apache.beam.runners.spark.io.SourceRDD) Read(org.apache.beam.sdk.io.Read) WindowedValue(org.apache.beam.sdk.util.WindowedValue) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext)

Example 64 with WindowedValue

use of org.apache.beam.sdk.util.WindowedValue in project beam by apache.

the class TransformTranslator method parDo.

private static <InputT, OutputT> TransformEvaluator<ParDo.MultiOutput<InputT, OutputT>> parDo() {
    return new TransformEvaluator<ParDo.MultiOutput<InputT, OutputT>>() {

        @Override
        @SuppressWarnings("unchecked")
        public void evaluate(ParDo.MultiOutput<InputT, OutputT> transform, EvaluationContext context) {
            String stepName = context.getCurrentTransform().getFullName();
            DoFn<InputT, OutputT> doFn = transform.getFn();
            rejectSplittable(doFn);
            JavaRDD<WindowedValue<InputT>> inRDD = ((BoundedDataset<InputT>) context.borrowDataset(transform)).getRDD();
            WindowingStrategy<?, ?> windowingStrategy = context.getInput(transform).getWindowingStrategy();
            Accumulator<NamedAggregators> aggAccum = AggregatorsAccumulator.getInstance();
            Accumulator<MetricsContainerStepMap> metricsAccum = MetricsAccumulator.getInstance();
            JavaPairRDD<TupleTag<?>, WindowedValue<?>> all;
            DoFnSignature signature = DoFnSignatures.getSignature(transform.getFn().getClass());
            boolean stateful = signature.stateDeclarations().size() > 0 || signature.timerDeclarations().size() > 0;
            MultiDoFnFunction<InputT, OutputT> multiDoFnFunction = new MultiDoFnFunction<>(aggAccum, metricsAccum, stepName, doFn, context.getRuntimeContext(), transform.getMainOutputTag(), transform.getAdditionalOutputTags().getAll(), TranslationUtils.getSideInputs(transform.getSideInputs(), context), windowingStrategy, stateful);
            if (stateful) {
                // Based on the fact that the signature is stateful, DoFnSignatures ensures
                // that it is also keyed
                all = statefulParDoTransform((KvCoder) context.getInput(transform).getCoder(), windowingStrategy.getWindowFn().windowCoder(), (JavaRDD) inRDD, (MultiDoFnFunction) multiDoFnFunction);
            } else {
                all = inRDD.mapPartitionsToPair(multiDoFnFunction);
            }
            Map<TupleTag<?>, PValue> outputs = context.getOutputs(transform);
            if (outputs.size() > 1) {
                // cache the RDD if we're going to filter it more than once.
                all.cache();
            }
            for (Map.Entry<TupleTag<?>, PValue> output : outputs.entrySet()) {
                JavaPairRDD<TupleTag<?>, WindowedValue<?>> filtered = all.filter(new TranslationUtils.TupleTagFilter(output.getKey()));
                // Object is the best we can do since different outputs can have different tags
                JavaRDD<WindowedValue<Object>> values = (JavaRDD<WindowedValue<Object>>) (JavaRDD<?>) filtered.values();
                context.putDataset(output.getValue(), new BoundedDataset<>(values));
            }
        }

        @Override
        public String toNativeString() {
            return "mapPartitions(new <fn>())";
        }
    };
}
Also used : MetricsContainerStepMap(org.apache.beam.runners.core.metrics.MetricsContainerStepMap) TupleTag(org.apache.beam.sdk.values.TupleTag) WindowedValue(org.apache.beam.sdk.util.WindowedValue) NamedAggregators(org.apache.beam.runners.spark.aggregators.NamedAggregators) KvCoder(org.apache.beam.sdk.coders.KvCoder) PValue(org.apache.beam.sdk.values.PValue) JavaRDD(org.apache.spark.api.java.JavaRDD) ParDo(org.apache.beam.sdk.transforms.ParDo) MetricsContainerStepMap(org.apache.beam.runners.core.metrics.MetricsContainerStepMap) Map(java.util.Map) DoFnSignature(org.apache.beam.sdk.transforms.reflect.DoFnSignature)

Example 65 with WindowedValue

use of org.apache.beam.sdk.util.WindowedValue in project beam by apache.

the class HashingFlinkCombineRunner method combine.

@Override
public void combine(FlinkCombiner<K, InputT, AccumT, OutputT> flinkCombiner, WindowingStrategy<Object, W> windowingStrategy, SideInputReader sideInputReader, PipelineOptions options, Iterable<WindowedValue<KV<K, InputT>>> elements, Collector<WindowedValue<KV<K, OutputT>>> out) throws Exception {
    @SuppressWarnings("unchecked") TimestampCombiner timestampCombiner = windowingStrategy.getTimestampCombiner();
    WindowFn<Object, W> windowFn = windowingStrategy.getWindowFn();
    // Flink Iterable can be iterated over only once.
    List<WindowedValue<KV<K, InputT>>> inputs = new ArrayList<>();
    Iterables.addAll(inputs, elements);
    Set<W> windows = collectWindows(inputs);
    Map<W, W> windowToMergeResult = mergeWindows(windowingStrategy, windows);
    // Combine all windowedValues into map
    Map<W, Tuple2<AccumT, Instant>> mapState = new HashMap<>();
    Iterator<WindowedValue<KV<K, InputT>>> iterator = inputs.iterator();
    WindowedValue<KV<K, InputT>> currentValue = iterator.next();
    K key = currentValue.getValue().getKey();
    do {
        for (BoundedWindow w : currentValue.getWindows()) {
            @SuppressWarnings("unchecked") W currentWindow = (W) w;
            W mergedWindow = windowToMergeResult.get(currentWindow);
            mergedWindow = mergedWindow == null ? currentWindow : mergedWindow;
            Set<W> singletonW = Collections.singleton(mergedWindow);
            Tuple2<AccumT, Instant> accumAndInstant = mapState.get(mergedWindow);
            if (accumAndInstant == null) {
                AccumT accumT = flinkCombiner.firstInput(key, currentValue.getValue().getValue(), options, sideInputReader, singletonW);
                Instant windowTimestamp = timestampCombiner.assign(mergedWindow, windowFn.getOutputTime(currentValue.getTimestamp(), mergedWindow));
                accumAndInstant = new Tuple2<>(accumT, windowTimestamp);
                mapState.put(mergedWindow, accumAndInstant);
            } else {
                accumAndInstant.f0 = flinkCombiner.addInput(key, accumAndInstant.f0, currentValue.getValue().getValue(), options, sideInputReader, singletonW);
                accumAndInstant.f1 = timestampCombiner.combine(accumAndInstant.f1, timestampCombiner.assign(mergedWindow, windowingStrategy.getWindowFn().getOutputTime(currentValue.getTimestamp(), mergedWindow)));
            }
        }
        if (iterator.hasNext()) {
            currentValue = iterator.next();
        } else {
            break;
        }
    } while (true);
    // Output the final value of combiners
    for (Map.Entry<W, Tuple2<AccumT, Instant>> entry : mapState.entrySet()) {
        AccumT accumulator = entry.getValue().f0;
        Instant windowTimestamp = entry.getValue().f1;
        out.collect(WindowedValue.of(KV.of(key, flinkCombiner.extractOutput(key, accumulator, options, sideInputReader, Collections.singleton(entry.getKey()))), windowTimestamp, entry.getKey(), PaneInfo.NO_FIRING));
    }
}
Also used : HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) WindowedValue(org.apache.beam.sdk.util.WindowedValue) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) TimestampCombiner(org.apache.beam.sdk.transforms.windowing.TimestampCombiner) Instant(org.joda.time.Instant) KV(org.apache.beam.sdk.values.KV) Tuple2(org.apache.flink.api.java.tuple.Tuple2) HashMap(java.util.HashMap) Map(java.util.Map)

Aggregations

WindowedValue (org.apache.beam.sdk.util.WindowedValue)89 Test (org.junit.Test)53 Instant (org.joda.time.Instant)47 IntervalWindow (org.apache.beam.sdk.transforms.windowing.IntervalWindow)36 KV (org.apache.beam.sdk.values.KV)19 ArrayList (java.util.ArrayList)17 WindowMatchers.isSingleWindowedValue (org.apache.beam.runners.core.WindowMatchers.isSingleWindowedValue)17 WindowMatchers.isWindowedValue (org.apache.beam.runners.core.WindowMatchers.isWindowedValue)17 BoundedWindow (org.apache.beam.sdk.transforms.windowing.BoundedWindow)17 Matchers.emptyIterable (org.hamcrest.Matchers.emptyIterable)16 TupleTag (org.apache.beam.sdk.values.TupleTag)13 JavaRDD (org.apache.spark.api.java.JavaRDD)8 ByteString (com.google.protobuf.ByteString)7 BeamFnApi (org.apache.beam.fn.v1.BeamFnApi)7 ThrowingConsumer (org.apache.beam.fn.harness.fn.ThrowingConsumer)6 IsmRecord (org.apache.beam.runners.dataflow.internal.IsmFormat.IsmRecord)6 TimestampCombiner (org.apache.beam.sdk.transforms.windowing.TimestampCombiner)6 CloseableThrowingConsumer (org.apache.beam.fn.harness.fn.CloseableThrowingConsumer)5 MetricsContainerImpl (org.apache.beam.runners.core.metrics.MetricsContainerImpl)5 EvaluationContext (org.apache.beam.runners.spark.translation.EvaluationContext)5