use of org.apache.beam.sdk.util.WindowedValue in project beam by apache.
the class GroupCombineFunctions method groupByKeyOnly.
/**
* An implementation of
* {@link org.apache.beam.runners.core.GroupByKeyViaGroupByKeyOnly.GroupByKeyOnly}
* for the Spark runner.
*/
public static <K, V> JavaRDD<WindowedValue<KV<K, Iterable<WindowedValue<V>>>>> groupByKeyOnly(JavaRDD<WindowedValue<KV<K, V>>> rdd, Coder<K> keyCoder, WindowedValueCoder<V> wvCoder) {
// we use coders to convert objects in the PCollection to byte arrays, so they
// can be transferred over the network for the shuffle.
JavaPairRDD<ByteArray, byte[]> pairRDD = rdd.map(new ReifyTimestampsAndWindowsFunction<K, V>()).map(WindowingHelpers.<KV<K, WindowedValue<V>>>unwindowFunction()).mapToPair(TranslationUtils.<K, WindowedValue<V>>toPairFunction()).mapToPair(CoderHelpers.toByteFunction(keyCoder, wvCoder));
// use a default parallelism HashPartitioner.
Partitioner partitioner = new HashPartitioner(rdd.rdd().sparkContext().defaultParallelism());
// and avoid unnecessary shuffle downstream.
return pairRDD.groupByKey(partitioner).mapPartitionsToPair(TranslationUtils.pairFunctionToPairFlatMapFunction(CoderHelpers.fromByteFunctionIterable(keyCoder, wvCoder)), true).mapPartitions(TranslationUtils.<K, Iterable<WindowedValue<V>>>fromPairFlatMapFunction(), true).mapPartitions(TranslationUtils.functionToFlatMapFunction(WindowingHelpers.<KV<K, Iterable<WindowedValue<V>>>>windowFunction()), true);
}
use of org.apache.beam.sdk.util.WindowedValue in project beam by apache.
the class SparkUnboundedSource method read.
public static <T, CheckpointMarkT extends CheckpointMark> UnboundedDataset<T> read(JavaStreamingContext jssc, SparkRuntimeContext rc, UnboundedSource<T, CheckpointMarkT> source, String stepName) {
SparkPipelineOptions options = rc.getPipelineOptions().as(SparkPipelineOptions.class);
Long maxRecordsPerBatch = options.getMaxRecordsPerBatch();
SourceDStream<T, CheckpointMarkT> sourceDStream = new SourceDStream<>(jssc.ssc(), source, rc, maxRecordsPerBatch);
JavaPairInputDStream<Source<T>, CheckpointMarkT> inputDStream = JavaPairInputDStream$.MODULE$.fromInputDStream(sourceDStream, JavaSparkContext$.MODULE$.<Source<T>>fakeClassTag(), JavaSparkContext$.MODULE$.<CheckpointMarkT>fakeClassTag());
// call mapWithState to read from a checkpointable sources.
JavaMapWithStateDStream<Source<T>, CheckpointMarkT, Tuple2<byte[], Instant>, Tuple2<Iterable<byte[]>, Metadata>> mapWithStateDStream = inputDStream.mapWithState(StateSpec.function(StateSpecFunctions.<T, CheckpointMarkT>mapSourceFunction(rc, stepName)).numPartitions(sourceDStream.getNumPartitions()));
// set checkpoint duration for read stream, if set.
checkpointStream(mapWithStateDStream, options);
// report the number of input elements for this InputDStream to the InputInfoTracker.
int id = inputDStream.inputDStream().id();
JavaDStream<Metadata> metadataDStream = mapWithStateDStream.map(new Tuple2MetadataFunction());
// register ReadReportDStream to report information related to this read.
new ReadReportDStream(metadataDStream.dstream(), id, getSourceName(source, id), stepName).register();
// output the actual (deserialized) stream.
WindowedValue.FullWindowedValueCoder<T> coder = WindowedValue.FullWindowedValueCoder.of(source.getDefaultOutputCoder(), GlobalWindow.Coder.INSTANCE);
JavaDStream<WindowedValue<T>> readUnboundedStream = mapWithStateDStream.flatMap(new Tuple2byteFlatMapFunction()).map(CoderHelpers.fromByteFunction(coder));
return new UnboundedDataset<>(readUnboundedStream, Collections.singletonList(id));
}
use of org.apache.beam.sdk.util.WindowedValue in project beam by apache.
the class TransformTranslator method readBounded.
private static <T> TransformEvaluator<Read.Bounded<T>> readBounded() {
return new TransformEvaluator<Read.Bounded<T>>() {
@Override
public void evaluate(Read.Bounded<T> transform, EvaluationContext context) {
String stepName = context.getCurrentTransform().getFullName();
final JavaSparkContext jsc = context.getSparkContext();
final SparkRuntimeContext runtimeContext = context.getRuntimeContext();
// create an RDD from a BoundedSource.
JavaRDD<WindowedValue<T>> input = new SourceRDD.Bounded<>(jsc.sc(), transform.getSource(), runtimeContext, stepName).toJavaRDD();
// cache to avoid re-evaluation of the source by Spark's lazy DAG evaluation.
context.putDataset(transform, new BoundedDataset<>(input.cache()));
}
@Override
public String toNativeString() {
return "sparkContext.<readFrom(<source>)>()";
}
};
}
use of org.apache.beam.sdk.util.WindowedValue in project beam by apache.
the class TransformTranslator method parDo.
private static <InputT, OutputT> TransformEvaluator<ParDo.MultiOutput<InputT, OutputT>> parDo() {
return new TransformEvaluator<ParDo.MultiOutput<InputT, OutputT>>() {
@Override
@SuppressWarnings("unchecked")
public void evaluate(ParDo.MultiOutput<InputT, OutputT> transform, EvaluationContext context) {
String stepName = context.getCurrentTransform().getFullName();
DoFn<InputT, OutputT> doFn = transform.getFn();
rejectSplittable(doFn);
JavaRDD<WindowedValue<InputT>> inRDD = ((BoundedDataset<InputT>) context.borrowDataset(transform)).getRDD();
WindowingStrategy<?, ?> windowingStrategy = context.getInput(transform).getWindowingStrategy();
Accumulator<NamedAggregators> aggAccum = AggregatorsAccumulator.getInstance();
Accumulator<MetricsContainerStepMap> metricsAccum = MetricsAccumulator.getInstance();
JavaPairRDD<TupleTag<?>, WindowedValue<?>> all;
DoFnSignature signature = DoFnSignatures.getSignature(transform.getFn().getClass());
boolean stateful = signature.stateDeclarations().size() > 0 || signature.timerDeclarations().size() > 0;
MultiDoFnFunction<InputT, OutputT> multiDoFnFunction = new MultiDoFnFunction<>(aggAccum, metricsAccum, stepName, doFn, context.getRuntimeContext(), transform.getMainOutputTag(), transform.getAdditionalOutputTags().getAll(), TranslationUtils.getSideInputs(transform.getSideInputs(), context), windowingStrategy, stateful);
if (stateful) {
// Based on the fact that the signature is stateful, DoFnSignatures ensures
// that it is also keyed
all = statefulParDoTransform((KvCoder) context.getInput(transform).getCoder(), windowingStrategy.getWindowFn().windowCoder(), (JavaRDD) inRDD, (MultiDoFnFunction) multiDoFnFunction);
} else {
all = inRDD.mapPartitionsToPair(multiDoFnFunction);
}
Map<TupleTag<?>, PValue> outputs = context.getOutputs(transform);
if (outputs.size() > 1) {
// cache the RDD if we're going to filter it more than once.
all.cache();
}
for (Map.Entry<TupleTag<?>, PValue> output : outputs.entrySet()) {
JavaPairRDD<TupleTag<?>, WindowedValue<?>> filtered = all.filter(new TranslationUtils.TupleTagFilter(output.getKey()));
// Object is the best we can do since different outputs can have different tags
JavaRDD<WindowedValue<Object>> values = (JavaRDD<WindowedValue<Object>>) (JavaRDD<?>) filtered.values();
context.putDataset(output.getValue(), new BoundedDataset<>(values));
}
}
@Override
public String toNativeString() {
return "mapPartitions(new <fn>())";
}
};
}
use of org.apache.beam.sdk.util.WindowedValue in project beam by apache.
the class HashingFlinkCombineRunner method combine.
@Override
public void combine(FlinkCombiner<K, InputT, AccumT, OutputT> flinkCombiner, WindowingStrategy<Object, W> windowingStrategy, SideInputReader sideInputReader, PipelineOptions options, Iterable<WindowedValue<KV<K, InputT>>> elements, Collector<WindowedValue<KV<K, OutputT>>> out) throws Exception {
@SuppressWarnings("unchecked") TimestampCombiner timestampCombiner = windowingStrategy.getTimestampCombiner();
WindowFn<Object, W> windowFn = windowingStrategy.getWindowFn();
// Flink Iterable can be iterated over only once.
List<WindowedValue<KV<K, InputT>>> inputs = new ArrayList<>();
Iterables.addAll(inputs, elements);
Set<W> windows = collectWindows(inputs);
Map<W, W> windowToMergeResult = mergeWindows(windowingStrategy, windows);
// Combine all windowedValues into map
Map<W, Tuple2<AccumT, Instant>> mapState = new HashMap<>();
Iterator<WindowedValue<KV<K, InputT>>> iterator = inputs.iterator();
WindowedValue<KV<K, InputT>> currentValue = iterator.next();
K key = currentValue.getValue().getKey();
do {
for (BoundedWindow w : currentValue.getWindows()) {
@SuppressWarnings("unchecked") W currentWindow = (W) w;
W mergedWindow = windowToMergeResult.get(currentWindow);
mergedWindow = mergedWindow == null ? currentWindow : mergedWindow;
Set<W> singletonW = Collections.singleton(mergedWindow);
Tuple2<AccumT, Instant> accumAndInstant = mapState.get(mergedWindow);
if (accumAndInstant == null) {
AccumT accumT = flinkCombiner.firstInput(key, currentValue.getValue().getValue(), options, sideInputReader, singletonW);
Instant windowTimestamp = timestampCombiner.assign(mergedWindow, windowFn.getOutputTime(currentValue.getTimestamp(), mergedWindow));
accumAndInstant = new Tuple2<>(accumT, windowTimestamp);
mapState.put(mergedWindow, accumAndInstant);
} else {
accumAndInstant.f0 = flinkCombiner.addInput(key, accumAndInstant.f0, currentValue.getValue().getValue(), options, sideInputReader, singletonW);
accumAndInstant.f1 = timestampCombiner.combine(accumAndInstant.f1, timestampCombiner.assign(mergedWindow, windowingStrategy.getWindowFn().getOutputTime(currentValue.getTimestamp(), mergedWindow)));
}
}
if (iterator.hasNext()) {
currentValue = iterator.next();
} else {
break;
}
} while (true);
// Output the final value of combiners
for (Map.Entry<W, Tuple2<AccumT, Instant>> entry : mapState.entrySet()) {
AccumT accumulator = entry.getValue().f0;
Instant windowTimestamp = entry.getValue().f1;
out.collect(WindowedValue.of(KV.of(key, flinkCombiner.extractOutput(key, accumulator, options, sideInputReader, Collections.singleton(entry.getKey()))), windowTimestamp, entry.getKey(), PaneInfo.NO_FIRING));
}
}
Aggregations