use of org.apache.beam.runners.spark.translation.SparkPCollectionView in project beam by apache.
the class StreamingTransformTranslator method parDo.
private static <InputT, OutputT> TransformEvaluator<ParDo.MultiOutput<InputT, OutputT>> parDo() {
return new TransformEvaluator<ParDo.MultiOutput<InputT, OutputT>>() {
@Override
public void evaluate(final ParDo.MultiOutput<InputT, OutputT> transform, final EvaluationContext context) {
final DoFn<InputT, OutputT> doFn = transform.getFn();
checkArgument(!DoFnSignatures.signatureForDoFn(doFn).processElement().isSplittable(), "Splittable DoFn not yet supported in streaming mode: %s", doFn);
rejectStateAndTimers(doFn);
final SerializablePipelineOptions options = context.getSerializableOptions();
final SparkPCollectionView pviews = context.getPViews();
final WindowingStrategy<?, ?> windowingStrategy = context.getInput(transform).getWindowingStrategy();
Coder<InputT> inputCoder = (Coder<InputT>) context.getInput(transform).getCoder();
Map<TupleTag<?>, Coder<?>> outputCoders = context.getOutputCoders();
@SuppressWarnings("unchecked") UnboundedDataset<InputT> unboundedDataset = (UnboundedDataset<InputT>) context.borrowDataset(transform);
JavaDStream<WindowedValue<InputT>> dStream = unboundedDataset.getDStream();
final DoFnSchemaInformation doFnSchemaInformation = ParDoTranslation.getSchemaInformation(context.getCurrentTransform());
final Map<String, PCollectionView<?>> sideInputMapping = ParDoTranslation.getSideInputMapping(context.getCurrentTransform());
final String stepName = context.getCurrentTransform().getFullName();
JavaPairDStream<TupleTag<?>, WindowedValue<?>> all = dStream.transformToPair(rdd -> {
final MetricsContainerStepMapAccumulator metricsAccum = MetricsAccumulator.getInstance();
final Map<TupleTag<?>, KV<WindowingStrategy<?, ?>, SideInputBroadcast<?>>> sideInputs = TranslationUtils.getSideInputs(transform.getSideInputs().values(), JavaSparkContext.fromSparkContext(rdd.context()), pviews);
return rdd.mapPartitionsToPair(new MultiDoFnFunction<>(metricsAccum, stepName, doFn, options, transform.getMainOutputTag(), transform.getAdditionalOutputTags().getAll(), inputCoder, outputCoders, sideInputs, windowingStrategy, false, doFnSchemaInformation, sideInputMapping));
});
Map<TupleTag<?>, PCollection<?>> outputs = context.getOutputs(transform);
if (outputs.size() > 1) {
// Caching can cause Serialization, we need to code to bytes
// more details in https://issues.apache.org/jira/browse/BEAM-2669
Map<TupleTag<?>, Coder<WindowedValue<?>>> coderMap = TranslationUtils.getTupleTagCoders(outputs);
all = all.mapToPair(TranslationUtils.getTupleTagEncodeFunction(coderMap)).cache().mapToPair(TranslationUtils.getTupleTagDecodeFunction(coderMap));
}
for (Map.Entry<TupleTag<?>, PCollection<?>> output : outputs.entrySet()) {
@SuppressWarnings("unchecked") JavaPairDStream<TupleTag<?>, WindowedValue<?>> filtered = all.filter(new TranslationUtils.TupleTagFilter(output.getKey()));
@SuppressWarnings("unchecked") JavaDStream<WindowedValue<Object>> // Object is the best we can do since different outputs can have different tags
values = (JavaDStream<WindowedValue<Object>>) (JavaDStream<?>) TranslationUtils.dStreamValues(filtered);
context.putDataset(output.getValue(), new UnboundedDataset<>(values, unboundedDataset.getStreamSources()));
}
}
@Override
public String toNativeString() {
return "mapPartitions(new <fn>())";
}
};
}
use of org.apache.beam.runners.spark.translation.SparkPCollectionView in project beam by apache.
the class StreamingTransformTranslator method combineGrouped.
private static <K, InputT, OutputT> TransformEvaluator<Combine.GroupedValues<K, InputT, OutputT>> combineGrouped() {
return new TransformEvaluator<Combine.GroupedValues<K, InputT, OutputT>>() {
@Override
public void evaluate(final Combine.GroupedValues<K, InputT, OutputT> transform, EvaluationContext context) {
// get the applied combine function.
PCollection<? extends KV<K, ? extends Iterable<InputT>>> input = context.getInput(transform);
final WindowingStrategy<?, ?> windowingStrategy = input.getWindowingStrategy();
@SuppressWarnings("unchecked") final CombineWithContext.CombineFnWithContext<InputT, ?, OutputT> fn = (CombineWithContext.CombineFnWithContext<InputT, ?, OutputT>) CombineFnUtil.toFnWithContext(transform.getFn());
@SuppressWarnings("unchecked") UnboundedDataset<KV<K, Iterable<InputT>>> unboundedDataset = (UnboundedDataset<KV<K, Iterable<InputT>>>) context.borrowDataset(transform);
JavaDStream<WindowedValue<KV<K, Iterable<InputT>>>> dStream = unboundedDataset.getDStream();
final SerializablePipelineOptions options = context.getSerializableOptions();
final SparkPCollectionView pviews = context.getPViews();
JavaDStream<WindowedValue<KV<K, OutputT>>> outStream = dStream.transform(rdd -> {
SparkCombineFn<KV<K, InputT>, InputT, ?, OutputT> combineFnWithContext = SparkCombineFn.keyed(fn, options, TranslationUtils.getSideInputs(transform.getSideInputs(), new JavaSparkContext(rdd.context()), pviews), windowingStrategy);
return rdd.map(new TranslationUtils.CombineGroupedValues<>(combineFnWithContext));
});
context.putDataset(transform, new UnboundedDataset<>(outStream, unboundedDataset.getStreamSources()));
}
@Override
public String toNativeString() {
return "map(new <fn>())";
}
};
}
Aggregations