use of org.apache.beam.sdk.values.TupleTag in project beam by apache.
the class StreamingTransformTranslator method parDo.
private static <InputT, OutputT> TransformEvaluator<ParDo.MultiOutput<InputT, OutputT>> parDo() {
return new TransformEvaluator<ParDo.MultiOutput<InputT, OutputT>>() {
@Override
public void evaluate(final ParDo.MultiOutput<InputT, OutputT> transform, final EvaluationContext context) {
final DoFn<InputT, OutputT> doFn = transform.getFn();
checkArgument(!DoFnSignatures.signatureForDoFn(doFn).processElement().isSplittable(), "Splittable DoFn not yet supported in streaming mode: %s", doFn);
rejectStateAndTimers(doFn);
final SerializablePipelineOptions options = context.getSerializableOptions();
final SparkPCollectionView pviews = context.getPViews();
final WindowingStrategy<?, ?> windowingStrategy = context.getInput(transform).getWindowingStrategy();
Coder<InputT> inputCoder = (Coder<InputT>) context.getInput(transform).getCoder();
Map<TupleTag<?>, Coder<?>> outputCoders = context.getOutputCoders();
@SuppressWarnings("unchecked") UnboundedDataset<InputT> unboundedDataset = (UnboundedDataset<InputT>) context.borrowDataset(transform);
JavaDStream<WindowedValue<InputT>> dStream = unboundedDataset.getDStream();
final DoFnSchemaInformation doFnSchemaInformation = ParDoTranslation.getSchemaInformation(context.getCurrentTransform());
final Map<String, PCollectionView<?>> sideInputMapping = ParDoTranslation.getSideInputMapping(context.getCurrentTransform());
final String stepName = context.getCurrentTransform().getFullName();
JavaPairDStream<TupleTag<?>, WindowedValue<?>> all = dStream.transformToPair(rdd -> {
final MetricsContainerStepMapAccumulator metricsAccum = MetricsAccumulator.getInstance();
final Map<TupleTag<?>, KV<WindowingStrategy<?, ?>, SideInputBroadcast<?>>> sideInputs = TranslationUtils.getSideInputs(transform.getSideInputs().values(), JavaSparkContext.fromSparkContext(rdd.context()), pviews);
return rdd.mapPartitionsToPair(new MultiDoFnFunction<>(metricsAccum, stepName, doFn, options, transform.getMainOutputTag(), transform.getAdditionalOutputTags().getAll(), inputCoder, outputCoders, sideInputs, windowingStrategy, false, doFnSchemaInformation, sideInputMapping));
});
Map<TupleTag<?>, PCollection<?>> outputs = context.getOutputs(transform);
if (outputs.size() > 1) {
// Caching can cause Serialization, we need to code to bytes
// more details in https://issues.apache.org/jira/browse/BEAM-2669
Map<TupleTag<?>, Coder<WindowedValue<?>>> coderMap = TranslationUtils.getTupleTagCoders(outputs);
all = all.mapToPair(TranslationUtils.getTupleTagEncodeFunction(coderMap)).cache().mapToPair(TranslationUtils.getTupleTagDecodeFunction(coderMap));
}
for (Map.Entry<TupleTag<?>, PCollection<?>> output : outputs.entrySet()) {
@SuppressWarnings("unchecked") JavaPairDStream<TupleTag<?>, WindowedValue<?>> filtered = all.filter(new TranslationUtils.TupleTagFilter(output.getKey()));
@SuppressWarnings("unchecked") JavaDStream<WindowedValue<Object>> // Object is the best we can do since different outputs can have different tags
values = (JavaDStream<WindowedValue<Object>>) (JavaDStream<?>) TranslationUtils.dStreamValues(filtered);
context.putDataset(output.getValue(), new UnboundedDataset<>(values, unboundedDataset.getStreamSources()));
}
}
@Override
public String toNativeString() {
return "mapPartitions(new <fn>())";
}
};
}
use of org.apache.beam.sdk.values.TupleTag in project beam by apache.
the class CreateStreamTest method testMultiOutputParDo.
/**
* Test multiple output {@link ParDo} in streaming pipelines. This is currently needed as a test
* for https://issues.apache.org/jira/browse/BEAM-2029 since {@link
* org.apache.beam.sdk.testing.ValidatesRunner} tests do not currently run for Spark runner in
* streaming mode.
*/
@Test
public void testMultiOutputParDo() throws IOException {
Instant instant = new Instant(0);
CreateStream<Integer> source1 = CreateStream.of(VarIntCoder.of(), batchDuration()).emptyBatch().advanceWatermarkForNextBatch(instant.plus(Duration.standardMinutes(5))).nextBatch(TimestampedValue.of(1, instant), TimestampedValue.of(2, instant), TimestampedValue.of(3, instant)).advanceNextBatchWatermarkToInfinity();
PCollection<Integer> inputs = p.apply(source1);
final TupleTag<Integer> mainTag = new TupleTag<>();
final TupleTag<Integer> additionalTag = new TupleTag<>();
PCollectionTuple outputs = inputs.apply(ParDo.of(new DoFn<Integer, Integer>() {
@SuppressWarnings("unused")
@ProcessElement
public void process(ProcessContext context) {
Integer element = context.element();
context.output(element);
context.output(additionalTag, element + 1);
}
}).withOutputTags(mainTag, TupleTagList.of(additionalTag)));
PCollection<Integer> output1 = outputs.get(mainTag).setCoder(VarIntCoder.of());
PCollection<Integer> output2 = outputs.get(additionalTag).setCoder(VarIntCoder.of());
PAssert.that(output1).containsInAnyOrder(1, 2, 3);
PAssert.that(output2).containsInAnyOrder(2, 3, 4);
p.run();
}
use of org.apache.beam.sdk.values.TupleTag in project beam by apache.
the class DoFnFunction method prepareSerialization.
/**
* prepares the DoFnFunction class so it can be serialized properly. This involves using various
* protobuf's and byte arrays which are later converted back into the proper classes during
* deserialization.
*/
private void prepareSerialization() {
SdkComponents components = SdkComponents.create();
components.registerEnvironment(Environments.createOrGetDefaultEnvironment(pipelineOptions.as(PortablePipelineOptions.class)));
this.serializedOptions = new SerializablePipelineOptions(pipelineOptions).toString();
doFnwithEx = ParDoTranslation.translateDoFn(this.doFn, mainOutput, sideInputMapping, doFnSchemaInformation, components);
doFnwithExBytes = doFnwithEx.getPayload().toByteArray();
outputCodersBytes = new HashMap<>();
try {
coderBytes = SerializableUtils.serializeToByteArray(inputCoder);
windowStrategyProto = WindowingStrategyTranslation.toMessageProto(windowingStrategy, components);
windowBytes = windowStrategyProto.toByteArray();
for (Map.Entry<TupleTag<?>, Coder<?>> entry : outputCoders.entrySet()) {
outputCodersBytes.put(entry.getKey().getId(), SerializableUtils.serializeToByteArray(entry.getValue()));
}
sideInputBytes = new HashMap<>();
for (Map.Entry<TupleTag<?>, WindowingStrategy<?, ?>> entry : sideInputs.entrySet()) {
windowStrategyProto = WindowingStrategyTranslation.toMessageProto(entry.getValue(), components);
sideInputBytes.put(entry.getKey().getId(), windowStrategyProto.toByteArray());
}
serializedSideOutputs = new ArrayList<>();
for (TupleTag<?> sideOutput : sideOutputs) {
serializedSideOutputs.add(sideOutput.getId());
}
serializedOutputMap = new HashMap<>();
for (Map.Entry<TupleTag<?>, Integer> entry : outputMap.entrySet()) {
serializedOutputMap.put(entry.getKey().getId(), entry.getValue());
}
} catch (IOException e) {
LOG.info(e.getMessage());
}
}
use of org.apache.beam.sdk.values.TupleTag in project beam by apache.
the class Partition method expand.
// ///////////////////////////////////////////////////////////////////////////
@Override
public PCollectionList<T> expand(PCollection<T> in) {
final TupleTagList outputTags = partitionDoFn.getOutputTags();
PCollectionTuple outputs = in.apply(ParDo.of(partitionDoFn).withOutputTags(new TupleTag<Void>() {
}, outputTags).withSideInputs(partitionDoFn.getSideInputs()));
PCollectionList<T> pcs = PCollectionList.empty(in.getPipeline());
Coder<T> coder = in.getCoder();
for (TupleTag<?> outputTag : outputTags.getAll()) {
// All the tuple tags are actually TupleTag<T>
// And all the collections are actually PCollection<T>
@SuppressWarnings("unchecked") TupleTag<T> typedOutputTag = (TupleTag<T>) outputTag;
pcs = pcs.and(outputs.get(typedOutputTag).setCoder(coder));
}
return pcs;
}
use of org.apache.beam.sdk.values.TupleTag in project beam by apache.
the class PipelineTest method testTupleProjectionTransform.
/**
* Tests that Pipeline supports pulling an element out of a tuple as a transform.
*/
@Test
@Category(ValidatesRunner.class)
public void testTupleProjectionTransform() throws Exception {
PCollection<Integer> input = pipeline.apply(Create.of(1, 2, 3, 4));
TupleTag<Integer> tag = new TupleTag<>();
PCollectionTuple tuple = PCollectionTuple.of(tag, input);
PCollection<Integer> output = tuple.apply("ProjectTag", new TupleProjectionTransform<>(tag));
PAssert.that(output).containsInAnyOrder(1, 2, 3, 4);
pipeline.run();
}
Aggregations