use of org.apache.beam.sdk.values.PCollectionTuple in project beam by apache.
the class ParDoTest method testParDoWithTaggedOutput.
@Test
@Category(ValidatesRunner.class)
public void testParDoWithTaggedOutput() {
List<Integer> inputs = Arrays.asList(3, -42, 666);
TupleTag<String> mainOutputTag = new TupleTag<String>("main") {
};
TupleTag<String> additionalOutputTag1 = new TupleTag<String>("additional1") {
};
TupleTag<String> additionalOutputTag2 = new TupleTag<String>("additional2") {
};
TupleTag<String> additionalOutputTag3 = new TupleTag<String>("additional3") {
};
TupleTag<String> additionalOutputTagUnwritten = new TupleTag<String>("unwrittenOutput") {
};
PCollectionTuple outputs = pipeline.apply(Create.of(inputs)).apply(ParDo.of(new TestDoFn(Arrays.<PCollectionView<Integer>>asList(), Arrays.asList(additionalOutputTag1, additionalOutputTag2, additionalOutputTag3))).withOutputTags(mainOutputTag, TupleTagList.of(additionalOutputTag3).and(additionalOutputTag1).and(additionalOutputTagUnwritten).and(additionalOutputTag2)));
PAssert.that(outputs.get(mainOutputTag)).satisfies(ParDoTest.HasExpectedOutput.forInput(inputs));
PAssert.that(outputs.get(additionalOutputTag1)).satisfies(ParDoTest.HasExpectedOutput.forInput(inputs).fromOutput(additionalOutputTag1));
PAssert.that(outputs.get(additionalOutputTag2)).satisfies(ParDoTest.HasExpectedOutput.forInput(inputs).fromOutput(additionalOutputTag2));
PAssert.that(outputs.get(additionalOutputTag3)).satisfies(ParDoTest.HasExpectedOutput.forInput(inputs).fromOutput(additionalOutputTag3));
PAssert.that(outputs.get(additionalOutputTagUnwritten)).empty();
pipeline.run();
}
use of org.apache.beam.sdk.values.PCollectionTuple in project beam by apache.
the class StreamingWriteTables method expand.
@Override
public WriteResult expand(PCollection<KV<TableDestination, TableRow>> input) {
// A naive implementation would be to simply stream data directly to BigQuery.
// However, this could occasionally lead to duplicated data, e.g., when
// a VM that runs this code is restarted and the code is re-run.
// The above risk is mitigated in this implementation by relying on
// BigQuery built-in best effort de-dup mechanism.
// To use this mechanism, each input TableRow is tagged with a generated
// unique id, which is then passed to BigQuery and used to ignore duplicates
// We create 50 keys per BigQuery table to generate output on. This is few enough that we
// get good batching into BigQuery's insert calls, and enough that we can max out the
// streaming insert quota.
PCollection<KV<ShardedKey<String>, TableRowInfo>> tagged = input.apply("ShardTableWrites", ParDo.of(new GenerateShardedTable(50))).setCoder(KvCoder.of(ShardedKeyCoder.of(StringUtf8Coder.of()), TableRowJsonCoder.of())).apply("TagWithUniqueIds", ParDo.of(new TagWithUniqueIds()));
// To prevent having the same TableRow processed more than once with regenerated
// different unique ids, this implementation relies on "checkpointing", which is
// achieved as a side effect of having StreamingWriteFn immediately follow a GBK,
// performed by Reshuffle.
TupleTag<Void> mainOutputTag = new TupleTag<>("mainOutput");
TupleTag<TableRow> failedInsertsTag = new TupleTag<>("failedInserts");
PCollectionTuple tuple = tagged.setCoder(KvCoder.of(ShardedKeyCoder.of(StringUtf8Coder.of()), TableRowInfoCoder.of())).apply(Reshuffle.<ShardedKey<String>, TableRowInfo>of()).apply("GlobalWindow", Window.<KV<ShardedKey<String>, TableRowInfo>>into(new GlobalWindows()).triggering(DefaultTrigger.of()).discardingFiredPanes()).apply("StreamingWrite", ParDo.of(new StreamingWriteFn(bigQueryServices, retryPolicy, failedInsertsTag)).withOutputTags(mainOutputTag, TupleTagList.of(failedInsertsTag)));
PCollection<TableRow> failedInserts = tuple.get(failedInsertsTag);
failedInserts.setCoder(TableRowJsonCoder.of());
return WriteResult.in(input.getPipeline(), failedInsertsTag, failedInserts);
}
use of org.apache.beam.sdk.values.PCollectionTuple in project beam by apache.
the class TransformHierarchyTest method replaceWithCompositeSucceeds.
@Test
public void replaceWithCompositeSucceeds() {
final SingleOutput<Long, Long> originalParDo = ParDo.of(new DoFn<Long, Long>() {
@ProcessElement
public void processElement(ProcessContext ctxt) {
ctxt.output(ctxt.element() + 1L);
}
});
GenerateSequence genUpstream = GenerateSequence.from(0);
PCollection<Long> upstream = pipeline.apply(genUpstream);
PCollection<Long> output = upstream.apply("Original", originalParDo);
hierarchy.pushNode("Upstream", pipeline.begin(), genUpstream);
hierarchy.finishSpecifyingInput();
hierarchy.setOutput(upstream);
hierarchy.popNode();
TransformHierarchy.Node original = hierarchy.pushNode("Original", upstream, originalParDo);
hierarchy.finishSpecifyingInput();
hierarchy.setOutput(output);
hierarchy.popNode();
final TupleTag<Long> longs = new TupleTag<>();
final MultiOutput<Long, Long> replacementParDo = ParDo.of(new DoFn<Long, Long>() {
@ProcessElement
public void processElement(ProcessContext ctxt) {
ctxt.output(ctxt.element() + 1L);
}
}).withOutputTags(longs, TupleTagList.empty());
PTransform<PCollection<Long>, PCollection<Long>> replacementComposite = new PTransform<PCollection<Long>, PCollection<Long>>() {
@Override
public PCollection<Long> expand(PCollection<Long> input) {
return input.apply("Contained", replacementParDo).get(longs);
}
};
PCollectionTuple replacementOutput = upstream.apply("Contained", replacementParDo);
Node compositeNode = hierarchy.replaceNode(original, upstream, replacementComposite);
Node replacementParNode = hierarchy.pushNode("Original/Contained", upstream, replacementParDo);
hierarchy.finishSpecifyingInput();
hierarchy.setOutput(replacementOutput);
hierarchy.popNode();
hierarchy.setOutput(replacementOutput.get(longs));
Entry<TupleTag<?>, PValue> replacementLongs = Iterables.getOnlyElement(replacementOutput.expand().entrySet());
hierarchy.replaceOutputs(Collections.<PValue, ReplacementOutput>singletonMap(replacementOutput.get(longs), ReplacementOutput.of(TaggedPValue.ofExpandedValue(output), TaggedPValue.of(replacementLongs.getKey(), replacementLongs.getValue()))));
assertThat(replacementParNode.getOutputs().keySet(), Matchers.<TupleTag<?>>contains(replacementLongs.getKey()));
assertThat(replacementParNode.getOutputs().values(), Matchers.<PValue>contains(output));
assertThat(compositeNode.getOutputs().keySet(), equalTo(replacementOutput.get(longs).expand().keySet()));
assertThat(compositeNode.getOutputs().values(), Matchers.<PValue>contains(output));
hierarchy.popNode();
}
use of org.apache.beam.sdk.values.PCollectionTuple in project beam by apache.
the class ParDoTest method testParDoWithOnlyTaggedOutput.
@Test
@Category(ValidatesRunner.class)
public void testParDoWithOnlyTaggedOutput() {
List<Integer> inputs = Arrays.asList(3, -42, 666);
final TupleTag<Void> mainOutputTag = new TupleTag<Void>("main") {
};
final TupleTag<Integer> additionalOutputTag = new TupleTag<Integer>("additional") {
};
PCollectionTuple outputs = pipeline.apply(Create.of(inputs)).apply(ParDo.of(new DoFn<Integer, Void>() {
@ProcessElement
public void processElement(ProcessContext c) {
c.output(additionalOutputTag, c.element());
}
}).withOutputTags(mainOutputTag, TupleTagList.of(additionalOutputTag)));
PAssert.that(outputs.get(mainOutputTag)).empty();
PAssert.that(outputs.get(additionalOutputTag)).containsInAnyOrder(inputs);
pipeline.run();
}
use of org.apache.beam.sdk.values.PCollectionTuple in project beam by apache.
the class ParDoTest method testMultiOutputParDoWithSideInputsIsCumulative.
@Test
@Category(ValidatesRunner.class)
public void testMultiOutputParDoWithSideInputsIsCumulative() {
List<Integer> inputs = Arrays.asList(3, -42, 666);
final TupleTag<String> mainOutputTag = new TupleTag<String>("main") {
};
final TupleTag<Void> additionalOutputTag = new TupleTag<Void>("output") {
};
PCollectionView<Integer> sideInput1 = pipeline.apply("CreateSideInput1", Create.of(11)).apply("ViewSideInput1", View.<Integer>asSingleton());
PCollectionView<Integer> sideInputUnread = pipeline.apply("CreateSideInputUnread", Create.of(-3333)).apply("ViewSideInputUnread", View.<Integer>asSingleton());
PCollectionView<Integer> sideInput2 = pipeline.apply("CreateSideInput2", Create.of(222)).apply("ViewSideInput2", View.<Integer>asSingleton());
PCollectionTuple outputs = pipeline.apply(Create.of(inputs)).apply(ParDo.of(new TestDoFn(Arrays.asList(sideInput1, sideInput2), Arrays.<TupleTag<String>>asList())).withSideInputs(sideInput1).withSideInputs(sideInputUnread).withSideInputs(sideInput2).withOutputTags(mainOutputTag, TupleTagList.of(additionalOutputTag)));
PAssert.that(outputs.get(mainOutputTag)).satisfies(ParDoTest.HasExpectedOutput.forInput(inputs).andSideInputs(11, 222));
pipeline.run();
}
Aggregations