use of org.apache.beam.runners.core.construction.graph.OutputDeduplicator.DeduplicationResult in project beam by apache.
the class OutputDeduplicatorTest method unchangedWithNoDuplicates.
@Test
public void unchangedWithNoDuplicates() {
/* When all the PCollections are produced by only one transform or stage, the result should be
* empty/identical to the input.
*
* Pipeline:
* /-> one -> .out \
* red -> .out -> -> blue -> .out
* \-> two -> .out /
*/
PCollection redOut = PCollection.newBuilder().setUniqueName("red.out").build();
PTransform red = PTransform.newBuilder().setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).build()).putOutputs("out", redOut.getUniqueName()).build();
PCollection oneOut = PCollection.newBuilder().setUniqueName("one.out").build();
PTransform one = PTransform.newBuilder().setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).build()).putInputs("in", redOut.getUniqueName()).putOutputs("out", oneOut.getUniqueName()).build();
PCollection twoOut = PCollection.newBuilder().setUniqueName("two.out").build();
PTransform two = PTransform.newBuilder().setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).build()).putInputs("in", redOut.getUniqueName()).putOutputs("out", twoOut.getUniqueName()).build();
PCollection blueOut = PCollection.newBuilder().setUniqueName("blue.out").build();
PTransform blue = PTransform.newBuilder().setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).build()).putInputs("one", oneOut.getUniqueName()).putInputs("two", twoOut.getUniqueName()).putOutputs("out", blueOut.getUniqueName()).build();
RunnerApi.Components components = Components.newBuilder().putTransforms("one", one).putPcollections(oneOut.getUniqueName(), oneOut).putTransforms("two", two).putPcollections(twoOut.getUniqueName(), twoOut).putTransforms("red", red).putPcollections(redOut.getUniqueName(), redOut).putTransforms("blue", blue).putPcollections(blueOut.getUniqueName(), blueOut).build();
ExecutableStage oneStage = ImmutableExecutableStage.of(components, Environment.getDefaultInstance(), PipelineNode.pCollection(redOut.getUniqueName(), redOut), ImmutableList.of(), ImmutableList.of(), ImmutableList.of(), ImmutableList.of(PipelineNode.pTransform("one", one)), ImmutableList.of(PipelineNode.pCollection(oneOut.getUniqueName(), oneOut)), DEFAULT_WIRE_CODER_SETTINGS);
ExecutableStage twoStage = ImmutableExecutableStage.of(components, Environment.getDefaultInstance(), PipelineNode.pCollection(redOut.getUniqueName(), redOut), ImmutableList.of(), ImmutableList.of(), ImmutableList.of(), ImmutableList.of(PipelineNode.pTransform("two", two)), ImmutableList.of(PipelineNode.pCollection(twoOut.getUniqueName(), twoOut)), DEFAULT_WIRE_CODER_SETTINGS);
PTransformNode redTransform = PipelineNode.pTransform("red", red);
PTransformNode blueTransform = PipelineNode.pTransform("blue", blue);
QueryablePipeline pipeline = QueryablePipeline.forPrimitivesIn(components);
DeduplicationResult result = OutputDeduplicator.ensureSingleProducer(pipeline, ImmutableList.of(oneStage, twoStage), ImmutableList.of(redTransform, blueTransform));
assertThat(result.getDeduplicatedComponents(), equalTo(components));
assertThat(result.getDeduplicatedStages().keySet(), empty());
assertThat(result.getDeduplicatedTransforms().keySet(), empty());
assertThat(result.getIntroducedTransforms(), empty());
}
use of org.apache.beam.runners.core.construction.graph.OutputDeduplicator.DeduplicationResult in project beam by apache.
the class OutputDeduplicatorTest method multipleDuplicatesInStages.
@Test
public void multipleDuplicatesInStages() {
/* A stage that produces multiple duplicates should have them all synthesized.
*
* Original Pipeline:
* red -> .out ---> one -> .out -----\
* \ -> shared.out
* \--> two -> .out ----|
* \ -> otherShared -> .out
* \-> three --> .out /
*
* Fused Pipeline:
* -> .out [-> one -> .out -> shared -> .out] \
* / -> blue -> .out
* | -> shared -> .out] /
* red -> .out [-> two -> .out |
* | -> otherShared -> .out]
* \
* -> .out [-> three -> .out -> otherShared -> .out]
*
* Deduplicated Pipeline:
* [-> one -> .out -> shared -> .out:0] --\
* | -> shared -> .out -> blue -> .out
* | -> shared -> .out:1] /
* red -> .out [-> two -> .out |
* | -> otherShared -> .out:0] --\
* | -> otherShared -> .out
* [-> three -> .out -> otherShared -> .out:1] ---/
*/
PCollection redOut = PCollection.newBuilder().setUniqueName("red.out").build();
PTransform red = PTransform.newBuilder().setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).build()).putOutputs("out", redOut.getUniqueName()).build();
PCollection threeOut = PCollection.newBuilder().setUniqueName("three.out").build();
PTransform three = PTransform.newBuilder().setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).build()).putInputs("in", redOut.getUniqueName()).putOutputs("out", threeOut.getUniqueName()).build();
PCollection oneOut = PCollection.newBuilder().setUniqueName("one.out").build();
PTransform one = PTransform.newBuilder().setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).build()).putInputs("in", redOut.getUniqueName()).putOutputs("out", oneOut.getUniqueName()).build();
PCollection twoOut = PCollection.newBuilder().setUniqueName("two.out").build();
PTransform two = PTransform.newBuilder().setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).build()).putInputs("in", redOut.getUniqueName()).putOutputs("out", twoOut.getUniqueName()).build();
PCollection sharedOut = PCollection.newBuilder().setUniqueName("shared.out").build();
PTransform shared = PTransform.newBuilder().setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).build()).putInputs("one", oneOut.getUniqueName()).putInputs("two", twoOut.getUniqueName()).putOutputs("shared", sharedOut.getUniqueName()).build();
PCollection otherSharedOut = PCollection.newBuilder().setUniqueName("shared.out2").build();
PTransform otherShared = PTransform.newBuilder().setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).build()).putInputs("multi", threeOut.getUniqueName()).putInputs("two", twoOut.getUniqueName()).putOutputs("out", otherSharedOut.getUniqueName()).build();
PCollection blueOut = PCollection.newBuilder().setUniqueName("blue.out").build();
PTransform blue = PTransform.newBuilder().setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).build()).putInputs("in", sharedOut.getUniqueName()).putOutputs("out", blueOut.getUniqueName()).build();
RunnerApi.Components components = Components.newBuilder().putTransforms("one", one).putPcollections(oneOut.getUniqueName(), oneOut).putTransforms("two", two).putPcollections(twoOut.getUniqueName(), twoOut).putTransforms("multi", three).putPcollections(threeOut.getUniqueName(), threeOut).putTransforms("shared", shared).putPcollections(sharedOut.getUniqueName(), sharedOut).putTransforms("otherShared", otherShared).putPcollections(otherSharedOut.getUniqueName(), otherSharedOut).putTransforms("red", red).putPcollections(redOut.getUniqueName(), redOut).putTransforms("blue", blue).putPcollections(blueOut.getUniqueName(), blueOut).build();
ExecutableStage multiStage = ImmutableExecutableStage.of(components, Environment.getDefaultInstance(), PipelineNode.pCollection(redOut.getUniqueName(), redOut), ImmutableList.of(), ImmutableList.of(), ImmutableList.of(), ImmutableList.of(PipelineNode.pTransform("multi", three), PipelineNode.pTransform("shared", shared), PipelineNode.pTransform("otherShared", otherShared)), ImmutableList.of(PipelineNode.pCollection(sharedOut.getUniqueName(), sharedOut), PipelineNode.pCollection(otherSharedOut.getUniqueName(), otherSharedOut)), DEFAULT_WIRE_CODER_SETTINGS);
ExecutableStage oneStage = ImmutableExecutableStage.of(components, Environment.getDefaultInstance(), PipelineNode.pCollection(redOut.getUniqueName(), redOut), ImmutableList.of(), ImmutableList.of(), ImmutableList.of(), ImmutableList.of(PipelineNode.pTransform("one", one), PipelineNode.pTransform("shared", shared)), ImmutableList.of(PipelineNode.pCollection(sharedOut.getUniqueName(), sharedOut)), DEFAULT_WIRE_CODER_SETTINGS);
ExecutableStage twoStage = ImmutableExecutableStage.of(components, Environment.getDefaultInstance(), PipelineNode.pCollection(redOut.getUniqueName(), redOut), ImmutableList.of(), ImmutableList.of(), ImmutableList.of(), ImmutableList.of(PipelineNode.pTransform("two", two), PipelineNode.pTransform("otherShared", otherShared)), ImmutableList.of(PipelineNode.pCollection(otherSharedOut.getUniqueName(), otherSharedOut)), DEFAULT_WIRE_CODER_SETTINGS);
PTransformNode redTransform = PipelineNode.pTransform("red", red);
PTransformNode blueTransform = PipelineNode.pTransform("blue", blue);
QueryablePipeline pipeline = QueryablePipeline.forPrimitivesIn(components);
DeduplicationResult result = OutputDeduplicator.ensureSingleProducer(pipeline, ImmutableList.of(oneStage, twoStage, multiStage), ImmutableList.of(redTransform, blueTransform));
assertThat(result.getIntroducedTransforms(), hasSize(2));
assertThat(result.getDeduplicatedStages().keySet(), containsInAnyOrder(multiStage, oneStage, twoStage));
assertThat(result.getDeduplicatedTransforms().keySet(), empty());
Collection<String> introducedIds = result.getIntroducedTransforms().stream().flatMap(pt -> pt.getTransform().getInputsMap().values().stream()).collect(Collectors.toList());
String[] stageOutputs = result.getDeduplicatedStages().values().stream().flatMap(s -> s.getOutputPCollections().stream().map(PCollectionNode::getId)).toArray(String[]::new);
assertThat(introducedIds, containsInAnyOrder(stageOutputs));
assertThat(result.getDeduplicatedComponents().getPcollectionsMap().keySet(), hasItems(introducedIds.toArray(new String[0])));
assertThat(result.getDeduplicatedComponents().getTransformsMap().entrySet(), hasItems(result.getIntroducedTransforms().stream().collect(Collectors.toMap(PTransformNode::getId, PTransformNode::getTransform)).entrySet().toArray(new Map.Entry[0])));
}
use of org.apache.beam.runners.core.construction.graph.OutputDeduplicator.DeduplicationResult in project beam by apache.
the class OutputDeduplicatorTest method duplicateOverStagesAndTransforms.
@Test
public void duplicateOverStagesAndTransforms() {
/* When both a stage and a runner-executed transform produce a PCollection, all should be
* replaced with synthetic flattens.
* original graph:
* --> one -> .out \
* red -> .out | -> shared -> .out
* --------------> /
*
* fused graph:
* --> [one -> .out -> shared ->] .out
* red -> .out |
* ------------------> shared --> .out
*
* deduplicated graph:
* --> [one -> .out -> shared ->] .out:0 \
* red -> .out | -> shared -> .out
* -----------------> shared:0 -> .out:1 /
*/
PCollection redOut = PCollection.newBuilder().setUniqueName("red.out").build();
PTransform red = PTransform.newBuilder().setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).build()).putOutputs("out", redOut.getUniqueName()).build();
PCollection oneOut = PCollection.newBuilder().setUniqueName("one.out").build();
PTransform one = PTransform.newBuilder().setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).build()).putInputs("in", redOut.getUniqueName()).putOutputs("out", oneOut.getUniqueName()).build();
PCollection sharedOut = PCollection.newBuilder().setUniqueName("shared.out").build();
PTransform shared = PTransform.newBuilder().setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).build()).putInputs("one", oneOut.getUniqueName()).putInputs("red", redOut.getUniqueName()).putOutputs("shared", sharedOut.getUniqueName()).build();
PCollection blueOut = PCollection.newBuilder().setUniqueName("blue.out").build();
PTransform blue = PTransform.newBuilder().setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).build()).putInputs("in", sharedOut.getUniqueName()).putOutputs("out", blueOut.getUniqueName()).build();
RunnerApi.Components components = Components.newBuilder().putTransforms("one", one).putPcollections(oneOut.getUniqueName(), oneOut).putTransforms("red", red).putPcollections(redOut.getUniqueName(), redOut).putTransforms("shared", shared).putPcollections(sharedOut.getUniqueName(), sharedOut).putTransforms("blue", blue).putPcollections(blueOut.getUniqueName(), blueOut).build();
PTransformNode sharedTransform = PipelineNode.pTransform("shared", shared);
ExecutableStage oneStage = ImmutableExecutableStage.of(components, Environment.getDefaultInstance(), PipelineNode.pCollection(redOut.getUniqueName(), redOut), ImmutableList.of(), ImmutableList.of(), ImmutableList.of(), ImmutableList.of(PipelineNode.pTransform("one", one), sharedTransform), ImmutableList.of(PipelineNode.pCollection(sharedOut.getUniqueName(), sharedOut)), DEFAULT_WIRE_CODER_SETTINGS);
PTransformNode redTransform = PipelineNode.pTransform("red", red);
PTransformNode blueTransform = PipelineNode.pTransform("blue", blue);
QueryablePipeline pipeline = QueryablePipeline.forPrimitivesIn(components);
DeduplicationResult result = OutputDeduplicator.ensureSingleProducer(pipeline, ImmutableList.of(oneStage), ImmutableList.of(redTransform, blueTransform, sharedTransform));
assertThat(result.getIntroducedTransforms(), hasSize(1));
PTransformNode introduced = getOnlyElement(result.getIntroducedTransforms());
assertThat(introduced.getTransform().getOutputsMap().size(), equalTo(1));
assertThat(getOnlyElement(introduced.getTransform().getOutputsMap().values()), equalTo(sharedOut.getUniqueName()));
assertThat(result.getDeduplicatedComponents().getPcollectionsMap().keySet(), hasItems(introduced.getTransform().getInputsMap().values().toArray(new String[0])));
assertThat(result.getDeduplicatedStages().keySet(), hasSize(1));
assertThat(result.getDeduplicatedTransforms().keySet(), containsInAnyOrder("shared"));
List<String> introducedOutputs = new ArrayList<>();
introducedOutputs.addAll(result.getDeduplicatedTransforms().get("shared").getTransform().getOutputsMap().values());
introducedOutputs.addAll(result.getDeduplicatedStages().get(oneStage).getOutputPCollections().stream().map(PCollectionNode::getId).collect(Collectors.toList()));
assertThat(introduced.getTransform().getInputsMap().values(), containsInAnyOrder(introducedOutputs.toArray(new String[0])));
assertThat(result.getDeduplicatedComponents().getPcollectionsMap().keySet(), hasItems(introducedOutputs.toArray(new String[0])));
assertThat(result.getDeduplicatedComponents().getTransformsMap(), hasEntry(introduced.getId(), introduced.getTransform()));
}
use of org.apache.beam.runners.core.construction.graph.OutputDeduplicator.DeduplicationResult in project beam by apache.
the class GreedyPipelineFuser method fusePipeline.
/**
* Fuses a {@link Pipeline} into a collection of {@link ExecutableStage}.
*
* <p>The input is the initial collection of siblings sets which will be fused into {@link
* ExecutableStage stages}. A sibling in this context represents a pair of (PCollection,
* PTransform), where the PTransform consumes input elements on a per-element basis from the
* PCollection, represented by a {@link CollectionConsumer}. A sibling set is a collection of
* siblings which can execute within a single {@link ExecutableStage}, determined by {@link
* GreedyPCollectionFusers#isCompatible(PTransformNode, PTransformNode, QueryablePipeline)}.
*
* <p>While a pending sibling set exists:
*
* <ul>
* <li>Retrieve a pending sibling set from the front of the queue.
* <li>If the pending sibling set has already been created, continue. Each materialized {@link
* PTransformNode} can be consumed by any number of {@link ExecutableStage stages}, but each
* {@link PTransformNode} may only be present in a single stage rooted at a single {@link
* PCollectionNode}, otherwise it will process elements of that {@link PCollectionNode}
* multiple times.
* <li>Create a {@link GreedyStageFuser} with those siblings as the initial consuming transforms
* of the stage
* <li>For each materialized {@link PCollectionNode}, find all of the descendant in-environment
* consumers. See {@link #getDescendantConsumers(PCollectionNode)} for details.
* <li>Construct all of the sibling sets from the descendant in-environment consumers, and add
* them to the queue of sibling sets.
* </ul>
*/
private FusedPipeline fusePipeline(Collection<PTransformNode> initialUnfusedTransforms, NavigableSet<NavigableSet<CollectionConsumer>> initialConsumers, Set<String> requirements) {
Map<CollectionConsumer, ExecutableStage> consumedCollectionsAndTransforms = new HashMap<>();
Set<ExecutableStage> stages = new LinkedHashSet<>();
Set<PTransformNode> unfusedTransforms = new LinkedHashSet<>(initialUnfusedTransforms);
Queue<Set<CollectionConsumer>> pendingSiblingSets = new ArrayDeque<>(initialConsumers);
while (!pendingSiblingSets.isEmpty()) {
// Only introduce new PCollection consumers. Not performing this introduces potential
// duplicate paths through the pipeline.
Set<CollectionConsumer> candidateSiblings = pendingSiblingSets.poll();
Set<CollectionConsumer> siblingSet = Sets.difference(candidateSiblings, consumedCollectionsAndTransforms.keySet());
checkState(siblingSet.equals(candidateSiblings) || siblingSet.isEmpty(), "Inconsistent collection of siblings reported for a %s. Initial attempt missed %s", PCollectionNode.class.getSimpleName(), siblingSet);
if (siblingSet.isEmpty()) {
LOG.debug("Filtered out duplicate stage root {}", candidateSiblings);
continue;
}
// Create the stage with these siblings as the initial consuming transforms
ExecutableStage stage = fuseSiblings(siblingSet);
// don't place them in multiple stages.
for (CollectionConsumer sibling : siblingSet) {
consumedCollectionsAndTransforms.put(sibling, stage);
}
stages.add(stage);
for (PCollectionNode materializedOutput : stage.getOutputPCollections()) {
// Get all of the descendant consumers of each materialized PCollection, and add them to the
// queue of pending siblings.
DescendantConsumers descendantConsumers = getDescendantConsumers(materializedOutput);
unfusedTransforms.addAll(descendantConsumers.getUnfusedNodes());
NavigableSet<NavigableSet<CollectionConsumer>> siblings = groupSiblings(descendantConsumers.getFusibleConsumers());
pendingSiblingSets.addAll(siblings);
}
}
// TODO: Figure out where to store this.
DeduplicationResult deduplicated = OutputDeduplicator.ensureSingleProducer(pipeline, stages, unfusedTransforms);
// as can compatible producers/consumers if a PCollection is only materialized once.
return FusedPipeline.of(deduplicated.getDeduplicatedComponents(), stages.stream().map(stage -> deduplicated.getDeduplicatedStages().getOrDefault(stage, stage)).map(GreedyPipelineFuser::sanitizeDanglingPTransformInputs).collect(Collectors.toSet()), Sets.union(deduplicated.getIntroducedTransforms(), unfusedTransforms.stream().map(transform -> deduplicated.getDeduplicatedTransforms().getOrDefault(transform.getId(), transform)).collect(Collectors.toSet())), requirements);
}
use of org.apache.beam.runners.core.construction.graph.OutputDeduplicator.DeduplicationResult in project beam by apache.
the class OutputDeduplicatorTest method duplicateOverStages.
@Test
public void duplicateOverStages() {
/* When multiple stages and a runner-executed transform produce a PCollection, all should be
* replaced with synthetic flattens.
* original graph:
* --> one -> .out \
* red -> .out | -> shared -> .out -> blue -> .out
* --> two -> .out /
*
* fused graph:
* --> [one -> .out -> shared ->] .out
* red -> .out | (shared.out) -> blue -> .out
* --> [two -> .out -> shared ->] .out
*
* deduplicated graph:
* --> [one -> .out -> shared ->] .out:0 \
* red -> .out | -> shared -> .out -> blue ->.out
* --> [two -> .out -> shared ->] .out:1 /
*/
PCollection redOut = PCollection.newBuilder().setUniqueName("red.out").build();
PTransform red = PTransform.newBuilder().setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).build()).putOutputs("out", redOut.getUniqueName()).build();
PCollection oneOut = PCollection.newBuilder().setUniqueName("one.out").build();
PTransform one = PTransform.newBuilder().setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).build()).putInputs("in", redOut.getUniqueName()).putOutputs("out", oneOut.getUniqueName()).build();
PCollection twoOut = PCollection.newBuilder().setUniqueName("two.out").build();
PTransform two = PTransform.newBuilder().setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).build()).putInputs("in", redOut.getUniqueName()).putOutputs("out", twoOut.getUniqueName()).build();
PCollection sharedOut = PCollection.newBuilder().setUniqueName("shared.out").build();
PTransform shared = PTransform.newBuilder().setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).build()).putInputs("one", oneOut.getUniqueName()).putInputs("two", twoOut.getUniqueName()).putOutputs("shared", sharedOut.getUniqueName()).build();
PCollection blueOut = PCollection.newBuilder().setUniqueName("blue.out").build();
PTransform blue = PTransform.newBuilder().setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).build()).putInputs("in", sharedOut.getUniqueName()).putOutputs("out", blueOut.getUniqueName()).build();
RunnerApi.Components components = Components.newBuilder().putTransforms("one", one).putPcollections(oneOut.getUniqueName(), oneOut).putTransforms("two", two).putPcollections(twoOut.getUniqueName(), twoOut).putTransforms("shared", shared).putPcollections(sharedOut.getUniqueName(), sharedOut).putTransforms("red", red).putPcollections(redOut.getUniqueName(), redOut).putTransforms("blue", blue).putPcollections(blueOut.getUniqueName(), blueOut).build();
ExecutableStage oneStage = ImmutableExecutableStage.of(components, Environment.getDefaultInstance(), PipelineNode.pCollection(redOut.getUniqueName(), redOut), ImmutableList.of(), ImmutableList.of(), ImmutableList.of(), ImmutableList.of(PipelineNode.pTransform("one", one), PipelineNode.pTransform("shared", shared)), ImmutableList.of(PipelineNode.pCollection(sharedOut.getUniqueName(), sharedOut)), DEFAULT_WIRE_CODER_SETTINGS);
ExecutableStage twoStage = ImmutableExecutableStage.of(components, Environment.getDefaultInstance(), PipelineNode.pCollection(redOut.getUniqueName(), redOut), ImmutableList.of(), ImmutableList.of(), ImmutableList.of(), ImmutableList.of(PipelineNode.pTransform("two", two), PipelineNode.pTransform("shared", shared)), ImmutableList.of(PipelineNode.pCollection(sharedOut.getUniqueName(), sharedOut)), DEFAULT_WIRE_CODER_SETTINGS);
PTransformNode redTransform = PipelineNode.pTransform("red", red);
PTransformNode blueTransform = PipelineNode.pTransform("blue", blue);
QueryablePipeline pipeline = QueryablePipeline.forPrimitivesIn(components);
DeduplicationResult result = OutputDeduplicator.ensureSingleProducer(pipeline, ImmutableList.of(oneStage, twoStage), ImmutableList.of(redTransform, blueTransform));
assertThat(result.getIntroducedTransforms(), hasSize(1));
PTransformNode introduced = getOnlyElement(result.getIntroducedTransforms());
assertThat(introduced.getTransform().getOutputsMap().size(), equalTo(1));
assertThat(getOnlyElement(introduced.getTransform().getOutputsMap().values()), equalTo(sharedOut.getUniqueName()));
assertThat(result.getDeduplicatedComponents().getPcollectionsMap().keySet(), hasItems(introduced.getTransform().getInputsMap().values().toArray(new String[0])));
assertThat(result.getDeduplicatedStages().keySet(), hasSize(2));
List<String> stageOutputs = result.getDeduplicatedStages().values().stream().flatMap(stage -> stage.getOutputPCollections().stream().map(PCollectionNode::getId)).collect(Collectors.toList());
assertThat(stageOutputs, containsInAnyOrder(introduced.getTransform().getInputsMap().values().toArray()));
assertThat(result.getDeduplicatedTransforms().keySet(), empty());
assertThat(result.getDeduplicatedComponents().getPcollectionsMap().keySet(), hasItems(stageOutputs.toArray(new String[0])));
assertThat(result.getDeduplicatedComponents().getTransformsMap(), hasEntry(introduced.getId(), introduced.getTransform()));
}
Aggregations