use of org.apache.beam.runners.core.construction.graph.PipelineNode.PTransformNode in project beam by apache.
the class GreedyPipelineFuser method getRootConsumers.
private DescendantConsumers getRootConsumers(PTransformNode rootNode) {
checkArgument(rootNode.getTransform().getInputsCount() == 0, "Transform %s is not at the root of the graph (consumes %s)", rootNode.getId(), rootNode.getTransform().getInputsMap());
checkArgument(!pipeline.getEnvironment(rootNode).isPresent(), "%s requires all root nodes to be runner-implemented %s or %s primitives, " + "but transform %s executes in environment %s", GreedyPipelineFuser.class.getSimpleName(), PTransformTranslation.IMPULSE_TRANSFORM_URN, PTransformTranslation.READ_TRANSFORM_URN, rootNode.getId(), pipeline.getEnvironment(rootNode));
Set<PTransformNode> unfused = new HashSet<>();
unfused.add(rootNode);
NavigableSet<CollectionConsumer> environmentNodes = new TreeSet<>();
// Walk down until the first environments are found, and fuse them as appropriate.
for (PCollectionNode output : pipeline.getOutputPCollections(rootNode)) {
DescendantConsumers descendants = getDescendantConsumers(output);
unfused.addAll(descendants.getUnfusedNodes());
environmentNodes.addAll(descendants.getFusibleConsumers());
}
return DescendantConsumers.of(unfused, environmentNodes);
}
use of org.apache.beam.runners.core.construction.graph.PipelineNode.PTransformNode in project beam by apache.
the class CreateExecutableStageNodeFunction method getEnvironmentFromPTransform.
private Environment getEnvironmentFromPTransform(RunnerApi.Components components, Set<PTransformNode> sdkTransforms) {
RehydratedComponents sdkComponents = RehydratedComponents.forComponents(components);
Environment env = null;
for (PTransformNode pTransformNode : sdkTransforms) {
env = Environments.getEnvironment(pTransformNode.getTransform(), sdkComponents).orElse(null);
if (env != null) {
break;
}
}
return env;
}
use of org.apache.beam.runners.core.construction.graph.PipelineNode.PTransformNode in project beam by apache.
the class GreedyStageFuser method forGrpcPortRead.
/**
* Returns an {@link ExecutableStage} where the initial {@link PTransformNode PTransform} is a
* Remote gRPC Port Read, reading elements from the materialized {@link PCollectionNode
* PCollection}.
*
* @param initialNodes the initial set of sibling transforms to fuse into this node. All of the
* transforms must consume the {@code inputPCollection} on a per-element basis, and must all
* be mutually compatible.
*/
public static ExecutableStage forGrpcPortRead(QueryablePipeline pipeline, PCollectionNode inputPCollection, Set<PTransformNode> initialNodes) {
checkArgument(!initialNodes.isEmpty(), "%s must contain at least one %s.", GreedyStageFuser.class.getSimpleName(), PTransformNode.class.getSimpleName());
// Choose the environment from an arbitrary node. The initial nodes may not be empty for this
// subgraph to make any sense, there has to be at least one processor node
// (otherwise the stage is gRPC Read -> gRPC Write, which doesn't do anything).
Environment environment = getStageEnvironment(pipeline, initialNodes);
ImmutableSet.Builder<PTransformNode> fusedTransforms = ImmutableSet.builder();
fusedTransforms.addAll(initialNodes);
Set<SideInputReference> sideInputs = new LinkedHashSet<>();
Set<UserStateReference> userStates = new LinkedHashSet<>();
Set<TimerReference> timers = new LinkedHashSet<>();
Set<PCollectionNode> fusedCollections = new LinkedHashSet<>();
Set<PCollectionNode> materializedPCollections = new LinkedHashSet<>();
Queue<PCollectionNode> fusionCandidates = new ArrayDeque<>();
for (PTransformNode initialConsumer : initialNodes) {
fusionCandidates.addAll(pipeline.getOutputPCollections(initialConsumer));
sideInputs.addAll(pipeline.getSideInputs(initialConsumer));
userStates.addAll(pipeline.getUserStates(initialConsumer));
timers.addAll(pipeline.getTimers(initialConsumer));
}
while (!fusionCandidates.isEmpty()) {
PCollectionNode candidate = fusionCandidates.poll();
if (fusedCollections.contains(candidate) || materializedPCollections.contains(candidate)) {
// This should generally mean we get to a Flatten via multiple paths through the graph and
// we've already determined what to do with the output.
LOG.debug("Skipping fusion candidate {} because it is {} in this {}", candidate, fusedCollections.contains(candidate) ? "fused" : "materialized", ExecutableStage.class.getSimpleName());
continue;
}
PCollectionFusibility fusibility = canFuse(pipeline, candidate, environment, fusedCollections);
switch(fusibility) {
case MATERIALIZE:
materializedPCollections.add(candidate);
break;
case FUSE:
// All of the consumers of the candidate PCollection can be fused into this stage. Do so.
fusedCollections.add(candidate);
fusedTransforms.addAll(pipeline.getPerElementConsumers(candidate));
for (PTransformNode consumer : pipeline.getPerElementConsumers(candidate)) {
// The outputs of every transform fused into this stage must be either materialized or
// themselves fused away, so add them to the set of candidates.
fusionCandidates.addAll(pipeline.getOutputPCollections(consumer));
sideInputs.addAll(pipeline.getSideInputs(consumer));
}
break;
default:
throw new IllegalStateException(String.format("Unknown type of %s %s", PCollectionFusibility.class.getSimpleName(), fusibility));
}
}
return ImmutableExecutableStage.ofFullComponents(pipeline.getComponents(), environment, inputPCollection, sideInputs, userStates, timers, fusedTransforms.build(), materializedPCollections, DEFAULT_WIRE_CODER_SETTINGS);
}
use of org.apache.beam.runners.core.construction.graph.PipelineNode.PTransformNode in project beam by apache.
the class GreedyPipelineFuser method sanitizeDanglingPTransformInputs.
private static ExecutableStage sanitizeDanglingPTransformInputs(ExecutableStage stage) {
/* Possible inputs to a PTransform can only be those which are:
* <ul>
* <li>Explicit input PCollection to the stage
* <li>Outputs of a PTransform within the same stage
* <li>Timer PCollections
* <li>Side input PCollections
* <li>Explicit outputs from the stage
* </ul>
*/
Set<String> possibleInputs = new HashSet<>();
possibleInputs.add(stage.getInputPCollection().getId());
possibleInputs.addAll(stage.getOutputPCollections().stream().map(PCollectionNode::getId).collect(Collectors.toSet()));
possibleInputs.addAll(stage.getSideInputs().stream().map(s -> s.collection().getId()).collect(Collectors.toSet()));
possibleInputs.addAll(stage.getTransforms().stream().flatMap(t -> t.getTransform().getOutputsMap().values().stream()).collect(Collectors.toSet()));
Set<String> danglingInputs = stage.getTransforms().stream().flatMap(t -> t.getTransform().getInputsMap().values().stream()).filter(in -> !possibleInputs.contains(in)).collect(Collectors.toSet());
ImmutableList.Builder<PTransformNode> pTransformNodesBuilder = ImmutableList.builder();
for (PTransformNode transformNode : stage.getTransforms()) {
PTransform transform = transformNode.getTransform();
Map<String, String> validInputs = transform.getInputsMap().entrySet().stream().filter(e -> !danglingInputs.contains(e.getValue())).collect(Collectors.toMap(Entry::getKey, Entry::getValue));
if (!validInputs.equals(transform.getInputsMap())) {
// Dangling inputs found so recreate pTransform without the dangling inputs.
transformNode = PipelineNode.pTransform(transformNode.getId(), transform.toBuilder().clearInputs().putAllInputs(validInputs).build());
}
pTransformNodesBuilder.add(transformNode);
}
ImmutableList<PTransformNode> pTransformNodes = pTransformNodesBuilder.build();
Components.Builder componentBuilder = stage.getComponents().toBuilder();
// Update the pTransforms in components.
componentBuilder.clearTransforms().putAllTransforms(pTransformNodes.stream().collect(Collectors.toMap(PTransformNode::getId, PTransformNode::getTransform)));
Map<String, PCollection> validPCollectionMap = stage.getComponents().getPcollectionsMap().entrySet().stream().filter(e -> !danglingInputs.contains(e.getKey())).collect(Collectors.toMap(Entry::getKey, Entry::getValue));
// Update pCollections in the components.
componentBuilder.clearPcollections().putAllPcollections(validPCollectionMap);
return ImmutableExecutableStage.of(componentBuilder.build(), stage.getEnvironment(), stage.getInputPCollection(), stage.getSideInputs(), stage.getUserStates(), stage.getTimers(), pTransformNodes, stage.getOutputPCollections(), stage.getWireCoderSettings());
}
use of org.apache.beam.runners.core.construction.graph.PipelineNode.PTransformNode in project beam by apache.
the class GreedyPipelineFuser method fusePipeline.
/**
* Fuses a {@link Pipeline} into a collection of {@link ExecutableStage}.
*
* <p>The input is the initial collection of siblings sets which will be fused into {@link
* ExecutableStage stages}. A sibling in this context represents a pair of (PCollection,
* PTransform), where the PTransform consumes input elements on a per-element basis from the
* PCollection, represented by a {@link CollectionConsumer}. A sibling set is a collection of
* siblings which can execute within a single {@link ExecutableStage}, determined by {@link
* GreedyPCollectionFusers#isCompatible(PTransformNode, PTransformNode, QueryablePipeline)}.
*
* <p>While a pending sibling set exists:
*
* <ul>
* <li>Retrieve a pending sibling set from the front of the queue.
* <li>If the pending sibling set has already been created, continue. Each materialized {@link
* PTransformNode} can be consumed by any number of {@link ExecutableStage stages}, but each
* {@link PTransformNode} may only be present in a single stage rooted at a single {@link
* PCollectionNode}, otherwise it will process elements of that {@link PCollectionNode}
* multiple times.
* <li>Create a {@link GreedyStageFuser} with those siblings as the initial consuming transforms
* of the stage
* <li>For each materialized {@link PCollectionNode}, find all of the descendant in-environment
* consumers. See {@link #getDescendantConsumers(PCollectionNode)} for details.
* <li>Construct all of the sibling sets from the descendant in-environment consumers, and add
* them to the queue of sibling sets.
* </ul>
*/
private FusedPipeline fusePipeline(Collection<PTransformNode> initialUnfusedTransforms, NavigableSet<NavigableSet<CollectionConsumer>> initialConsumers, Set<String> requirements) {
Map<CollectionConsumer, ExecutableStage> consumedCollectionsAndTransforms = new HashMap<>();
Set<ExecutableStage> stages = new LinkedHashSet<>();
Set<PTransformNode> unfusedTransforms = new LinkedHashSet<>(initialUnfusedTransforms);
Queue<Set<CollectionConsumer>> pendingSiblingSets = new ArrayDeque<>(initialConsumers);
while (!pendingSiblingSets.isEmpty()) {
// Only introduce new PCollection consumers. Not performing this introduces potential
// duplicate paths through the pipeline.
Set<CollectionConsumer> candidateSiblings = pendingSiblingSets.poll();
Set<CollectionConsumer> siblingSet = Sets.difference(candidateSiblings, consumedCollectionsAndTransforms.keySet());
checkState(siblingSet.equals(candidateSiblings) || siblingSet.isEmpty(), "Inconsistent collection of siblings reported for a %s. Initial attempt missed %s", PCollectionNode.class.getSimpleName(), siblingSet);
if (siblingSet.isEmpty()) {
LOG.debug("Filtered out duplicate stage root {}", candidateSiblings);
continue;
}
// Create the stage with these siblings as the initial consuming transforms
ExecutableStage stage = fuseSiblings(siblingSet);
// don't place them in multiple stages.
for (CollectionConsumer sibling : siblingSet) {
consumedCollectionsAndTransforms.put(sibling, stage);
}
stages.add(stage);
for (PCollectionNode materializedOutput : stage.getOutputPCollections()) {
// Get all of the descendant consumers of each materialized PCollection, and add them to the
// queue of pending siblings.
DescendantConsumers descendantConsumers = getDescendantConsumers(materializedOutput);
unfusedTransforms.addAll(descendantConsumers.getUnfusedNodes());
NavigableSet<NavigableSet<CollectionConsumer>> siblings = groupSiblings(descendantConsumers.getFusibleConsumers());
pendingSiblingSets.addAll(siblings);
}
}
// TODO: Figure out where to store this.
DeduplicationResult deduplicated = OutputDeduplicator.ensureSingleProducer(pipeline, stages, unfusedTransforms);
// as can compatible producers/consumers if a PCollection is only materialized once.
return FusedPipeline.of(deduplicated.getDeduplicatedComponents(), stages.stream().map(stage -> deduplicated.getDeduplicatedStages().getOrDefault(stage, stage)).map(GreedyPipelineFuser::sanitizeDanglingPTransformInputs).collect(Collectors.toSet()), Sets.union(deduplicated.getIntroducedTransforms(), unfusedTransforms.stream().map(transform -> deduplicated.getDeduplicatedTransforms().getOrDefault(transform.getId(), transform)).collect(Collectors.toSet())), requirements);
}
Aggregations