Search in sources :

Example 11 with PTransformNode

use of org.apache.beam.runners.core.construction.graph.PipelineNode.PTransformNode in project beam by apache.

the class FusedPipeline method toPipeline.

/**
 * Returns the {@link RunnerApi.Pipeline} representation of this {@link FusedPipeline}.
 *
 * <p>The {@link Components} of the returned pipeline will contain all of the {@link PTransform
 * PTransforms} present in the original Pipeline that this {@link FusedPipeline} was created from,
 * plus all of the {@link ExecutableStage ExecutableStages} contained within this {@link
 * FusedPipeline}. The {@link Pipeline#getRootTransformIdsList()} will contain all of the runner
 * executed transforms and all of the {@link ExecutableStage execuable stages} contained within
 * the Pipeline.
 */
public RunnerApi.Pipeline toPipeline() {
    Map<String, PTransform> executableStageTransforms = getEnvironmentExecutedTransforms();
    Set<String> executableTransformIds = Sets.union(executableStageTransforms.keySet(), getRunnerExecutedTransforms().stream().map(PTransformNode::getId).collect(Collectors.toSet()));
    // Augment the initial transforms with all of the executable transforms.
    Components fusedComponents = getComponents().toBuilder().putAllTransforms(executableStageTransforms).build();
    List<String> rootTransformIds = StreamSupport.stream(QueryablePipeline.forTransforms(executableTransformIds, fusedComponents).getTopologicallyOrderedTransforms().spliterator(), false).map(PTransformNode::getId).collect(Collectors.toList());
    Pipeline res = Pipeline.newBuilder().setComponents(fusedComponents).addAllRootTransformIds(rootTransformIds).addAllRequirements(getRequirements()).build();
    // Validate that fusion didn't produce a malformed pipeline.
    PipelineValidator.validate(res);
    return res;
}
Also used : SyntheticComponents(org.apache.beam.runners.core.construction.SyntheticComponents) Components(org.apache.beam.model.pipeline.v1.RunnerApi.Components) PTransformNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PTransformNode) PTransform(org.apache.beam.model.pipeline.v1.RunnerApi.PTransform) Pipeline(org.apache.beam.model.pipeline.v1.RunnerApi.Pipeline)

Example 12 with PTransformNode

use of org.apache.beam.runners.core.construction.graph.PipelineNode.PTransformNode in project beam by apache.

the class QueryablePipeline method buildNetwork.

private MutableNetwork<PipelineNode, PipelineEdge> buildNetwork(Collection<String> transformIds, Components components) {
    MutableNetwork<PipelineNode, PipelineEdge> network = NetworkBuilder.directed().allowsParallelEdges(true).allowsSelfLoops(false).build();
    Set<PCollectionNode> unproducedCollections = new HashSet<>();
    for (String transformId : transformIds) {
        PTransform transform = components.getTransformsOrThrow(transformId);
        PTransformNode transformNode = PipelineNode.pTransform(transformId, this.components.getTransformsOrThrow(transformId));
        network.addNode(transformNode);
        for (String produced : transform.getOutputsMap().values()) {
            PCollectionNode producedNode = PipelineNode.pCollection(produced, components.getPcollectionsOrThrow(produced));
            network.addNode(producedNode);
            network.addEdge(transformNode, producedNode, new PerElementEdge());
            checkArgument(network.inDegree(producedNode) == 1, "A %s should have exactly one producing %s, but found %s:\nPCollection:\n%s\nProducers:\n%s", PCollectionNode.class.getSimpleName(), PTransformNode.class.getSimpleName(), network.predecessors(producedNode).size(), producedNode, network.predecessors(producedNode));
            unproducedCollections.remove(producedNode);
        }
        for (Map.Entry<String, String> consumed : transform.getInputsMap().entrySet()) {
            // This loop may add an edge between the consumed PCollection and the current PTransform.
            // The local name of the transform must be used to determine the type of edge.
            String pcollectionId = consumed.getValue();
            PCollectionNode consumedNode = PipelineNode.pCollection(pcollectionId, this.components.getPcollectionsOrThrow(pcollectionId));
            if (network.addNode(consumedNode)) {
                // This node has been added to the network for the first time, so it has no producer.
                unproducedCollections.add(consumedNode);
            }
            if (getLocalSideInputNames(transform).contains(consumed.getKey())) {
                network.addEdge(consumedNode, transformNode, new SingletonEdge());
            } else {
                network.addEdge(consumedNode, transformNode, new PerElementEdge());
            }
        }
    }
    checkArgument(unproducedCollections.isEmpty(), "%ss %s were consumed but never produced", PCollectionNode.class.getSimpleName(), unproducedCollections);
    return network;
}
Also used : PTransformNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PTransformNode) PCollectionNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PCollectionNode) Map(java.util.Map) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet) PTransform(org.apache.beam.model.pipeline.v1.RunnerApi.PTransform)

Example 13 with PTransformNode

use of org.apache.beam.runners.core.construction.graph.PipelineNode.PTransformNode in project beam by apache.

the class OutputDeduplicator method deduplicateStageOutput.

/**
 * Returns an {@link ExecutableStage} where all of the {@link PCollectionNode PCollections}
 * matching the original are replaced with the introduced partial {@link PCollection} in all
 * references made within the {@link ExecutableStage}.
 */
private static ExecutableStage deduplicateStageOutput(ExecutableStage stage, Map<String, PCollectionNode> originalToPartial) {
    Collection<PTransformNode> updatedTransforms = new ArrayList<>();
    for (PTransformNode transform : stage.getTransforms()) {
        PTransform updatedTransform = updateOutputs(transform.getTransform(), originalToPartial);
        updatedTransforms.add(PipelineNode.pTransform(transform.getId(), updatedTransform));
    }
    Collection<PCollectionNode> updatedOutputs = new ArrayList<>();
    for (PCollectionNode output : stage.getOutputPCollections()) {
        updatedOutputs.add(originalToPartial.getOrDefault(output.getId(), output));
    }
    RunnerApi.Components updatedStageComponents = stage.getComponents().toBuilder().clearTransforms().putAllTransforms(updatedTransforms.stream().collect(Collectors.toMap(PTransformNode::getId, PTransformNode::getTransform))).putAllPcollections(originalToPartial.values().stream().collect(Collectors.toMap(PCollectionNode::getId, PCollectionNode::getPCollection))).build();
    return ImmutableExecutableStage.of(updatedStageComponents, stage.getEnvironment(), stage.getInputPCollection(), stage.getSideInputs(), stage.getUserStates(), stage.getTimers(), updatedTransforms, updatedOutputs, stage.getWireCoderSettings());
}
Also used : RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) PTransformNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PTransformNode) ArrayList(java.util.ArrayList) Components(org.apache.beam.model.pipeline.v1.RunnerApi.Components) PCollectionNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PCollectionNode) PTransform(org.apache.beam.model.pipeline.v1.RunnerApi.PTransform)

Example 14 with PTransformNode

use of org.apache.beam.runners.core.construction.graph.PipelineNode.PTransformNode in project beam by apache.

the class OutputDeduplicator method ensureSingleProducer.

/**
 * Ensure that no {@link PCollection} output by any of the {@code stages} or {@code
 * unfusedTransforms} is produced by more than one of those stages or transforms.
 *
 * <p>For each {@link PCollection} output by multiple stages and/or transforms, each producer is
 * rewritten to produce a partial {@link PCollection}, which are then flattened together via an
 * introduced Flatten node which produces the original output.
 */
static DeduplicationResult ensureSingleProducer(QueryablePipeline pipeline, Collection<ExecutableStage> stages, Collection<PTransformNode> unfusedTransforms) {
    RunnerApi.Components.Builder unzippedComponents = pipeline.getComponents().toBuilder();
    Multimap<PCollectionNode, StageOrTransform> pcollectionProducers = getProducers(pipeline, stages, unfusedTransforms);
    Multimap<StageOrTransform, PCollectionNode> requiresNewOutput = HashMultimap.create();
    // ExecutableStage must also be rewritten to have updated outputs and transforms.
    for (Map.Entry<PCollectionNode, Collection<StageOrTransform>> collectionProducer : pcollectionProducers.asMap().entrySet()) {
        if (collectionProducer.getValue().size() > 1) {
            for (StageOrTransform producer : collectionProducer.getValue()) {
                requiresNewOutput.put(producer, collectionProducer.getKey());
            }
        }
    }
    Map<ExecutableStage, ExecutableStage> updatedStages = new LinkedHashMap<>();
    Map<String, PTransformNode> updatedTransforms = new LinkedHashMap<>();
    Multimap<String, PCollectionNode> originalToPartial = HashMultimap.create();
    for (Map.Entry<StageOrTransform, Collection<PCollectionNode>> deduplicationTargets : requiresNewOutput.asMap().entrySet()) {
        if (deduplicationTargets.getKey().getStage() != null) {
            StageDeduplication deduplication = deduplicatePCollections(deduplicationTargets.getKey().getStage(), deduplicationTargets.getValue(), unzippedComponents::containsPcollections);
            for (Entry<String, PCollectionNode> originalToPartialReplacement : deduplication.getOriginalToPartialPCollections().entrySet()) {
                originalToPartial.put(originalToPartialReplacement.getKey(), originalToPartialReplacement.getValue());
                unzippedComponents.putPcollections(originalToPartialReplacement.getValue().getId(), originalToPartialReplacement.getValue().getPCollection());
            }
            updatedStages.put(deduplicationTargets.getKey().getStage(), deduplication.getUpdatedStage());
        } else if (deduplicationTargets.getKey().getTransform() != null) {
            PTransformDeduplication deduplication = deduplicatePCollections(deduplicationTargets.getKey().getTransform(), deduplicationTargets.getValue(), unzippedComponents::containsPcollections);
            for (Entry<String, PCollectionNode> originalToPartialReplacement : deduplication.getOriginalToPartialPCollections().entrySet()) {
                originalToPartial.put(originalToPartialReplacement.getKey(), originalToPartialReplacement.getValue());
                unzippedComponents.putPcollections(originalToPartialReplacement.getValue().getId(), originalToPartialReplacement.getValue().getPCollection());
            }
            updatedTransforms.put(deduplicationTargets.getKey().getTransform().getId(), deduplication.getUpdatedTransform());
        } else {
            throw new IllegalStateException(String.format("%s with no %s or %s", StageOrTransform.class.getSimpleName(), ExecutableStage.class.getSimpleName(), PTransformNode.class.getSimpleName()));
        }
    }
    Set<PTransformNode> introducedFlattens = new LinkedHashSet<>();
    for (Map.Entry<String, Collection<PCollectionNode>> partialFlattenTargets : originalToPartial.asMap().entrySet()) {
        String flattenId = SyntheticComponents.uniqueId("unzipped_flatten", unzippedComponents::containsTransforms);
        PTransform flattenPartialPCollections = createFlattenOfPartials(flattenId, partialFlattenTargets.getKey(), partialFlattenTargets.getValue());
        unzippedComponents.putTransforms(flattenId, flattenPartialPCollections);
        introducedFlattens.add(PipelineNode.pTransform(flattenId, flattenPartialPCollections));
    }
    Components components = unzippedComponents.build();
    return DeduplicationResult.of(components, introducedFlattens, updatedStages, updatedTransforms);
}
Also used : LinkedHashSet(java.util.LinkedHashSet) PTransformNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PTransformNode) LinkedHashMap(java.util.LinkedHashMap) SyntheticComponents(org.apache.beam.runners.core.construction.SyntheticComponents) Components(org.apache.beam.model.pipeline.v1.RunnerApi.Components) Entry(java.util.Map.Entry) PTransform(org.apache.beam.model.pipeline.v1.RunnerApi.PTransform) PCollectionNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PCollectionNode) Collection(java.util.Collection) PCollection(org.apache.beam.model.pipeline.v1.RunnerApi.PCollection) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map)

Example 15 with PTransformNode

use of org.apache.beam.runners.core.construction.graph.PipelineNode.PTransformNode in project beam by apache.

the class GreedyPipelineFuser method getDescendantConsumers.

/**
 * Retrieve all descendant {@link PTransformNode PTransforms} which are executed within an {@link
 * Environment}, such that there is a path between this input {@link PCollectionNode} and the
 * descendant {@link PTransformNode} with no intermediate {@link PTransformNode} which executes
 * within an environment.
 *
 * <p>This occurs as follows:
 *
 * <ul>
 *   <li>For each consumer of the input {@link PCollectionNode}:
 *       <ul>
 *         <li>If that {@link PTransformNode} executes within an environment, add it to the
 *             collection of descendants
 *         <li>If that {@link PTransformNode} does not execute within an environment, for each
 *             output {@link PCollectionNode} that that {@link PTransformNode} produces, add the
 *             result of recursively applying this method to that {@link PCollectionNode}.
 *       </ul>
 * </ul>
 *
 * <p>As {@link PCollectionNode PCollections} output by a {@link PTransformNode} that executes
 * within an {@link Environment} are not recursively inspected, {@link PTransformNode PTransforms}
 * reachable only via a path including that node as an intermediate node cannot be returned as a
 * descendant consumer of the original {@link PCollectionNode}.
 */
private DescendantConsumers getDescendantConsumers(PCollectionNode inputPCollection) {
    Set<PTransformNode> unfused = new HashSet<>();
    NavigableSet<CollectionConsumer> downstreamConsumers = new TreeSet<>();
    for (PTransformNode consumer : pipeline.getPerElementConsumers(inputPCollection)) {
        if (pipeline.getEnvironment(consumer).isPresent()) {
            // The base case: this descendant consumes elements from
            downstreamConsumers.add(CollectionConsumer.of(inputPCollection, consumer));
        } else {
            LOG.debug("Adding {} {} to the set of runner-executed transforms", PTransformNode.class.getSimpleName(), consumer.getId());
            unfused.add(consumer);
            for (PCollectionNode output : pipeline.getOutputPCollections(consumer)) {
                // Recurse to all of the ouput PCollections of this PTransform.
                DescendantConsumers descendants = getDescendantConsumers(output);
                unfused.addAll(descendants.getUnfusedNodes());
                downstreamConsumers.addAll(descendants.getFusibleConsumers());
            }
        }
    }
    return DescendantConsumers.of(unfused, downstreamConsumers);
}
Also used : TreeSet(java.util.TreeSet) PTransformNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PTransformNode) PCollectionNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PCollectionNode) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet)

Aggregations

PTransformNode (org.apache.beam.runners.core.construction.graph.PipelineNode.PTransformNode)33 PCollectionNode (org.apache.beam.runners.core.construction.graph.PipelineNode.PCollectionNode)22 Components (org.apache.beam.model.pipeline.v1.RunnerApi.Components)20 PTransform (org.apache.beam.model.pipeline.v1.RunnerApi.PTransform)20 Test (org.junit.Test)20 Environment (org.apache.beam.model.pipeline.v1.RunnerApi.Environment)15 PCollection (org.apache.beam.model.pipeline.v1.RunnerApi.PCollection)12 RunnerApi (org.apache.beam.model.pipeline.v1.RunnerApi)9 Collection (java.util.Collection)8 Map (java.util.Map)8 LinkedHashSet (java.util.LinkedHashSet)7 ArrayList (java.util.ArrayList)6 HashSet (java.util.HashSet)6 Collectors (java.util.stream.Collectors)6 DeduplicationResult (org.apache.beam.runners.core.construction.graph.OutputDeduplicator.DeduplicationResult)6 Pipeline (org.apache.beam.sdk.Pipeline)6 PTransformTranslation (org.apache.beam.runners.core.construction.PTransformTranslation)5 ImmutableList (org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList)5 HashMap (java.util.HashMap)4 TreeSet (java.util.TreeSet)4