Search in sources :

Example 21 with PCollectionNode

use of org.apache.beam.runners.core.construction.graph.PipelineNode.PCollectionNode in project beam by apache.

the class GreedyPipelineFuser method fusePipeline.

/**
 * Fuses a {@link Pipeline} into a collection of {@link ExecutableStage}.
 *
 * <p>The input is the initial collection of siblings sets which will be fused into {@link
 * ExecutableStage stages}. A sibling in this context represents a pair of (PCollection,
 * PTransform), where the PTransform consumes input elements on a per-element basis from the
 * PCollection, represented by a {@link CollectionConsumer}. A sibling set is a collection of
 * siblings which can execute within a single {@link ExecutableStage}, determined by {@link
 * GreedyPCollectionFusers#isCompatible(PTransformNode, PTransformNode, QueryablePipeline)}.
 *
 * <p>While a pending sibling set exists:
 *
 * <ul>
 *   <li>Retrieve a pending sibling set from the front of the queue.
 *   <li>If the pending sibling set has already been created, continue. Each materialized {@link
 *       PTransformNode} can be consumed by any number of {@link ExecutableStage stages}, but each
 *       {@link PTransformNode} may only be present in a single stage rooted at a single {@link
 *       PCollectionNode}, otherwise it will process elements of that {@link PCollectionNode}
 *       multiple times.
 *   <li>Create a {@link GreedyStageFuser} with those siblings as the initial consuming transforms
 *       of the stage
 *   <li>For each materialized {@link PCollectionNode}, find all of the descendant in-environment
 *       consumers. See {@link #getDescendantConsumers(PCollectionNode)} for details.
 *   <li>Construct all of the sibling sets from the descendant in-environment consumers, and add
 *       them to the queue of sibling sets.
 * </ul>
 */
private FusedPipeline fusePipeline(Collection<PTransformNode> initialUnfusedTransforms, NavigableSet<NavigableSet<CollectionConsumer>> initialConsumers, Set<String> requirements) {
    Map<CollectionConsumer, ExecutableStage> consumedCollectionsAndTransforms = new HashMap<>();
    Set<ExecutableStage> stages = new LinkedHashSet<>();
    Set<PTransformNode> unfusedTransforms = new LinkedHashSet<>(initialUnfusedTransforms);
    Queue<Set<CollectionConsumer>> pendingSiblingSets = new ArrayDeque<>(initialConsumers);
    while (!pendingSiblingSets.isEmpty()) {
        // Only introduce new PCollection consumers. Not performing this introduces potential
        // duplicate paths through the pipeline.
        Set<CollectionConsumer> candidateSiblings = pendingSiblingSets.poll();
        Set<CollectionConsumer> siblingSet = Sets.difference(candidateSiblings, consumedCollectionsAndTransforms.keySet());
        checkState(siblingSet.equals(candidateSiblings) || siblingSet.isEmpty(), "Inconsistent collection of siblings reported for a %s. Initial attempt missed %s", PCollectionNode.class.getSimpleName(), siblingSet);
        if (siblingSet.isEmpty()) {
            LOG.debug("Filtered out duplicate stage root {}", candidateSiblings);
            continue;
        }
        // Create the stage with these siblings as the initial consuming transforms
        ExecutableStage stage = fuseSiblings(siblingSet);
        // don't place them in multiple stages.
        for (CollectionConsumer sibling : siblingSet) {
            consumedCollectionsAndTransforms.put(sibling, stage);
        }
        stages.add(stage);
        for (PCollectionNode materializedOutput : stage.getOutputPCollections()) {
            // Get all of the descendant consumers of each materialized PCollection, and add them to the
            // queue of pending siblings.
            DescendantConsumers descendantConsumers = getDescendantConsumers(materializedOutput);
            unfusedTransforms.addAll(descendantConsumers.getUnfusedNodes());
            NavigableSet<NavigableSet<CollectionConsumer>> siblings = groupSiblings(descendantConsumers.getFusibleConsumers());
            pendingSiblingSets.addAll(siblings);
        }
    }
    // TODO: Figure out where to store this.
    DeduplicationResult deduplicated = OutputDeduplicator.ensureSingleProducer(pipeline, stages, unfusedTransforms);
    // as can compatible producers/consumers if a PCollection is only materialized once.
    return FusedPipeline.of(deduplicated.getDeduplicatedComponents(), stages.stream().map(stage -> deduplicated.getDeduplicatedStages().getOrDefault(stage, stage)).map(GreedyPipelineFuser::sanitizeDanglingPTransformInputs).collect(Collectors.toSet()), Sets.union(deduplicated.getIntroducedTransforms(), unfusedTransforms.stream().map(transform -> deduplicated.getDeduplicatedTransforms().getOrDefault(transform.getId(), transform)).collect(Collectors.toSet())), requirements);
}
Also used : LinkedHashSet(java.util.LinkedHashSet) PTransform(org.apache.beam.model.pipeline.v1.RunnerApi.PTransform) LoggerFactory(org.slf4j.LoggerFactory) HashMap(java.util.HashMap) HashMultimap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.HashMultimap) TreeSet(java.util.TreeSet) ComparisonChain(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ComparisonChain) HashSet(java.util.HashSet) ImmutableSet(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableSet) Sets(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Sets) PCollection(org.apache.beam.model.pipeline.v1.RunnerApi.PCollection) Multimap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Multimap) Map(java.util.Map) Preconditions.checkArgument(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkArgument) Components(org.apache.beam.model.pipeline.v1.RunnerApi.Components) PTransformNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PTransformNode) LinkedHashSet(java.util.LinkedHashSet) PTransformTranslation(org.apache.beam.runners.core.construction.PTransformTranslation) Logger(org.slf4j.Logger) Collection(java.util.Collection) Set(java.util.Set) NavigableSet(java.util.NavigableSet) DeduplicationResult(org.apache.beam.runners.core.construction.graph.OutputDeduplicator.DeduplicationResult) Collectors(java.util.stream.Collectors) Pipeline(org.apache.beam.model.pipeline.v1.RunnerApi.Pipeline) Preconditions.checkState(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkState) Environment(org.apache.beam.model.pipeline.v1.RunnerApi.Environment) AutoValue(com.google.auto.value.AutoValue) Entry(java.util.Map.Entry) ImmutableList(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList) Queue(java.util.Queue) ArrayDeque(java.util.ArrayDeque) Comparator(java.util.Comparator) PCollectionNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PCollectionNode) NavigableSet(java.util.NavigableSet) TreeSet(java.util.TreeSet) HashSet(java.util.HashSet) ImmutableSet(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableSet) LinkedHashSet(java.util.LinkedHashSet) Set(java.util.Set) NavigableSet(java.util.NavigableSet) HashMap(java.util.HashMap) PTransformNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PTransformNode) ArrayDeque(java.util.ArrayDeque) PCollectionNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PCollectionNode) DeduplicationResult(org.apache.beam.runners.core.construction.graph.OutputDeduplicator.DeduplicationResult)

Example 22 with PCollectionNode

use of org.apache.beam.runners.core.construction.graph.PipelineNode.PCollectionNode in project beam by apache.

the class ExecutableStage method toPTransform.

/**
 * Returns a composite {@link PTransform} which is equivalent to this {@link ExecutableStage} as
 * follows:
 *
 * <ul>
 *   <li>The {@link PTransform#getSubtransformsList()} is empty. This ensures that executable
 *       stages are treated as primitive transforms.
 *   <li>The only {@link PCollection PCollections} in the {@link PTransform#getInputsMap()} is the
 *       result of {@link #getInputPCollection()} and {@link #getSideInputs()}.
 *   <li>The output {@link PCollection PCollections} in the values of {@link
 *       PTransform#getOutputsMap()} are the {@link PCollectionNode PCollections} returned by
 *       {@link #getOutputPCollections()}.
 *   <li>The {@link PTransform#getSpec()} contains an {@link ExecutableStagePayload} with inputs
 *       and outputs equal to the PTransform's inputs and outputs, and transforms equal to the
 *       result of {@link #getTransforms}.
 * </ul>
 *
 * <p>The executable stage can be reconstructed from the resulting {@link ExecutableStagePayload}
 * via {@link #fromPayload(ExecutableStagePayload)}.
 */
default PTransform toPTransform(String uniqueName) {
    PTransform.Builder pt = PTransform.newBuilder().setUniqueName(uniqueName);
    ExecutableStagePayload.Builder payload = ExecutableStagePayload.newBuilder();
    payload.setEnvironment(getEnvironment());
    payload.addAllWireCoderSettings(getWireCoderSettings());
    // Populate inputs and outputs of the stage payload and outer PTransform simultaneously.
    PCollectionNode input = getInputPCollection();
    pt.putInputs("input", getInputPCollection().getId());
    payload.setInput(input.getId());
    for (SideInputReference sideInput : getSideInputs()) {
        // Side inputs of the ExecutableStage itself can be uniquely identified by inner PTransform
        // name and local name.
        String outerLocalName = String.format("%s:%s", sideInput.transform().getId(), sideInput.localName());
        pt.putInputs(outerLocalName, sideInput.collection().getId());
        payload.addSideInputs(SideInputId.newBuilder().setTransformId(sideInput.transform().getId()).setLocalName(sideInput.localName()));
    }
    for (UserStateReference userState : getUserStates()) {
        payload.addUserStates(UserStateId.newBuilder().setTransformId(userState.transform().getId()).setLocalName(userState.localName()));
    }
    for (TimerReference timer : getTimers()) {
        payload.addTimers(TimerId.newBuilder().setTransformId(timer.transform().getId()).setLocalName(timer.localName()));
    }
    int outputIndex = 0;
    for (PCollectionNode output : getOutputPCollections()) {
        pt.putOutputs(String.format("materialized_%d", outputIndex), output.getId());
        payload.addOutputs(output.getId());
        outputIndex++;
    }
    // stage payload.
    for (PTransformNode transform : getTransforms()) {
        payload.addTransforms(transform.getId());
    }
    payload.setComponents(getComponents().toBuilder().clearTransforms().putAllTransforms(getTransforms().stream().collect(Collectors.toMap(PTransformNode::getId, PTransformNode::getTransform))));
    pt.setSpec(FunctionSpec.newBuilder().setUrn(ExecutableStage.URN).setPayload(payload.build().toByteString()).build());
    return pt.build();
}
Also used : ExecutableStagePayload(org.apache.beam.model.pipeline.v1.RunnerApi.ExecutableStagePayload) PTransformNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PTransformNode) PCollectionNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PCollectionNode) PTransform(org.apache.beam.model.pipeline.v1.RunnerApi.PTransform)

Example 23 with PCollectionNode

use of org.apache.beam.runners.core.construction.graph.PipelineNode.PCollectionNode in project beam by apache.

the class OutputDeduplicator method createPartialPCollections.

/**
 * Returns a {@link Map} from the ID of a {@link PCollectionNode PCollection} to a {@link
 * PCollectionNode} that contains part of that {@link PCollectionNode PCollection}.
 */
private static Map<String, PCollectionNode> createPartialPCollections(Collection<PCollectionNode> duplicates, Predicate<String> existingPCollectionIds) {
    Map<String, PCollectionNode> unzippedOutputs = new LinkedHashMap<>();
    Predicate<String> existingOrNewIds = existingPCollectionIds.or(id -> unzippedOutputs.values().stream().map(PCollectionNode::getId).anyMatch(id::equals));
    for (PCollectionNode duplicateOutput : duplicates) {
        String id = SyntheticComponents.uniqueId(duplicateOutput.getId(), existingOrNewIds);
        PCollection partial = duplicateOutput.getPCollection().toBuilder().setUniqueName(id).build();
        // Check to make sure there is only one duplicated output with the same id - which ensures we
        // only introduce one 'partial output' per producer of that output.
        PCollectionNode alreadyDeduplicated = unzippedOutputs.put(duplicateOutput.getId(), PipelineNode.pCollection(id, partial));
        checkArgument(alreadyDeduplicated == null, "a duplicate should only appear once per stage");
    }
    return unzippedOutputs;
}
Also used : PCollection(org.apache.beam.model.pipeline.v1.RunnerApi.PCollection) PCollectionNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PCollectionNode) LinkedHashMap(java.util.LinkedHashMap)

Example 24 with PCollectionNode

use of org.apache.beam.runners.core.construction.graph.PipelineNode.PCollectionNode in project beam by apache.

the class OutputDeduplicator method createFlattenOfPartials.

private static PTransform createFlattenOfPartials(String transformId, String outputId, Collection<PCollectionNode> generatedInputs) {
    PTransform.Builder newFlattenBuilder = PTransform.newBuilder();
    int i = 0;
    for (PCollectionNode generatedInput : generatedInputs) {
        String localInputId = String.format("input_%s", i);
        i++;
        newFlattenBuilder.putInputs(localInputId, generatedInput.getId());
    }
    // Flatten all of the new partial nodes together.
    return newFlattenBuilder.setUniqueName(transformId).putOutputs("output", outputId).setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.FLATTEN_TRANSFORM_URN)).build();
}
Also used : PCollectionNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PCollectionNode) PTransform(org.apache.beam.model.pipeline.v1.RunnerApi.PTransform)

Example 25 with PCollectionNode

use of org.apache.beam.runners.core.construction.graph.PipelineNode.PCollectionNode in project beam by apache.

the class OutputDeduplicatorTest method duplicateOverStages.

@Test
public void duplicateOverStages() {
    /* When multiple stages and a runner-executed transform produce a PCollection, all should be
     * replaced with synthetic flattens.
     * original graph:
     *             --> one -> .out \
     * red -> .out |                -> shared -> .out -> blue -> .out
     *             --> two -> .out /
     *
     * fused graph:
     *             --> [one -> .out -> shared ->] .out
     * red -> .out |                                   (shared.out) -> blue -> .out
     *             --> [two -> .out -> shared ->] .out
     *
     * deduplicated graph:
     *             --> [one -> .out -> shared ->] .out:0 \
     * red -> .out |                                      -> shared -> .out -> blue ->.out
     *             --> [two -> .out -> shared ->] .out:1 /
     */
    PCollection redOut = PCollection.newBuilder().setUniqueName("red.out").build();
    PTransform red = PTransform.newBuilder().setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).build()).putOutputs("out", redOut.getUniqueName()).build();
    PCollection oneOut = PCollection.newBuilder().setUniqueName("one.out").build();
    PTransform one = PTransform.newBuilder().setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).build()).putInputs("in", redOut.getUniqueName()).putOutputs("out", oneOut.getUniqueName()).build();
    PCollection twoOut = PCollection.newBuilder().setUniqueName("two.out").build();
    PTransform two = PTransform.newBuilder().setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).build()).putInputs("in", redOut.getUniqueName()).putOutputs("out", twoOut.getUniqueName()).build();
    PCollection sharedOut = PCollection.newBuilder().setUniqueName("shared.out").build();
    PTransform shared = PTransform.newBuilder().setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).build()).putInputs("one", oneOut.getUniqueName()).putInputs("two", twoOut.getUniqueName()).putOutputs("shared", sharedOut.getUniqueName()).build();
    PCollection blueOut = PCollection.newBuilder().setUniqueName("blue.out").build();
    PTransform blue = PTransform.newBuilder().setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).build()).putInputs("in", sharedOut.getUniqueName()).putOutputs("out", blueOut.getUniqueName()).build();
    RunnerApi.Components components = Components.newBuilder().putTransforms("one", one).putPcollections(oneOut.getUniqueName(), oneOut).putTransforms("two", two).putPcollections(twoOut.getUniqueName(), twoOut).putTransforms("shared", shared).putPcollections(sharedOut.getUniqueName(), sharedOut).putTransforms("red", red).putPcollections(redOut.getUniqueName(), redOut).putTransforms("blue", blue).putPcollections(blueOut.getUniqueName(), blueOut).build();
    ExecutableStage oneStage = ImmutableExecutableStage.of(components, Environment.getDefaultInstance(), PipelineNode.pCollection(redOut.getUniqueName(), redOut), ImmutableList.of(), ImmutableList.of(), ImmutableList.of(), ImmutableList.of(PipelineNode.pTransform("one", one), PipelineNode.pTransform("shared", shared)), ImmutableList.of(PipelineNode.pCollection(sharedOut.getUniqueName(), sharedOut)), DEFAULT_WIRE_CODER_SETTINGS);
    ExecutableStage twoStage = ImmutableExecutableStage.of(components, Environment.getDefaultInstance(), PipelineNode.pCollection(redOut.getUniqueName(), redOut), ImmutableList.of(), ImmutableList.of(), ImmutableList.of(), ImmutableList.of(PipelineNode.pTransform("two", two), PipelineNode.pTransform("shared", shared)), ImmutableList.of(PipelineNode.pCollection(sharedOut.getUniqueName(), sharedOut)), DEFAULT_WIRE_CODER_SETTINGS);
    PTransformNode redTransform = PipelineNode.pTransform("red", red);
    PTransformNode blueTransform = PipelineNode.pTransform("blue", blue);
    QueryablePipeline pipeline = QueryablePipeline.forPrimitivesIn(components);
    DeduplicationResult result = OutputDeduplicator.ensureSingleProducer(pipeline, ImmutableList.of(oneStage, twoStage), ImmutableList.of(redTransform, blueTransform));
    assertThat(result.getIntroducedTransforms(), hasSize(1));
    PTransformNode introduced = getOnlyElement(result.getIntroducedTransforms());
    assertThat(introduced.getTransform().getOutputsMap().size(), equalTo(1));
    assertThat(getOnlyElement(introduced.getTransform().getOutputsMap().values()), equalTo(sharedOut.getUniqueName()));
    assertThat(result.getDeduplicatedComponents().getPcollectionsMap().keySet(), hasItems(introduced.getTransform().getInputsMap().values().toArray(new String[0])));
    assertThat(result.getDeduplicatedStages().keySet(), hasSize(2));
    List<String> stageOutputs = result.getDeduplicatedStages().values().stream().flatMap(stage -> stage.getOutputPCollections().stream().map(PCollectionNode::getId)).collect(Collectors.toList());
    assertThat(stageOutputs, containsInAnyOrder(introduced.getTransform().getInputsMap().values().toArray()));
    assertThat(result.getDeduplicatedTransforms().keySet(), empty());
    assertThat(result.getDeduplicatedComponents().getPcollectionsMap().keySet(), hasItems(stageOutputs.toArray(new String[0])));
    assertThat(result.getDeduplicatedComponents().getTransformsMap(), hasEntry(introduced.getId(), introduced.getTransform()));
}
Also used : PTransform(org.apache.beam.model.pipeline.v1.RunnerApi.PTransform) RunWith(org.junit.runner.RunWith) Matchers.hasItems(org.hamcrest.Matchers.hasItems) ArrayList(java.util.ArrayList) FunctionSpec(org.apache.beam.model.pipeline.v1.RunnerApi.FunctionSpec) PCollection(org.apache.beam.model.pipeline.v1.RunnerApi.PCollection) Map(java.util.Map) Matchers.hasSize(org.hamcrest.Matchers.hasSize) Components(org.apache.beam.model.pipeline.v1.RunnerApi.Components) MatcherAssert.assertThat(org.hamcrest.MatcherAssert.assertThat) PTransformNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PTransformNode) Matchers.hasEntry(org.hamcrest.Matchers.hasEntry) RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) Matchers.empty(org.hamcrest.Matchers.empty) PTransformTranslation(org.apache.beam.runners.core.construction.PTransformTranslation) Collection(java.util.Collection) DeduplicationResult(org.apache.beam.runners.core.construction.graph.OutputDeduplicator.DeduplicationResult) Test(org.junit.Test) JUnit4(org.junit.runners.JUnit4) Collectors(java.util.stream.Collectors) List(java.util.List) Matchers.containsInAnyOrder(org.hamcrest.Matchers.containsInAnyOrder) Matchers.equalTo(org.hamcrest.Matchers.equalTo) Environment(org.apache.beam.model.pipeline.v1.RunnerApi.Environment) ImmutableList(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList) Iterables.getOnlyElement(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Iterables.getOnlyElement) PCollectionNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PCollectionNode) DEFAULT_WIRE_CODER_SETTINGS(org.apache.beam.runners.core.construction.graph.ExecutableStage.DEFAULT_WIRE_CODER_SETTINGS) PCollection(org.apache.beam.model.pipeline.v1.RunnerApi.PCollection) RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) DeduplicationResult(org.apache.beam.runners.core.construction.graph.OutputDeduplicator.DeduplicationResult) Components(org.apache.beam.model.pipeline.v1.RunnerApi.Components) PTransformNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PTransformNode) PCollectionNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PCollectionNode) PTransform(org.apache.beam.model.pipeline.v1.RunnerApi.PTransform) Test(org.junit.Test)

Aggregations

PCollectionNode (org.apache.beam.runners.core.construction.graph.PipelineNode.PCollectionNode)34 PTransformNode (org.apache.beam.runners.core.construction.graph.PipelineNode.PTransformNode)22 PTransform (org.apache.beam.model.pipeline.v1.RunnerApi.PTransform)19 Environment (org.apache.beam.model.pipeline.v1.RunnerApi.Environment)14 Components (org.apache.beam.model.pipeline.v1.RunnerApi.Components)12 PCollection (org.apache.beam.model.pipeline.v1.RunnerApi.PCollection)12 Test (org.junit.Test)12 RunnerApi (org.apache.beam.model.pipeline.v1.RunnerApi)10 Map (java.util.Map)9 Collection (java.util.Collection)7 LinkedHashSet (java.util.LinkedHashSet)7 ArrayList (java.util.ArrayList)6 HashSet (java.util.HashSet)6 Collectors (java.util.stream.Collectors)6 List (java.util.List)5 PTransformTranslation (org.apache.beam.runners.core.construction.PTransformTranslation)5 DeduplicationResult (org.apache.beam.runners.core.construction.graph.OutputDeduplicator.DeduplicationResult)5 ImmutableList (org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList)5 HashMap (java.util.HashMap)4 TreeSet (java.util.TreeSet)4