Search in sources :

Example 1 with Sets

use of org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Sets in project beam by apache.

the class GreedyPipelineFuser method fusePipeline.

/**
 * Fuses a {@link Pipeline} into a collection of {@link ExecutableStage}.
 *
 * <p>The input is the initial collection of siblings sets which will be fused into {@link
 * ExecutableStage stages}. A sibling in this context represents a pair of (PCollection,
 * PTransform), where the PTransform consumes input elements on a per-element basis from the
 * PCollection, represented by a {@link CollectionConsumer}. A sibling set is a collection of
 * siblings which can execute within a single {@link ExecutableStage}, determined by {@link
 * GreedyPCollectionFusers#isCompatible(PTransformNode, PTransformNode, QueryablePipeline)}.
 *
 * <p>While a pending sibling set exists:
 *
 * <ul>
 *   <li>Retrieve a pending sibling set from the front of the queue.
 *   <li>If the pending sibling set has already been created, continue. Each materialized {@link
 *       PTransformNode} can be consumed by any number of {@link ExecutableStage stages}, but each
 *       {@link PTransformNode} may only be present in a single stage rooted at a single {@link
 *       PCollectionNode}, otherwise it will process elements of that {@link PCollectionNode}
 *       multiple times.
 *   <li>Create a {@link GreedyStageFuser} with those siblings as the initial consuming transforms
 *       of the stage
 *   <li>For each materialized {@link PCollectionNode}, find all of the descendant in-environment
 *       consumers. See {@link #getDescendantConsumers(PCollectionNode)} for details.
 *   <li>Construct all of the sibling sets from the descendant in-environment consumers, and add
 *       them to the queue of sibling sets.
 * </ul>
 */
private FusedPipeline fusePipeline(Collection<PTransformNode> initialUnfusedTransforms, NavigableSet<NavigableSet<CollectionConsumer>> initialConsumers, Set<String> requirements) {
    Map<CollectionConsumer, ExecutableStage> consumedCollectionsAndTransforms = new HashMap<>();
    Set<ExecutableStage> stages = new LinkedHashSet<>();
    Set<PTransformNode> unfusedTransforms = new LinkedHashSet<>(initialUnfusedTransforms);
    Queue<Set<CollectionConsumer>> pendingSiblingSets = new ArrayDeque<>(initialConsumers);
    while (!pendingSiblingSets.isEmpty()) {
        // Only introduce new PCollection consumers. Not performing this introduces potential
        // duplicate paths through the pipeline.
        Set<CollectionConsumer> candidateSiblings = pendingSiblingSets.poll();
        Set<CollectionConsumer> siblingSet = Sets.difference(candidateSiblings, consumedCollectionsAndTransforms.keySet());
        checkState(siblingSet.equals(candidateSiblings) || siblingSet.isEmpty(), "Inconsistent collection of siblings reported for a %s. Initial attempt missed %s", PCollectionNode.class.getSimpleName(), siblingSet);
        if (siblingSet.isEmpty()) {
            LOG.debug("Filtered out duplicate stage root {}", candidateSiblings);
            continue;
        }
        // Create the stage with these siblings as the initial consuming transforms
        ExecutableStage stage = fuseSiblings(siblingSet);
        // don't place them in multiple stages.
        for (CollectionConsumer sibling : siblingSet) {
            consumedCollectionsAndTransforms.put(sibling, stage);
        }
        stages.add(stage);
        for (PCollectionNode materializedOutput : stage.getOutputPCollections()) {
            // Get all of the descendant consumers of each materialized PCollection, and add them to the
            // queue of pending siblings.
            DescendantConsumers descendantConsumers = getDescendantConsumers(materializedOutput);
            unfusedTransforms.addAll(descendantConsumers.getUnfusedNodes());
            NavigableSet<NavigableSet<CollectionConsumer>> siblings = groupSiblings(descendantConsumers.getFusibleConsumers());
            pendingSiblingSets.addAll(siblings);
        }
    }
    // TODO: Figure out where to store this.
    DeduplicationResult deduplicated = OutputDeduplicator.ensureSingleProducer(pipeline, stages, unfusedTransforms);
    // as can compatible producers/consumers if a PCollection is only materialized once.
    return FusedPipeline.of(deduplicated.getDeduplicatedComponents(), stages.stream().map(stage -> deduplicated.getDeduplicatedStages().getOrDefault(stage, stage)).map(GreedyPipelineFuser::sanitizeDanglingPTransformInputs).collect(Collectors.toSet()), Sets.union(deduplicated.getIntroducedTransforms(), unfusedTransforms.stream().map(transform -> deduplicated.getDeduplicatedTransforms().getOrDefault(transform.getId(), transform)).collect(Collectors.toSet())), requirements);
}
Also used : LinkedHashSet(java.util.LinkedHashSet) PTransform(org.apache.beam.model.pipeline.v1.RunnerApi.PTransform) LoggerFactory(org.slf4j.LoggerFactory) HashMap(java.util.HashMap) HashMultimap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.HashMultimap) TreeSet(java.util.TreeSet) ComparisonChain(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ComparisonChain) HashSet(java.util.HashSet) ImmutableSet(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableSet) Sets(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Sets) PCollection(org.apache.beam.model.pipeline.v1.RunnerApi.PCollection) Multimap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Multimap) Map(java.util.Map) Preconditions.checkArgument(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkArgument) Components(org.apache.beam.model.pipeline.v1.RunnerApi.Components) PTransformNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PTransformNode) LinkedHashSet(java.util.LinkedHashSet) PTransformTranslation(org.apache.beam.runners.core.construction.PTransformTranslation) Logger(org.slf4j.Logger) Collection(java.util.Collection) Set(java.util.Set) NavigableSet(java.util.NavigableSet) DeduplicationResult(org.apache.beam.runners.core.construction.graph.OutputDeduplicator.DeduplicationResult) Collectors(java.util.stream.Collectors) Pipeline(org.apache.beam.model.pipeline.v1.RunnerApi.Pipeline) Preconditions.checkState(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkState) Environment(org.apache.beam.model.pipeline.v1.RunnerApi.Environment) AutoValue(com.google.auto.value.AutoValue) Entry(java.util.Map.Entry) ImmutableList(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList) Queue(java.util.Queue) ArrayDeque(java.util.ArrayDeque) Comparator(java.util.Comparator) PCollectionNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PCollectionNode) NavigableSet(java.util.NavigableSet) TreeSet(java.util.TreeSet) HashSet(java.util.HashSet) ImmutableSet(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableSet) LinkedHashSet(java.util.LinkedHashSet) Set(java.util.Set) NavigableSet(java.util.NavigableSet) HashMap(java.util.HashMap) PTransformNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PTransformNode) ArrayDeque(java.util.ArrayDeque) PCollectionNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PCollectionNode) DeduplicationResult(org.apache.beam.runners.core.construction.graph.OutputDeduplicator.DeduplicationResult)

Aggregations

AutoValue (com.google.auto.value.AutoValue)1 ArrayDeque (java.util.ArrayDeque)1 Collection (java.util.Collection)1 Comparator (java.util.Comparator)1 HashMap (java.util.HashMap)1 HashSet (java.util.HashSet)1 LinkedHashSet (java.util.LinkedHashSet)1 Map (java.util.Map)1 Entry (java.util.Map.Entry)1 NavigableSet (java.util.NavigableSet)1 Queue (java.util.Queue)1 Set (java.util.Set)1 TreeSet (java.util.TreeSet)1 Collectors (java.util.stream.Collectors)1 Components (org.apache.beam.model.pipeline.v1.RunnerApi.Components)1 Environment (org.apache.beam.model.pipeline.v1.RunnerApi.Environment)1 PCollection (org.apache.beam.model.pipeline.v1.RunnerApi.PCollection)1 PTransform (org.apache.beam.model.pipeline.v1.RunnerApi.PTransform)1 Pipeline (org.apache.beam.model.pipeline.v1.RunnerApi.Pipeline)1 PTransformTranslation (org.apache.beam.runners.core.construction.PTransformTranslation)1