Search in sources :

Example 31 with Environment

use of org.apache.beam.model.pipeline.v1.RunnerApi.Environment in project beam by apache.

the class GreedyPCollectionFusers method canFuseParDo.

/**
 * A ParDo can be fused into a stage if it executes in the same Environment as that stage, and no
 * transform that are upstream of any of its side input are present in that stage.
 *
 * <p>A ParDo that consumes a side input cannot process an element until all of the side inputs
 * contain data for the side input window that contains the element.
 */
private static boolean canFuseParDo(PTransformNode parDo, Environment environment, PCollectionNode candidate, Collection<PCollectionNode> stagePCollections, QueryablePipeline pipeline) {
    Optional<Environment> env = pipeline.getEnvironment(parDo);
    checkArgument(env.isPresent(), "A %s must have an %s associated with it", ParDoPayload.class.getSimpleName(), Environment.class.getSimpleName());
    if (!env.get().equals(environment)) {
        // is never possible.
        return false;
    }
    try {
        ParDoPayload payload = ParDoPayload.parseFrom(parDo.getTransform().getSpec().getPayload());
        if (Maps.filterKeys(parDo.getTransform().getInputsMap(), s -> payload.getTimerFamilySpecsMap().containsKey(s)).values().contains(candidate.getId())) {
            // Allow fusion across timer PCollections because they are a self loop.
            return true;
        } else if (payload.getStateSpecsCount() > 0 || payload.getTimerFamilySpecsCount() > 0) {
            // key-partitioned and preserves keys, these ParDos do not fuse into an existing stage.
            return false;
        } else if (!pipeline.getSideInputs(parDo).isEmpty()) {
            // executable stage alongside any transforms which are upstream of any of its side inputs.
            return false;
        }
    } catch (InvalidProtocolBufferException e) {
        throw new IllegalArgumentException(e);
    }
    return true;
}
Also used : ParDoPayload(org.apache.beam.model.pipeline.v1.RunnerApi.ParDoPayload) InvalidProtocolBufferException(org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.InvalidProtocolBufferException) Environment(org.apache.beam.model.pipeline.v1.RunnerApi.Environment)

Example 32 with Environment

use of org.apache.beam.model.pipeline.v1.RunnerApi.Environment in project beam by apache.

the class GreedyStageFuser method forGrpcPortRead.

/**
 * Returns an {@link ExecutableStage} where the initial {@link PTransformNode PTransform} is a
 * Remote gRPC Port Read, reading elements from the materialized {@link PCollectionNode
 * PCollection}.
 *
 * @param initialNodes the initial set of sibling transforms to fuse into this node. All of the
 *     transforms must consume the {@code inputPCollection} on a per-element basis, and must all
 *     be mutually compatible.
 */
public static ExecutableStage forGrpcPortRead(QueryablePipeline pipeline, PCollectionNode inputPCollection, Set<PTransformNode> initialNodes) {
    checkArgument(!initialNodes.isEmpty(), "%s must contain at least one %s.", GreedyStageFuser.class.getSimpleName(), PTransformNode.class.getSimpleName());
    // Choose the environment from an arbitrary node. The initial nodes may not be empty for this
    // subgraph to make any sense, there has to be at least one processor node
    // (otherwise the stage is gRPC Read -> gRPC Write, which doesn't do anything).
    Environment environment = getStageEnvironment(pipeline, initialNodes);
    ImmutableSet.Builder<PTransformNode> fusedTransforms = ImmutableSet.builder();
    fusedTransforms.addAll(initialNodes);
    Set<SideInputReference> sideInputs = new LinkedHashSet<>();
    Set<UserStateReference> userStates = new LinkedHashSet<>();
    Set<TimerReference> timers = new LinkedHashSet<>();
    Set<PCollectionNode> fusedCollections = new LinkedHashSet<>();
    Set<PCollectionNode> materializedPCollections = new LinkedHashSet<>();
    Queue<PCollectionNode> fusionCandidates = new ArrayDeque<>();
    for (PTransformNode initialConsumer : initialNodes) {
        fusionCandidates.addAll(pipeline.getOutputPCollections(initialConsumer));
        sideInputs.addAll(pipeline.getSideInputs(initialConsumer));
        userStates.addAll(pipeline.getUserStates(initialConsumer));
        timers.addAll(pipeline.getTimers(initialConsumer));
    }
    while (!fusionCandidates.isEmpty()) {
        PCollectionNode candidate = fusionCandidates.poll();
        if (fusedCollections.contains(candidate) || materializedPCollections.contains(candidate)) {
            // This should generally mean we get to a Flatten via multiple paths through the graph and
            // we've already determined what to do with the output.
            LOG.debug("Skipping fusion candidate {} because it is {} in this {}", candidate, fusedCollections.contains(candidate) ? "fused" : "materialized", ExecutableStage.class.getSimpleName());
            continue;
        }
        PCollectionFusibility fusibility = canFuse(pipeline, candidate, environment, fusedCollections);
        switch(fusibility) {
            case MATERIALIZE:
                materializedPCollections.add(candidate);
                break;
            case FUSE:
                // All of the consumers of the candidate PCollection can be fused into this stage. Do so.
                fusedCollections.add(candidate);
                fusedTransforms.addAll(pipeline.getPerElementConsumers(candidate));
                for (PTransformNode consumer : pipeline.getPerElementConsumers(candidate)) {
                    // The outputs of every transform fused into this stage must be either materialized or
                    // themselves fused away, so add them to the set of candidates.
                    fusionCandidates.addAll(pipeline.getOutputPCollections(consumer));
                    sideInputs.addAll(pipeline.getSideInputs(consumer));
                }
                break;
            default:
                throw new IllegalStateException(String.format("Unknown type of %s %s", PCollectionFusibility.class.getSimpleName(), fusibility));
        }
    }
    return ImmutableExecutableStage.ofFullComponents(pipeline.getComponents(), environment, inputPCollection, sideInputs, userStates, timers, fusedTransforms.build(), materializedPCollections, DEFAULT_WIRE_CODER_SETTINGS);
}
Also used : LinkedHashSet(java.util.LinkedHashSet) PTransformNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PTransformNode) PCollectionNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PCollectionNode) ArrayDeque(java.util.ArrayDeque) ImmutableSet(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableSet) Environment(org.apache.beam.model.pipeline.v1.RunnerApi.Environment)

Example 33 with Environment

use of org.apache.beam.model.pipeline.v1.RunnerApi.Environment in project beam by apache.

the class GreedyPipelineFuser method fusePipeline.

/**
 * Fuses a {@link Pipeline} into a collection of {@link ExecutableStage}.
 *
 * <p>The input is the initial collection of siblings sets which will be fused into {@link
 * ExecutableStage stages}. A sibling in this context represents a pair of (PCollection,
 * PTransform), where the PTransform consumes input elements on a per-element basis from the
 * PCollection, represented by a {@link CollectionConsumer}. A sibling set is a collection of
 * siblings which can execute within a single {@link ExecutableStage}, determined by {@link
 * GreedyPCollectionFusers#isCompatible(PTransformNode, PTransformNode, QueryablePipeline)}.
 *
 * <p>While a pending sibling set exists:
 *
 * <ul>
 *   <li>Retrieve a pending sibling set from the front of the queue.
 *   <li>If the pending sibling set has already been created, continue. Each materialized {@link
 *       PTransformNode} can be consumed by any number of {@link ExecutableStage stages}, but each
 *       {@link PTransformNode} may only be present in a single stage rooted at a single {@link
 *       PCollectionNode}, otherwise it will process elements of that {@link PCollectionNode}
 *       multiple times.
 *   <li>Create a {@link GreedyStageFuser} with those siblings as the initial consuming transforms
 *       of the stage
 *   <li>For each materialized {@link PCollectionNode}, find all of the descendant in-environment
 *       consumers. See {@link #getDescendantConsumers(PCollectionNode)} for details.
 *   <li>Construct all of the sibling sets from the descendant in-environment consumers, and add
 *       them to the queue of sibling sets.
 * </ul>
 */
private FusedPipeline fusePipeline(Collection<PTransformNode> initialUnfusedTransforms, NavigableSet<NavigableSet<CollectionConsumer>> initialConsumers, Set<String> requirements) {
    Map<CollectionConsumer, ExecutableStage> consumedCollectionsAndTransforms = new HashMap<>();
    Set<ExecutableStage> stages = new LinkedHashSet<>();
    Set<PTransformNode> unfusedTransforms = new LinkedHashSet<>(initialUnfusedTransforms);
    Queue<Set<CollectionConsumer>> pendingSiblingSets = new ArrayDeque<>(initialConsumers);
    while (!pendingSiblingSets.isEmpty()) {
        // Only introduce new PCollection consumers. Not performing this introduces potential
        // duplicate paths through the pipeline.
        Set<CollectionConsumer> candidateSiblings = pendingSiblingSets.poll();
        Set<CollectionConsumer> siblingSet = Sets.difference(candidateSiblings, consumedCollectionsAndTransforms.keySet());
        checkState(siblingSet.equals(candidateSiblings) || siblingSet.isEmpty(), "Inconsistent collection of siblings reported for a %s. Initial attempt missed %s", PCollectionNode.class.getSimpleName(), siblingSet);
        if (siblingSet.isEmpty()) {
            LOG.debug("Filtered out duplicate stage root {}", candidateSiblings);
            continue;
        }
        // Create the stage with these siblings as the initial consuming transforms
        ExecutableStage stage = fuseSiblings(siblingSet);
        // don't place them in multiple stages.
        for (CollectionConsumer sibling : siblingSet) {
            consumedCollectionsAndTransforms.put(sibling, stage);
        }
        stages.add(stage);
        for (PCollectionNode materializedOutput : stage.getOutputPCollections()) {
            // Get all of the descendant consumers of each materialized PCollection, and add them to the
            // queue of pending siblings.
            DescendantConsumers descendantConsumers = getDescendantConsumers(materializedOutput);
            unfusedTransforms.addAll(descendantConsumers.getUnfusedNodes());
            NavigableSet<NavigableSet<CollectionConsumer>> siblings = groupSiblings(descendantConsumers.getFusibleConsumers());
            pendingSiblingSets.addAll(siblings);
        }
    }
    // TODO: Figure out where to store this.
    DeduplicationResult deduplicated = OutputDeduplicator.ensureSingleProducer(pipeline, stages, unfusedTransforms);
    // as can compatible producers/consumers if a PCollection is only materialized once.
    return FusedPipeline.of(deduplicated.getDeduplicatedComponents(), stages.stream().map(stage -> deduplicated.getDeduplicatedStages().getOrDefault(stage, stage)).map(GreedyPipelineFuser::sanitizeDanglingPTransformInputs).collect(Collectors.toSet()), Sets.union(deduplicated.getIntroducedTransforms(), unfusedTransforms.stream().map(transform -> deduplicated.getDeduplicatedTransforms().getOrDefault(transform.getId(), transform)).collect(Collectors.toSet())), requirements);
}
Also used : LinkedHashSet(java.util.LinkedHashSet) PTransform(org.apache.beam.model.pipeline.v1.RunnerApi.PTransform) LoggerFactory(org.slf4j.LoggerFactory) HashMap(java.util.HashMap) HashMultimap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.HashMultimap) TreeSet(java.util.TreeSet) ComparisonChain(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ComparisonChain) HashSet(java.util.HashSet) ImmutableSet(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableSet) Sets(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Sets) PCollection(org.apache.beam.model.pipeline.v1.RunnerApi.PCollection) Multimap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Multimap) Map(java.util.Map) Preconditions.checkArgument(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkArgument) Components(org.apache.beam.model.pipeline.v1.RunnerApi.Components) PTransformNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PTransformNode) LinkedHashSet(java.util.LinkedHashSet) PTransformTranslation(org.apache.beam.runners.core.construction.PTransformTranslation) Logger(org.slf4j.Logger) Collection(java.util.Collection) Set(java.util.Set) NavigableSet(java.util.NavigableSet) DeduplicationResult(org.apache.beam.runners.core.construction.graph.OutputDeduplicator.DeduplicationResult) Collectors(java.util.stream.Collectors) Pipeline(org.apache.beam.model.pipeline.v1.RunnerApi.Pipeline) Preconditions.checkState(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkState) Environment(org.apache.beam.model.pipeline.v1.RunnerApi.Environment) AutoValue(com.google.auto.value.AutoValue) Entry(java.util.Map.Entry) ImmutableList(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList) Queue(java.util.Queue) ArrayDeque(java.util.ArrayDeque) Comparator(java.util.Comparator) PCollectionNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PCollectionNode) NavigableSet(java.util.NavigableSet) TreeSet(java.util.TreeSet) HashSet(java.util.HashSet) ImmutableSet(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableSet) LinkedHashSet(java.util.LinkedHashSet) Set(java.util.Set) NavigableSet(java.util.NavigableSet) HashMap(java.util.HashMap) PTransformNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PTransformNode) ArrayDeque(java.util.ArrayDeque) PCollectionNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PCollectionNode) DeduplicationResult(org.apache.beam.runners.core.construction.graph.OutputDeduplicator.DeduplicationResult)

Example 34 with Environment

use of org.apache.beam.model.pipeline.v1.RunnerApi.Environment in project beam by apache.

the class ImmutableExecutableStageTest method ofFullComponentsOnlyHasStagePTransforms.

@Test
public void ofFullComponentsOnlyHasStagePTransforms() throws Exception {
    Environment env = Environments.createDockerEnvironment("foo");
    PTransform pt = PTransform.newBuilder().putInputs("input", "input.out").putInputs("side_input", "sideInput.in").putInputs("timer", "timer.pc").putOutputs("output", "output.out").putOutputs("timer", "timer.pc").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).setPayload(ParDoPayload.newBuilder().setDoFn(RunnerApi.FunctionSpec.newBuilder()).putSideInputs("side_input", RunnerApi.SideInput.getDefaultInstance()).putStateSpecs("user_state", RunnerApi.StateSpec.getDefaultInstance()).putTimerFamilySpecs("timer", RunnerApi.TimerFamilySpec.getDefaultInstance()).build().toByteString())).build();
    PCollection input = PCollection.newBuilder().setUniqueName("input.out").build();
    PCollection sideInput = PCollection.newBuilder().setUniqueName("sideInput.in").build();
    PCollection timer = PCollection.newBuilder().setUniqueName("timer.pc").build();
    PCollection output = PCollection.newBuilder().setUniqueName("output.out").build();
    Components components = Components.newBuilder().putTransforms("pt", pt).putTransforms("other_pt", PTransform.newBuilder().setUniqueName("other").build()).putPcollections("input.out", input).putPcollections("sideInput.in", sideInput).putPcollections("timer.pc", timer).putPcollections("output.out", output).putEnvironments("foo", env).build();
    PTransformNode transformNode = PipelineNode.pTransform("pt", pt);
    SideInputReference sideInputRef = SideInputReference.of(transformNode, "side_input", PipelineNode.pCollection("sideInput.in", sideInput));
    UserStateReference userStateRef = UserStateReference.of(transformNode, "user_state", PipelineNode.pCollection("input.out", input));
    TimerReference timerRef = TimerReference.of(transformNode, "timer");
    ImmutableExecutableStage stage = ImmutableExecutableStage.ofFullComponents(components, env, PipelineNode.pCollection("input.out", input), Collections.singleton(sideInputRef), Collections.singleton(userStateRef), Collections.singleton(timerRef), Collections.singleton(PipelineNode.pTransform("pt", pt)), Collections.singleton(PipelineNode.pCollection("output.out", output)), DEFAULT_WIRE_CODER_SETTINGS);
    assertThat(stage.getComponents().containsTransforms("pt"), is(true));
    assertThat(stage.getComponents().containsTransforms("other_pt"), is(false));
    PTransform stagePTransform = stage.toPTransform("foo");
    assertThat(stagePTransform.getOutputsMap(), hasValue("output.out"));
    assertThat(stagePTransform.getOutputsCount(), equalTo(1));
    assertThat(stagePTransform.getInputsMap(), allOf(hasValue("input.out"), hasValue("sideInput.in")));
    assertThat(stagePTransform.getInputsCount(), equalTo(2));
    ExecutableStagePayload payload = ExecutableStagePayload.parseFrom(stagePTransform.getSpec().getPayload());
    assertThat(payload.getTransformsList(), contains("pt"));
    assertThat(ExecutableStage.fromPayload(payload), equalTo(stage));
}
Also used : Components(org.apache.beam.model.pipeline.v1.RunnerApi.Components) PCollection(org.apache.beam.model.pipeline.v1.RunnerApi.PCollection) ExecutableStagePayload(org.apache.beam.model.pipeline.v1.RunnerApi.ExecutableStagePayload) PTransformNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PTransformNode) Environment(org.apache.beam.model.pipeline.v1.RunnerApi.Environment) PTransform(org.apache.beam.model.pipeline.v1.RunnerApi.PTransform) Test(org.junit.Test)

Example 35 with Environment

use of org.apache.beam.model.pipeline.v1.RunnerApi.Environment in project beam by apache.

the class DataflowPipelineTranslatorTest method testSetSdkContainerImageInPipelineProto.

/**
 * Tests that when {@link DataflowPipelineOptions#setSdkContainerImage(String)} pipeline option is
 * set, {@link DataflowRunner} sets that value as the {@link DockerPayload#getContainerImage()} of
 * the default {@link Environment} used when generating the model pipeline proto.
 */
@Test
public void testSetSdkContainerImageInPipelineProto() throws Exception {
    DataflowPipelineOptions options = buildPipelineOptions();
    String containerImage = "gcr.io/image:foo";
    options.as(DataflowPipelineOptions.class).setSdkContainerImage(containerImage);
    Pipeline p = Pipeline.create(options);
    SdkComponents sdkComponents = createSdkComponents(options);
    RunnerApi.Pipeline proto = PipelineTranslation.toProto(p, sdkComponents, true);
    JobSpecification specification = DataflowPipelineTranslator.fromOptions(options).translate(p, proto, sdkComponents, DataflowRunner.fromOptions(options), Collections.emptyList());
    RunnerApi.Pipeline pipelineProto = specification.getPipelineProto();
    assertEquals(1, pipelineProto.getComponents().getEnvironmentsCount());
    Environment defaultEnvironment = Iterables.getOnlyElement(pipelineProto.getComponents().getEnvironmentsMap().values());
    DockerPayload payload = DockerPayload.parseFrom(defaultEnvironment.getPayload());
    assertEquals(DataflowRunner.getContainerImageForJob(options), payload.getContainerImage());
}
Also used : RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) DataflowPipelineOptions(org.apache.beam.runners.dataflow.options.DataflowPipelineOptions) Environment(org.apache.beam.model.pipeline.v1.RunnerApi.Environment) Structs.getString(org.apache.beam.runners.dataflow.util.Structs.getString) ByteString(org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.ByteString) JobSpecification(org.apache.beam.runners.dataflow.DataflowPipelineTranslator.JobSpecification) SdkComponents(org.apache.beam.runners.core.construction.SdkComponents) DockerPayload(org.apache.beam.model.pipeline.v1.RunnerApi.DockerPayload) Pipeline(org.apache.beam.sdk.Pipeline) Test(org.junit.Test)

Aggregations

Environment (org.apache.beam.model.pipeline.v1.RunnerApi.Environment)33 Test (org.junit.Test)28 PTransform (org.apache.beam.model.pipeline.v1.RunnerApi.PTransform)17 RunnerApi (org.apache.beam.model.pipeline.v1.RunnerApi)14 ByteString (org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.ByteString)13 PTransformNode (org.apache.beam.runners.core.construction.graph.PipelineNode.PTransformNode)12 PCollectionNode (org.apache.beam.runners.core.construction.graph.PipelineNode.PCollectionNode)11 PCollection (org.apache.beam.model.pipeline.v1.RunnerApi.PCollection)8 ImmutableList (org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList)8 Map (java.util.Map)7 RemoteEnvironment (org.apache.beam.runners.fnexecution.environment.RemoteEnvironment)7 Pipeline (org.apache.beam.sdk.Pipeline)7 IOException (java.io.IOException)6 SdkComponents (org.apache.beam.runners.core.construction.SdkComponents)5 EnvironmentFactory (org.apache.beam.runners.fnexecution.environment.EnvironmentFactory)5 Provider (org.apache.beam.runners.fnexecution.environment.EnvironmentFactory.Provider)5 ServerFactory (org.apache.beam.sdk.fn.server.ServerFactory)5 InvalidProtocolBufferException (org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.InvalidProtocolBufferException)5 Matchers.containsString (org.hamcrest.Matchers.containsString)5 ArrayList (java.util.ArrayList)4