Search in sources :

Example 41 with Components

use of org.apache.beam.sdk.common.runner.v1.RunnerApi.Components in project beam by apache.

the class QueryablePipelineTest method perElementConsumersWithConsumingMultipleTimes.

/**
 * Tests that {@link QueryablePipeline#getPerElementConsumers(PCollectionNode)} returns a
 * transform that consumes the node more than once.
 */
@Test
public void perElementConsumersWithConsumingMultipleTimes() {
    Pipeline p = Pipeline.create();
    PCollection<Long> longs = p.apply("BoundedRead", Read.from(CountingSource.upTo(100L)));
    PCollectionList.of(longs).and(longs).and(longs).apply("flatten", Flatten.pCollections());
    Components components = PipelineTranslation.toProto(p).getComponents();
    // This breaks if the way that IDs are assigned to PTransforms changes in PipelineTranslation
    String readOutput = getOnlyElement(components.getTransformsOrThrow("BoundedRead").getOutputsMap().values());
    QueryablePipeline qp = QueryablePipeline.forPrimitivesIn(components);
    Set<PTransformNode> consumers = qp.getPerElementConsumers(PipelineNode.pCollection(readOutput, components.getPcollectionsOrThrow(readOutput)));
    assertThat(consumers.size(), equalTo(1));
    assertThat(getOnlyElement(consumers).getTransform().getSpec().getUrn(), equalTo(PTransformTranslation.FLATTEN_TRANSFORM_URN));
}
Also used : Components(org.apache.beam.model.pipeline.v1.RunnerApi.Components) PTransformNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PTransformNode) Pipeline(org.apache.beam.sdk.Pipeline) Test(org.junit.Test)

Example 42 with Components

use of org.apache.beam.sdk.common.runner.v1.RunnerApi.Components in project beam by apache.

the class GreedyPipelineFuserTest method parDoWithTimerRootsStage.

/*
   * impulse -> .out -> parDo -> .out -> timer -> .out
   * becomes
   * (impulse.out) -> parDo -> (parDo.out)
   * (parDo.out) -> timer
   */
@Test
public void parDoWithTimerRootsStage() {
    // (impulse.out) -> parDo -> (parDo.out)
    // (parDo.out) -> timer -> timer.out
    // timer has a timer spec which prevents it from fusing with an upstream ParDo
    PTransform parDoTransform = PTransform.newBuilder().setUniqueName("ParDo").putInputs("input", "impulse.out").putOutputs("output", "parDo.out").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).setPayload(ParDoPayload.newBuilder().setDoFn(FunctionSpec.newBuilder()).build().toByteString())).setEnvironmentId("common").build();
    PTransform timerTransform = PTransform.newBuilder().setUniqueName("TimerParDo").putInputs("input", "parDo.out").putInputs("timer", "timer.out").putOutputs("timer", "timer.out").putOutputs("output", "output.out").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).setPayload(ParDoPayload.newBuilder().setDoFn(FunctionSpec.newBuilder()).putTimerFamilySpecs("timer", TimerFamilySpec.getDefaultInstance()).build().toByteString())).setEnvironmentId("common").build();
    Components components = partialComponents.toBuilder().putTransforms("parDo", parDoTransform).putPcollections("parDo.out", pc("parDo.out")).putTransforms("timer", timerTransform).putPcollections("timer.out", pc("timer.out")).putPcollections("output.out", pc("output.out")).putEnvironments("common", Environments.createDockerEnvironment("common")).build();
    FusedPipeline fused = GreedyPipelineFuser.fuse(Pipeline.newBuilder().setComponents(components).addRequirements(ParDoTranslation.REQUIRES_STATEFUL_PROCESSING_URN).build());
    assertThat(fused.getRunnerExecutedTransforms(), containsInAnyOrder(PipelineNode.pTransform("impulse", components.getTransformsOrThrow("impulse"))));
    assertThat(fused.getFusedStages(), containsInAnyOrder(ExecutableStageMatcher.withInput("impulse.out").withOutputs("parDo.out").withTransforms("parDo"), ExecutableStageMatcher.withInput("parDo.out").withNoOutputs().withTransforms("timer")));
}
Also used : Components(org.apache.beam.model.pipeline.v1.RunnerApi.Components) PTransform(org.apache.beam.model.pipeline.v1.RunnerApi.PTransform) Test(org.junit.Test)

Example 43 with Components

use of org.apache.beam.sdk.common.runner.v1.RunnerApi.Components in project beam by apache.

the class GreedyPipelineFuserTest method compositesIgnored.

/*
   * impulse -> .out -> ( read -> .out --> goTransform -> .out )
   *                                    \
   *                                     -> pyTransform -> .out )
   * becomes (impulse.out) -> read -> (read.out)
   *         (read.out) -> goTransform
   *         (read.out) -> pyTransform
   */
@Test
public void compositesIgnored() {
    Components components = partialComponents.toBuilder().putTransforms("read", PTransform.newBuilder().setUniqueName("Read").putInputs("input", "impulse.out").putOutputs("output", "read.out").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).setPayload(ParDoPayload.newBuilder().setDoFn(FunctionSpec.newBuilder()).build().toByteString())).setEnvironmentId("py").build()).putPcollections("read.out", pc("read.out")).putTransforms("goTransform", PTransform.newBuilder().setUniqueName("GoTransform").putInputs("input", "read.out").putOutputs("output", "go.out").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).setPayload(ParDoPayload.newBuilder().setDoFn(FunctionSpec.newBuilder()).build().toByteString())).setEnvironmentId("go").build()).putPcollections("go.out", pc("go.out")).putTransforms("pyTransform", PTransform.newBuilder().setUniqueName("PyTransform").putInputs("input", "read.out").putOutputs("output", "py.out").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.ASSIGN_WINDOWS_TRANSFORM_URN).setPayload(WindowIntoPayload.newBuilder().setWindowFn(FunctionSpec.newBuilder()).build().toByteString())).setEnvironmentId("py").build()).putPcollections("py.out", pc("py.out")).putTransforms("compositeMultiLang", PTransform.newBuilder().setUniqueName("CompositeMultiLang").putInputs("input", "impulse.out").putOutputs("pyOut", "py.out").putOutputs("goOut", "go.out").addSubtransforms("read").addSubtransforms("goTransform").addSubtransforms("pyTransform").build()).build();
    FusedPipeline fused = GreedyPipelineFuser.fuse(Pipeline.newBuilder().addRootTransformIds("impulse").addRootTransformIds("compositeMultiLang").setComponents(components).build());
    // Impulse is the runner transform
    assertThat(fused.getRunnerExecutedTransforms(), hasSize(1));
    assertThat(fused.getFusedStages(), hasSize(3));
    assertThat(fused.getFusedStages(), containsInAnyOrder(ExecutableStageMatcher.withInput("impulse.out").withOutputs("read.out").withTransforms("read"), ExecutableStageMatcher.withInput("read.out").withNoOutputs().withTransforms("pyTransform"), ExecutableStageMatcher.withInput("read.out").withNoOutputs().withTransforms("goTransform")));
}
Also used : Components(org.apache.beam.model.pipeline.v1.RunnerApi.Components) Test(org.junit.Test)

Example 44 with Components

use of org.apache.beam.sdk.common.runner.v1.RunnerApi.Components in project beam by apache.

the class GreedyPipelineFuserTest method flattenWithHeterogenousInputsAndOutputsEntirelyMaterialized.

/*
   * goImpulse -> .out -> goRead -> .out \                    -> goParDo -> .out
   *                                      -> flatten -> .out |
   * pyImpulse -> .out -> pyRead -> .out /                    -> pyParDo -> .out
   *
   * becomes
   * (goImpulse.out) -> goRead -> goRead.out -> flatten -> (flatten.out_synthetic0)
   * (pyImpulse.out) -> pyRead -> pyRead.out -> flatten -> (flatten.out_synthetic1)
   * flatten.out_synthetic0 & flatten.out_synthetic1 -> synthetic_flatten -> flatten.out
   * (flatten.out) -> goParDo
   * (flatten.out) -> pyParDo
   */
@Test
public void flattenWithHeterogenousInputsAndOutputsEntirelyMaterialized() {
    Components components = Components.newBuilder().putCoders("coder", Coder.newBuilder().build()).putCoders("windowCoder", Coder.newBuilder().build()).putWindowingStrategies("ws", WindowingStrategy.newBuilder().setWindowCoderId("windowCoder").build()).putTransforms("pyImpulse", PTransform.newBuilder().setUniqueName("PyImpulse").putOutputs("output", "pyImpulse.out").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.IMPULSE_TRANSFORM_URN)).build()).putPcollections("pyImpulse.out", pc("pyImpulse.out")).putTransforms("pyRead", PTransform.newBuilder().setUniqueName("PyRead").putInputs("input", "pyImpulse.out").putOutputs("output", "pyRead.out").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).setPayload(ParDoPayload.newBuilder().setDoFn(FunctionSpec.newBuilder()).build().toByteString())).setEnvironmentId("py").build()).putPcollections("pyRead.out", pc("pyRead.out")).putTransforms("goImpulse", PTransform.newBuilder().setUniqueName("GoImpulse").putOutputs("output", "goImpulse.out").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.IMPULSE_TRANSFORM_URN)).build()).putPcollections("goImpulse.out", pc("goImpulse.out")).putTransforms("goRead", PTransform.newBuilder().setUniqueName("GoRead").putInputs("input", "goImpulse.out").putOutputs("output", "goRead.out").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).setPayload(ParDoPayload.newBuilder().setDoFn(FunctionSpec.newBuilder()).build().toByteString())).setEnvironmentId("go").build()).putPcollections("goRead.out", pc("goRead.out")).putTransforms("flatten", PTransform.newBuilder().setUniqueName("Flatten").putInputs("goReadInput", "goRead.out").putInputs("pyReadInput", "pyRead.out").putOutputs("output", "flatten.out").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.FLATTEN_TRANSFORM_URN)).build()).putPcollections("flatten.out", pc("flatten.out")).putTransforms("pyParDo", PTransform.newBuilder().setUniqueName("PyParDo").putInputs("input", "flatten.out").putOutputs("output", "pyParDo.out").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).setPayload(ParDoPayload.newBuilder().setDoFn(FunctionSpec.newBuilder()).build().toByteString())).setEnvironmentId("py").build()).putPcollections("pyParDo.out", pc("pyParDo.out")).putTransforms("goParDo", PTransform.newBuilder().setUniqueName("GoParDo").putInputs("input", "flatten.out").putOutputs("output", "goParDo.out").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).setPayload(ParDoPayload.newBuilder().setDoFn(FunctionSpec.newBuilder()).build().toByteString())).setEnvironmentId("go").build()).putPcollections("goParDo.out", pc("goParDo.out")).putEnvironments("go", Environments.createDockerEnvironment("go")).putEnvironments("py", Environments.createDockerEnvironment("py")).build();
    FusedPipeline fused = GreedyPipelineFuser.fuse(Pipeline.newBuilder().setComponents(components).build());
    assertThat(fused.getRunnerExecutedTransforms(), hasSize(3));
    assertThat("The runner should include the impulses for both languages, plus an introduced flatten", fused.getRunnerExecutedTransforms(), hasItems(PipelineNode.pTransform("pyImpulse", components.getTransformsOrThrow("pyImpulse")), PipelineNode.pTransform("goImpulse", components.getTransformsOrThrow("goImpulse"))));
    PTransformNode flattenNode = null;
    for (PTransformNode runnerTransform : fused.getRunnerExecutedTransforms()) {
        if (getOnlyElement(runnerTransform.getTransform().getOutputsMap().values()).equals("flatten.out")) {
            flattenNode = runnerTransform;
        }
    }
    assertThat(flattenNode, not(nullValue()));
    assertThat(flattenNode.getTransform().getSpec().getUrn(), equalTo(PTransformTranslation.FLATTEN_TRANSFORM_URN));
    assertThat(new HashSet<>(flattenNode.getTransform().getInputsMap().values()), hasSize(2));
    Collection<String> introducedOutputs = flattenNode.getTransform().getInputsMap().values();
    AnyOf<String> anyIntroducedPCollection = anyOf(introducedOutputs.stream().map(Matchers::equalTo).collect(Collectors.toSet()));
    assertThat(fused.getFusedStages(), containsInAnyOrder(ExecutableStageMatcher.withInput("goImpulse.out").withOutputs(anyIntroducedPCollection).withTransforms("goRead", "flatten"), ExecutableStageMatcher.withInput("pyImpulse.out").withOutputs(anyIntroducedPCollection).withTransforms("pyRead", "flatten"), ExecutableStageMatcher.withInput("flatten.out").withNoOutputs().withTransforms("goParDo"), ExecutableStageMatcher.withInput("flatten.out").withNoOutputs().withTransforms("pyParDo")));
    Set<String> materializedStageOutputs = fused.getFusedStages().stream().flatMap(executableStage -> executableStage.getOutputPCollections().stream()).map(PCollectionNode::getId).collect(Collectors.toSet());
    assertThat("All materialized stage outputs should be flattened, and no more", materializedStageOutputs, containsInAnyOrder(flattenNode.getTransform().getInputsMap().values().toArray(new String[0])));
}
Also used : Components(org.apache.beam.model.pipeline.v1.RunnerApi.Components) PTransformNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PTransformNode) Matchers(org.hamcrest.Matchers) Test(org.junit.Test)

Example 45 with Components

use of org.apache.beam.sdk.common.runner.v1.RunnerApi.Components in project beam by apache.

the class GreedyPipelineFuserTest method singleEnvironmentBecomesASingleStage.

/*
   * impulse -> .out -> read -> .out -> parDo -> .out -> window -> .out
   * becomes
   * (impulse.out) -> read -> read.out -> parDo -> parDo.out -> window
   */
@Test
public void singleEnvironmentBecomesASingleStage() {
    String name = "read.out";
    Components components = partialComponents.toBuilder().putTransforms("read", PTransform.newBuilder().setUniqueName("Read").putInputs("input", "impulse.out").putOutputs("output", "read.out").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).setPayload(ParDoPayload.newBuilder().setDoFn(FunctionSpec.newBuilder()).build().toByteString())).setEnvironmentId("py").build()).putPcollections("read.out", pc(name)).putTransforms("parDo", PTransform.newBuilder().setUniqueName("ParDo").putInputs("input", "read.out").putOutputs("output", "parDo.out").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).setPayload(ParDoPayload.newBuilder().setDoFn(FunctionSpec.newBuilder()).build().toByteString())).setEnvironmentId("py").build()).putPcollections("parDo.out", pc("parDo.out")).putTransforms("window", PTransform.newBuilder().setUniqueName("Window").putInputs("input", "parDo.out").putOutputs("output", "window.out").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.ASSIGN_WINDOWS_TRANSFORM_URN).setPayload(WindowIntoPayload.newBuilder().setWindowFn(FunctionSpec.newBuilder()).build().toByteString())).setEnvironmentId("py").build()).putPcollections("window.out", pc("window.out")).build();
    FusedPipeline fused = GreedyPipelineFuser.fuse(Pipeline.newBuilder().setComponents(components).build());
    assertThat(fused.getRunnerExecutedTransforms(), contains(PipelineNode.pTransform("impulse", components.getTransformsOrThrow("impulse"))));
    assertThat(fused.getFusedStages(), contains(ExecutableStageMatcher.withInput("impulse.out").withNoOutputs().withTransforms("read", "parDo", "window")));
}
Also used : Components(org.apache.beam.model.pipeline.v1.RunnerApi.Components) Test(org.junit.Test)

Aggregations

Components (org.apache.beam.model.pipeline.v1.RunnerApi.Components)49 Test (org.junit.Test)37 PTransform (org.apache.beam.model.pipeline.v1.RunnerApi.PTransform)19 PTransformNode (org.apache.beam.runners.core.construction.graph.PipelineNode.PTransformNode)18 RunnerApi (org.apache.beam.model.pipeline.v1.RunnerApi)12 PCollection (org.apache.beam.model.pipeline.v1.RunnerApi.PCollection)10 PCollectionNode (org.apache.beam.runners.core.construction.graph.PipelineNode.PCollectionNode)10 Map (java.util.Map)9 Pipeline (org.apache.beam.sdk.Pipeline)9 Environment (org.apache.beam.model.pipeline.v1.RunnerApi.Environment)7 Collection (java.util.Collection)6 Collectors (java.util.stream.Collectors)6 List (java.util.List)5 ExecutableStagePayload (org.apache.beam.model.pipeline.v1.RunnerApi.ExecutableStagePayload)5 FunctionSpec (org.apache.beam.model.pipeline.v1.RunnerApi.FunctionSpec)5 DeduplicationResult (org.apache.beam.runners.core.construction.graph.OutputDeduplicator.DeduplicationResult)5 ArrayList (java.util.ArrayList)4 Pipeline (org.apache.beam.model.pipeline.v1.RunnerApi.Pipeline)4 PCollection (org.apache.beam.sdk.values.PCollection)4 ImmutableList (org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList)4