Search in sources :

Example 71 with Components

use of org.apache.beam.model.pipeline.v1.RunnerApi.Components in project beam by apache.

the class GreedyPipelineFuserTest method flattenWithHeterogenousInputsAndOutputsEntirelyMaterialized.

/*
   * goImpulse -> .out -> goRead -> .out \                    -> goParDo -> .out
   *                                      -> flatten -> .out |
   * pyImpulse -> .out -> pyRead -> .out /                    -> pyParDo -> .out
   *
   * becomes
   * (goImpulse.out) -> goRead -> goRead.out -> flatten -> (flatten.out_synthetic0)
   * (pyImpulse.out) -> pyRead -> pyRead.out -> flatten -> (flatten.out_synthetic1)
   * flatten.out_synthetic0 & flatten.out_synthetic1 -> synthetic_flatten -> flatten.out
   * (flatten.out) -> goParDo
   * (flatten.out) -> pyParDo
   */
@Test
public void flattenWithHeterogenousInputsAndOutputsEntirelyMaterialized() {
    Components components = Components.newBuilder().putCoders("coder", Coder.newBuilder().build()).putCoders("windowCoder", Coder.newBuilder().build()).putWindowingStrategies("ws", WindowingStrategy.newBuilder().setWindowCoderId("windowCoder").build()).putTransforms("pyImpulse", PTransform.newBuilder().setUniqueName("PyImpulse").putOutputs("output", "pyImpulse.out").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.IMPULSE_TRANSFORM_URN)).build()).putPcollections("pyImpulse.out", pc("pyImpulse.out")).putTransforms("pyRead", PTransform.newBuilder().setUniqueName("PyRead").putInputs("input", "pyImpulse.out").putOutputs("output", "pyRead.out").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).setPayload(ParDoPayload.newBuilder().setDoFn(FunctionSpec.newBuilder()).build().toByteString())).setEnvironmentId("py").build()).putPcollections("pyRead.out", pc("pyRead.out")).putTransforms("goImpulse", PTransform.newBuilder().setUniqueName("GoImpulse").putOutputs("output", "goImpulse.out").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.IMPULSE_TRANSFORM_URN)).build()).putPcollections("goImpulse.out", pc("goImpulse.out")).putTransforms("goRead", PTransform.newBuilder().setUniqueName("GoRead").putInputs("input", "goImpulse.out").putOutputs("output", "goRead.out").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).setPayload(ParDoPayload.newBuilder().setDoFn(FunctionSpec.newBuilder()).build().toByteString())).setEnvironmentId("go").build()).putPcollections("goRead.out", pc("goRead.out")).putTransforms("flatten", PTransform.newBuilder().setUniqueName("Flatten").putInputs("goReadInput", "goRead.out").putInputs("pyReadInput", "pyRead.out").putOutputs("output", "flatten.out").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.FLATTEN_TRANSFORM_URN)).build()).putPcollections("flatten.out", pc("flatten.out")).putTransforms("pyParDo", PTransform.newBuilder().setUniqueName("PyParDo").putInputs("input", "flatten.out").putOutputs("output", "pyParDo.out").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).setPayload(ParDoPayload.newBuilder().setDoFn(FunctionSpec.newBuilder()).build().toByteString())).setEnvironmentId("py").build()).putPcollections("pyParDo.out", pc("pyParDo.out")).putTransforms("goParDo", PTransform.newBuilder().setUniqueName("GoParDo").putInputs("input", "flatten.out").putOutputs("output", "goParDo.out").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).setPayload(ParDoPayload.newBuilder().setDoFn(FunctionSpec.newBuilder()).build().toByteString())).setEnvironmentId("go").build()).putPcollections("goParDo.out", pc("goParDo.out")).putEnvironments("go", Environments.createDockerEnvironment("go")).putEnvironments("py", Environments.createDockerEnvironment("py")).build();
    FusedPipeline fused = GreedyPipelineFuser.fuse(Pipeline.newBuilder().setComponents(components).build());
    assertThat(fused.getRunnerExecutedTransforms(), hasSize(3));
    assertThat("The runner should include the impulses for both languages, plus an introduced flatten", fused.getRunnerExecutedTransforms(), hasItems(PipelineNode.pTransform("pyImpulse", components.getTransformsOrThrow("pyImpulse")), PipelineNode.pTransform("goImpulse", components.getTransformsOrThrow("goImpulse"))));
    PTransformNode flattenNode = null;
    for (PTransformNode runnerTransform : fused.getRunnerExecutedTransforms()) {
        if (getOnlyElement(runnerTransform.getTransform().getOutputsMap().values()).equals("flatten.out")) {
            flattenNode = runnerTransform;
        }
    }
    assertThat(flattenNode, not(nullValue()));
    assertThat(flattenNode.getTransform().getSpec().getUrn(), equalTo(PTransformTranslation.FLATTEN_TRANSFORM_URN));
    assertThat(new HashSet<>(flattenNode.getTransform().getInputsMap().values()), hasSize(2));
    Collection<String> introducedOutputs = flattenNode.getTransform().getInputsMap().values();
    AnyOf<String> anyIntroducedPCollection = anyOf(introducedOutputs.stream().map(Matchers::equalTo).collect(Collectors.toSet()));
    assertThat(fused.getFusedStages(), containsInAnyOrder(ExecutableStageMatcher.withInput("goImpulse.out").withOutputs(anyIntroducedPCollection).withTransforms("goRead", "flatten"), ExecutableStageMatcher.withInput("pyImpulse.out").withOutputs(anyIntroducedPCollection).withTransforms("pyRead", "flatten"), ExecutableStageMatcher.withInput("flatten.out").withNoOutputs().withTransforms("goParDo"), ExecutableStageMatcher.withInput("flatten.out").withNoOutputs().withTransforms("pyParDo")));
    Set<String> materializedStageOutputs = fused.getFusedStages().stream().flatMap(executableStage -> executableStage.getOutputPCollections().stream()).map(PCollectionNode::getId).collect(Collectors.toSet());
    assertThat("All materialized stage outputs should be flattened, and no more", materializedStageOutputs, containsInAnyOrder(flattenNode.getTransform().getInputsMap().values().toArray(new String[0])));
}
Also used : Components(org.apache.beam.model.pipeline.v1.RunnerApi.Components) PTransformNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PTransformNode) Matchers(org.hamcrest.Matchers) Test(org.junit.Test)

Example 72 with Components

use of org.apache.beam.model.pipeline.v1.RunnerApi.Components in project beam by apache.

the class GreedyPipelineFuserTest method singleEnvironmentBecomesASingleStage.

/*
   * impulse -> .out -> read -> .out -> parDo -> .out -> window -> .out
   * becomes
   * (impulse.out) -> read -> read.out -> parDo -> parDo.out -> window
   */
@Test
public void singleEnvironmentBecomesASingleStage() {
    String name = "read.out";
    Components components = partialComponents.toBuilder().putTransforms("read", PTransform.newBuilder().setUniqueName("Read").putInputs("input", "impulse.out").putOutputs("output", "read.out").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).setPayload(ParDoPayload.newBuilder().setDoFn(FunctionSpec.newBuilder()).build().toByteString())).setEnvironmentId("py").build()).putPcollections("read.out", pc(name)).putTransforms("parDo", PTransform.newBuilder().setUniqueName("ParDo").putInputs("input", "read.out").putOutputs("output", "parDo.out").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).setPayload(ParDoPayload.newBuilder().setDoFn(FunctionSpec.newBuilder()).build().toByteString())).setEnvironmentId("py").build()).putPcollections("parDo.out", pc("parDo.out")).putTransforms("window", PTransform.newBuilder().setUniqueName("Window").putInputs("input", "parDo.out").putOutputs("output", "window.out").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.ASSIGN_WINDOWS_TRANSFORM_URN).setPayload(WindowIntoPayload.newBuilder().setWindowFn(FunctionSpec.newBuilder()).build().toByteString())).setEnvironmentId("py").build()).putPcollections("window.out", pc("window.out")).build();
    FusedPipeline fused = GreedyPipelineFuser.fuse(Pipeline.newBuilder().setComponents(components).build());
    assertThat(fused.getRunnerExecutedTransforms(), contains(PipelineNode.pTransform("impulse", components.getTransformsOrThrow("impulse"))));
    assertThat(fused.getFusedStages(), contains(ExecutableStageMatcher.withInput("impulse.out").withNoOutputs().withTransforms("read", "parDo", "window")));
}
Also used : Components(org.apache.beam.model.pipeline.v1.RunnerApi.Components) Test(org.junit.Test)

Example 73 with Components

use of org.apache.beam.model.pipeline.v1.RunnerApi.Components in project beam by apache.

the class DataflowPipelineTranslatorTest method testPortablePipelineContainsExpectedDependenciesAndCapabilities.

@Test
public void testPortablePipelineContainsExpectedDependenciesAndCapabilities() throws Exception {
    DataflowPipelineOptions options = buildPipelineOptions();
    options.setExperiments(Arrays.asList("beam_fn_api"));
    DataflowRunner runner = DataflowRunner.fromOptions(options);
    DataflowPipelineTranslator translator = DataflowPipelineTranslator.fromOptions(options);
    Pipeline pipeline = Pipeline.create(options);
    pipeline.apply(Impulse.create()).apply(MapElements.via(new SimpleFunction<byte[], String>() {

        @Override
        public String apply(byte[] input) {
            return "";
        }
    })).apply(Window.into(FixedWindows.of(Duration.standardMinutes(1))));
    runner.replaceV1Transforms(pipeline);
    File file1 = File.createTempFile("file1-", ".txt");
    file1.deleteOnExit();
    File file2 = File.createTempFile("file2-", ".txt");
    file2.deleteOnExit();
    SdkComponents sdkComponents = SdkComponents.create();
    sdkComponents.registerEnvironment(Environments.createDockerEnvironment(DataflowRunner.getContainerImageForJob(options)).toBuilder().addAllDependencies(Environments.getArtifacts(ImmutableList.of("file1.txt=" + file1, "file2.txt=" + file2))).addAllCapabilities(Environments.getJavaCapabilities()).build());
    RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(pipeline, sdkComponents, true);
    JobSpecification result = translator.translate(pipeline, pipelineProto, sdkComponents, runner, Collections.emptyList());
    Components componentsProto = result.getPipelineProto().getComponents();
    assertThat(Iterables.getOnlyElement(componentsProto.getEnvironmentsMap().values()).getCapabilitiesList(), containsInAnyOrder(Environments.getJavaCapabilities().toArray(new String[0])));
    assertThat(Iterables.getOnlyElement(componentsProto.getEnvironmentsMap().values()).getDependenciesList(), containsInAnyOrder(Environments.getArtifacts(ImmutableList.of("file1.txt=" + file1, "file2.txt=" + file2)).toArray(new ArtifactInformation[0])));
}
Also used : DataflowPipelineOptions(org.apache.beam.runners.dataflow.options.DataflowPipelineOptions) Structs.getString(org.apache.beam.runners.dataflow.util.Structs.getString) ByteString(org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.ByteString) SdkComponents(org.apache.beam.runners.core.construction.SdkComponents) Pipeline(org.apache.beam.sdk.Pipeline) Components(org.apache.beam.model.pipeline.v1.RunnerApi.Components) SdkComponents(org.apache.beam.runners.core.construction.SdkComponents) RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) JobSpecification(org.apache.beam.runners.dataflow.DataflowPipelineTranslator.JobSpecification) File(java.io.File) Test(org.junit.Test)

Example 74 with Components

use of org.apache.beam.model.pipeline.v1.RunnerApi.Components in project beam by apache.

the class GreedyStageFuserTest method flattenWithHeterogeneousInputsAndOutputs.

@Test
public void flattenWithHeterogeneousInputsAndOutputs() {
    // (impulse.out) -> pyRead -> pyRead.out \                           -> pyParDo -> pyParDo.out
    // (impulse.out) ->                       -> flatten -> flatten.out |
    // (impulse.out) -> goRead -> goRead.out /                           -> goWindow -> goWindow.out
    // fuses into
    // (impulse.out) -> pyRead -> pyRead.out -> flatten -> (flatten.out)
    // (impulse.out) -> goRead -> goRead.out -> flatten -> (flatten.out)
    // (flatten.out) -> pyParDo -> pyParDo.out
    // (flatten.out) -> goWindow -> goWindow.out
    PTransform pyRead = PTransform.newBuilder().putInputs("input", "impulse.out").putOutputs("output", "pyRead.out").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).setPayload(ParDoPayload.newBuilder().setDoFn(FunctionSpec.newBuilder()).build().toByteString()).build()).setEnvironmentId("py").build();
    PTransform goRead = PTransform.newBuilder().putInputs("input", "impulse.out").putOutputs("output", "goRead.out").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).setPayload(ParDoPayload.newBuilder().setDoFn(FunctionSpec.newBuilder()).build().toByteString()).build()).setEnvironmentId("go").build();
    PTransform pyParDo = PTransform.newBuilder().putInputs("input", "flatten.out").putOutputs("output", "pyParDo.out").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).setPayload(ParDoPayload.newBuilder().setDoFn(FunctionSpec.newBuilder()).build().toByteString()).build()).setEnvironmentId("py").build();
    PTransform goWindow = PTransform.newBuilder().putInputs("input", "flatten.out").putOutputs("output", "goWindow.out").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.ASSIGN_WINDOWS_TRANSFORM_URN).setPayload(WindowIntoPayload.newBuilder().setWindowFn(FunctionSpec.newBuilder()).build().toByteString()).build()).setEnvironmentId("go").build();
    PCollection flattenPc = PCollection.newBuilder().setUniqueName("flatten.out").build();
    Components components = partialComponents.toBuilder().putTransforms("pyRead", pyRead).putPcollections("pyRead.out", PCollection.newBuilder().setUniqueName("pyRead.out").build()).putTransforms("goRead", goRead).putPcollections("goRead.out", PCollection.newBuilder().setUniqueName("goRead.out").build()).putTransforms("flatten", PTransform.newBuilder().putInputs("py_input", "pyRead.out").putInputs("go_input", "goRead.out").putOutputs("output", "flatten.out").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.FLATTEN_TRANSFORM_URN).build()).build()).putPcollections("flatten.out", flattenPc).putTransforms("pyParDo", pyParDo).putPcollections("pyParDo.out", PCollection.newBuilder().setUniqueName("pyParDo.out").build()).putTransforms("goWindow", goWindow).putPcollections("goWindow.out", PCollection.newBuilder().setUniqueName("goWindow.out").build()).putEnvironments("go", Environments.createDockerEnvironment("go")).putEnvironments("py", Environments.createDockerEnvironment("py")).build();
    QueryablePipeline p = QueryablePipeline.forPrimitivesIn(components);
    ExecutableStage readFromPy = GreedyStageFuser.forGrpcPortRead(p, impulseOutputNode, ImmutableSet.of(PipelineNode.pTransform("pyRead", pyRead)));
    ExecutableStage readFromGo = GreedyStageFuser.forGrpcPortRead(p, impulseOutputNode, ImmutableSet.of(PipelineNode.pTransform("goRead", goRead)));
    assertThat(readFromPy.getOutputPCollections(), contains(PipelineNode.pCollection("flatten.out", flattenPc)));
    // The stage must materialize the flatten, so the `go` stage can read it; this means that this
    // parDo can't be in the stage, as it'll be a reader of that materialized PCollection. The same
    // is true for the go window.
    assertThat(readFromPy.getTransforms(), not(hasItem(PipelineNode.pTransform("pyParDo", pyParDo))));
    assertThat(readFromGo.getOutputPCollections(), contains(PipelineNode.pCollection("flatten.out", flattenPc)));
    assertThat(readFromGo.getTransforms(), not(hasItem(PipelineNode.pTransform("goWindow", goWindow))));
}
Also used : Components(org.apache.beam.model.pipeline.v1.RunnerApi.Components) PCollection(org.apache.beam.model.pipeline.v1.RunnerApi.PCollection) PTransform(org.apache.beam.model.pipeline.v1.RunnerApi.PTransform) Test(org.junit.Test)

Example 75 with Components

use of org.apache.beam.model.pipeline.v1.RunnerApi.Components in project beam by apache.

the class ProtoOverridesTest method replacesOnlyMatching.

@Test
public void replacesOnlyMatching() {
    RunnerApi.Pipeline p = Pipeline.newBuilder().addAllRootTransformIds(ImmutableList.of("first", "second")).setComponents(Components.newBuilder().putTransforms("first", PTransform.newBuilder().setSpec(FunctionSpec.newBuilder().setUrn("beam:first")).build()).putTransforms("second", PTransform.newBuilder().setSpec(FunctionSpec.newBuilder().setUrn("beam:second")).build()).putPcollections("intermediatePc", PCollection.newBuilder().setUniqueName("intermediate").build()).putCoders("coder", Coder.newBuilder().setSpec(FunctionSpec.getDefaultInstance()).build())).build();
    PTransform secondReplacement = PTransform.newBuilder().addSubtransforms("second_sub").setSpec(FunctionSpec.newBuilder().setUrn("beam:second:replacement").setPayload(ByteString.copyFrom("foo-bar-baz".getBytes(StandardCharsets.UTF_8)))).build();
    WindowingStrategy introducedWS = WindowingStrategy.newBuilder().setAccumulationMode(AccumulationMode.Enum.ACCUMULATING).build();
    RunnerApi.Components extraComponents = Components.newBuilder().putPcollections("intermediatePc", PCollection.newBuilder().setUniqueName("intermediate_replacement").build()).putWindowingStrategies("new_ws", introducedWS).putTransforms("second_sub", PTransform.getDefaultInstance()).build();
    Pipeline updated = ProtoOverrides.updateTransform("beam:second", p, new TestReplacer(secondReplacement, extraComponents));
    PTransform updatedSecond = updated.getComponents().getTransformsOrThrow("second");
    assertThat(updatedSecond, equalTo(secondReplacement));
    assertThat(updated.getComponents().getWindowingStrategiesOrThrow("new_ws"), equalTo(introducedWS));
    assertThat(updated.getComponents().getTransformsOrThrow("second_sub"), equalTo(PTransform.getDefaultInstance()));
    // TODO: This might not be appropriate. Merging in the other direction might force that callers
    // are well behaved.
    assertThat(updated.getComponents().getPcollectionsOrThrow("intermediatePc").getUniqueName(), equalTo("intermediate_replacement"));
    // Assert that the untouched components are unchanged.
    assertThat(updated.getComponents().getTransformsOrThrow("first"), equalTo(p.getComponents().getTransformsOrThrow("first")));
    assertThat(updated.getComponents().getCodersOrThrow("coder"), equalTo(p.getComponents().getCodersOrThrow("coder")));
    assertThat(updated.getRootTransformIdsList(), equalTo(p.getRootTransformIdsList()));
}
Also used : RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) Components(org.apache.beam.model.pipeline.v1.RunnerApi.Components) Pipeline(org.apache.beam.model.pipeline.v1.RunnerApi.Pipeline) WindowingStrategy(org.apache.beam.model.pipeline.v1.RunnerApi.WindowingStrategy) PTransform(org.apache.beam.model.pipeline.v1.RunnerApi.PTransform) Pipeline(org.apache.beam.model.pipeline.v1.RunnerApi.Pipeline) Test(org.junit.Test)

Aggregations

Test (org.junit.Test)55 Components (org.apache.beam.model.pipeline.v1.RunnerApi.Components)49 RunnerApi (org.apache.beam.model.pipeline.v1.RunnerApi)40 PTransform (org.apache.beam.model.pipeline.v1.RunnerApi.PTransform)31 PTransformNode (org.apache.beam.runners.core.construction.graph.PipelineNode.PTransformNode)20 Map (java.util.Map)16 WindowedValue (org.apache.beam.sdk.util.WindowedValue)16 IOException (java.io.IOException)15 PCollectionNode (org.apache.beam.runners.core.construction.graph.PipelineNode.PCollectionNode)15 PCollection (org.apache.beam.model.pipeline.v1.RunnerApi.PCollection)14 Coder (org.apache.beam.sdk.coders.Coder)14 SdkComponents (org.apache.beam.runners.core.construction.SdkComponents)13 Pipeline (org.apache.beam.sdk.Pipeline)13 ByteString (org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.ByteString)12 FunctionSpec (org.apache.beam.model.pipeline.v1.RunnerApi.FunctionSpec)11 KvCoder (org.apache.beam.sdk.coders.KvCoder)11 BoundedWindow (org.apache.beam.sdk.transforms.windowing.BoundedWindow)11 ArrayList (java.util.ArrayList)10 List (java.util.List)10 Environment (org.apache.beam.model.pipeline.v1.RunnerApi.Environment)10