Search in sources :

Example 71 with PTransform

use of org.apache.beam.model.pipeline.v1.RunnerApi.PTransform in project beam by apache.

the class RegisterNodeFunction method apply.

@Override
public Node apply(MutableNetwork<Node, Edge> input) {
    for (Node node : input.nodes()) {
        if (node instanceof RemoteGrpcPortNode || node instanceof ParallelInstructionNode || node instanceof InstructionOutputNode) {
            continue;
        }
        throw new IllegalArgumentException(String.format("Network contains unknown type of node: %s", input));
    }
    // Fix all non output nodes to have named edges.
    for (Node node : input.nodes()) {
        if (node instanceof InstructionOutputNode) {
            continue;
        }
        for (Node successor : input.successors(node)) {
            for (Edge edge : input.edgesConnecting(node, successor)) {
                if (edge instanceof DefaultEdge) {
                    input.removeEdge(edge);
                    input.addEdge(node, successor, MultiOutputInfoEdge.create(new MultiOutputInfo().setTag(idGenerator.getId())));
                }
            }
        }
    }
    // We start off by replacing all edges within the graph with edges that have the named
    // outputs from the predecessor step. For ParallelInstruction Source nodes and RemoteGrpcPort
    // nodes this is a generated port id. All ParDoInstructions will have already
    ProcessBundleDescriptor.Builder processBundleDescriptor = ProcessBundleDescriptor.newBuilder().setId(idGenerator.getId()).setStateApiServiceDescriptor(stateApiServiceDescriptor);
    // For intermediate PCollections we fabricate, we make a bogus WindowingStrategy
    // TODO: create a correct windowing strategy, including coders and environment
    SdkComponents sdkComponents = SdkComponents.create(pipeline.getComponents(), null);
    // Default to use the Java environment if pipeline doesn't have environment specified.
    if (pipeline.getComponents().getEnvironmentsMap().isEmpty()) {
        sdkComponents.registerEnvironment(Environments.JAVA_SDK_HARNESS_ENVIRONMENT);
    }
    String fakeWindowingStrategyId = "fakeWindowingStrategy" + idGenerator.getId();
    try {
        RunnerApi.MessageWithComponents fakeWindowingStrategyProto = WindowingStrategyTranslation.toMessageProto(WindowingStrategy.globalDefault(), sdkComponents);
        processBundleDescriptor.putWindowingStrategies(fakeWindowingStrategyId, fakeWindowingStrategyProto.getWindowingStrategy()).putAllCoders(fakeWindowingStrategyProto.getComponents().getCodersMap()).putAllEnvironments(fakeWindowingStrategyProto.getComponents().getEnvironmentsMap());
    } catch (IOException exc) {
        throw new RuntimeException("Could not convert default windowing stratey to proto", exc);
    }
    Map<Node, String> nodesToPCollections = new HashMap<>();
    ImmutableMap.Builder<String, NameContext> ptransformIdToNameContexts = ImmutableMap.builder();
    ImmutableMap.Builder<String, Iterable<SideInputInfo>> ptransformIdToSideInputInfos = ImmutableMap.builder();
    ImmutableMap.Builder<String, Iterable<PCollectionView<?>>> ptransformIdToPCollectionViews = ImmutableMap.builder();
    ImmutableMap.Builder<String, NameContext> pcollectionIdToNameContexts = ImmutableMap.builder();
    ImmutableMap.Builder<InstructionOutputNode, String> instructionOutputNodeToCoderIdBuilder = ImmutableMap.builder();
    // 2. Generate new PCollectionId and register it with ProcessBundleDescriptor.
    for (InstructionOutputNode node : Iterables.filter(input.nodes(), InstructionOutputNode.class)) {
        InstructionOutput instructionOutput = node.getInstructionOutput();
        String coderId = "generatedCoder" + idGenerator.getId();
        instructionOutputNodeToCoderIdBuilder.put(node, coderId);
        try (ByteString.Output output = ByteString.newOutput()) {
            try {
                Coder<?> javaCoder = CloudObjects.coderFromCloudObject(CloudObject.fromSpec(instructionOutput.getCodec()));
                sdkComponents.registerCoder(javaCoder);
                RunnerApi.Coder coderProto = CoderTranslation.toProto(javaCoder, sdkComponents);
                processBundleDescriptor.putCoders(coderId, coderProto);
            } catch (IOException e) {
                throw new IllegalArgumentException(String.format("Unable to encode coder %s for output %s", instructionOutput.getCodec(), instructionOutput), e);
            } catch (Exception e) {
                // Coder probably wasn't a java coder
                OBJECT_MAPPER.writeValue(output, instructionOutput.getCodec());
                processBundleDescriptor.putCoders(coderId, RunnerApi.Coder.newBuilder().setSpec(RunnerApi.FunctionSpec.newBuilder().setPayload(output.toByteString())).build());
            }
        } catch (IOException e) {
            throw new IllegalArgumentException(String.format("Unable to encode coder %s for output %s", instructionOutput.getCodec(), instructionOutput), e);
        }
        // Generate new PCollection ID and map it to relevant node.
        // Will later be used to fill PTransform inputs/outputs information.
        String pcollectionId = "generatedPcollection" + idGenerator.getId();
        processBundleDescriptor.putPcollections(pcollectionId, RunnerApi.PCollection.newBuilder().setCoderId(coderId).setWindowingStrategyId(fakeWindowingStrategyId).build());
        nodesToPCollections.put(node, pcollectionId);
        pcollectionIdToNameContexts.put(pcollectionId, NameContext.create(null, instructionOutput.getOriginalName(), instructionOutput.getSystemName(), instructionOutput.getName()));
    }
    processBundleDescriptor.putAllCoders(sdkComponents.toComponents().getCodersMap());
    Map<InstructionOutputNode, String> instructionOutputNodeToCoderIdMap = instructionOutputNodeToCoderIdBuilder.build();
    for (ParallelInstructionNode node : Iterables.filter(input.nodes(), ParallelInstructionNode.class)) {
        ParallelInstruction parallelInstruction = node.getParallelInstruction();
        String ptransformId = "generatedPtransform" + idGenerator.getId();
        ptransformIdToNameContexts.put(ptransformId, NameContext.create(null, parallelInstruction.getOriginalName(), parallelInstruction.getSystemName(), parallelInstruction.getName()));
        RunnerApi.PTransform.Builder pTransform = RunnerApi.PTransform.newBuilder();
        RunnerApi.FunctionSpec.Builder transformSpec = RunnerApi.FunctionSpec.newBuilder();
        if (parallelInstruction.getParDo() != null) {
            ParDoInstruction parDoInstruction = parallelInstruction.getParDo();
            CloudObject userFnSpec = CloudObject.fromSpec(parDoInstruction.getUserFn());
            String userFnClassName = userFnSpec.getClassName();
            if ("CombineValuesFn".equals(userFnClassName) || "KeyedCombineFn".equals(userFnClassName)) {
                transformSpec = transformCombineValuesFnToFunctionSpec(userFnSpec);
                ptransformIdToPCollectionViews.put(ptransformId, Collections.emptyList());
            } else {
                String parDoPTransformId = getString(userFnSpec, PropertyNames.SERIALIZED_FN);
                RunnerApi.PTransform parDoPTransform = pipeline.getComponents().getTransformsOrDefault(parDoPTransformId, null);
                // TODO: only the non-null branch should exist; for migration ease only
                if (parDoPTransform != null) {
                    checkArgument(parDoPTransform.getSpec().getUrn().equals(PTransformTranslation.PAR_DO_TRANSFORM_URN), "Found transform \"%s\" for ParallelDo instruction, " + " but that transform had unexpected URN \"%s\" (expected \"%s\")", parDoPTransformId, parDoPTransform.getSpec().getUrn(), PTransformTranslation.PAR_DO_TRANSFORM_URN);
                    RunnerApi.ParDoPayload parDoPayload;
                    try {
                        parDoPayload = RunnerApi.ParDoPayload.parseFrom(parDoPTransform.getSpec().getPayload());
                    } catch (InvalidProtocolBufferException exc) {
                        throw new RuntimeException("ParDo did not have a ParDoPayload", exc);
                    }
                    ImmutableList.Builder<PCollectionView<?>> pcollectionViews = ImmutableList.builder();
                    for (Map.Entry<String, SideInput> sideInputEntry : parDoPayload.getSideInputsMap().entrySet()) {
                        pcollectionViews.add(transformSideInputForRunner(pipeline, parDoPTransform, sideInputEntry.getKey(), sideInputEntry.getValue()));
                        transformSideInputForSdk(pipeline, parDoPTransform, sideInputEntry.getKey(), processBundleDescriptor, pTransform);
                    }
                    ptransformIdToPCollectionViews.put(ptransformId, pcollectionViews.build());
                    transformSpec.setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).setPayload(parDoPayload.toByteString());
                } else {
                    // legacy path - bytes are the FunctionSpec's payload field, basically, and
                    // SDKs expect it in the PTransform's payload field
                    byte[] userFnBytes = getBytes(userFnSpec, PropertyNames.SERIALIZED_FN);
                    transformSpec.setUrn(ParDoTranslation.CUSTOM_JAVA_DO_FN_URN).setPayload(ByteString.copyFrom(userFnBytes));
                }
                // Add side input information for batch pipelines
                if (parDoInstruction.getSideInputs() != null) {
                    ptransformIdToSideInputInfos.put(ptransformId, forSideInputInfos(parDoInstruction.getSideInputs(), true));
                }
            }
        } else if (parallelInstruction.getRead() != null) {
            ReadInstruction readInstruction = parallelInstruction.getRead();
            CloudObject sourceSpec = CloudObject.fromSpec(CloudSourceUtils.flattenBaseSpecs(readInstruction.getSource()).getSpec());
            // TODO: Need to plumb through the SDK specific function spec.
            transformSpec.setUrn(JAVA_SOURCE_URN);
            try {
                byte[] serializedSource = Base64.getDecoder().decode(getString(sourceSpec, SERIALIZED_SOURCE));
                ByteString sourceByteString = ByteString.copyFrom(serializedSource);
                transformSpec.setPayload(sourceByteString);
            } catch (Exception e) {
                throw new IllegalArgumentException(String.format("Unable to process Read %s", parallelInstruction), e);
            }
        } else if (parallelInstruction.getFlatten() != null) {
            transformSpec.setUrn(PTransformTranslation.FLATTEN_TRANSFORM_URN);
        } else {
            throw new IllegalArgumentException(String.format("Unknown type of ParallelInstruction %s", parallelInstruction));
        }
        for (Node predecessorOutput : input.predecessors(node)) {
            pTransform.putInputs("generatedInput" + idGenerator.getId(), nodesToPCollections.get(predecessorOutput));
        }
        for (Edge edge : input.outEdges(node)) {
            Node nodeOutput = input.incidentNodes(edge).target();
            MultiOutputInfoEdge edge2 = (MultiOutputInfoEdge) edge;
            pTransform.putOutputs(edge2.getMultiOutputInfo().getTag(), nodesToPCollections.get(nodeOutput));
        }
        pTransform.setSpec(transformSpec);
        processBundleDescriptor.putTransforms(ptransformId, pTransform.build());
    }
    // Add the PTransforms representing the remote gRPC nodes
    for (RemoteGrpcPortNode node : Iterables.filter(input.nodes(), RemoteGrpcPortNode.class)) {
        RunnerApi.PTransform.Builder pTransform = RunnerApi.PTransform.newBuilder();
        Set<Node> predecessors = input.predecessors(node);
        Set<Node> successors = input.successors(node);
        if (predecessors.isEmpty() && !successors.isEmpty()) {
            Node instructionOutputNode = Iterables.getOnlyElement(successors);
            pTransform.putOutputs("generatedOutput" + idGenerator.getId(), nodesToPCollections.get(instructionOutputNode));
            pTransform.setSpec(RunnerApi.FunctionSpec.newBuilder().setUrn(DATA_INPUT_URN).setPayload(node.getRemoteGrpcPort().toBuilder().setCoderId(instructionOutputNodeToCoderIdMap.get(instructionOutputNode)).build().toByteString()).build());
        } else if (!predecessors.isEmpty() && successors.isEmpty()) {
            Node instructionOutputNode = Iterables.getOnlyElement(predecessors);
            pTransform.putInputs("generatedInput" + idGenerator.getId(), nodesToPCollections.get(instructionOutputNode));
            pTransform.setSpec(RunnerApi.FunctionSpec.newBuilder().setUrn(DATA_OUTPUT_URN).setPayload(node.getRemoteGrpcPort().toBuilder().setCoderId(instructionOutputNodeToCoderIdMap.get(instructionOutputNode)).build().toByteString()).build());
        } else {
            throw new IllegalStateException("Expected either one input OR one output " + "InstructionOutputNode for this RemoteGrpcPortNode");
        }
        processBundleDescriptor.putTransforms(node.getPrimitiveTransformId(), pTransform.build());
    }
    return RegisterRequestNode.create(RegisterRequest.newBuilder().addProcessBundleDescriptor(processBundleDescriptor).build(), ptransformIdToNameContexts.build(), ptransformIdToSideInputInfos.build(), ptransformIdToPCollectionViews.build(), pcollectionIdToNameContexts.build());
}
Also used : HashMap(java.util.HashMap) MultiOutputInfo(com.google.api.services.dataflow.model.MultiOutputInfo) ByteString(org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.ByteString) RegisterRequestNode(org.apache.beam.runners.dataflow.worker.graph.Nodes.RegisterRequestNode) InstructionOutputNode(org.apache.beam.runners.dataflow.worker.graph.Nodes.InstructionOutputNode) ParallelInstructionNode(org.apache.beam.runners.dataflow.worker.graph.Nodes.ParallelInstructionNode) Node(org.apache.beam.runners.dataflow.worker.graph.Nodes.Node) PipelineNode(org.apache.beam.runners.core.construction.graph.PipelineNode) RemoteGrpcPortNode(org.apache.beam.runners.dataflow.worker.graph.Nodes.RemoteGrpcPortNode) InstructionOutput(com.google.api.services.dataflow.model.InstructionOutput) Structs.getString(org.apache.beam.runners.dataflow.util.Structs.getString) ByteString(org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.ByteString) RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) SideInput(org.apache.beam.model.pipeline.v1.RunnerApi.SideInput) DefaultEdge(org.apache.beam.runners.dataflow.worker.graph.Edges.DefaultEdge) MultiOutputInfoEdge(org.apache.beam.runners.dataflow.worker.graph.Edges.MultiOutputInfoEdge) ImmutableMap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap) ParDoInstruction(com.google.api.services.dataflow.model.ParDoInstruction) DataflowPortabilityPCollectionView(org.apache.beam.runners.dataflow.worker.DataflowPortabilityPCollectionView) PCollectionView(org.apache.beam.sdk.values.PCollectionView) ImmutableMap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap) Map(java.util.Map) HashMap(java.util.HashMap) RemoteGrpcPortNode(org.apache.beam.runners.dataflow.worker.graph.Nodes.RemoteGrpcPortNode) ProcessBundleDescriptor(org.apache.beam.model.fnexecution.v1.BeamFnApi.ProcessBundleDescriptor) ImmutableList(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList) ParallelInstructionNode(org.apache.beam.runners.dataflow.worker.graph.Nodes.ParallelInstructionNode) SdkComponents(org.apache.beam.runners.core.construction.SdkComponents) ReadInstruction(com.google.api.services.dataflow.model.ReadInstruction) InstructionOutputNode(org.apache.beam.runners.dataflow.worker.graph.Nodes.InstructionOutputNode) NameContext(org.apache.beam.runners.dataflow.worker.counters.NameContext) InvalidProtocolBufferException(org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.InvalidProtocolBufferException) IOException(java.io.IOException) InvalidProtocolBufferException(org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.InvalidProtocolBufferException) IOException(java.io.IOException) ParallelInstruction(com.google.api.services.dataflow.model.ParallelInstruction) CloudObject(org.apache.beam.runners.dataflow.util.CloudObject) Edge(org.apache.beam.runners.dataflow.worker.graph.Edges.Edge) MultiOutputInfoEdge(org.apache.beam.runners.dataflow.worker.graph.Edges.MultiOutputInfoEdge) DefaultEdge(org.apache.beam.runners.dataflow.worker.graph.Edges.DefaultEdge)

Example 72 with PTransform

use of org.apache.beam.model.pipeline.v1.RunnerApi.PTransform in project beam by apache.

the class GreedyStageFuserTest method userStateIncludedInStage.

@Test
public void userStateIncludedInStage() {
    Environment env = Environments.createDockerEnvironment("common");
    PTransform readTransform = PTransform.newBuilder().putInputs("input", "impulse.out").putOutputs("output", "read.out").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).setPayload(ParDoPayload.newBuilder().setDoFn(FunctionSpec.newBuilder()).build().toByteString())).setEnvironmentId("common").build();
    PTransform parDoTransform = PTransform.newBuilder().putInputs("input", "read.out").putOutputs("output", "parDo.out").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).setPayload(ParDoPayload.newBuilder().setDoFn(FunctionSpec.newBuilder()).putStateSpecs("state_spec", StateSpec.getDefaultInstance()).build().toByteString())).setEnvironmentId("common").build();
    PCollection userStateMainInputPCollection = PCollection.newBuilder().setUniqueName("read.out").build();
    QueryablePipeline p = QueryablePipeline.forPrimitivesIn(partialComponents.toBuilder().putTransforms("read", readTransform).putPcollections("read.out", userStateMainInputPCollection).putTransforms("user_state", PTransform.newBuilder().putInputs("input", "impulse.out").putOutputs("output", "user_state.out").build()).putPcollections("user_state.out", PCollection.newBuilder().setUniqueName("user_state.out").build()).putTransforms("parDo", parDoTransform).putPcollections("parDo.out", PCollection.newBuilder().setUniqueName("parDo.out").build()).putEnvironments("common", env).build());
    PCollectionNode readOutput = getOnlyElement(p.getOutputPCollections(PipelineNode.pTransform("read", readTransform)));
    ExecutableStage subgraph = GreedyStageFuser.forGrpcPortRead(p, readOutput, ImmutableSet.of(PipelineNode.pTransform("parDo", parDoTransform)));
    PTransformNode parDoNode = PipelineNode.pTransform("parDo", parDoTransform);
    UserStateReference userStateRef = UserStateReference.of(parDoNode, "state_spec", PipelineNode.pCollection("read.out", userStateMainInputPCollection));
    assertThat(subgraph.getUserStates(), contains(userStateRef));
    assertThat(subgraph.getOutputPCollections(), emptyIterable());
}
Also used : PCollection(org.apache.beam.model.pipeline.v1.RunnerApi.PCollection) PTransformNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PTransformNode) Environment(org.apache.beam.model.pipeline.v1.RunnerApi.Environment) PCollectionNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PCollectionNode) PTransform(org.apache.beam.model.pipeline.v1.RunnerApi.PTransform) Test(org.junit.Test)

Example 73 with PTransform

use of org.apache.beam.model.pipeline.v1.RunnerApi.PTransform in project beam by apache.

the class GreedyStageFuserTest method materializesWithDifferentEnvSibling.

@Test
public void materializesWithDifferentEnvSibling() {
    // (impulse.out) -> read -> read.out -> parDo -> parDo.out
    // \
    // -> window -> window.out
    // Fuses into
    // (impulse.out) -> read -> (read.out)
    // (read.out) -> parDo -> parDo.out
    // (read.out) -> window -> window.out
    // The window can't be fused into the stage, which forces the PCollection to be materialized.
    // ParDo in this case _could_ be fused into the stage, but is not for simplicity of
    // implementation
    Environment env = Environments.createDockerEnvironment("common");
    PTransform readTransform = PTransform.newBuilder().putInputs("input", "impulse.out").putOutputs("output", "read.out").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).setPayload(ParDoPayload.newBuilder().setDoFn(FunctionSpec.newBuilder()).build().toByteString())).setEnvironmentId("common").build();
    QueryablePipeline p = QueryablePipeline.forPrimitivesIn(partialComponents.toBuilder().putTransforms("read", readTransform).putPcollections("read.out", PCollection.newBuilder().setUniqueName("read.out").build()).putTransforms("parDo", PTransform.newBuilder().putInputs("input", "read.out").putOutputs("output", "parDo.out").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).setPayload(ParDoPayload.newBuilder().setDoFn(FunctionSpec.newBuilder()).build().toByteString())).setEnvironmentId("common").build()).putPcollections("parDo.out", PCollection.newBuilder().setUniqueName("parDo.out").build()).putTransforms("window", PTransform.newBuilder().putInputs("input", "read.out").putOutputs("output", "window.out").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.ASSIGN_WINDOWS_TRANSFORM_URN).setPayload(WindowIntoPayload.newBuilder().setWindowFn(FunctionSpec.newBuilder()).build().toByteString())).setEnvironmentId("rare").build()).putPcollections("window.out", PCollection.newBuilder().setUniqueName("window.out").build()).putEnvironments("rare", Environments.createDockerEnvironment("rare")).putEnvironments("common", env).build());
    PTransformNode readNode = PipelineNode.pTransform("read", readTransform);
    PCollectionNode readOutput = getOnlyElement(p.getOutputPCollections(readNode));
    ExecutableStage subgraph = GreedyStageFuser.forGrpcPortRead(p, impulseOutputNode, ImmutableSet.of(PipelineNode.pTransform("read", readTransform)));
    assertThat(subgraph.getOutputPCollections(), contains(readOutput));
    assertThat(subgraph.getTransforms(), contains(readNode));
}
Also used : PTransformNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PTransformNode) Environment(org.apache.beam.model.pipeline.v1.RunnerApi.Environment) PCollectionNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PCollectionNode) PTransform(org.apache.beam.model.pipeline.v1.RunnerApi.PTransform) Test(org.junit.Test)

Example 74 with PTransform

use of org.apache.beam.model.pipeline.v1.RunnerApi.PTransform in project beam by apache.

the class GreedyStageFuserTest method flattenWithHeterogeneousInputsAndOutputs.

@Test
public void flattenWithHeterogeneousInputsAndOutputs() {
    // (impulse.out) -> pyRead -> pyRead.out \                           -> pyParDo -> pyParDo.out
    // (impulse.out) ->                       -> flatten -> flatten.out |
    // (impulse.out) -> goRead -> goRead.out /                           -> goWindow -> goWindow.out
    // fuses into
    // (impulse.out) -> pyRead -> pyRead.out -> flatten -> (flatten.out)
    // (impulse.out) -> goRead -> goRead.out -> flatten -> (flatten.out)
    // (flatten.out) -> pyParDo -> pyParDo.out
    // (flatten.out) -> goWindow -> goWindow.out
    PTransform pyRead = PTransform.newBuilder().putInputs("input", "impulse.out").putOutputs("output", "pyRead.out").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).setPayload(ParDoPayload.newBuilder().setDoFn(FunctionSpec.newBuilder()).build().toByteString()).build()).setEnvironmentId("py").build();
    PTransform goRead = PTransform.newBuilder().putInputs("input", "impulse.out").putOutputs("output", "goRead.out").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).setPayload(ParDoPayload.newBuilder().setDoFn(FunctionSpec.newBuilder()).build().toByteString()).build()).setEnvironmentId("go").build();
    PTransform pyParDo = PTransform.newBuilder().putInputs("input", "flatten.out").putOutputs("output", "pyParDo.out").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).setPayload(ParDoPayload.newBuilder().setDoFn(FunctionSpec.newBuilder()).build().toByteString()).build()).setEnvironmentId("py").build();
    PTransform goWindow = PTransform.newBuilder().putInputs("input", "flatten.out").putOutputs("output", "goWindow.out").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.ASSIGN_WINDOWS_TRANSFORM_URN).setPayload(WindowIntoPayload.newBuilder().setWindowFn(FunctionSpec.newBuilder()).build().toByteString()).build()).setEnvironmentId("go").build();
    PCollection flattenPc = PCollection.newBuilder().setUniqueName("flatten.out").build();
    Components components = partialComponents.toBuilder().putTransforms("pyRead", pyRead).putPcollections("pyRead.out", PCollection.newBuilder().setUniqueName("pyRead.out").build()).putTransforms("goRead", goRead).putPcollections("goRead.out", PCollection.newBuilder().setUniqueName("goRead.out").build()).putTransforms("flatten", PTransform.newBuilder().putInputs("py_input", "pyRead.out").putInputs("go_input", "goRead.out").putOutputs("output", "flatten.out").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.FLATTEN_TRANSFORM_URN).build()).build()).putPcollections("flatten.out", flattenPc).putTransforms("pyParDo", pyParDo).putPcollections("pyParDo.out", PCollection.newBuilder().setUniqueName("pyParDo.out").build()).putTransforms("goWindow", goWindow).putPcollections("goWindow.out", PCollection.newBuilder().setUniqueName("goWindow.out").build()).putEnvironments("go", Environments.createDockerEnvironment("go")).putEnvironments("py", Environments.createDockerEnvironment("py")).build();
    QueryablePipeline p = QueryablePipeline.forPrimitivesIn(components);
    ExecutableStage readFromPy = GreedyStageFuser.forGrpcPortRead(p, impulseOutputNode, ImmutableSet.of(PipelineNode.pTransform("pyRead", pyRead)));
    ExecutableStage readFromGo = GreedyStageFuser.forGrpcPortRead(p, impulseOutputNode, ImmutableSet.of(PipelineNode.pTransform("goRead", goRead)));
    assertThat(readFromPy.getOutputPCollections(), contains(PipelineNode.pCollection("flatten.out", flattenPc)));
    // The stage must materialize the flatten, so the `go` stage can read it; this means that this
    // parDo can't be in the stage, as it'll be a reader of that materialized PCollection. The same
    // is true for the go window.
    assertThat(readFromPy.getTransforms(), not(hasItem(PipelineNode.pTransform("pyParDo", pyParDo))));
    assertThat(readFromGo.getOutputPCollections(), contains(PipelineNode.pCollection("flatten.out", flattenPc)));
    assertThat(readFromGo.getTransforms(), not(hasItem(PipelineNode.pTransform("goWindow", goWindow))));
}
Also used : Components(org.apache.beam.model.pipeline.v1.RunnerApi.Components) PCollection(org.apache.beam.model.pipeline.v1.RunnerApi.PCollection) PTransform(org.apache.beam.model.pipeline.v1.RunnerApi.PTransform) Test(org.junit.Test)

Example 75 with PTransform

use of org.apache.beam.model.pipeline.v1.RunnerApi.PTransform in project beam by apache.

the class ProtoOverridesTest method replacesOnlyMatching.

@Test
public void replacesOnlyMatching() {
    RunnerApi.Pipeline p = Pipeline.newBuilder().addAllRootTransformIds(ImmutableList.of("first", "second")).setComponents(Components.newBuilder().putTransforms("first", PTransform.newBuilder().setSpec(FunctionSpec.newBuilder().setUrn("beam:first")).build()).putTransforms("second", PTransform.newBuilder().setSpec(FunctionSpec.newBuilder().setUrn("beam:second")).build()).putPcollections("intermediatePc", PCollection.newBuilder().setUniqueName("intermediate").build()).putCoders("coder", Coder.newBuilder().setSpec(FunctionSpec.getDefaultInstance()).build())).build();
    PTransform secondReplacement = PTransform.newBuilder().addSubtransforms("second_sub").setSpec(FunctionSpec.newBuilder().setUrn("beam:second:replacement").setPayload(ByteString.copyFrom("foo-bar-baz".getBytes(StandardCharsets.UTF_8)))).build();
    WindowingStrategy introducedWS = WindowingStrategy.newBuilder().setAccumulationMode(AccumulationMode.Enum.ACCUMULATING).build();
    RunnerApi.Components extraComponents = Components.newBuilder().putPcollections("intermediatePc", PCollection.newBuilder().setUniqueName("intermediate_replacement").build()).putWindowingStrategies("new_ws", introducedWS).putTransforms("second_sub", PTransform.getDefaultInstance()).build();
    Pipeline updated = ProtoOverrides.updateTransform("beam:second", p, new TestReplacer(secondReplacement, extraComponents));
    PTransform updatedSecond = updated.getComponents().getTransformsOrThrow("second");
    assertThat(updatedSecond, equalTo(secondReplacement));
    assertThat(updated.getComponents().getWindowingStrategiesOrThrow("new_ws"), equalTo(introducedWS));
    assertThat(updated.getComponents().getTransformsOrThrow("second_sub"), equalTo(PTransform.getDefaultInstance()));
    // TODO: This might not be appropriate. Merging in the other direction might force that callers
    // are well behaved.
    assertThat(updated.getComponents().getPcollectionsOrThrow("intermediatePc").getUniqueName(), equalTo("intermediate_replacement"));
    // Assert that the untouched components are unchanged.
    assertThat(updated.getComponents().getTransformsOrThrow("first"), equalTo(p.getComponents().getTransformsOrThrow("first")));
    assertThat(updated.getComponents().getCodersOrThrow("coder"), equalTo(p.getComponents().getCodersOrThrow("coder")));
    assertThat(updated.getRootTransformIdsList(), equalTo(p.getRootTransformIdsList()));
}
Also used : RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) Components(org.apache.beam.model.pipeline.v1.RunnerApi.Components) Pipeline(org.apache.beam.model.pipeline.v1.RunnerApi.Pipeline) WindowingStrategy(org.apache.beam.model.pipeline.v1.RunnerApi.WindowingStrategy) PTransform(org.apache.beam.model.pipeline.v1.RunnerApi.PTransform) Pipeline(org.apache.beam.model.pipeline.v1.RunnerApi.Pipeline) Test(org.junit.Test)

Aggregations

PTransform (org.apache.beam.model.pipeline.v1.RunnerApi.PTransform)58 Test (org.junit.Test)41 RunnerApi (org.apache.beam.model.pipeline.v1.RunnerApi)28 Components (org.apache.beam.model.pipeline.v1.RunnerApi.Components)23 PTransformNode (org.apache.beam.runners.core.construction.graph.PipelineNode.PTransformNode)22 PCollectionNode (org.apache.beam.runners.core.construction.graph.PipelineNode.PCollectionNode)21 PCollection (org.apache.beam.model.pipeline.v1.RunnerApi.PCollection)19 Map (java.util.Map)18 Environment (org.apache.beam.model.pipeline.v1.RunnerApi.Environment)18 ArrayList (java.util.ArrayList)16 ByteString (org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.ByteString)16 WindowedValue (org.apache.beam.sdk.util.WindowedValue)11 Collection (java.util.Collection)9 SdkComponents (org.apache.beam.runners.core.construction.SdkComponents)9 Collectors (java.util.stream.Collectors)8 ImmutableMap (org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap)8 IOException (java.io.IOException)7 HashSet (java.util.HashSet)7 FunctionSpec (org.apache.beam.model.pipeline.v1.RunnerApi.FunctionSpec)7 Pipeline (org.apache.beam.model.pipeline.v1.RunnerApi.Pipeline)7