Search in sources :

Example 66 with PTransform

use of org.apache.beam.model.pipeline.v1.RunnerApi.PTransform in project beam by apache.

the class GreedyPipelineFuserTest method parDoWithTimerRootsStage.

/*
   * impulse -> .out -> parDo -> .out -> timer -> .out
   * becomes
   * (impulse.out) -> parDo -> (parDo.out)
   * (parDo.out) -> timer
   */
@Test
public void parDoWithTimerRootsStage() {
    // (impulse.out) -> parDo -> (parDo.out)
    // (parDo.out) -> timer -> timer.out
    // timer has a timer spec which prevents it from fusing with an upstream ParDo
    PTransform parDoTransform = PTransform.newBuilder().setUniqueName("ParDo").putInputs("input", "impulse.out").putOutputs("output", "parDo.out").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).setPayload(ParDoPayload.newBuilder().setDoFn(FunctionSpec.newBuilder()).build().toByteString())).setEnvironmentId("common").build();
    PTransform timerTransform = PTransform.newBuilder().setUniqueName("TimerParDo").putInputs("input", "parDo.out").putInputs("timer", "timer.out").putOutputs("timer", "timer.out").putOutputs("output", "output.out").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).setPayload(ParDoPayload.newBuilder().setDoFn(FunctionSpec.newBuilder()).putTimerFamilySpecs("timer", TimerFamilySpec.getDefaultInstance()).build().toByteString())).setEnvironmentId("common").build();
    Components components = partialComponents.toBuilder().putTransforms("parDo", parDoTransform).putPcollections("parDo.out", pc("parDo.out")).putTransforms("timer", timerTransform).putPcollections("timer.out", pc("timer.out")).putPcollections("output.out", pc("output.out")).putEnvironments("common", Environments.createDockerEnvironment("common")).build();
    FusedPipeline fused = GreedyPipelineFuser.fuse(Pipeline.newBuilder().setComponents(components).addRequirements(ParDoTranslation.REQUIRES_STATEFUL_PROCESSING_URN).build());
    assertThat(fused.getRunnerExecutedTransforms(), containsInAnyOrder(PipelineNode.pTransform("impulse", components.getTransformsOrThrow("impulse"))));
    assertThat(fused.getFusedStages(), containsInAnyOrder(ExecutableStageMatcher.withInput("impulse.out").withOutputs("parDo.out").withTransforms("parDo"), ExecutableStageMatcher.withInput("parDo.out").withNoOutputs().withTransforms("timer")));
}
Also used : Components(org.apache.beam.model.pipeline.v1.RunnerApi.Components) PTransform(org.apache.beam.model.pipeline.v1.RunnerApi.PTransform) Test(org.junit.Test)

Example 67 with PTransform

use of org.apache.beam.model.pipeline.v1.RunnerApi.PTransform in project beam by apache.

the class GreedyPipelineFuserTest method sanitizedTransforms.

@Test
public void sanitizedTransforms() throws Exception {
    PCollection flattenOutput = pc("flatten.out");
    PCollection read1Output = pc("read1.out");
    PCollection read2Output = pc("read2.out");
    PCollection impulse1Output = pc("impulse1.out");
    PCollection impulse2Output = pc("impulse2.out");
    PTransform flattenTransform = PTransform.newBuilder().setUniqueName("Flatten").putInputs(read1Output.getUniqueName(), read1Output.getUniqueName()).putInputs(read2Output.getUniqueName(), read2Output.getUniqueName()).putOutputs(flattenOutput.getUniqueName(), flattenOutput.getUniqueName()).setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.FLATTEN_TRANSFORM_URN).setPayload(WindowIntoPayload.newBuilder().setWindowFn(FunctionSpec.newBuilder()).build().toByteString())).setEnvironmentId("py").build();
    PTransform read1Transform = PTransform.newBuilder().setUniqueName("read1").putInputs(impulse1Output.getUniqueName(), impulse1Output.getUniqueName()).putOutputs(read1Output.getUniqueName(), read1Output.getUniqueName()).setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).setPayload(WindowIntoPayload.newBuilder().setWindowFn(FunctionSpec.newBuilder()).build().toByteString())).setEnvironmentId("py").build();
    PTransform read2Transform = PTransform.newBuilder().setUniqueName("read2").putInputs(impulse2Output.getUniqueName(), impulse2Output.getUniqueName()).putOutputs(read2Output.getUniqueName(), read2Output.getUniqueName()).setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).setPayload(WindowIntoPayload.newBuilder().setWindowFn(FunctionSpec.newBuilder()).build().toByteString())).setEnvironmentId("py").build();
    PTransform impulse1Transform = PTransform.newBuilder().setUniqueName("impulse1").putOutputs(impulse1Output.getUniqueName(), impulse1Output.getUniqueName()).setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.IMPULSE_TRANSFORM_URN).setPayload(WindowIntoPayload.newBuilder().setWindowFn(FunctionSpec.newBuilder()).build().toByteString())).build();
    PTransform impulse2Transform = PTransform.newBuilder().setUniqueName("impulse2").putOutputs(impulse2Output.getUniqueName(), impulse2Output.getUniqueName()).setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.IMPULSE_TRANSFORM_URN).setPayload(WindowIntoPayload.newBuilder().setWindowFn(FunctionSpec.newBuilder()).build().toByteString())).build();
    Pipeline impulse = Pipeline.newBuilder().addRootTransformIds(impulse1Transform.getUniqueName()).addRootTransformIds(impulse2Transform.getUniqueName()).addRootTransformIds(flattenTransform.getUniqueName()).setComponents(Components.newBuilder().putCoders("coder", Coder.newBuilder().build()).putCoders("windowCoder", Coder.newBuilder().build()).putWindowingStrategies("ws", WindowingStrategy.newBuilder().setWindowCoderId("windowCoder").build()).putEnvironments("py", Environments.createDockerEnvironment("py")).putPcollections(flattenOutput.getUniqueName(), flattenOutput).putTransforms(flattenTransform.getUniqueName(), flattenTransform).putPcollections(read1Output.getUniqueName(), read1Output).putTransforms(read1Transform.getUniqueName(), read1Transform).putPcollections(read2Output.getUniqueName(), read2Output).putTransforms(read2Transform.getUniqueName(), read2Transform).putPcollections(impulse1Output.getUniqueName(), impulse1Output).putTransforms(impulse1Transform.getUniqueName(), impulse1Transform).putPcollections(impulse2Output.getUniqueName(), impulse2Output).putTransforms(impulse2Transform.getUniqueName(), impulse2Transform).build()).build();
    FusedPipeline fused = GreedyPipelineFuser.fuse(impulse);
    assertThat(fused.getRunnerExecutedTransforms(), hasSize(2));
    assertThat(fused.getFusedStages(), hasSize(2));
    assertThat(fused.getFusedStages(), containsInAnyOrder(ExecutableStageMatcher.withInput(impulse1Output.getUniqueName()).withTransforms(flattenTransform.getUniqueName(), read1Transform.getUniqueName()), ExecutableStageMatcher.withInput(impulse2Output.getUniqueName()).withTransforms(flattenTransform.getUniqueName(), read2Transform.getUniqueName())));
    assertThat(fused.getFusedStages().stream().flatMap(s -> s.getComponents().getTransformsOrThrow(flattenTransform.getUniqueName()).getInputsMap().values().stream()).collect(Collectors.toList()), containsInAnyOrder(read1Output.getUniqueName(), read2Output.getUniqueName()));
}
Also used : PCollection(org.apache.beam.model.pipeline.v1.RunnerApi.PCollection) PTransform(org.apache.beam.model.pipeline.v1.RunnerApi.PTransform) Pipeline(org.apache.beam.model.pipeline.v1.RunnerApi.Pipeline) Test(org.junit.Test)

Example 68 with PTransform

use of org.apache.beam.model.pipeline.v1.RunnerApi.PTransform in project beam by apache.

the class DataflowPipelineTranslatorTest method testStreamingGroupIntoBatchesWithShardedKeyTranslationUnifiedWorker.

@Test
public void testStreamingGroupIntoBatchesWithShardedKeyTranslationUnifiedWorker() throws Exception {
    List<String> experiments = new ArrayList<>(ImmutableList.of(GcpOptions.STREAMING_ENGINE_EXPERIMENT, GcpOptions.WINDMILL_SERVICE_EXPERIMENT, "use_runner_v2"));
    JobSpecification jobSpec = runStreamingGroupIntoBatchesAndGetJobSpec(true, experiments);
    List<Step> steps = jobSpec.getJob().getSteps();
    Step shardedStateStep = steps.get(steps.size() - 1);
    Map<String, Object> properties = shardedStateStep.getProperties();
    assertTrue(properties.containsKey(PropertyNames.USES_KEYED_STATE));
    assertTrue(properties.containsKey(PropertyNames.ALLOWS_SHARDABLE_STATE));
    assertEquals("true", getString(properties, PropertyNames.ALLOWS_SHARDABLE_STATE));
    assertTrue(properties.containsKey(PropertyNames.PRESERVES_KEYS));
    assertEquals("true", getString(properties, PropertyNames.PRESERVES_KEYS));
    // Also checks the runner proto is correctly populated.
    Map<String, RunnerApi.PTransform> transformMap = jobSpec.getPipelineProto().getComponents().getTransformsMap();
    boolean transformFound = false;
    for (Map.Entry<String, RunnerApi.PTransform> transform : transformMap.entrySet()) {
        RunnerApi.FunctionSpec spec = transform.getValue().getSpec();
        if (spec.getUrn().equals(PTransformTranslation.GROUP_INTO_BATCHES_WITH_SHARDED_KEY_URN)) {
            for (String subtransform : transform.getValue().getSubtransformsList()) {
                RunnerApi.PTransform ptransform = transformMap.get(subtransform);
                if (ptransform.getSpec().getUrn().equals(PTransformTranslation.GROUP_INTO_BATCHES_URN)) {
                    transformFound = true;
                }
            }
        }
    }
    assertTrue(transformFound);
    boolean coderFound = false;
    Map<String, RunnerApi.Coder> coderMap = jobSpec.getPipelineProto().getComponents().getCodersMap();
    for (Map.Entry<String, RunnerApi.Coder> coder : coderMap.entrySet()) {
        if (coder.getValue().getSpec().getUrn().equals(ModelCoders.SHARDED_KEY_CODER_URN)) {
            coderFound = true;
        }
    }
    assertTrue(coderFound);
}
Also used : SerializableCoder(org.apache.beam.sdk.coders.SerializableCoder) KvCoder(org.apache.beam.sdk.coders.KvCoder) VarIntCoder(org.apache.beam.sdk.coders.VarIntCoder) VoidCoder(org.apache.beam.sdk.coders.VoidCoder) Coder(org.apache.beam.sdk.coders.Coder) StringUtf8Coder(org.apache.beam.sdk.coders.StringUtf8Coder) ArrayList(java.util.ArrayList) Structs.getString(org.apache.beam.runners.dataflow.util.Structs.getString) ByteString(org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.ByteString) Step(com.google.api.services.dataflow.model.Step) RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) CloudObject(org.apache.beam.runners.dataflow.util.CloudObject) JobSpecification(org.apache.beam.runners.dataflow.DataflowPipelineTranslator.JobSpecification) ImmutableMap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap) Map(java.util.Map) PTransform(org.apache.beam.sdk.transforms.PTransform) Test(org.junit.Test)

Example 69 with PTransform

use of org.apache.beam.model.pipeline.v1.RunnerApi.PTransform in project beam by apache.

the class CreateExecutableStageNodeFunction method apply.

@Override
public Node apply(MutableNetwork<Node, Edge> input) {
    for (Node node : input.nodes()) {
        if (node instanceof RemoteGrpcPortNode || node instanceof ParallelInstructionNode || node instanceof InstructionOutputNode) {
            continue;
        }
        throw new IllegalArgumentException(String.format("Network contains unknown type of node: %s", input));
    }
    // Fix all non output nodes to have named edges.
    for (Node node : input.nodes()) {
        if (node instanceof InstructionOutputNode) {
            continue;
        }
        for (Node successor : input.successors(node)) {
            for (Edge edge : input.edgesConnecting(node, successor)) {
                if (edge instanceof DefaultEdge) {
                    input.removeEdge(edge);
                    input.addEdge(node, successor, MultiOutputInfoEdge.create(new MultiOutputInfo().setTag(idGenerator.getId())));
                }
            }
        }
    }
    RunnerApi.Components.Builder componentsBuilder = RunnerApi.Components.newBuilder();
    componentsBuilder.mergeFrom(this.pipeline.getComponents());
    // Default to use the Java environment if pipeline doesn't have environment specified.
    if (pipeline.getComponents().getEnvironmentsMap().isEmpty()) {
        String envId = Environments.JAVA_SDK_HARNESS_ENVIRONMENT.getUrn() + idGenerator.getId();
        componentsBuilder.putEnvironments(envId, Environments.JAVA_SDK_HARNESS_ENVIRONMENT);
    }
    // By default, use GlobalWindow for all languages.
    // For java, if there is a IntervalWindowCoder, then use FixedWindow instead.
    // TODO: should get real WindowingStategy from pipeline proto.
    String globalWindowingStrategyId = "generatedGlobalWindowingStrategy" + idGenerator.getId();
    String intervalWindowEncodingWindowingStrategyId = "generatedIntervalWindowEncodingWindowingStrategy" + idGenerator.getId();
    SdkComponents sdkComponents = SdkComponents.create(pipeline.getComponents(), null);
    try {
        registerWindowingStrategy(globalWindowingStrategyId, WindowingStrategy.globalDefault(), componentsBuilder, sdkComponents);
        registerWindowingStrategy(intervalWindowEncodingWindowingStrategyId, WindowingStrategy.of(FixedWindows.of(Duration.standardSeconds(1))), componentsBuilder, sdkComponents);
    } catch (IOException exc) {
        throw new RuntimeException("Could not convert default windowing stratey to proto", exc);
    }
    Map<Node, String> nodesToPCollections = new HashMap<>();
    ImmutableMap.Builder<String, NameContext> ptransformIdToNameContexts = ImmutableMap.builder();
    ImmutableMap.Builder<String, Iterable<SideInputInfo>> ptransformIdToSideInputInfos = ImmutableMap.builder();
    ImmutableMap.Builder<String, Iterable<PCollectionView<?>>> ptransformIdToPCollectionViews = ImmutableMap.builder();
    // A field of ExecutableStage which includes the PCollection goes to worker side.
    Set<PCollectionNode> executableStageOutputs = new HashSet<>();
    // A field of ExecutableStage which includes the PCollection goes to runner side.
    Set<PCollectionNode> executableStageInputs = new HashSet<>();
    for (InstructionOutputNode node : Iterables.filter(input.nodes(), InstructionOutputNode.class)) {
        InstructionOutput instructionOutput = node.getInstructionOutput();
        String coderId = "generatedCoder" + idGenerator.getId();
        String windowingStrategyId;
        try (ByteString.Output output = ByteString.newOutput()) {
            try {
                Coder<?> javaCoder = CloudObjects.coderFromCloudObject(CloudObject.fromSpec(instructionOutput.getCodec()));
                Coder<?> elementCoder = ((WindowedValueCoder<?>) javaCoder).getValueCoder();
                sdkComponents.registerCoder(elementCoder);
                RunnerApi.Coder coderProto = CoderTranslation.toProto(elementCoder, sdkComponents);
                componentsBuilder.putCoders(coderId, coderProto);
                // For now, Dataflow runner harness only deal with FixedWindow.
                if (javaCoder instanceof FullWindowedValueCoder) {
                    FullWindowedValueCoder<?> windowedValueCoder = (FullWindowedValueCoder<?>) javaCoder;
                    Coder<?> windowCoder = windowedValueCoder.getWindowCoder();
                    if (windowCoder instanceof IntervalWindowCoder) {
                        windowingStrategyId = intervalWindowEncodingWindowingStrategyId;
                    } else if (windowCoder instanceof GlobalWindow.Coder) {
                        windowingStrategyId = globalWindowingStrategyId;
                    } else {
                        throw new UnsupportedOperationException(String.format("Dataflow portable runner harness doesn't support windowing with %s", windowCoder));
                    }
                } else {
                    throw new UnsupportedOperationException("Dataflow portable runner harness only supports FullWindowedValueCoder");
                }
            } catch (IOException e) {
                throw new IllegalArgumentException(String.format("Unable to encode coder %s for output %s", instructionOutput.getCodec(), instructionOutput), e);
            } catch (Exception e) {
                // Coder probably wasn't a java coder
                OBJECT_MAPPER.writeValue(output, instructionOutput.getCodec());
                componentsBuilder.putCoders(coderId, RunnerApi.Coder.newBuilder().setSpec(RunnerApi.FunctionSpec.newBuilder().setPayload(output.toByteString())).build());
                // For non-java coder, hope it's GlobalWindows by default.
                // TODO(BEAM-6231): Actually discover the right windowing strategy.
                windowingStrategyId = globalWindowingStrategyId;
            }
        } catch (IOException e) {
            throw new IllegalArgumentException(String.format("Unable to encode coder %s for output %s", instructionOutput.getCodec(), instructionOutput), e);
        }
        // TODO(BEAM-6275): Set correct IsBounded on generated PCollections
        String pcollectionId = node.getPcollectionId();
        RunnerApi.PCollection pCollection = RunnerApi.PCollection.newBuilder().setCoderId(coderId).setWindowingStrategyId(windowingStrategyId).setIsBounded(RunnerApi.IsBounded.Enum.BOUNDED).build();
        nodesToPCollections.put(node, pcollectionId);
        componentsBuilder.putPcollections(pcollectionId, pCollection);
        // is set
        if (isExecutableStageOutputPCollection(input, node)) {
            executableStageOutputs.add(PipelineNode.pCollection(pcollectionId, pCollection));
        }
        if (isExecutableStageInputPCollection(input, node)) {
            executableStageInputs.add(PipelineNode.pCollection(pcollectionId, pCollection));
        }
    }
    componentsBuilder.putAllCoders(sdkComponents.toComponents().getCodersMap());
    Set<PTransformNode> executableStageTransforms = new HashSet<>();
    Set<TimerReference> executableStageTimers = new HashSet<>();
    List<UserStateId> userStateIds = new ArrayList<>();
    Set<SideInputReference> executableStageSideInputs = new HashSet<>();
    for (ParallelInstructionNode node : Iterables.filter(input.nodes(), ParallelInstructionNode.class)) {
        ImmutableMap.Builder<String, PCollectionNode> sideInputIds = ImmutableMap.builder();
        ParallelInstruction parallelInstruction = node.getParallelInstruction();
        String ptransformId = "generatedPtransform" + idGenerator.getId();
        ptransformIdToNameContexts.put(ptransformId, NameContext.create(null, parallelInstruction.getOriginalName(), parallelInstruction.getSystemName(), parallelInstruction.getName()));
        RunnerApi.PTransform.Builder pTransform = RunnerApi.PTransform.newBuilder();
        RunnerApi.FunctionSpec.Builder transformSpec = RunnerApi.FunctionSpec.newBuilder();
        List<String> timerIds = new ArrayList<>();
        if (parallelInstruction.getParDo() != null) {
            ParDoInstruction parDoInstruction = parallelInstruction.getParDo();
            CloudObject userFnSpec = CloudObject.fromSpec(parDoInstruction.getUserFn());
            String userFnClassName = userFnSpec.getClassName();
            if (userFnClassName.equals("CombineValuesFn") || userFnClassName.equals("KeyedCombineFn")) {
                transformSpec = transformCombineValuesFnToFunctionSpec(userFnSpec);
                ptransformIdToPCollectionViews.put(ptransformId, Collections.emptyList());
            } else {
                String parDoPTransformId = getString(userFnSpec, PropertyNames.SERIALIZED_FN);
                RunnerApi.PTransform parDoPTransform = pipeline.getComponents().getTransformsOrDefault(parDoPTransformId, null);
                // TODO: only the non-null branch should exist; for migration ease only
                if (parDoPTransform != null) {
                    checkArgument(parDoPTransform.getSpec().getUrn().equals(PTransformTranslation.PAR_DO_TRANSFORM_URN), "Found transform \"%s\" for ParallelDo instruction, " + " but that transform had unexpected URN \"%s\" (expected \"%s\")", parDoPTransformId, parDoPTransform.getSpec().getUrn(), PTransformTranslation.PAR_DO_TRANSFORM_URN);
                    RunnerApi.ParDoPayload parDoPayload;
                    try {
                        parDoPayload = RunnerApi.ParDoPayload.parseFrom(parDoPTransform.getSpec().getPayload());
                    } catch (InvalidProtocolBufferException exc) {
                        throw new RuntimeException("ParDo did not have a ParDoPayload", exc);
                    }
                    // user timers and user state.
                    for (Map.Entry<String, RunnerApi.TimerFamilySpec> entry : parDoPayload.getTimerFamilySpecsMap().entrySet()) {
                        timerIds.add(entry.getKey());
                    }
                    for (Map.Entry<String, RunnerApi.StateSpec> entry : parDoPayload.getStateSpecsMap().entrySet()) {
                        UserStateId.Builder builder = UserStateId.newBuilder();
                        builder.setTransformId(parDoPTransformId);
                        builder.setLocalName(entry.getKey());
                        userStateIds.add(builder.build());
                    }
                    // To facilitate the creation of Set executableStageSideInputs.
                    for (String sideInputTag : parDoPayload.getSideInputsMap().keySet()) {
                        String sideInputPCollectionId = parDoPTransform.getInputsOrThrow(sideInputTag);
                        RunnerApi.PCollection sideInputPCollection = pipeline.getComponents().getPcollectionsOrThrow(sideInputPCollectionId);
                        pTransform.putInputs(sideInputTag, sideInputPCollectionId);
                        PCollectionNode pCollectionNode = PipelineNode.pCollection(sideInputPCollectionId, sideInputPCollection);
                        sideInputIds.put(sideInputTag, pCollectionNode);
                    }
                    // To facilitate the creation of Map(ptransformId -> pCollectionView), which is
                    // required by constructing an ExecutableStageNode.
                    ImmutableList.Builder<PCollectionView<?>> pcollectionViews = ImmutableList.builder();
                    for (Map.Entry<String, RunnerApi.SideInput> sideInputEntry : parDoPayload.getSideInputsMap().entrySet()) {
                        pcollectionViews.add(RegisterNodeFunction.transformSideInputForRunner(pipeline, parDoPTransform, sideInputEntry.getKey(), sideInputEntry.getValue()));
                    }
                    ptransformIdToPCollectionViews.put(ptransformId, pcollectionViews.build());
                    transformSpec.setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).setPayload(parDoPayload.toByteString());
                } else {
                    // legacy path - bytes are the FunctionSpec's payload field, basically, and
                    // SDKs expect it in the PTransform's payload field
                    byte[] userFnBytes = getBytes(userFnSpec, PropertyNames.SERIALIZED_FN);
                    transformSpec.setUrn(ParDoTranslation.CUSTOM_JAVA_DO_FN_URN).setPayload(ByteString.copyFrom(userFnBytes));
                }
                if (parDoInstruction.getSideInputs() != null) {
                    ptransformIdToSideInputInfos.put(ptransformId, forSideInputInfos(parDoInstruction.getSideInputs(), true));
                }
            }
        } else if (parallelInstruction.getRead() != null) {
            ReadInstruction readInstruction = parallelInstruction.getRead();
            CloudObject sourceSpec = CloudObject.fromSpec(CloudSourceUtils.flattenBaseSpecs(readInstruction.getSource()).getSpec());
            // TODO: Need to plumb through the SDK specific function spec.
            transformSpec.setUrn(JAVA_SOURCE_URN);
            try {
                byte[] serializedSource = Base64.getDecoder().decode(getString(sourceSpec, SERIALIZED_SOURCE));
                ByteString sourceByteString = ByteString.copyFrom(serializedSource);
                transformSpec.setPayload(sourceByteString);
            } catch (Exception e) {
                throw new IllegalArgumentException(String.format("Unable to process Read %s", parallelInstruction), e);
            }
        } else if (parallelInstruction.getFlatten() != null) {
            transformSpec.setUrn(PTransformTranslation.FLATTEN_TRANSFORM_URN);
        } else {
            throw new IllegalArgumentException(String.format("Unknown type of ParallelInstruction %s", parallelInstruction));
        }
        // predecessor in a ParDo. This PCollection is called the "main input".
        for (Node predecessorOutput : input.predecessors(node)) {
            pTransform.putInputs("generatedInput" + idGenerator.getId(), nodesToPCollections.get(predecessorOutput));
        }
        for (Edge edge : input.outEdges(node)) {
            Node nodeOutput = input.incidentNodes(edge).target();
            MultiOutputInfoEdge edge2 = (MultiOutputInfoEdge) edge;
            pTransform.putOutputs(edge2.getMultiOutputInfo().getTag(), nodesToPCollections.get(nodeOutput));
        }
        pTransform.setSpec(transformSpec);
        PTransformNode pTransformNode = PipelineNode.pTransform(ptransformId, pTransform.build());
        executableStageTransforms.add(pTransformNode);
        for (String timerId : timerIds) {
            executableStageTimers.add(TimerReference.of(pTransformNode, timerId));
        }
        ImmutableMap<String, PCollectionNode> sideInputIdToPCollectionNodes = sideInputIds.build();
        for (String sideInputTag : sideInputIdToPCollectionNodes.keySet()) {
            SideInputReference sideInputReference = SideInputReference.of(pTransformNode, sideInputTag, sideInputIdToPCollectionNodes.get(sideInputTag));
            executableStageSideInputs.add(sideInputReference);
        }
        executableStageTransforms.add(pTransformNode);
    }
    if (executableStageInputs.size() != 1) {
        throw new UnsupportedOperationException("ExecutableStage only support one input PCollection");
    }
    PCollectionNode executableInput = executableStageInputs.iterator().next();
    RunnerApi.Components executableStageComponents = componentsBuilder.build();
    // Get Environment from ptransform, otherwise, use JAVA_SDK_HARNESS_ENVIRONMENT as default.
    Environment executableStageEnv = getEnvironmentFromPTransform(executableStageComponents, executableStageTransforms);
    if (executableStageEnv == null) {
        executableStageEnv = Environments.JAVA_SDK_HARNESS_ENVIRONMENT;
    }
    Set<UserStateReference> executableStageUserStateReference = new HashSet<>();
    for (UserStateId userStateId : userStateIds) {
        executableStageUserStateReference.add(UserStateReference.fromUserStateId(userStateId, executableStageComponents));
    }
    ExecutableStage executableStage = ImmutableExecutableStage.ofFullComponents(executableStageComponents, executableStageEnv, executableInput, executableStageSideInputs, executableStageUserStateReference, executableStageTimers, executableStageTransforms, executableStageOutputs, DEFAULT_WIRE_CODER_SETTINGS);
    return ExecutableStageNode.create(executableStage, ptransformIdToNameContexts.build(), ptransformIdToSideInputInfos.build(), ptransformIdToPCollectionViews.build());
}
Also used : HashMap(java.util.HashMap) MultiOutputInfo(com.google.api.services.dataflow.model.MultiOutputInfo) ByteString(org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.ByteString) PCollectionNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PCollectionNode) InstructionOutputNode(org.apache.beam.runners.dataflow.worker.graph.Nodes.InstructionOutputNode) PTransformNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PTransformNode) ParallelInstructionNode(org.apache.beam.runners.dataflow.worker.graph.Nodes.ParallelInstructionNode) Node(org.apache.beam.runners.dataflow.worker.graph.Nodes.Node) ExecutableStageNode(org.apache.beam.runners.dataflow.worker.graph.Nodes.ExecutableStageNode) PipelineNode(org.apache.beam.runners.core.construction.graph.PipelineNode) RemoteGrpcPortNode(org.apache.beam.runners.dataflow.worker.graph.Nodes.RemoteGrpcPortNode) InstructionOutput(com.google.api.services.dataflow.model.InstructionOutput) PTransformNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PTransformNode) ArrayList(java.util.ArrayList) Structs.getString(org.apache.beam.runners.dataflow.util.Structs.getString) ByteString(org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.ByteString) RehydratedComponents(org.apache.beam.runners.core.construction.RehydratedComponents) SdkComponents(org.apache.beam.runners.core.construction.SdkComponents) RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) SideInputReference(org.apache.beam.runners.core.construction.graph.SideInputReference) ImmutableExecutableStage(org.apache.beam.runners.core.construction.graph.ImmutableExecutableStage) ExecutableStage(org.apache.beam.runners.core.construction.graph.ExecutableStage) HashSet(java.util.HashSet) DefaultEdge(org.apache.beam.runners.dataflow.worker.graph.Edges.DefaultEdge) MultiOutputInfoEdge(org.apache.beam.runners.dataflow.worker.graph.Edges.MultiOutputInfoEdge) ImmutableMap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap) ParDoInstruction(com.google.api.services.dataflow.model.ParDoInstruction) PCollectionView(org.apache.beam.sdk.values.PCollectionView) Environment(org.apache.beam.model.pipeline.v1.RunnerApi.Environment) GlobalWindow(org.apache.beam.sdk.transforms.windowing.GlobalWindow) ImmutableMap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap) Map(java.util.Map) HashMap(java.util.HashMap) IntervalWindowCoder(org.apache.beam.sdk.transforms.windowing.IntervalWindow.IntervalWindowCoder) RemoteGrpcPortNode(org.apache.beam.runners.dataflow.worker.graph.Nodes.RemoteGrpcPortNode) TimerReference(org.apache.beam.runners.core.construction.graph.TimerReference) ImmutableList(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList) ParallelInstructionNode(org.apache.beam.runners.dataflow.worker.graph.Nodes.ParallelInstructionNode) SdkComponents(org.apache.beam.runners.core.construction.SdkComponents) ReadInstruction(com.google.api.services.dataflow.model.ReadInstruction) InstructionOutputNode(org.apache.beam.runners.dataflow.worker.graph.Nodes.InstructionOutputNode) FullWindowedValueCoder(org.apache.beam.sdk.util.WindowedValue.FullWindowedValueCoder) NameContext(org.apache.beam.runners.dataflow.worker.counters.NameContext) InvalidProtocolBufferException(org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.InvalidProtocolBufferException) IOException(java.io.IOException) PCollectionNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PCollectionNode) InvalidProtocolBufferException(org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.InvalidProtocolBufferException) IOException(java.io.IOException) ParallelInstruction(com.google.api.services.dataflow.model.ParallelInstruction) WindowedValueCoder(org.apache.beam.sdk.util.WindowedValue.WindowedValueCoder) FullWindowedValueCoder(org.apache.beam.sdk.util.WindowedValue.FullWindowedValueCoder) CloudObject(org.apache.beam.runners.dataflow.util.CloudObject) UserStateId(org.apache.beam.model.pipeline.v1.RunnerApi.ExecutableStagePayload.UserStateId) UserStateReference(org.apache.beam.runners.core.construction.graph.UserStateReference) Edge(org.apache.beam.runners.dataflow.worker.graph.Edges.Edge) MultiOutputInfoEdge(org.apache.beam.runners.dataflow.worker.graph.Edges.MultiOutputInfoEdge) DefaultEdge(org.apache.beam.runners.dataflow.worker.graph.Edges.DefaultEdge)

Example 70 with PTransform

use of org.apache.beam.model.pipeline.v1.RunnerApi.PTransform in project beam by apache.

the class RegisterNodeFunction method transformCombineValuesFnToFunctionSpec.

/**
 * Transforms a CombineValuesFn {@link ParDoInstruction} to an Apache Beam {@link
 * RunnerApi.FunctionSpec}.
 */
private RunnerApi.FunctionSpec.Builder transformCombineValuesFnToFunctionSpec(CloudObject userFn) {
    // Grab the Combine PTransform. This transform is the composite PTransform representing the
    // entire CombinePerKey, and it contains the CombinePayload we need.
    String combinePTransformId = getString(userFn, PropertyNames.SERIALIZED_FN);
    RunnerApi.PTransform combinePerKeyPTransform = pipeline.getComponents().getTransformsOrDefault(combinePTransformId, null);
    checkArgument(combinePerKeyPTransform != null, "Transform with id \"%s\" not found in pipeline.", combinePTransformId);
    checkArgument(combinePerKeyPTransform.getSpec().getUrn().equals(COMBINE_PER_KEY_URN), "Found transform \"%s\" for Combine instruction, " + "but that transform had unexpected URN \"%s\" (expected \"%s\")", combinePerKeyPTransform, combinePerKeyPTransform.getSpec().getUrn(), COMBINE_PER_KEY_URN);
    RunnerApi.CombinePayload combinePayload;
    try {
        combinePayload = RunnerApi.CombinePayload.parseFrom(combinePerKeyPTransform.getSpec().getPayload());
    } catch (InvalidProtocolBufferException exc) {
        throw new RuntimeException("Combine did not have a CombinePayload", exc);
    }
    String phase = getString(userFn, WorkerPropertyNames.PHASE, CombinePhase.ALL);
    String urn;
    switch(phase) {
        case CombinePhase.ALL:
            urn = COMBINE_GROUPED_VALUES_URN;
            break;
        case CombinePhase.ADD:
            urn = COMBINE_PRECOMBINE_URN;
            break;
        case CombinePhase.MERGE:
            urn = COMBINE_MERGE_URN;
            break;
        case CombinePhase.EXTRACT:
            urn = COMBINE_EXTRACT_URN;
            break;
        default:
            throw new RuntimeException("Encountered unknown Combine Phase: " + phase);
    }
    return RunnerApi.FunctionSpec.newBuilder().setUrn(urn).setPayload(combinePayload.toByteString());
}
Also used : RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) InvalidProtocolBufferException(org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.InvalidProtocolBufferException) Structs.getString(org.apache.beam.runners.dataflow.util.Structs.getString) ByteString(org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.ByteString)

Aggregations

PTransform (org.apache.beam.model.pipeline.v1.RunnerApi.PTransform)58 Test (org.junit.Test)41 RunnerApi (org.apache.beam.model.pipeline.v1.RunnerApi)28 Components (org.apache.beam.model.pipeline.v1.RunnerApi.Components)23 PTransformNode (org.apache.beam.runners.core.construction.graph.PipelineNode.PTransformNode)22 PCollectionNode (org.apache.beam.runners.core.construction.graph.PipelineNode.PCollectionNode)21 PCollection (org.apache.beam.model.pipeline.v1.RunnerApi.PCollection)19 Map (java.util.Map)18 Environment (org.apache.beam.model.pipeline.v1.RunnerApi.Environment)18 ArrayList (java.util.ArrayList)16 ByteString (org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.ByteString)16 WindowedValue (org.apache.beam.sdk.util.WindowedValue)11 Collection (java.util.Collection)9 SdkComponents (org.apache.beam.runners.core.construction.SdkComponents)9 Collectors (java.util.stream.Collectors)8 ImmutableMap (org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap)8 IOException (java.io.IOException)7 HashSet (java.util.HashSet)7 FunctionSpec (org.apache.beam.model.pipeline.v1.RunnerApi.FunctionSpec)7 Pipeline (org.apache.beam.model.pipeline.v1.RunnerApi.Pipeline)7