Search in sources :

Example 31 with Pipeline

use of org.apache.beam.model.pipeline.v1.RunnerApi.Pipeline in project beam by apache.

the class FlinkStreamingPortablePipelineTranslator method translateExecutableStage.

private <InputT, OutputT> void translateExecutableStage(String id, RunnerApi.Pipeline pipeline, StreamingTranslationContext context) {
    // TODO: Fail on splittable DoFns.
    // TODO: Special-case single outputs to avoid multiplexing PCollections.
    RunnerApi.Components components = pipeline.getComponents();
    RunnerApi.PTransform transform = components.getTransformsOrThrow(id);
    Map<String, String> outputs = transform.getOutputsMap();
    final RunnerApi.ExecutableStagePayload stagePayload;
    try {
        stagePayload = RunnerApi.ExecutableStagePayload.parseFrom(transform.getSpec().getPayload());
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    String inputPCollectionId = stagePayload.getInput();
    final TransformedSideInputs transformedSideInputs;
    if (stagePayload.getSideInputsCount() > 0) {
        transformedSideInputs = transformSideInputs(stagePayload, components, context);
    } else {
        transformedSideInputs = new TransformedSideInputs(Collections.emptyMap(), null);
    }
    Map<TupleTag<?>, OutputTag<WindowedValue<?>>> tagsToOutputTags = Maps.newLinkedHashMap();
    Map<TupleTag<?>, Coder<WindowedValue<?>>> tagsToCoders = Maps.newLinkedHashMap();
    // TODO: does it matter which output we designate as "main"
    final TupleTag<OutputT> mainOutputTag = outputs.isEmpty() ? null : new TupleTag(outputs.keySet().iterator().next());
    // associate output tags with ids, output manager uses these Integer ids to serialize state
    BiMap<String, Integer> outputIndexMap = createOutputMap(outputs.keySet());
    Map<String, Coder<WindowedValue<?>>> outputCoders = Maps.newHashMap();
    Map<TupleTag<?>, Integer> tagsToIds = Maps.newHashMap();
    Map<String, TupleTag<?>> collectionIdToTupleTag = Maps.newHashMap();
    // order output names for deterministic mapping
    for (String localOutputName : new TreeMap<>(outputIndexMap).keySet()) {
        String collectionId = outputs.get(localOutputName);
        Coder<WindowedValue<?>> windowCoder = (Coder) instantiateCoder(collectionId, components);
        outputCoders.put(localOutputName, windowCoder);
        TupleTag<?> tupleTag = new TupleTag<>(localOutputName);
        CoderTypeInformation<WindowedValue<?>> typeInformation = new CoderTypeInformation(windowCoder, context.getPipelineOptions());
        tagsToOutputTags.put(tupleTag, new OutputTag<>(localOutputName, typeInformation));
        tagsToCoders.put(tupleTag, windowCoder);
        tagsToIds.put(tupleTag, outputIndexMap.get(localOutputName));
        collectionIdToTupleTag.put(collectionId, tupleTag);
    }
    final SingleOutputStreamOperator<WindowedValue<OutputT>> outputStream;
    DataStream<WindowedValue<InputT>> inputDataStream = context.getDataStreamOrThrow(inputPCollectionId);
    CoderTypeInformation<WindowedValue<OutputT>> outputTypeInformation = !outputs.isEmpty() ? new CoderTypeInformation(outputCoders.get(mainOutputTag.getId()), context.getPipelineOptions()) : null;
    ArrayList<TupleTag<?>> additionalOutputTags = Lists.newArrayList();
    for (TupleTag<?> tupleTag : tagsToCoders.keySet()) {
        if (!mainOutputTag.getId().equals(tupleTag.getId())) {
            additionalOutputTags.add(tupleTag);
        }
    }
    final Coder<WindowedValue<InputT>> windowedInputCoder = instantiateCoder(inputPCollectionId, components);
    final boolean stateful = stagePayload.getUserStatesCount() > 0 || stagePayload.getTimersCount() > 0;
    final boolean hasSdfProcessFn = stagePayload.getComponents().getTransformsMap().values().stream().anyMatch(pTransform -> pTransform.getSpec().getUrn().equals(PTransformTranslation.SPLITTABLE_PROCESS_SIZED_ELEMENTS_AND_RESTRICTIONS_URN));
    Coder keyCoder = null;
    KeySelector<WindowedValue<InputT>, ?> keySelector = null;
    if (stateful || hasSdfProcessFn) {
        // Stateful/SDF stages are only allowed of KV input.
        Coder valueCoder = ((WindowedValue.FullWindowedValueCoder) windowedInputCoder).getValueCoder();
        if (!(valueCoder instanceof KvCoder)) {
            throw new IllegalStateException(String.format(Locale.ENGLISH, "The element coder for stateful DoFn '%s' must be KvCoder but is: %s", inputPCollectionId, valueCoder.getClass().getSimpleName()));
        }
        if (stateful) {
            keyCoder = ((KvCoder) valueCoder).getKeyCoder();
            keySelector = new KvToByteBufferKeySelector(keyCoder, new SerializablePipelineOptions(context.getPipelineOptions()));
        } else {
            // as the key.
            if (!(((KvCoder) valueCoder).getKeyCoder() instanceof KvCoder)) {
                throw new IllegalStateException(String.format(Locale.ENGLISH, "The element coder for splittable DoFn '%s' must be KVCoder(KvCoder, DoubleCoder) but is: %s", inputPCollectionId, valueCoder.getClass().getSimpleName()));
            }
            keyCoder = ((KvCoder) ((KvCoder) valueCoder).getKeyCoder()).getKeyCoder();
            keySelector = new SdfByteBufferKeySelector(keyCoder, new SerializablePipelineOptions(context.getPipelineOptions()));
        }
        inputDataStream = inputDataStream.keyBy(keySelector);
    }
    DoFnOperator.MultiOutputOutputManagerFactory<OutputT> outputManagerFactory = new DoFnOperator.MultiOutputOutputManagerFactory<>(mainOutputTag, tagsToOutputTags, tagsToCoders, tagsToIds, new SerializablePipelineOptions(context.getPipelineOptions()));
    DoFnOperator<InputT, OutputT> doFnOperator = new ExecutableStageDoFnOperator<>(transform.getUniqueName(), windowedInputCoder, Collections.emptyMap(), mainOutputTag, additionalOutputTags, outputManagerFactory, transformedSideInputs.unionTagToView, new ArrayList<>(transformedSideInputs.unionTagToView.values()), getSideInputIdToPCollectionViewMap(stagePayload, components), context.getPipelineOptions(), stagePayload, context.getJobInfo(), FlinkExecutableStageContextFactory.getInstance(), collectionIdToTupleTag, getWindowingStrategy(inputPCollectionId, components), keyCoder, keySelector);
    final String operatorName = generateNameFromStagePayload(stagePayload);
    if (transformedSideInputs.unionTagToView.isEmpty()) {
        outputStream = inputDataStream.transform(operatorName, outputTypeInformation, doFnOperator);
    } else {
        DataStream<RawUnionValue> sideInputStream = transformedSideInputs.unionedSideInputs.broadcast();
        if (stateful || hasSdfProcessFn) {
            // We have to manually construct the two-input transform because we're not
            // allowed to have only one input keyed, normally. Since Flink 1.5.0 it's
            // possible to use the Broadcast State Pattern which provides a more elegant
            // way to process keyed main input with broadcast state, but it's not feasible
            // here because it breaks the DoFnOperator abstraction.
            TwoInputTransformation<WindowedValue<KV<?, InputT>>, RawUnionValue, WindowedValue<OutputT>> rawFlinkTransform = new TwoInputTransformation(inputDataStream.getTransformation(), sideInputStream.getTransformation(), transform.getUniqueName(), doFnOperator, outputTypeInformation, inputDataStream.getParallelism());
            rawFlinkTransform.setStateKeyType(((KeyedStream) inputDataStream).getKeyType());
            rawFlinkTransform.setStateKeySelectors(((KeyedStream) inputDataStream).getKeySelector(), null);
            outputStream = new SingleOutputStreamOperator(inputDataStream.getExecutionEnvironment(), // we have to cheat around the ctor being protected
            rawFlinkTransform) {
            };
        } else {
            outputStream = inputDataStream.connect(sideInputStream).transform(operatorName, outputTypeInformation, doFnOperator);
        }
    }
    // Assign a unique but consistent id to re-map operator state
    outputStream.uid(transform.getUniqueName());
    if (mainOutputTag != null) {
        context.addDataStream(outputs.get(mainOutputTag.getId()), outputStream);
    }
    for (TupleTag<?> tupleTag : additionalOutputTags) {
        context.addDataStream(outputs.get(tupleTag.getId()), outputStream.getSideOutput(tagsToOutputTags.get(tupleTag)));
    }
}
Also used : KvToByteBufferKeySelector(org.apache.beam.runners.flink.translation.wrappers.streaming.KvToByteBufferKeySelector) TupleTag(org.apache.beam.sdk.values.TupleTag) RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) WindowedValue(org.apache.beam.sdk.util.WindowedValue) OutputTag(org.apache.flink.util.OutputTag) SerializablePipelineOptions(org.apache.beam.runners.core.construction.SerializablePipelineOptions) RawUnionValue(org.apache.beam.sdk.transforms.join.RawUnionValue) SingleOutputStreamOperator(org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator) WindowDoFnOperator(org.apache.beam.runners.flink.translation.wrappers.streaming.WindowDoFnOperator) DoFnOperator(org.apache.beam.runners.flink.translation.wrappers.streaming.DoFnOperator) ExecutableStageDoFnOperator(org.apache.beam.runners.flink.translation.wrappers.streaming.ExecutableStageDoFnOperator) SdfByteBufferKeySelector(org.apache.beam.runners.flink.translation.wrappers.streaming.SdfByteBufferKeySelector) TwoInputTransformation(org.apache.flink.streaming.api.transformations.TwoInputTransformation) ExecutableStageDoFnOperator(org.apache.beam.runners.flink.translation.wrappers.streaming.ExecutableStageDoFnOperator) CoderTypeInformation(org.apache.beam.runners.flink.translation.types.CoderTypeInformation) SingletonKeyedWorkItemCoder(org.apache.beam.runners.flink.translation.wrappers.streaming.SingletonKeyedWorkItemCoder) WindowedValueCoder(org.apache.beam.sdk.util.WindowedValue.WindowedValueCoder) KvCoder(org.apache.beam.sdk.coders.KvCoder) PipelineTranslatorUtils.instantiateCoder(org.apache.beam.runners.fnexecution.translation.PipelineTranslatorUtils.instantiateCoder) IterableCoder(org.apache.beam.sdk.coders.IterableCoder) VoidCoder(org.apache.beam.sdk.coders.VoidCoder) UnionCoder(org.apache.beam.sdk.transforms.join.UnionCoder) Coder(org.apache.beam.sdk.coders.Coder) ByteArrayCoder(org.apache.beam.sdk.coders.ByteArrayCoder) KvCoder(org.apache.beam.sdk.coders.KvCoder) IOException(java.io.IOException)

Example 32 with Pipeline

use of org.apache.beam.model.pipeline.v1.RunnerApi.Pipeline in project beam by apache.

the class WorkerCustomSourcesTest method translateIOToCloudSource.

static com.google.api.services.dataflow.model.Source translateIOToCloudSource(BoundedSource<?> io, DataflowPipelineOptions options) throws Exception {
    DataflowPipelineTranslator translator = DataflowPipelineTranslator.fromOptions(options);
    Pipeline p = Pipeline.create(options);
    p.begin().apply(Read.from(io));
    // Note that we specifically perform this replacement since this is what the DataflowRunner
    // does and the DataflowRunner class does not expose a way to perform these replacements
    // without running the pipeline.
    p.replaceAll(Collections.singletonList(SplittableParDo.PRIMITIVE_BOUNDED_READ_OVERRIDE));
    DataflowRunner runner = DataflowRunner.fromOptions(options);
    SdkComponents sdkComponents = SdkComponents.create();
    RunnerApi.Environment defaultEnvironmentForDataflow = Environments.createDockerEnvironment("dummy-image-url");
    sdkComponents.registerEnvironment(defaultEnvironmentForDataflow);
    RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(p, sdkComponents, true);
    Job workflow = translator.translate(p, pipelineProto, sdkComponents, runner, new ArrayList<DataflowPackage>()).getJob();
    Step step = workflow.getSteps().get(0);
    return stepToCloudSource(step);
}
Also used : RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) ArrayList(java.util.ArrayList) DataflowRunner(org.apache.beam.runners.dataflow.DataflowRunner) Step(com.google.api.services.dataflow.model.Step) DataflowPipelineTranslator(org.apache.beam.runners.dataflow.DataflowPipelineTranslator) SdkComponents(org.apache.beam.runners.core.construction.SdkComponents) Job(com.google.api.services.dataflow.model.Job) Pipeline(org.apache.beam.sdk.Pipeline)

Example 33 with Pipeline

use of org.apache.beam.model.pipeline.v1.RunnerApi.Pipeline in project beam by apache.

the class InsertFetchAndFilterStreamingSideInputNodesTest method testSdkParDoWithSideInput.

@Test
public void testSdkParDoWithSideInput() throws Exception {
    Pipeline p = Pipeline.create();
    PCollection<String> pc = p.apply(Create.of("a", "b", "c"));
    PCollectionView<List<String>> pcView = pc.apply(View.asList());
    pc.apply(ParDo.of(new TestDoFn()).withSideInputs(pcView));
    RunnerApi.Pipeline pipeline = PipelineTranslation.toProto(p);
    Node predecessor = createParDoNode("predecessor");
    InstructionOutputNode mainInput = InstructionOutputNode.create(new InstructionOutput(), "fakeId");
    Node sideInputParDo = createParDoNode(findParDoWithSideInput(pipeline));
    MutableNetwork<Node, Edge> network = createEmptyNetwork();
    network.addNode(predecessor);
    network.addNode(mainInput);
    network.addNode(sideInputParDo);
    network.addEdge(predecessor, mainInput, DefaultEdge.create());
    network.addEdge(mainInput, sideInputParDo, DefaultEdge.create());
    network = InsertFetchAndFilterStreamingSideInputNodes.with(pipeline).forNetwork(network);
    Node mainInputClone = InstructionOutputNode.create(mainInput.getInstructionOutput(), "fakeId");
    Node fetchAndFilter = FetchAndFilterStreamingSideInputsNode.create(pcView.getWindowingStrategyInternal(), ImmutableMap.of(pcView, ParDoTranslation.translateWindowMappingFn(pcView.getWindowMappingFn(), SdkComponents.create(PipelineOptionsFactory.create()))), NameContextsForTests.nameContextForTest());
    MutableNetwork<Node, Edge> expectedNetwork = createEmptyNetwork();
    expectedNetwork.addNode(predecessor);
    expectedNetwork.addNode(mainInputClone);
    expectedNetwork.addNode(fetchAndFilter);
    expectedNetwork.addNode(mainInput);
    expectedNetwork.addNode(sideInputParDo);
    expectedNetwork.addEdge(predecessor, mainInputClone, DefaultEdge.create());
    expectedNetwork.addEdge(mainInputClone, fetchAndFilter, DefaultEdge.create());
    expectedNetwork.addEdge(fetchAndFilter, mainInput, DefaultEdge.create());
    expectedNetwork.addEdge(mainInput, sideInputParDo, DefaultEdge.create());
    assertThatNetworksAreIdentical(expectedNetwork, network);
}
Also used : FetchAndFilterStreamingSideInputsNode(org.apache.beam.runners.dataflow.worker.graph.Nodes.FetchAndFilterStreamingSideInputsNode) InstructionOutputNode(org.apache.beam.runners.dataflow.worker.graph.Nodes.InstructionOutputNode) ParallelInstructionNode(org.apache.beam.runners.dataflow.worker.graph.Nodes.ParallelInstructionNode) Node(org.apache.beam.runners.dataflow.worker.graph.Nodes.Node) InstructionOutput(com.google.api.services.dataflow.model.InstructionOutput) Pipeline(org.apache.beam.sdk.Pipeline) RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) InstructionOutputNode(org.apache.beam.runners.dataflow.worker.graph.Nodes.InstructionOutputNode) ArrayList(java.util.ArrayList) List(java.util.List) Edge(org.apache.beam.runners.dataflow.worker.graph.Edges.Edge) DefaultEdge(org.apache.beam.runners.dataflow.worker.graph.Edges.DefaultEdge) Test(org.junit.Test)

Example 34 with Pipeline

use of org.apache.beam.model.pipeline.v1.RunnerApi.Pipeline in project beam by apache.

the class CreateExecutableStageNodeFunction method transformCombineValuesFnToFunctionSpec.

/**
 * Transforms a CombineValuesFn {@link ParDoInstruction} to an Apache Beam {@link
 * RunnerApi.FunctionSpec}.
 */
private RunnerApi.FunctionSpec.Builder transformCombineValuesFnToFunctionSpec(CloudObject userFn) {
    // Grab the Combine PTransform. This transform is the composite PTransform representing the
    // entire CombinePerKey, and it contains the CombinePayload we need.
    String combinePTransformId = getString(userFn, PropertyNames.SERIALIZED_FN);
    RunnerApi.PTransform combinePerKeyPTransform = pipeline.getComponents().getTransformsOrDefault(combinePTransformId, null);
    checkArgument(combinePerKeyPTransform != null, "Transform with id \"%s\" not found in pipeline.", combinePTransformId);
    checkArgument(combinePerKeyPTransform.getSpec().getUrn().equals(COMBINE_PER_KEY_URN), "Found transform \"%s\" for Combine instruction, " + "but that transform had unexpected URN \"%s\" (expected \"%s\")", combinePerKeyPTransform, combinePerKeyPTransform.getSpec().getUrn(), COMBINE_PER_KEY_URN);
    RunnerApi.CombinePayload combinePayload;
    try {
        combinePayload = RunnerApi.CombinePayload.parseFrom(combinePerKeyPTransform.getSpec().getPayload());
    } catch (InvalidProtocolBufferException exc) {
        throw new RuntimeException("Combine did not have a CombinePayload", exc);
    }
    String phase = getString(userFn, WorkerPropertyNames.PHASE, CombinePhase.ALL);
    String urn;
    switch(phase) {
        case CombinePhase.ALL:
            urn = COMBINE_GROUPED_VALUES_URN;
            break;
        case CombinePhase.ADD:
            urn = COMBINE_PRECOMBINE_URN;
            break;
        case CombinePhase.MERGE:
            urn = COMBINE_MERGE_URN;
            break;
        case CombinePhase.EXTRACT:
            urn = COMBINE_EXTRACT_URN;
            break;
        default:
            throw new RuntimeException("Encountered unknown Combine Phase: " + phase);
    }
    return RunnerApi.FunctionSpec.newBuilder().setUrn(urn).setPayload(combinePayload.toByteString());
}
Also used : RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) InvalidProtocolBufferException(org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.InvalidProtocolBufferException) Structs.getString(org.apache.beam.runners.dataflow.util.Structs.getString) ByteString(org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.ByteString)

Example 35 with Pipeline

use of org.apache.beam.model.pipeline.v1.RunnerApi.Pipeline in project beam by apache.

the class RegisterNodeFunction method transformSideInputForSdk.

/**
 * Modifies the process bundle descriptor and updates the PTransform that the SDK harness will see
 * with length prefixed coders used on the side input PCollection and windowing strategy.
 */
private static final void transformSideInputForSdk(RunnerApi.Pipeline pipeline, RunnerApi.PTransform originalPTransform, String sideInputTag, ProcessBundleDescriptor.Builder processBundleDescriptor, RunnerApi.PTransform.Builder updatedPTransform) {
    RunnerApi.PCollection sideInputPCollection = pipeline.getComponents().getPcollectionsOrThrow(originalPTransform.getInputsOrThrow(sideInputTag));
    RunnerApi.WindowingStrategy sideInputWindowingStrategy = pipeline.getComponents().getWindowingStrategiesOrThrow(sideInputPCollection.getWindowingStrategyId());
    // TODO: We should not length prefix the window or key for the SDK side since the
    // key and window are already length delimited via protobuf itself. But we need to
    // maintain the length prefixing within the Runner harness to match the bytes that were
    // materialized to the side input sink.
    // We take the original pipeline coders and add any coders we have added when processing side
    // inputs before building new length prefixed variants.
    RunnerApi.Components.Builder componentsBuilder = pipeline.getComponents().toBuilder();
    componentsBuilder.putAllCoders(processBundleDescriptor.getCodersMap());
    String updatedSdkSideInputCoderId = LengthPrefixUnknownCoders.addLengthPrefixedCoder(sideInputPCollection.getCoderId(), componentsBuilder, false);
    String updatedSdkSideInputWindowCoderId = LengthPrefixUnknownCoders.addLengthPrefixedCoder(sideInputWindowingStrategy.getWindowCoderId(), componentsBuilder, false);
    processBundleDescriptor.putAllCoders(componentsBuilder.getCodersMap());
    String updatedSdkWindowingStrategyId = SyntheticComponents.uniqueId(sideInputPCollection.getWindowingStrategyId() + "-runner_generated", processBundleDescriptor.getWindowingStrategiesMap().keySet()::contains);
    processBundleDescriptor.putWindowingStrategies(updatedSdkWindowingStrategyId, sideInputWindowingStrategy.toBuilder().setWindowCoderId(updatedSdkSideInputWindowCoderId).build());
    RunnerApi.PCollection updatedSdkSideInputPcollection = sideInputPCollection.toBuilder().setCoderId(updatedSdkSideInputCoderId).setWindowingStrategyId(updatedSdkWindowingStrategyId).build();
    // Replace the contents of the PCollection with the updated side input PCollection
    // specification and insert it into the update PTransform.
    processBundleDescriptor.putPcollections(originalPTransform.getInputsOrThrow(sideInputTag), updatedSdkSideInputPcollection);
    updatedPTransform.putInputs(sideInputTag, originalPTransform.getInputsOrThrow(sideInputTag));
}
Also used : SdkComponents(org.apache.beam.runners.core.construction.SdkComponents) SyntheticComponents(org.apache.beam.runners.core.construction.SyntheticComponents) RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) Structs.getString(org.apache.beam.runners.dataflow.util.Structs.getString) ByteString(org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.ByteString)

Aggregations

RunnerApi (org.apache.beam.model.pipeline.v1.RunnerApi)117 Test (org.junit.Test)87 Pipeline (org.apache.beam.sdk.Pipeline)82 SdkComponents (org.apache.beam.runners.core.construction.SdkComponents)44 ByteString (org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.ByteString)43 DataflowPipelineOptions (org.apache.beam.runners.dataflow.options.DataflowPipelineOptions)38 Map (java.util.Map)32 KV (org.apache.beam.sdk.values.KV)26 Job (com.google.api.services.dataflow.model.Job)25 Structs.getString (org.apache.beam.runners.dataflow.util.Structs.getString)24 KvCoder (org.apache.beam.sdk.coders.KvCoder)24 Components (org.apache.beam.model.pipeline.v1.RunnerApi.Components)23 Coder (org.apache.beam.sdk.coders.Coder)23 ArrayList (java.util.ArrayList)22 WindowedValue (org.apache.beam.sdk.util.WindowedValue)22 HashMap (java.util.HashMap)20 List (java.util.List)20 ExecutableStage (org.apache.beam.runners.core.construction.graph.ExecutableStage)19 IOException (java.io.IOException)18 PCollection (org.apache.beam.sdk.values.PCollection)18