Search in sources :

Example 6 with Coder

use of org.apache.beam.model.pipeline.v1.RunnerApi.Coder in project beam by apache.

the class ProcessBundleDescriptorsTest method testLengthPrefixingOfKeyCoderInStatefulExecutableStage.

/**
 * Tests that a stateful stage will wrap the key coder of a stateful transform in a
 * LengthPrefixCoder.
 */
@Test
public void testLengthPrefixingOfKeyCoderInStatefulExecutableStage() throws Exception {
    // Add another stateful stage with a non-standard key coder
    Pipeline p = Pipeline.create();
    Coder<Void> keycoder = VoidCoder.of();
    assertThat(ModelCoderRegistrar.isKnownCoder(keycoder), is(false));
    p.apply("impulse", Impulse.create()).apply("create", ParDo.of(new DoFn<byte[], KV<Void, String>>() {

        @ProcessElement
        public void process(ProcessContext ctxt) {
        }
    })).setCoder(KvCoder.of(keycoder, StringUtf8Coder.of())).apply("userState", ParDo.of(new DoFn<KV<Void, String>, KV<Void, String>>() {

        @StateId("stateId")
        private final StateSpec<BagState<String>> bufferState = StateSpecs.bag(StringUtf8Coder.of());

        @TimerId("timerId")
        private final TimerSpec timerSpec = TimerSpecs.timer(TimeDomain.EVENT_TIME);

        @ProcessElement
        public void processElement(@Element KV<Void, String> element, @StateId("stateId") BagState<String> state, @TimerId("timerId") Timer timer, OutputReceiver<KV<Void, String>> r) {
        }

        @OnTimer("timerId")
        public void onTimer() {
        }
    })).apply("gbk", GroupByKey.create());
    RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(p);
    FusedPipeline fused = GreedyPipelineFuser.fuse(pipelineProto);
    Optional<ExecutableStage> optionalStage = Iterables.tryFind(fused.getFusedStages(), (ExecutableStage stage) -> stage.getUserStates().stream().anyMatch(spec -> spec.localName().equals("stateId")));
    checkState(optionalStage.isPresent(), "Expected a stage with user state.");
    ExecutableStage stage = optionalStage.get();
    PipelineNode.PCollectionNode inputPCollection = stage.getInputPCollection();
    // Ensure original key coder is not a LengthPrefixCoder
    Map<String, RunnerApi.Coder> stageCoderMap = stage.getComponents().getCodersMap();
    RunnerApi.Coder originalMainInputCoder = stageCoderMap.get(inputPCollection.getPCollection().getCoderId());
    String originalKeyCoderId = ModelCoders.getKvCoderComponents(originalMainInputCoder).keyCoderId();
    RunnerApi.Coder originalKeyCoder = stageCoderMap.get(originalKeyCoderId);
    assertThat(originalKeyCoder.getSpec().getUrn(), is(CoderTranslation.JAVA_SERIALIZED_CODER_URN));
    // Now create ProcessBundleDescriptor and check for the LengthPrefixCoder around the key coder
    BeamFnApi.ProcessBundleDescriptor pbd = ProcessBundleDescriptors.fromExecutableStage("test_stage", stage, Endpoints.ApiServiceDescriptor.getDefaultInstance()).getProcessBundleDescriptor();
    Map<String, RunnerApi.Coder> pbsCoderMap = pbd.getCodersMap();
    RunnerApi.Coder pbsMainInputCoder = pbsCoderMap.get(pbd.getPcollectionsOrThrow(inputPCollection.getId()).getCoderId());
    String keyCoderId = ModelCoders.getKvCoderComponents(pbsMainInputCoder).keyCoderId();
    RunnerApi.Coder keyCoder = pbsCoderMap.get(keyCoderId);
    ensureLengthPrefixed(keyCoder, originalKeyCoder, pbsCoderMap);
    TimerReference timerRef = Iterables.getOnlyElement(stage.getTimers());
    String timerTransformId = timerRef.transform().getId();
    RunnerApi.ParDoPayload parDoPayload = RunnerApi.ParDoPayload.parseFrom(pbd.getTransformsOrThrow(timerTransformId).getSpec().getPayload());
    RunnerApi.TimerFamilySpec timerSpec = parDoPayload.getTimerFamilySpecsOrThrow(timerRef.localName());
    RunnerApi.Coder timerCoder = pbsCoderMap.get(timerSpec.getTimerFamilyCoderId());
    String timerKeyCoderId = timerCoder.getComponentCoderIds(0);
    RunnerApi.Coder timerKeyCoder = pbsCoderMap.get(timerKeyCoderId);
    ensureLengthPrefixed(timerKeyCoder, originalKeyCoder, pbsCoderMap);
}
Also used : CoreMatchers.is(org.hamcrest.CoreMatchers.is) Endpoints(org.apache.beam.model.pipeline.v1.Endpoints) StateSpec(org.apache.beam.sdk.state.StateSpec) KV(org.apache.beam.sdk.values.KV) CoderTranslation(org.apache.beam.runners.core.construction.CoderTranslation) TimerSpecs(org.apache.beam.sdk.state.TimerSpecs) Coder(org.apache.beam.sdk.coders.Coder) Impulse(org.apache.beam.sdk.transforms.Impulse) GreedyPipelineFuser(org.apache.beam.runners.core.construction.graph.GreedyPipelineFuser) PipelineTranslation(org.apache.beam.runners.core.construction.PipelineTranslation) Optional(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Optional) StringUtf8Coder(org.apache.beam.sdk.coders.StringUtf8Coder) TimerSpec(org.apache.beam.sdk.state.TimerSpec) Map(java.util.Map) Iterables(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Iterables) TimerReference(org.apache.beam.runners.core.construction.graph.TimerReference) MatcherAssert.assertThat(org.hamcrest.MatcherAssert.assertThat) ModelCoderRegistrar(org.apache.beam.runners.core.construction.ModelCoderRegistrar) Pipeline(org.apache.beam.sdk.Pipeline) ProcessContext(org.apache.beam.sdk.transforms.DoFn.ProcessContext) RestrictionTracker(org.apache.beam.sdk.transforms.splittabledofn.RestrictionTracker) RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) FusedPipeline(org.apache.beam.runners.core.construction.graph.FusedPipeline) DoFn(org.apache.beam.sdk.transforms.DoFn) KvCoder(org.apache.beam.sdk.coders.KvCoder) GroupByKey(org.apache.beam.sdk.transforms.GroupByKey) PTransformTranslation(org.apache.beam.runners.core.construction.PTransformTranslation) ProcessElement(org.apache.beam.sdk.transforms.DoFn.ProcessElement) Test(org.junit.Test) BeamFnApi(org.apache.beam.model.fnexecution.v1.BeamFnApi) ExecutableStage(org.apache.beam.runners.core.construction.graph.ExecutableStage) ModelCoders(org.apache.beam.runners.core.construction.ModelCoders) Serializable(java.io.Serializable) SplittableParDoExpander(org.apache.beam.runners.core.construction.graph.SplittableParDoExpander) BagState(org.apache.beam.sdk.state.BagState) StateSpecs(org.apache.beam.sdk.state.StateSpecs) ParDo(org.apache.beam.sdk.transforms.ParDo) ProtoOverrides(org.apache.beam.runners.core.construction.graph.ProtoOverrides) Timer(org.apache.beam.sdk.state.Timer) Preconditions.checkState(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkState) PipelineNode(org.apache.beam.runners.core.construction.graph.PipelineNode) VoidCoder(org.apache.beam.sdk.coders.VoidCoder) TimeDomain(org.apache.beam.sdk.state.TimeDomain) TimerReference(org.apache.beam.runners.core.construction.graph.TimerReference) ProcessContext(org.apache.beam.sdk.transforms.DoFn.ProcessContext) RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) ExecutableStage(org.apache.beam.runners.core.construction.graph.ExecutableStage) BagState(org.apache.beam.sdk.state.BagState) TimerSpec(org.apache.beam.sdk.state.TimerSpec) Coder(org.apache.beam.sdk.coders.Coder) StringUtf8Coder(org.apache.beam.sdk.coders.StringUtf8Coder) KvCoder(org.apache.beam.sdk.coders.KvCoder) VoidCoder(org.apache.beam.sdk.coders.VoidCoder) BeamFnApi(org.apache.beam.model.fnexecution.v1.BeamFnApi) FusedPipeline(org.apache.beam.runners.core.construction.graph.FusedPipeline) KV(org.apache.beam.sdk.values.KV) PipelineNode(org.apache.beam.runners.core.construction.graph.PipelineNode) Pipeline(org.apache.beam.sdk.Pipeline) FusedPipeline(org.apache.beam.runners.core.construction.graph.FusedPipeline) Timer(org.apache.beam.sdk.state.Timer) ProcessElement(org.apache.beam.sdk.transforms.DoFn.ProcessElement) Test(org.junit.Test)

Example 7 with Coder

use of org.apache.beam.model.pipeline.v1.RunnerApi.Coder in project beam by apache.

the class RemoteExecutionTest method testExecution.

@Test
public void testExecution() throws Exception {
    launchSdkHarness(PipelineOptionsFactory.create());
    Pipeline p = Pipeline.create();
    p.apply("impulse", Impulse.create()).apply("create", ParDo.of(new DoFn<byte[], String>() {

        @ProcessElement
        public void process(ProcessContext ctxt) {
            ctxt.output("zero");
            ctxt.output("one");
            ctxt.output("two");
        }
    })).apply("len", ParDo.of(new DoFn<String, Long>() {

        @ProcessElement
        public void process(ProcessContext ctxt) {
            ctxt.output((long) ctxt.element().length());
        }
    })).apply("addKeys", WithKeys.of("foo")).setCoder(KvCoder.of(StringUtf8Coder.of(), BigEndianLongCoder.of())).apply("gbk", GroupByKey.create());
    RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(p);
    FusedPipeline fused = GreedyPipelineFuser.fuse(pipelineProto);
    checkState(fused.getFusedStages().size() == 1, "Expected exactly one fused stage");
    ExecutableStage stage = fused.getFusedStages().iterator().next();
    ExecutableProcessBundleDescriptor descriptor = ProcessBundleDescriptors.fromExecutableStage("my_stage", stage, dataServer.getApiServiceDescriptor());
    BundleProcessor processor = controlClient.getProcessor(descriptor.getProcessBundleDescriptor(), descriptor.getRemoteInputDestinations());
    Map<String, ? super Coder<WindowedValue<?>>> remoteOutputCoders = descriptor.getRemoteOutputCoders();
    Map<String, Collection<? super WindowedValue<?>>> outputValues = new HashMap<>();
    Map<String, RemoteOutputReceiver<?>> outputReceivers = new HashMap<>();
    for (Entry<String, ? super Coder<WindowedValue<?>>> remoteOutputCoder : remoteOutputCoders.entrySet()) {
        List<? super WindowedValue<?>> outputContents = Collections.synchronizedList(new ArrayList<>());
        outputValues.put(remoteOutputCoder.getKey(), outputContents);
        outputReceivers.put(remoteOutputCoder.getKey(), RemoteOutputReceiver.of((Coder) remoteOutputCoder.getValue(), (FnDataReceiver<? super WindowedValue<?>>) outputContents::add));
    }
    try (RemoteBundle bundle = processor.newBundle(outputReceivers, BundleProgressHandler.ignored())) {
        Iterables.getOnlyElement(bundle.getInputReceivers().values()).accept(valueInGlobalWindow(new byte[0]));
    }
    for (Collection<? super WindowedValue<?>> windowedValues : outputValues.values()) {
        assertThat(windowedValues, containsInAnyOrder(valueInGlobalWindow(byteValueOf("foo", 4)), valueInGlobalWindow(byteValueOf("foo", 3)), valueInGlobalWindow(byteValueOf("foo", 3))));
    }
}
Also used : ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) HashMap(java.util.HashMap) ByteString(org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.ByteString) RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) BundleProcessor(org.apache.beam.runners.fnexecution.control.SdkHarnessClient.BundleProcessor) WindowedValue(org.apache.beam.sdk.util.WindowedValue) ExecutableStage(org.apache.beam.runners.core.construction.graph.ExecutableStage) ExecutableProcessBundleDescriptor(org.apache.beam.runners.fnexecution.control.ProcessBundleDescriptors.ExecutableProcessBundleDescriptor) KvCoder(org.apache.beam.sdk.coders.KvCoder) Coder(org.apache.beam.sdk.coders.Coder) StringUtf8Coder(org.apache.beam.sdk.coders.StringUtf8Coder) BigEndianLongCoder(org.apache.beam.sdk.coders.BigEndianLongCoder) FnDataReceiver(org.apache.beam.sdk.fn.data.FnDataReceiver) FusedPipeline(org.apache.beam.runners.core.construction.graph.FusedPipeline) Pipeline(org.apache.beam.sdk.Pipeline) FusedPipeline(org.apache.beam.runners.core.construction.graph.FusedPipeline) DoFn(org.apache.beam.sdk.transforms.DoFn) Collection(java.util.Collection) PCollection(org.apache.beam.sdk.values.PCollection) Test(org.junit.Test)

Example 8 with Coder

use of org.apache.beam.model.pipeline.v1.RunnerApi.Coder in project beam by apache.

the class ProcessBundleDescriptors method fromExecutableStageInternal.

private static ExecutableProcessBundleDescriptor fromExecutableStageInternal(String id, ExecutableStage stage, ApiServiceDescriptor dataEndpoint, @Nullable ApiServiceDescriptor stateEndpoint) throws IOException {
    // Create with all of the processing transforms, and all of the components.
    // TODO: Remove the unreachable subcomponents if the size of the descriptor matters.
    Map<String, PTransform> stageTransforms = stage.getTransforms().stream().collect(Collectors.toMap(PTransformNode::getId, PTransformNode::getTransform));
    Components.Builder components = stage.getComponents().toBuilder().clearTransforms().putAllTransforms(stageTransforms);
    ImmutableList.Builder<RemoteInputDestination> inputDestinationsBuilder = ImmutableList.builder();
    ImmutableMap.Builder<String, Coder> remoteOutputCodersBuilder = ImmutableMap.builder();
    WireCoderSetting wireCoderSetting = stage.getWireCoderSettings().stream().filter(ws -> ws.getInputOrOutputId().equals(stage.getInputPCollection().getId())).findAny().orElse(WireCoderSetting.getDefaultInstance());
    // The order of these does not matter.
    inputDestinationsBuilder.add(addStageInput(dataEndpoint, stage.getInputPCollection(), components, wireCoderSetting));
    remoteOutputCodersBuilder.putAll(addStageOutputs(dataEndpoint, stage.getOutputPCollections(), components, stage.getWireCoderSettings()));
    Map<String, Map<String, SideInputSpec>> sideInputSpecs = addSideInputs(stage, components);
    Map<String, Map<String, BagUserStateSpec>> bagUserStateSpecs = forBagUserStates(stage, components.build());
    Map<String, Map<String, TimerSpec>> timerSpecs = forTimerSpecs(stage, components);
    lengthPrefixAnyInputCoder(stage.getInputPCollection().getId(), components);
    // Copy data from components to ProcessBundleDescriptor.
    ProcessBundleDescriptor.Builder bundleDescriptorBuilder = ProcessBundleDescriptor.newBuilder().setId(id);
    if (stateEndpoint != null) {
        bundleDescriptorBuilder.setStateApiServiceDescriptor(stateEndpoint);
    }
    if (timerSpecs.size() > 0) {
        // By default use the data endpoint for timers, in the future considering enabling specifying
        // a different ApiServiceDescriptor for timers.
        bundleDescriptorBuilder.setTimerApiServiceDescriptor(dataEndpoint);
    }
    bundleDescriptorBuilder.putAllCoders(components.getCodersMap()).putAllEnvironments(components.getEnvironmentsMap()).putAllPcollections(components.getPcollectionsMap()).putAllWindowingStrategies(components.getWindowingStrategiesMap()).putAllTransforms(components.getTransformsMap());
    return ExecutableProcessBundleDescriptor.of(bundleDescriptorBuilder.build(), inputDestinationsBuilder.build(), remoteOutputCodersBuilder.build(), sideInputSpecs, bagUserStateSpecs, timerSpecs);
}
Also used : Coder(org.apache.beam.sdk.coders.Coder) ByteStringCoder(org.apache.beam.runners.fnexecution.wire.ByteStringCoder) FullWindowedValueCoder(org.apache.beam.sdk.util.WindowedValue.FullWindowedValueCoder) RemoteInputDestination(org.apache.beam.runners.fnexecution.data.RemoteInputDestination) ImmutableList(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList) ProcessBundleDescriptor(org.apache.beam.model.fnexecution.v1.BeamFnApi.ProcessBundleDescriptor) WireCoderSetting(org.apache.beam.model.pipeline.v1.RunnerApi.ExecutableStagePayload.WireCoderSetting) ImmutableMap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap) RehydratedComponents(org.apache.beam.runners.core.construction.RehydratedComponents) Components(org.apache.beam.model.pipeline.v1.RunnerApi.Components) ImmutableMap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map) PTransform(org.apache.beam.model.pipeline.v1.RunnerApi.PTransform)

Example 9 with Coder

use of org.apache.beam.model.pipeline.v1.RunnerApi.Coder in project beam by apache.

the class ProcessBundleDescriptors method addSideInputs.

private static Map<String, Map<String, SideInputSpec>> addSideInputs(ExecutableStage stage, Components.Builder components) throws IOException {
    ImmutableTable.Builder<String, String, SideInputSpec> idsToSpec = ImmutableTable.builder();
    for (SideInputReference sideInputReference : stage.getSideInputs()) {
        // Update the coder specification for side inputs to be length prefixed so that the
        // SDK and Runner agree on how to encode/decode the key, window, and values for
        // side inputs.
        PCollectionNode pcNode = sideInputReference.collection();
        PCollection pc = pcNode.getPCollection();
        String lengthPrefixedCoderId = LengthPrefixUnknownCoders.addLengthPrefixedCoder(pc.getCoderId(), components, false);
        components.putPcollections(pcNode.getId(), pc.toBuilder().setCoderId(lengthPrefixedCoderId).build());
        FullWindowedValueCoder<KV<?, ?>> coder = (FullWindowedValueCoder) WireCoders.instantiateRunnerWireCoder(pcNode, components.build());
        idsToSpec.put(sideInputReference.transform().getId(), sideInputReference.localName(), SideInputSpec.of(sideInputReference.transform().getId(), sideInputReference.localName(), getAccessPattern(sideInputReference), coder.getValueCoder(), coder.getWindowCoder()));
    }
    return idsToSpec.build().rowMap();
}
Also used : PCollection(org.apache.beam.model.pipeline.v1.RunnerApi.PCollection) FullWindowedValueCoder(org.apache.beam.sdk.util.WindowedValue.FullWindowedValueCoder) SideInputReference(org.apache.beam.runners.core.construction.graph.SideInputReference) KV(org.apache.beam.sdk.values.KV) ImmutableTable(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableTable) PCollectionNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PCollectionNode)

Example 10 with Coder

use of org.apache.beam.model.pipeline.v1.RunnerApi.Coder in project beam by apache.

the class ProcessBundleDescriptors method addStageOutputs.

private static Map<String, Coder<WindowedValue<?>>> addStageOutputs(ApiServiceDescriptor dataEndpoint, Collection<PCollectionNode> outputPCollections, Components.Builder components, Collection<WireCoderSetting> wireCoderSettings) throws IOException {
    Map<String, Coder<WindowedValue<?>>> remoteOutputCoders = new LinkedHashMap<>();
    for (PCollectionNode outputPCollection : outputPCollections) {
        WireCoderSetting wireCoderSetting = wireCoderSettings.stream().filter(ws -> ws.getInputOrOutputId().equals(outputPCollection.getId())).findAny().orElse(WireCoderSetting.getDefaultInstance());
        OutputEncoding outputEncoding = addStageOutput(dataEndpoint, components, outputPCollection, wireCoderSetting);
        remoteOutputCoders.put(outputEncoding.getPTransformId(), outputEncoding.getCoder());
    }
    return remoteOutputCoders;
}
Also used : Coder(org.apache.beam.sdk.coders.Coder) ByteStringCoder(org.apache.beam.runners.fnexecution.wire.ByteStringCoder) FullWindowedValueCoder(org.apache.beam.sdk.util.WindowedValue.FullWindowedValueCoder) WireCoderSetting(org.apache.beam.model.pipeline.v1.RunnerApi.ExecutableStagePayload.WireCoderSetting) PCollectionNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PCollectionNode) LinkedHashMap(java.util.LinkedHashMap)

Aggregations

RunnerApi (org.apache.beam.model.pipeline.v1.RunnerApi)48 Coder (org.apache.beam.sdk.coders.Coder)33 WindowedValue (org.apache.beam.sdk.util.WindowedValue)32 KvCoder (org.apache.beam.sdk.coders.KvCoder)30 Test (org.junit.Test)30 Map (java.util.Map)23 ByteString (org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.ByteString)23 HashMap (java.util.HashMap)21 KV (org.apache.beam.sdk.values.KV)20 ArrayList (java.util.ArrayList)19 IOException (java.io.IOException)18 StringUtf8Coder (org.apache.beam.sdk.coders.StringUtf8Coder)17 List (java.util.List)16 ExecutableStage (org.apache.beam.runners.core.construction.graph.ExecutableStage)16 BoundedWindow (org.apache.beam.sdk.transforms.windowing.BoundedWindow)15 ImmutableMap (org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap)15 Collection (java.util.Collection)13 Pipeline (org.apache.beam.sdk.Pipeline)13 FusedPipeline (org.apache.beam.runners.core.construction.graph.FusedPipeline)12 ConcurrentHashMap (java.util.concurrent.ConcurrentHashMap)11