Search in sources :

Example 1 with SideInput

use of org.apache.beam.model.pipeline.v1.RunnerApi.SideInput in project beam by apache.

the class ParDoTranslation method translateParDo.

/**
 * Translate a ParDo.
 */
public static <InputT> ParDoPayload translateParDo(ParDo.MultiOutput<InputT, ?> parDo, PCollection<InputT> mainInput, DoFnSchemaInformation doFnSchemaInformation, Pipeline pipeline, SdkComponents components) throws IOException {
    final DoFn<?, ?> doFn = parDo.getFn();
    final DoFnSignature signature = DoFnSignatures.getSignature(doFn.getClass());
    final String restrictionCoderId;
    if (signature.processElement().isSplittable()) {
        DoFnInvoker<?, ?> doFnInvoker = DoFnInvokers.invokerFor(doFn);
        final Coder<?> restrictionAndWatermarkStateCoder = KvCoder.of(doFnInvoker.invokeGetRestrictionCoder(pipeline.getCoderRegistry()), doFnInvoker.invokeGetWatermarkEstimatorStateCoder(pipeline.getCoderRegistry()));
        restrictionCoderId = components.registerCoder(restrictionAndWatermarkStateCoder);
    } else {
        restrictionCoderId = "";
    }
    Coder<BoundedWindow> windowCoder = (Coder<BoundedWindow>) mainInput.getWindowingStrategy().getWindowFn().windowCoder();
    Coder<?> keyCoder;
    if (signature.usesState() || signature.usesTimers()) {
        checkArgument(mainInput.getCoder() instanceof KvCoder, "DoFn's that use state or timers must have an input PCollection with a KvCoder but received %s", mainInput.getCoder());
        keyCoder = ((KvCoder) mainInput.getCoder()).getKeyCoder();
    } else {
        keyCoder = null;
    }
    return payloadForParDoLike(new ParDoLike() {

        @Override
        public FunctionSpec translateDoFn(SdkComponents newComponents) {
            return ParDoTranslation.translateDoFn(parDo.getFn(), parDo.getMainOutputTag(), parDo.getSideInputs(), doFnSchemaInformation, newComponents);
        }

        @Override
        public Map<String, SideInput> translateSideInputs(SdkComponents components) {
            Map<String, SideInput> sideInputs = new HashMap<>();
            for (PCollectionView<?> sideInput : parDo.getSideInputs().values()) {
                sideInputs.put(sideInput.getTagInternal().getId(), translateView(sideInput, components));
            }
            return sideInputs;
        }

        @Override
        public Map<String, RunnerApi.StateSpec> translateStateSpecs(SdkComponents components) throws IOException {
            Map<String, RunnerApi.StateSpec> stateSpecs = new HashMap<>();
            for (Map.Entry<String, StateDeclaration> state : signature.stateDeclarations().entrySet()) {
                RunnerApi.StateSpec spec = translateStateSpec(getStateSpecOrThrow(state.getValue(), doFn), components);
                stateSpecs.put(state.getKey(), spec);
            }
            return stateSpecs;
        }

        @Override
        public ParDoLikeTimerFamilySpecs translateTimerFamilySpecs(SdkComponents newComponents) {
            Map<String, RunnerApi.TimerFamilySpec> timerFamilySpecs = new HashMap<>();
            for (Map.Entry<String, TimerDeclaration> timer : signature.timerDeclarations().entrySet()) {
                RunnerApi.TimerFamilySpec spec = translateTimerFamilySpec(getTimerSpecOrThrow(timer.getValue(), doFn), newComponents, keyCoder, windowCoder);
                timerFamilySpecs.put(timer.getKey(), spec);
            }
            for (Map.Entry<String, DoFnSignature.TimerFamilyDeclaration> timerFamily : signature.timerFamilyDeclarations().entrySet()) {
                RunnerApi.TimerFamilySpec spec = translateTimerFamilySpec(DoFnSignatures.getTimerFamilySpecOrThrow(timerFamily.getValue(), doFn), newComponents, keyCoder, windowCoder);
                timerFamilySpecs.put(timerFamily.getKey(), spec);
            }
            String onWindowExpirationTimerFamilySpec = null;
            if (signature.onWindowExpiration() != null) {
                RunnerApi.TimerFamilySpec spec = RunnerApi.TimerFamilySpec.newBuilder().setTimeDomain(translateTimeDomain(TimeDomain.EVENT_TIME)).setTimerFamilyCoderId(registerCoderOrThrow(components, Timer.Coder.of(keyCoder, windowCoder))).build();
                for (int i = 0; i < Integer.MAX_VALUE; ++i) {
                    onWindowExpirationTimerFamilySpec = "onWindowExpiration" + i;
                    if (!timerFamilySpecs.containsKey(onWindowExpirationTimerFamilySpec)) {
                        break;
                    }
                }
                timerFamilySpecs.put(onWindowExpirationTimerFamilySpec, spec);
            }
            return ParDoLikeTimerFamilySpecs.create(timerFamilySpecs, onWindowExpirationTimerFamilySpec);
        }

        @Override
        public boolean isStateful() {
            return !signature.stateDeclarations().isEmpty() || !signature.timerDeclarations().isEmpty() || !signature.timerFamilyDeclarations().isEmpty() || signature.onWindowExpiration() != null;
        }

        @Override
        public boolean isSplittable() {
            return signature.processElement().isSplittable();
        }

        @Override
        public boolean isRequiresStableInput() {
            return signature.processElement().requiresStableInput();
        }

        @Override
        public boolean isRequiresTimeSortedInput() {
            return signature.processElement().requiresTimeSortedInput();
        }

        @Override
        public boolean requestsFinalization() {
            return (signature.startBundle() != null && signature.startBundle().extraParameters().contains(Parameter.bundleFinalizer())) || (signature.processElement() != null && signature.processElement().extraParameters().contains(Parameter.bundleFinalizer())) || (signature.finishBundle() != null && signature.finishBundle().extraParameters().contains(Parameter.bundleFinalizer()));
        }

        @Override
        public String translateRestrictionCoderId(SdkComponents newComponents) {
            return restrictionCoderId;
        }
    }, components);
}
Also used : KvCoder(org.apache.beam.sdk.coders.KvCoder) Coder(org.apache.beam.sdk.coders.Coder) FunctionSpec(org.apache.beam.model.pipeline.v1.RunnerApi.FunctionSpec) KvCoder(org.apache.beam.sdk.coders.KvCoder) ByteString(org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.ByteString) IOException(java.io.IOException) RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) StateSpec(org.apache.beam.sdk.state.StateSpec) PCollectionView(org.apache.beam.sdk.values.PCollectionView) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) Map(java.util.Map) HashMap(java.util.HashMap) DoFnSignature(org.apache.beam.sdk.transforms.reflect.DoFnSignature)

Example 2 with SideInput

use of org.apache.beam.model.pipeline.v1.RunnerApi.SideInput in project beam by apache.

the class ParDoTranslation method getSideInputs.

public static List<PCollectionView<?>> getSideInputs(AppliedPTransform<?, ?, ?> application) throws IOException {
    PTransform<?, ?> transform = application.getTransform();
    if (transform instanceof ParDo.MultiOutput) {
        return ((ParDo.MultiOutput<?, ?>) transform).getSideInputs().values().stream().collect(Collectors.toList());
    }
    SdkComponents sdkComponents = SdkComponents.create(application.getPipeline().getOptions());
    RunnerApi.PTransform parDoProto = PTransformTranslation.toProto(application, sdkComponents);
    ParDoPayload payload = ParDoPayload.parseFrom(parDoProto.getSpec().getPayload());
    List<PCollectionView<?>> views = new ArrayList<>();
    RehydratedComponents components = RehydratedComponents.forComponents(sdkComponents.toComponents());
    for (Map.Entry<String, SideInput> sideInputEntry : payload.getSideInputsMap().entrySet()) {
        String sideInputTag = sideInputEntry.getKey();
        RunnerApi.SideInput sideInput = sideInputEntry.getValue();
        PCollection<?> originalPCollection = checkNotNull((PCollection<?>) application.getInputs().get(new TupleTag<>(sideInputTag)), "no input with tag %s", sideInputTag);
        views.add(PCollectionViewTranslation.viewFromProto(sideInput, sideInputTag, originalPCollection, parDoProto, components));
    }
    return views;
}
Also used : ParDoPayload(org.apache.beam.model.pipeline.v1.RunnerApi.ParDoPayload) SideInput(org.apache.beam.model.pipeline.v1.RunnerApi.SideInput) ArrayList(java.util.ArrayList) ByteString(org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.ByteString) RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) PCollectionView(org.apache.beam.sdk.values.PCollectionView) ParDo(org.apache.beam.sdk.transforms.ParDo) Map(java.util.Map) HashMap(java.util.HashMap) MultiOutput(org.apache.beam.sdk.transforms.ParDo.MultiOutput) SideInput(org.apache.beam.model.pipeline.v1.RunnerApi.SideInput)

Example 3 with SideInput

use of org.apache.beam.model.pipeline.v1.RunnerApi.SideInput in project beam by apache.

the class ExecutableStage method toPTransform.

/**
 * Returns a composite {@link PTransform} which is equivalent to this {@link ExecutableStage} as
 * follows:
 *
 * <ul>
 *   <li>The {@link PTransform#getSubtransformsList()} is empty. This ensures that executable
 *       stages are treated as primitive transforms.
 *   <li>The only {@link PCollection PCollections} in the {@link PTransform#getInputsMap()} is the
 *       result of {@link #getInputPCollection()} and {@link #getSideInputs()}.
 *   <li>The output {@link PCollection PCollections} in the values of {@link
 *       PTransform#getOutputsMap()} are the {@link PCollectionNode PCollections} returned by
 *       {@link #getOutputPCollections()}.
 *   <li>The {@link PTransform#getSpec()} contains an {@link ExecutableStagePayload} with inputs
 *       and outputs equal to the PTransform's inputs and outputs, and transforms equal to the
 *       result of {@link #getTransforms}.
 * </ul>
 *
 * <p>The executable stage can be reconstructed from the resulting {@link ExecutableStagePayload}
 * via {@link #fromPayload(ExecutableStagePayload)}.
 */
default PTransform toPTransform(String uniqueName) {
    PTransform.Builder pt = PTransform.newBuilder().setUniqueName(uniqueName);
    ExecutableStagePayload.Builder payload = ExecutableStagePayload.newBuilder();
    payload.setEnvironment(getEnvironment());
    payload.addAllWireCoderSettings(getWireCoderSettings());
    // Populate inputs and outputs of the stage payload and outer PTransform simultaneously.
    PCollectionNode input = getInputPCollection();
    pt.putInputs("input", getInputPCollection().getId());
    payload.setInput(input.getId());
    for (SideInputReference sideInput : getSideInputs()) {
        // Side inputs of the ExecutableStage itself can be uniquely identified by inner PTransform
        // name and local name.
        String outerLocalName = String.format("%s:%s", sideInput.transform().getId(), sideInput.localName());
        pt.putInputs(outerLocalName, sideInput.collection().getId());
        payload.addSideInputs(SideInputId.newBuilder().setTransformId(sideInput.transform().getId()).setLocalName(sideInput.localName()));
    }
    for (UserStateReference userState : getUserStates()) {
        payload.addUserStates(UserStateId.newBuilder().setTransformId(userState.transform().getId()).setLocalName(userState.localName()));
    }
    for (TimerReference timer : getTimers()) {
        payload.addTimers(TimerId.newBuilder().setTransformId(timer.transform().getId()).setLocalName(timer.localName()));
    }
    int outputIndex = 0;
    for (PCollectionNode output : getOutputPCollections()) {
        pt.putOutputs(String.format("materialized_%d", outputIndex), output.getId());
        payload.addOutputs(output.getId());
        outputIndex++;
    }
    // stage payload.
    for (PTransformNode transform : getTransforms()) {
        payload.addTransforms(transform.getId());
    }
    payload.setComponents(getComponents().toBuilder().clearTransforms().putAllTransforms(getTransforms().stream().collect(Collectors.toMap(PTransformNode::getId, PTransformNode::getTransform))));
    pt.setSpec(FunctionSpec.newBuilder().setUrn(ExecutableStage.URN).setPayload(payload.build().toByteString()).build());
    return pt.build();
}
Also used : ExecutableStagePayload(org.apache.beam.model.pipeline.v1.RunnerApi.ExecutableStagePayload) PTransformNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PTransformNode) PCollectionNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PCollectionNode) PTransform(org.apache.beam.model.pipeline.v1.RunnerApi.PTransform)

Example 4 with SideInput

use of org.apache.beam.model.pipeline.v1.RunnerApi.SideInput in project beam by apache.

the class ImmutableExecutableStageTest method ofFullComponentsOnlyHasStagePTransforms.

@Test
public void ofFullComponentsOnlyHasStagePTransforms() throws Exception {
    Environment env = Environments.createDockerEnvironment("foo");
    PTransform pt = PTransform.newBuilder().putInputs("input", "input.out").putInputs("side_input", "sideInput.in").putInputs("timer", "timer.pc").putOutputs("output", "output.out").putOutputs("timer", "timer.pc").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).setPayload(ParDoPayload.newBuilder().setDoFn(RunnerApi.FunctionSpec.newBuilder()).putSideInputs("side_input", RunnerApi.SideInput.getDefaultInstance()).putStateSpecs("user_state", RunnerApi.StateSpec.getDefaultInstance()).putTimerFamilySpecs("timer", RunnerApi.TimerFamilySpec.getDefaultInstance()).build().toByteString())).build();
    PCollection input = PCollection.newBuilder().setUniqueName("input.out").build();
    PCollection sideInput = PCollection.newBuilder().setUniqueName("sideInput.in").build();
    PCollection timer = PCollection.newBuilder().setUniqueName("timer.pc").build();
    PCollection output = PCollection.newBuilder().setUniqueName("output.out").build();
    Components components = Components.newBuilder().putTransforms("pt", pt).putTransforms("other_pt", PTransform.newBuilder().setUniqueName("other").build()).putPcollections("input.out", input).putPcollections("sideInput.in", sideInput).putPcollections("timer.pc", timer).putPcollections("output.out", output).putEnvironments("foo", env).build();
    PTransformNode transformNode = PipelineNode.pTransform("pt", pt);
    SideInputReference sideInputRef = SideInputReference.of(transformNode, "side_input", PipelineNode.pCollection("sideInput.in", sideInput));
    UserStateReference userStateRef = UserStateReference.of(transformNode, "user_state", PipelineNode.pCollection("input.out", input));
    TimerReference timerRef = TimerReference.of(transformNode, "timer");
    ImmutableExecutableStage stage = ImmutableExecutableStage.ofFullComponents(components, env, PipelineNode.pCollection("input.out", input), Collections.singleton(sideInputRef), Collections.singleton(userStateRef), Collections.singleton(timerRef), Collections.singleton(PipelineNode.pTransform("pt", pt)), Collections.singleton(PipelineNode.pCollection("output.out", output)), DEFAULT_WIRE_CODER_SETTINGS);
    assertThat(stage.getComponents().containsTransforms("pt"), is(true));
    assertThat(stage.getComponents().containsTransforms("other_pt"), is(false));
    PTransform stagePTransform = stage.toPTransform("foo");
    assertThat(stagePTransform.getOutputsMap(), hasValue("output.out"));
    assertThat(stagePTransform.getOutputsCount(), equalTo(1));
    assertThat(stagePTransform.getInputsMap(), allOf(hasValue("input.out"), hasValue("sideInput.in")));
    assertThat(stagePTransform.getInputsCount(), equalTo(2));
    ExecutableStagePayload payload = ExecutableStagePayload.parseFrom(stagePTransform.getSpec().getPayload());
    assertThat(payload.getTransformsList(), contains("pt"));
    assertThat(ExecutableStage.fromPayload(payload), equalTo(stage));
}
Also used : Components(org.apache.beam.model.pipeline.v1.RunnerApi.Components) PCollection(org.apache.beam.model.pipeline.v1.RunnerApi.PCollection) ExecutableStagePayload(org.apache.beam.model.pipeline.v1.RunnerApi.ExecutableStagePayload) PTransformNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PTransformNode) Environment(org.apache.beam.model.pipeline.v1.RunnerApi.Environment) PTransform(org.apache.beam.model.pipeline.v1.RunnerApi.PTransform) Test(org.junit.Test)

Example 5 with SideInput

use of org.apache.beam.model.pipeline.v1.RunnerApi.SideInput in project beam by apache.

the class QueryablePipelineTest method transformWithSideAndMainInputs.

/**
 * Tests that inputs that are only side inputs are not returned from {@link
 * QueryablePipeline#getPerElementConsumers(PCollectionNode)} and are returned from {@link
 * QueryablePipeline#getSideInputs(PTransformNode)}.
 */
@Test
public void transformWithSideAndMainInputs() {
    Pipeline p = Pipeline.create();
    PCollection<byte[]> impulse = p.apply("Impulse", Impulse.create());
    PCollectionView<String> view = p.apply("Create", Create.of("foo")).apply("View", View.asSingleton());
    impulse.apply("par_do", ParDo.of(new TestFn()).withSideInputs(view).withOutputTags(new TupleTag<>(), TupleTagList.empty()));
    Components components = PipelineTranslation.toProto(p).getComponents();
    QueryablePipeline qp = QueryablePipeline.forPrimitivesIn(components);
    String mainInputName = getOnlyElement(PipelineNode.pTransform("Impulse", components.getTransformsOrThrow("Impulse")).getTransform().getOutputsMap().values());
    PCollectionNode mainInput = PipelineNode.pCollection(mainInputName, components.getPcollectionsOrThrow(mainInputName));
    PTransform parDoTransform = components.getTransformsOrThrow("par_do");
    String sideInputLocalName = getOnlyElement(parDoTransform.getInputsMap().entrySet().stream().filter(entry -> !entry.getValue().equals(mainInputName)).map(Map.Entry::getKey).collect(Collectors.toSet()));
    String sideInputCollectionId = parDoTransform.getInputsOrThrow(sideInputLocalName);
    PCollectionNode sideInput = PipelineNode.pCollection(sideInputCollectionId, components.getPcollectionsOrThrow(sideInputCollectionId));
    PTransformNode parDoNode = PipelineNode.pTransform("par_do", components.getTransformsOrThrow("par_do"));
    SideInputReference sideInputRef = SideInputReference.of(parDoNode, sideInputLocalName, sideInput);
    assertThat(qp.getSideInputs(parDoNode), contains(sideInputRef));
    assertThat(qp.getPerElementConsumers(mainInput), contains(parDoNode));
    assertThat(qp.getPerElementConsumers(sideInput), not(contains(parDoNode)));
}
Also used : Count(org.apache.beam.sdk.transforms.Count) PBegin(org.apache.beam.sdk.values.PBegin) Matchers.not(org.hamcrest.Matchers.not) Matchers.hasKey(org.hamcrest.Matchers.hasKey) ImmutableSet(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableSet) PCollectionList(org.apache.beam.sdk.values.PCollectionList) FunctionSpec(org.apache.beam.model.pipeline.v1.RunnerApi.FunctionSpec) Create(org.apache.beam.sdk.transforms.Create) Map(java.util.Map) Window(org.apache.beam.sdk.transforms.windowing.Window) Components(org.apache.beam.model.pipeline.v1.RunnerApi.Components) Flatten(org.apache.beam.sdk.transforms.Flatten) MapElements(org.apache.beam.sdk.transforms.MapElements) PTransformTranslation(org.apache.beam.runners.core.construction.PTransformTranslation) Collection(java.util.Collection) Set(java.util.Set) Collectors(java.util.stream.Collectors) ParDo(org.apache.beam.sdk.transforms.ParDo) Matchers.contains(org.hamcrest.Matchers.contains) Matchers.equalTo(org.hamcrest.Matchers.equalTo) TypeDescriptors(org.apache.beam.sdk.values.TypeDescriptors) Matchers.is(org.hamcrest.Matchers.is) SideInput(org.apache.beam.model.pipeline.v1.RunnerApi.SideInput) PCollectionNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PCollectionNode) PTransform(org.apache.beam.model.pipeline.v1.RunnerApi.PTransform) Duration(org.joda.time.Duration) RunWith(org.junit.runner.RunWith) Impulse(org.apache.beam.sdk.transforms.Impulse) View(org.apache.beam.sdk.transforms.View) PipelineTranslation(org.apache.beam.runners.core.construction.PipelineTranslation) TupleTagList(org.apache.beam.sdk.values.TupleTagList) Environments(org.apache.beam.runners.core.construction.Environments) ParDoPayload(org.apache.beam.model.pipeline.v1.RunnerApi.ParDoPayload) Read(org.apache.beam.sdk.io.Read) TupleTag(org.apache.beam.sdk.values.TupleTag) Matchers.hasSize(org.hamcrest.Matchers.hasSize) MatcherAssert.assertThat(org.hamcrest.MatcherAssert.assertThat) PTransformNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PTransformNode) Pipeline(org.apache.beam.sdk.Pipeline) ExpectedException(org.junit.rules.ExpectedException) RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) DoFn(org.apache.beam.sdk.transforms.DoFn) CountingSource(org.apache.beam.sdk.io.CountingSource) GroupByKey(org.apache.beam.sdk.transforms.GroupByKey) WithKeys(org.apache.beam.sdk.transforms.WithKeys) FixedWindows(org.apache.beam.sdk.transforms.windowing.FixedWindows) Test(org.junit.Test) JUnit4(org.junit.runners.JUnit4) PCollection(org.apache.beam.sdk.values.PCollection) Matchers.emptyIterable(org.hamcrest.Matchers.emptyIterable) Rule(org.junit.Rule) PCollectionView(org.apache.beam.sdk.values.PCollectionView) Iterables.getOnlyElement(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Iterables.getOnlyElement) PTransformNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PTransformNode) TupleTag(org.apache.beam.sdk.values.TupleTag) PCollectionNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PCollectionNode) Pipeline(org.apache.beam.sdk.Pipeline) Components(org.apache.beam.model.pipeline.v1.RunnerApi.Components) PTransform(org.apache.beam.model.pipeline.v1.RunnerApi.PTransform) Test(org.junit.Test)

Aggregations

RunnerApi (org.apache.beam.model.pipeline.v1.RunnerApi)7 PCollectionView (org.apache.beam.sdk.values.PCollectionView)7 Map (java.util.Map)5 HashMap (java.util.HashMap)4 PTransform (org.apache.beam.model.pipeline.v1.RunnerApi.PTransform)4 PTransformNode (org.apache.beam.runners.core.construction.graph.PipelineNode.PTransformNode)4 ByteString (org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.ByteString)4 ImmutableMap (org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap)4 IOException (java.io.IOException)3 Components (org.apache.beam.model.pipeline.v1.RunnerApi.Components)3 ExecutableStagePayload (org.apache.beam.model.pipeline.v1.RunnerApi.ExecutableStagePayload)3 SideInput (org.apache.beam.model.pipeline.v1.RunnerApi.SideInput)3 Structs.getString (org.apache.beam.runners.dataflow.util.Structs.getString)3 Test (org.junit.Test)3 ParDoInstruction (com.google.api.services.dataflow.model.ParDoInstruction)2 ArrayList (java.util.ArrayList)2 Environment (org.apache.beam.model.pipeline.v1.RunnerApi.Environment)2 PCollection (org.apache.beam.model.pipeline.v1.RunnerApi.PCollection)2 ParDoPayload (org.apache.beam.model.pipeline.v1.RunnerApi.ParDoPayload)2 PCollectionNode (org.apache.beam.runners.core.construction.graph.PipelineNode.PCollectionNode)2