Search in sources :

Example 36 with PCollection

use of org.apache.beam.model.pipeline.v1.RunnerApi.PCollection in project beam by apache.

the class OutputDeduplicator method createPartialPCollections.

/**
 * Returns a {@link Map} from the ID of a {@link PCollectionNode PCollection} to a {@link
 * PCollectionNode} that contains part of that {@link PCollectionNode PCollection}.
 */
private static Map<String, PCollectionNode> createPartialPCollections(Collection<PCollectionNode> duplicates, Predicate<String> existingPCollectionIds) {
    Map<String, PCollectionNode> unzippedOutputs = new LinkedHashMap<>();
    Predicate<String> existingOrNewIds = existingPCollectionIds.or(id -> unzippedOutputs.values().stream().map(PCollectionNode::getId).anyMatch(id::equals));
    for (PCollectionNode duplicateOutput : duplicates) {
        String id = SyntheticComponents.uniqueId(duplicateOutput.getId(), existingOrNewIds);
        PCollection partial = duplicateOutput.getPCollection().toBuilder().setUniqueName(id).build();
        // Check to make sure there is only one duplicated output with the same id - which ensures we
        // only introduce one 'partial output' per producer of that output.
        PCollectionNode alreadyDeduplicated = unzippedOutputs.put(duplicateOutput.getId(), PipelineNode.pCollection(id, partial));
        checkArgument(alreadyDeduplicated == null, "a duplicate should only appear once per stage");
    }
    return unzippedOutputs;
}
Also used : PCollection(org.apache.beam.model.pipeline.v1.RunnerApi.PCollection) PCollectionNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PCollectionNode) LinkedHashMap(java.util.LinkedHashMap)

Example 37 with PCollection

use of org.apache.beam.model.pipeline.v1.RunnerApi.PCollection in project beam by apache.

the class ImmutableExecutableStageTest method ofFullComponentsOnlyHasStagePTransforms.

@Test
public void ofFullComponentsOnlyHasStagePTransforms() throws Exception {
    Environment env = Environments.createDockerEnvironment("foo");
    PTransform pt = PTransform.newBuilder().putInputs("input", "input.out").putInputs("side_input", "sideInput.in").putInputs("timer", "timer.pc").putOutputs("output", "output.out").putOutputs("timer", "timer.pc").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).setPayload(ParDoPayload.newBuilder().setDoFn(RunnerApi.FunctionSpec.newBuilder()).putSideInputs("side_input", RunnerApi.SideInput.getDefaultInstance()).putStateSpecs("user_state", RunnerApi.StateSpec.getDefaultInstance()).putTimerFamilySpecs("timer", RunnerApi.TimerFamilySpec.getDefaultInstance()).build().toByteString())).build();
    PCollection input = PCollection.newBuilder().setUniqueName("input.out").build();
    PCollection sideInput = PCollection.newBuilder().setUniqueName("sideInput.in").build();
    PCollection timer = PCollection.newBuilder().setUniqueName("timer.pc").build();
    PCollection output = PCollection.newBuilder().setUniqueName("output.out").build();
    Components components = Components.newBuilder().putTransforms("pt", pt).putTransforms("other_pt", PTransform.newBuilder().setUniqueName("other").build()).putPcollections("input.out", input).putPcollections("sideInput.in", sideInput).putPcollections("timer.pc", timer).putPcollections("output.out", output).putEnvironments("foo", env).build();
    PTransformNode transformNode = PipelineNode.pTransform("pt", pt);
    SideInputReference sideInputRef = SideInputReference.of(transformNode, "side_input", PipelineNode.pCollection("sideInput.in", sideInput));
    UserStateReference userStateRef = UserStateReference.of(transformNode, "user_state", PipelineNode.pCollection("input.out", input));
    TimerReference timerRef = TimerReference.of(transformNode, "timer");
    ImmutableExecutableStage stage = ImmutableExecutableStage.ofFullComponents(components, env, PipelineNode.pCollection("input.out", input), Collections.singleton(sideInputRef), Collections.singleton(userStateRef), Collections.singleton(timerRef), Collections.singleton(PipelineNode.pTransform("pt", pt)), Collections.singleton(PipelineNode.pCollection("output.out", output)), DEFAULT_WIRE_CODER_SETTINGS);
    assertThat(stage.getComponents().containsTransforms("pt"), is(true));
    assertThat(stage.getComponents().containsTransforms("other_pt"), is(false));
    PTransform stagePTransform = stage.toPTransform("foo");
    assertThat(stagePTransform.getOutputsMap(), hasValue("output.out"));
    assertThat(stagePTransform.getOutputsCount(), equalTo(1));
    assertThat(stagePTransform.getInputsMap(), allOf(hasValue("input.out"), hasValue("sideInput.in")));
    assertThat(stagePTransform.getInputsCount(), equalTo(2));
    ExecutableStagePayload payload = ExecutableStagePayload.parseFrom(stagePTransform.getSpec().getPayload());
    assertThat(payload.getTransformsList(), contains("pt"));
    assertThat(ExecutableStage.fromPayload(payload), equalTo(stage));
}
Also used : Components(org.apache.beam.model.pipeline.v1.RunnerApi.Components) PCollection(org.apache.beam.model.pipeline.v1.RunnerApi.PCollection) ExecutableStagePayload(org.apache.beam.model.pipeline.v1.RunnerApi.ExecutableStagePayload) PTransformNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PTransformNode) Environment(org.apache.beam.model.pipeline.v1.RunnerApi.Environment) PTransform(org.apache.beam.model.pipeline.v1.RunnerApi.PTransform) Test(org.junit.Test)

Example 38 with PCollection

use of org.apache.beam.model.pipeline.v1.RunnerApi.PCollection in project beam by apache.

the class OutputDeduplicatorTest method duplicateOverStages.

@Test
public void duplicateOverStages() {
    /* When multiple stages and a runner-executed transform produce a PCollection, all should be
     * replaced with synthetic flattens.
     * original graph:
     *             --> one -> .out \
     * red -> .out |                -> shared -> .out -> blue -> .out
     *             --> two -> .out /
     *
     * fused graph:
     *             --> [one -> .out -> shared ->] .out
     * red -> .out |                                   (shared.out) -> blue -> .out
     *             --> [two -> .out -> shared ->] .out
     *
     * deduplicated graph:
     *             --> [one -> .out -> shared ->] .out:0 \
     * red -> .out |                                      -> shared -> .out -> blue ->.out
     *             --> [two -> .out -> shared ->] .out:1 /
     */
    PCollection redOut = PCollection.newBuilder().setUniqueName("red.out").build();
    PTransform red = PTransform.newBuilder().setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).build()).putOutputs("out", redOut.getUniqueName()).build();
    PCollection oneOut = PCollection.newBuilder().setUniqueName("one.out").build();
    PTransform one = PTransform.newBuilder().setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).build()).putInputs("in", redOut.getUniqueName()).putOutputs("out", oneOut.getUniqueName()).build();
    PCollection twoOut = PCollection.newBuilder().setUniqueName("two.out").build();
    PTransform two = PTransform.newBuilder().setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).build()).putInputs("in", redOut.getUniqueName()).putOutputs("out", twoOut.getUniqueName()).build();
    PCollection sharedOut = PCollection.newBuilder().setUniqueName("shared.out").build();
    PTransform shared = PTransform.newBuilder().setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).build()).putInputs("one", oneOut.getUniqueName()).putInputs("two", twoOut.getUniqueName()).putOutputs("shared", sharedOut.getUniqueName()).build();
    PCollection blueOut = PCollection.newBuilder().setUniqueName("blue.out").build();
    PTransform blue = PTransform.newBuilder().setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).build()).putInputs("in", sharedOut.getUniqueName()).putOutputs("out", blueOut.getUniqueName()).build();
    RunnerApi.Components components = Components.newBuilder().putTransforms("one", one).putPcollections(oneOut.getUniqueName(), oneOut).putTransforms("two", two).putPcollections(twoOut.getUniqueName(), twoOut).putTransforms("shared", shared).putPcollections(sharedOut.getUniqueName(), sharedOut).putTransforms("red", red).putPcollections(redOut.getUniqueName(), redOut).putTransforms("blue", blue).putPcollections(blueOut.getUniqueName(), blueOut).build();
    ExecutableStage oneStage = ImmutableExecutableStage.of(components, Environment.getDefaultInstance(), PipelineNode.pCollection(redOut.getUniqueName(), redOut), ImmutableList.of(), ImmutableList.of(), ImmutableList.of(), ImmutableList.of(PipelineNode.pTransform("one", one), PipelineNode.pTransform("shared", shared)), ImmutableList.of(PipelineNode.pCollection(sharedOut.getUniqueName(), sharedOut)), DEFAULT_WIRE_CODER_SETTINGS);
    ExecutableStage twoStage = ImmutableExecutableStage.of(components, Environment.getDefaultInstance(), PipelineNode.pCollection(redOut.getUniqueName(), redOut), ImmutableList.of(), ImmutableList.of(), ImmutableList.of(), ImmutableList.of(PipelineNode.pTransform("two", two), PipelineNode.pTransform("shared", shared)), ImmutableList.of(PipelineNode.pCollection(sharedOut.getUniqueName(), sharedOut)), DEFAULT_WIRE_CODER_SETTINGS);
    PTransformNode redTransform = PipelineNode.pTransform("red", red);
    PTransformNode blueTransform = PipelineNode.pTransform("blue", blue);
    QueryablePipeline pipeline = QueryablePipeline.forPrimitivesIn(components);
    DeduplicationResult result = OutputDeduplicator.ensureSingleProducer(pipeline, ImmutableList.of(oneStage, twoStage), ImmutableList.of(redTransform, blueTransform));
    assertThat(result.getIntroducedTransforms(), hasSize(1));
    PTransformNode introduced = getOnlyElement(result.getIntroducedTransforms());
    assertThat(introduced.getTransform().getOutputsMap().size(), equalTo(1));
    assertThat(getOnlyElement(introduced.getTransform().getOutputsMap().values()), equalTo(sharedOut.getUniqueName()));
    assertThat(result.getDeduplicatedComponents().getPcollectionsMap().keySet(), hasItems(introduced.getTransform().getInputsMap().values().toArray(new String[0])));
    assertThat(result.getDeduplicatedStages().keySet(), hasSize(2));
    List<String> stageOutputs = result.getDeduplicatedStages().values().stream().flatMap(stage -> stage.getOutputPCollections().stream().map(PCollectionNode::getId)).collect(Collectors.toList());
    assertThat(stageOutputs, containsInAnyOrder(introduced.getTransform().getInputsMap().values().toArray()));
    assertThat(result.getDeduplicatedTransforms().keySet(), empty());
    assertThat(result.getDeduplicatedComponents().getPcollectionsMap().keySet(), hasItems(stageOutputs.toArray(new String[0])));
    assertThat(result.getDeduplicatedComponents().getTransformsMap(), hasEntry(introduced.getId(), introduced.getTransform()));
}
Also used : PTransform(org.apache.beam.model.pipeline.v1.RunnerApi.PTransform) RunWith(org.junit.runner.RunWith) Matchers.hasItems(org.hamcrest.Matchers.hasItems) ArrayList(java.util.ArrayList) FunctionSpec(org.apache.beam.model.pipeline.v1.RunnerApi.FunctionSpec) PCollection(org.apache.beam.model.pipeline.v1.RunnerApi.PCollection) Map(java.util.Map) Matchers.hasSize(org.hamcrest.Matchers.hasSize) Components(org.apache.beam.model.pipeline.v1.RunnerApi.Components) MatcherAssert.assertThat(org.hamcrest.MatcherAssert.assertThat) PTransformNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PTransformNode) Matchers.hasEntry(org.hamcrest.Matchers.hasEntry) RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) Matchers.empty(org.hamcrest.Matchers.empty) PTransformTranslation(org.apache.beam.runners.core.construction.PTransformTranslation) Collection(java.util.Collection) DeduplicationResult(org.apache.beam.runners.core.construction.graph.OutputDeduplicator.DeduplicationResult) Test(org.junit.Test) JUnit4(org.junit.runners.JUnit4) Collectors(java.util.stream.Collectors) List(java.util.List) Matchers.containsInAnyOrder(org.hamcrest.Matchers.containsInAnyOrder) Matchers.equalTo(org.hamcrest.Matchers.equalTo) Environment(org.apache.beam.model.pipeline.v1.RunnerApi.Environment) ImmutableList(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList) Iterables.getOnlyElement(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Iterables.getOnlyElement) PCollectionNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PCollectionNode) DEFAULT_WIRE_CODER_SETTINGS(org.apache.beam.runners.core.construction.graph.ExecutableStage.DEFAULT_WIRE_CODER_SETTINGS) PCollection(org.apache.beam.model.pipeline.v1.RunnerApi.PCollection) RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) DeduplicationResult(org.apache.beam.runners.core.construction.graph.OutputDeduplicator.DeduplicationResult) Components(org.apache.beam.model.pipeline.v1.RunnerApi.Components) PTransformNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PTransformNode) PCollectionNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PCollectionNode) PTransform(org.apache.beam.model.pipeline.v1.RunnerApi.PTransform) Test(org.junit.Test)

Example 39 with PCollection

use of org.apache.beam.model.pipeline.v1.RunnerApi.PCollection in project beam by apache.

the class QueryablePipelineTest method transformWithSideAndMainInputs.

/**
 * Tests that inputs that are only side inputs are not returned from {@link
 * QueryablePipeline#getPerElementConsumers(PCollectionNode)} and are returned from {@link
 * QueryablePipeline#getSideInputs(PTransformNode)}.
 */
@Test
public void transformWithSideAndMainInputs() {
    Pipeline p = Pipeline.create();
    PCollection<byte[]> impulse = p.apply("Impulse", Impulse.create());
    PCollectionView<String> view = p.apply("Create", Create.of("foo")).apply("View", View.asSingleton());
    impulse.apply("par_do", ParDo.of(new TestFn()).withSideInputs(view).withOutputTags(new TupleTag<>(), TupleTagList.empty()));
    Components components = PipelineTranslation.toProto(p).getComponents();
    QueryablePipeline qp = QueryablePipeline.forPrimitivesIn(components);
    String mainInputName = getOnlyElement(PipelineNode.pTransform("Impulse", components.getTransformsOrThrow("Impulse")).getTransform().getOutputsMap().values());
    PCollectionNode mainInput = PipelineNode.pCollection(mainInputName, components.getPcollectionsOrThrow(mainInputName));
    PTransform parDoTransform = components.getTransformsOrThrow("par_do");
    String sideInputLocalName = getOnlyElement(parDoTransform.getInputsMap().entrySet().stream().filter(entry -> !entry.getValue().equals(mainInputName)).map(Map.Entry::getKey).collect(Collectors.toSet()));
    String sideInputCollectionId = parDoTransform.getInputsOrThrow(sideInputLocalName);
    PCollectionNode sideInput = PipelineNode.pCollection(sideInputCollectionId, components.getPcollectionsOrThrow(sideInputCollectionId));
    PTransformNode parDoNode = PipelineNode.pTransform("par_do", components.getTransformsOrThrow("par_do"));
    SideInputReference sideInputRef = SideInputReference.of(parDoNode, sideInputLocalName, sideInput);
    assertThat(qp.getSideInputs(parDoNode), contains(sideInputRef));
    assertThat(qp.getPerElementConsumers(mainInput), contains(parDoNode));
    assertThat(qp.getPerElementConsumers(sideInput), not(contains(parDoNode)));
}
Also used : Count(org.apache.beam.sdk.transforms.Count) PBegin(org.apache.beam.sdk.values.PBegin) Matchers.not(org.hamcrest.Matchers.not) Matchers.hasKey(org.hamcrest.Matchers.hasKey) ImmutableSet(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableSet) PCollectionList(org.apache.beam.sdk.values.PCollectionList) FunctionSpec(org.apache.beam.model.pipeline.v1.RunnerApi.FunctionSpec) Create(org.apache.beam.sdk.transforms.Create) Map(java.util.Map) Window(org.apache.beam.sdk.transforms.windowing.Window) Components(org.apache.beam.model.pipeline.v1.RunnerApi.Components) Flatten(org.apache.beam.sdk.transforms.Flatten) MapElements(org.apache.beam.sdk.transforms.MapElements) PTransformTranslation(org.apache.beam.runners.core.construction.PTransformTranslation) Collection(java.util.Collection) Set(java.util.Set) Collectors(java.util.stream.Collectors) ParDo(org.apache.beam.sdk.transforms.ParDo) Matchers.contains(org.hamcrest.Matchers.contains) Matchers.equalTo(org.hamcrest.Matchers.equalTo) TypeDescriptors(org.apache.beam.sdk.values.TypeDescriptors) Matchers.is(org.hamcrest.Matchers.is) SideInput(org.apache.beam.model.pipeline.v1.RunnerApi.SideInput) PCollectionNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PCollectionNode) PTransform(org.apache.beam.model.pipeline.v1.RunnerApi.PTransform) Duration(org.joda.time.Duration) RunWith(org.junit.runner.RunWith) Impulse(org.apache.beam.sdk.transforms.Impulse) View(org.apache.beam.sdk.transforms.View) PipelineTranslation(org.apache.beam.runners.core.construction.PipelineTranslation) TupleTagList(org.apache.beam.sdk.values.TupleTagList) Environments(org.apache.beam.runners.core.construction.Environments) ParDoPayload(org.apache.beam.model.pipeline.v1.RunnerApi.ParDoPayload) Read(org.apache.beam.sdk.io.Read) TupleTag(org.apache.beam.sdk.values.TupleTag) Matchers.hasSize(org.hamcrest.Matchers.hasSize) MatcherAssert.assertThat(org.hamcrest.MatcherAssert.assertThat) PTransformNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PTransformNode) Pipeline(org.apache.beam.sdk.Pipeline) ExpectedException(org.junit.rules.ExpectedException) RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) DoFn(org.apache.beam.sdk.transforms.DoFn) CountingSource(org.apache.beam.sdk.io.CountingSource) GroupByKey(org.apache.beam.sdk.transforms.GroupByKey) WithKeys(org.apache.beam.sdk.transforms.WithKeys) FixedWindows(org.apache.beam.sdk.transforms.windowing.FixedWindows) Test(org.junit.Test) JUnit4(org.junit.runners.JUnit4) PCollection(org.apache.beam.sdk.values.PCollection) Matchers.emptyIterable(org.hamcrest.Matchers.emptyIterable) Rule(org.junit.Rule) PCollectionView(org.apache.beam.sdk.values.PCollectionView) Iterables.getOnlyElement(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Iterables.getOnlyElement) PTransformNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PTransformNode) TupleTag(org.apache.beam.sdk.values.TupleTag) PCollectionNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PCollectionNode) Pipeline(org.apache.beam.sdk.Pipeline) Components(org.apache.beam.model.pipeline.v1.RunnerApi.Components) PTransform(org.apache.beam.model.pipeline.v1.RunnerApi.PTransform) Test(org.junit.Test)

Example 40 with PCollection

use of org.apache.beam.model.pipeline.v1.RunnerApi.PCollection in project beam by apache.

the class QueryablePipelineTest method retainOnlyPrimitivesComposites.

@Test
public void retainOnlyPrimitivesComposites() {
    Pipeline p = Pipeline.create();
    p.apply(new org.apache.beam.sdk.transforms.PTransform<PBegin, PCollection<Long>>() {

        @Override
        public PCollection<Long> expand(PBegin input) {
            return input.apply(Impulse.create()).apply(Window.into(FixedWindows.of(Duration.standardMinutes(5L)))).apply(MapElements.into(TypeDescriptors.longs()).via(l -> 1L));
        }
    });
    Components originalComponents = PipelineTranslation.toProto(p).getComponents();
    Collection<String> primitiveComponents = QueryablePipeline.getPrimitiveTransformIds(originalComponents);
    // Read, Window.Assign, ParDo. This will need to be updated if the expansions change.
    assertThat(primitiveComponents, hasSize(3));
    for (String transformId : primitiveComponents) {
        assertThat(originalComponents.getTransformsMap(), hasKey(transformId));
    }
}
Also used : Components(org.apache.beam.model.pipeline.v1.RunnerApi.Components) PCollection(org.apache.beam.sdk.values.PCollection) PBegin(org.apache.beam.sdk.values.PBegin) Pipeline(org.apache.beam.sdk.Pipeline) Test(org.junit.Test)

Aggregations

RunnerApi (org.apache.beam.model.pipeline.v1.RunnerApi)45 Test (org.junit.Test)45 Pipeline (org.apache.beam.sdk.Pipeline)25 PTransform (org.apache.beam.model.pipeline.v1.RunnerApi.PTransform)24 PCollection (org.apache.beam.model.pipeline.v1.RunnerApi.PCollection)22 PTransformNode (org.apache.beam.runners.core.construction.graph.PipelineNode.PTransformNode)22 Map (java.util.Map)21 Components (org.apache.beam.model.pipeline.v1.RunnerApi.Components)21 PCollectionNode (org.apache.beam.runners.core.construction.graph.PipelineNode.PCollectionNode)21 ByteString (org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.ByteString)17 ArrayList (java.util.ArrayList)16 HashMap (java.util.HashMap)14 Environment (org.apache.beam.model.pipeline.v1.RunnerApi.Environment)13 SdkComponents (org.apache.beam.runners.core.construction.SdkComponents)13 PCollection (org.apache.beam.sdk.values.PCollection)12 Coder (org.apache.beam.sdk.coders.Coder)11 KV (org.apache.beam.sdk.values.KV)11 Collection (java.util.Collection)10 ImmutableMap (org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap)10 IOException (java.io.IOException)9