Search in sources :

Example 36 with PCollectionView

use of org.apache.beam.sdk.values.PCollectionView in project beam by apache.

the class QueryablePipelineTest method transformWithSideAndMainInputs.

/**
 * Tests that inputs that are only side inputs are not returned from {@link
 * QueryablePipeline#getPerElementConsumers(PCollectionNode)} and are returned from {@link
 * QueryablePipeline#getSideInputs(PTransformNode)}.
 */
@Test
public void transformWithSideAndMainInputs() {
    Pipeline p = Pipeline.create();
    PCollection<byte[]> impulse = p.apply("Impulse", Impulse.create());
    PCollectionView<String> view = p.apply("Create", Create.of("foo")).apply("View", View.asSingleton());
    impulse.apply("par_do", ParDo.of(new TestFn()).withSideInputs(view).withOutputTags(new TupleTag<>(), TupleTagList.empty()));
    Components components = PipelineTranslation.toProto(p).getComponents();
    QueryablePipeline qp = QueryablePipeline.forPrimitivesIn(components);
    String mainInputName = getOnlyElement(PipelineNode.pTransform("Impulse", components.getTransformsOrThrow("Impulse")).getTransform().getOutputsMap().values());
    PCollectionNode mainInput = PipelineNode.pCollection(mainInputName, components.getPcollectionsOrThrow(mainInputName));
    PTransform parDoTransform = components.getTransformsOrThrow("par_do");
    String sideInputLocalName = getOnlyElement(parDoTransform.getInputsMap().entrySet().stream().filter(entry -> !entry.getValue().equals(mainInputName)).map(Map.Entry::getKey).collect(Collectors.toSet()));
    String sideInputCollectionId = parDoTransform.getInputsOrThrow(sideInputLocalName);
    PCollectionNode sideInput = PipelineNode.pCollection(sideInputCollectionId, components.getPcollectionsOrThrow(sideInputCollectionId));
    PTransformNode parDoNode = PipelineNode.pTransform("par_do", components.getTransformsOrThrow("par_do"));
    SideInputReference sideInputRef = SideInputReference.of(parDoNode, sideInputLocalName, sideInput);
    assertThat(qp.getSideInputs(parDoNode), contains(sideInputRef));
    assertThat(qp.getPerElementConsumers(mainInput), contains(parDoNode));
    assertThat(qp.getPerElementConsumers(sideInput), not(contains(parDoNode)));
}
Also used : Count(org.apache.beam.sdk.transforms.Count) PBegin(org.apache.beam.sdk.values.PBegin) Matchers.not(org.hamcrest.Matchers.not) Matchers.hasKey(org.hamcrest.Matchers.hasKey) ImmutableSet(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableSet) PCollectionList(org.apache.beam.sdk.values.PCollectionList) FunctionSpec(org.apache.beam.model.pipeline.v1.RunnerApi.FunctionSpec) Create(org.apache.beam.sdk.transforms.Create) Map(java.util.Map) Window(org.apache.beam.sdk.transforms.windowing.Window) Components(org.apache.beam.model.pipeline.v1.RunnerApi.Components) Flatten(org.apache.beam.sdk.transforms.Flatten) MapElements(org.apache.beam.sdk.transforms.MapElements) PTransformTranslation(org.apache.beam.runners.core.construction.PTransformTranslation) Collection(java.util.Collection) Set(java.util.Set) Collectors(java.util.stream.Collectors) ParDo(org.apache.beam.sdk.transforms.ParDo) Matchers.contains(org.hamcrest.Matchers.contains) Matchers.equalTo(org.hamcrest.Matchers.equalTo) TypeDescriptors(org.apache.beam.sdk.values.TypeDescriptors) Matchers.is(org.hamcrest.Matchers.is) SideInput(org.apache.beam.model.pipeline.v1.RunnerApi.SideInput) PCollectionNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PCollectionNode) PTransform(org.apache.beam.model.pipeline.v1.RunnerApi.PTransform) Duration(org.joda.time.Duration) RunWith(org.junit.runner.RunWith) Impulse(org.apache.beam.sdk.transforms.Impulse) View(org.apache.beam.sdk.transforms.View) PipelineTranslation(org.apache.beam.runners.core.construction.PipelineTranslation) TupleTagList(org.apache.beam.sdk.values.TupleTagList) Environments(org.apache.beam.runners.core.construction.Environments) ParDoPayload(org.apache.beam.model.pipeline.v1.RunnerApi.ParDoPayload) Read(org.apache.beam.sdk.io.Read) TupleTag(org.apache.beam.sdk.values.TupleTag) Matchers.hasSize(org.hamcrest.Matchers.hasSize) MatcherAssert.assertThat(org.hamcrest.MatcherAssert.assertThat) PTransformNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PTransformNode) Pipeline(org.apache.beam.sdk.Pipeline) ExpectedException(org.junit.rules.ExpectedException) RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) DoFn(org.apache.beam.sdk.transforms.DoFn) CountingSource(org.apache.beam.sdk.io.CountingSource) GroupByKey(org.apache.beam.sdk.transforms.GroupByKey) WithKeys(org.apache.beam.sdk.transforms.WithKeys) FixedWindows(org.apache.beam.sdk.transforms.windowing.FixedWindows) Test(org.junit.Test) JUnit4(org.junit.runners.JUnit4) PCollection(org.apache.beam.sdk.values.PCollection) Matchers.emptyIterable(org.hamcrest.Matchers.emptyIterable) Rule(org.junit.Rule) PCollectionView(org.apache.beam.sdk.values.PCollectionView) Iterables.getOnlyElement(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Iterables.getOnlyElement) PTransformNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PTransformNode) TupleTag(org.apache.beam.sdk.values.TupleTag) PCollectionNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PCollectionNode) Pipeline(org.apache.beam.sdk.Pipeline) Components(org.apache.beam.model.pipeline.v1.RunnerApi.Components) PTransform(org.apache.beam.model.pipeline.v1.RunnerApi.PTransform) Test(org.junit.Test)

Example 37 with PCollectionView

use of org.apache.beam.sdk.values.PCollectionView in project beam by apache.

the class DoFnOperatorTest method keyedParDoPushbackDataCheckpointing.

@Test
public void keyedParDoPushbackDataCheckpointing() throws Exception {
    pushbackDataCheckpointing(() -> {
        StringUtf8Coder keyCoder = StringUtf8Coder.of();
        Coder<WindowedValue<String>> coder = WindowedValue.getFullCoder(keyCoder, IntervalWindow.getCoder());
        TupleTag<String> outputTag = new TupleTag<>("main-output");
        KeySelector<WindowedValue<String>, ByteBuffer> keySelector = e -> FlinkKeyUtils.encodeKey(e.getValue(), keyCoder);
        ImmutableMap<Integer, PCollectionView<?>> sideInputMapping = ImmutableMap.<Integer, PCollectionView<?>>builder().put(1, view1).put(2, view2).build();
        DoFnOperator<String, String> doFnOperator = new DoFnOperator<>(new IdentityDoFn<>(), "stepName", coder, Collections.emptyMap(), outputTag, Collections.emptyList(), new DoFnOperator.MultiOutputOutputManagerFactory<>(outputTag, coder, new SerializablePipelineOptions(FlinkPipelineOptions.defaults())), WindowingStrategy.of(FixedWindows.of(Duration.millis(100))), sideInputMapping, /* side-input mapping */
        ImmutableList.of(view1, view2), /* side inputs */
        FlinkPipelineOptions.defaults(), keyCoder, keySelector, DoFnSchemaInformation.create(), Collections.emptyMap());
        return new KeyedTwoInputStreamOperatorTestHarness<>(doFnOperator, keySelector, // we use a dummy key for the second input since it is considered to be broadcast
        null, new CoderTypeInformation<>(FlinkKeyUtils.ByteBufferCoder.of(), FlinkPipelineOptions.defaults()));
    });
}
Also used : StateSpec(org.apache.beam.sdk.state.StateSpec) Arrays(java.util.Arrays) StateNamespace(org.apache.beam.runners.core.StateNamespace) SerializablePipelineOptions(org.apache.beam.runners.core.construction.SerializablePipelineOptions) TimestampCombiner(org.apache.beam.sdk.transforms.windowing.TimestampCombiner) WindowedValue(org.apache.beam.sdk.util.WindowedValue) StreamRecordStripper.stripStreamRecordFromWindowedValue(org.apache.beam.runners.flink.translation.wrappers.streaming.StreamRecordStripper.stripStreamRecordFromWindowedValue) IsIterableContainingInOrder.contains(org.hamcrest.collection.IsIterableContainingInOrder.contains) FlinkPipelineOptions(org.apache.beam.runners.flink.FlinkPipelineOptions) TimerSpecs(org.apache.beam.sdk.state.TimerSpecs) DoFnRunner(org.apache.beam.runners.core.DoFnRunner) FlinkMetricContainer(org.apache.beam.runners.flink.metrics.FlinkMetricContainer) StepContext(org.apache.beam.runners.core.StepContext) ValueState(org.apache.beam.sdk.state.ValueState) ImmutableMap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap) KeyedOneInputStreamOperatorTestHarness(org.apache.flink.streaming.util.KeyedOneInputStreamOperatorTestHarness) TimerInternals(org.apache.beam.runners.core.TimerInternals) ByteBuffer(java.nio.ByteBuffer) DoFnSchemaInformation(org.apache.beam.sdk.transforms.DoFnSchemaInformation) OneInputStreamOperatorTestHarness(org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness) TypeFactory(com.fasterxml.jackson.databind.type.TypeFactory) Create(org.apache.beam.sdk.transforms.Create) TwoInputStreamOperatorTestHarness(org.apache.flink.streaming.util.TwoInputStreamOperatorTestHarness) Iterables(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Iterables) LRUMap(com.fasterxml.jackson.databind.util.LRUMap) Window(org.apache.beam.sdk.transforms.windowing.Window) GlobalWindow(org.apache.beam.sdk.transforms.windowing.GlobalWindow) FluentIterable(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.FluentIterable) TypeInformation(org.apache.flink.api.common.typeinfo.TypeInformation) CoderTypeInformation(org.apache.beam.runners.flink.translation.types.CoderTypeInformation) KvCoder(org.apache.beam.sdk.coders.KvCoder) KeySelector(org.apache.flink.api.java.functions.KeySelector) KeyedTwoInputStreamOperatorTestHarness(org.apache.flink.streaming.util.KeyedTwoInputStreamOperatorTestHarness) PaneInfo(org.apache.beam.sdk.transforms.windowing.PaneInfo) FullWindowedValueCoder(org.apache.beam.sdk.util.WindowedValue.FullWindowedValueCoder) OutputTag(org.apache.flink.util.OutputTag) VarLongCoder(org.apache.beam.sdk.coders.VarLongCoder) OperatorSubtaskState(org.apache.flink.runtime.checkpoint.OperatorSubtaskState) Collectors(java.util.stream.Collectors) Matchers.instanceOf(org.hamcrest.Matchers.instanceOf) Objects(java.util.Objects) List(java.util.List) WatermarkHoldState(org.apache.beam.sdk.state.WatermarkHoldState) Matchers.containsInAnyOrder(org.hamcrest.Matchers.containsInAnyOrder) Timer(org.apache.beam.sdk.state.Timer) Matchers.equalTo(org.hamcrest.Matchers.equalTo) Optional(java.util.Optional) ImmutableList(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList) Matchers.greaterThan(org.hamcrest.Matchers.greaterThan) Matchers.is(org.hamcrest.Matchers.is) StateTag(org.apache.beam.runners.core.StateTag) WindowingStrategy(org.apache.beam.sdk.values.WindowingStrategy) StatefulDoFnRunner(org.apache.beam.runners.core.StatefulDoFnRunner) Whitebox(org.powermock.reflect.Whitebox) KV(org.apache.beam.sdk.values.KV) Assert.assertThrows(org.junit.Assert.assertThrows) Duration(org.joda.time.Duration) RunWith(org.junit.runner.RunWith) Coder(org.apache.beam.sdk.coders.Coder) HashMap(java.util.HashMap) View(org.apache.beam.sdk.transforms.View) StateNamespaces(org.apache.beam.runners.core.StateNamespaces) Supplier(java.util.function.Supplier) StateTags(org.apache.beam.runners.core.StateTags) ArrayList(java.util.ArrayList) StringUtf8Coder(org.apache.beam.sdk.coders.StringUtf8Coder) RawUnionValue(org.apache.beam.sdk.transforms.join.RawUnionValue) StreamRecord(org.apache.flink.streaming.runtime.streamrecord.StreamRecord) TimerSpec(org.apache.beam.sdk.state.TimerSpec) CoderTypeSerializer(org.apache.beam.runners.flink.translation.types.CoderTypeSerializer) TupleTag(org.apache.beam.sdk.values.TupleTag) MatcherAssert.assertThat(org.hamcrest.MatcherAssert.assertThat) Pipeline(org.apache.beam.sdk.Pipeline) Nullable(org.checkerframework.checker.nullness.qual.Nullable) Before(org.junit.Before) DoFn(org.apache.beam.sdk.transforms.DoFn) PCollectionViewTesting(org.apache.beam.sdk.testing.PCollectionViewTesting) FixedWindows(org.apache.beam.sdk.transforms.windowing.FixedWindows) Function(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Function) Test(org.junit.Test) JUnit4(org.junit.runners.JUnit4) PCollection(org.apache.beam.sdk.values.PCollection) Mockito(org.mockito.Mockito) Matchers.emptyIterable(org.hamcrest.Matchers.emptyIterable) StateSpecs(org.apache.beam.sdk.state.StateSpecs) PCollectionView(org.apache.beam.sdk.values.PCollectionView) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) Instant(org.joda.time.Instant) VarIntCoder(org.apache.beam.sdk.coders.VarIntCoder) IntervalWindow(org.apache.beam.sdk.transforms.windowing.IntervalWindow) Collections(java.util.Collections) TimeDomain(org.apache.beam.sdk.state.TimeDomain) Assert.assertEquals(org.junit.Assert.assertEquals) KeyedTwoInputStreamOperatorTestHarness(org.apache.flink.streaming.util.KeyedTwoInputStreamOperatorTestHarness) TupleTag(org.apache.beam.sdk.values.TupleTag) ByteBuffer(java.nio.ByteBuffer) PCollectionView(org.apache.beam.sdk.values.PCollectionView) WindowedValue(org.apache.beam.sdk.util.WindowedValue) StreamRecordStripper.stripStreamRecordFromWindowedValue(org.apache.beam.runners.flink.translation.wrappers.streaming.StreamRecordStripper.stripStreamRecordFromWindowedValue) StringUtf8Coder(org.apache.beam.sdk.coders.StringUtf8Coder) SerializablePipelineOptions(org.apache.beam.runners.core.construction.SerializablePipelineOptions) Test(org.junit.Test)

Example 38 with PCollectionView

use of org.apache.beam.sdk.values.PCollectionView in project beam by apache.

the class DoFnOperatorTest method testSideInputs.

void testSideInputs(boolean keyed) throws Exception {
    Coder<WindowedValue<String>> coder = WindowedValue.getValueOnlyCoder(StringUtf8Coder.of());
    TupleTag<String> outputTag = new TupleTag<>("main-output");
    ImmutableMap<Integer, PCollectionView<?>> sideInputMapping = ImmutableMap.<Integer, PCollectionView<?>>builder().put(1, view1).put(2, view2).build();
    Coder<String> keyCoder = StringUtf8Coder.of();
    KeySelector<WindowedValue<String>, ByteBuffer> keySelector = null;
    if (keyed) {
        keySelector = value -> FlinkKeyUtils.encodeKey(value.getValue(), keyCoder);
    }
    DoFnOperator<String, String> doFnOperator = new DoFnOperator<>(new IdentityDoFn<>(), "stepName", coder, Collections.emptyMap(), outputTag, Collections.emptyList(), new DoFnOperator.MultiOutputOutputManagerFactory<>(outputTag, coder, new SerializablePipelineOptions(FlinkPipelineOptions.defaults())), WindowingStrategy.of(FixedWindows.of(Duration.millis(100))), sideInputMapping, /* side-input mapping */
    ImmutableList.of(view1, view2), /* side inputs */
    FlinkPipelineOptions.defaults(), keyed ? keyCoder : null, keyed ? keySelector : null, DoFnSchemaInformation.create(), Collections.emptyMap());
    TwoInputStreamOperatorTestHarness<WindowedValue<String>, RawUnionValue, WindowedValue<String>> testHarness = new TwoInputStreamOperatorTestHarness<>(doFnOperator);
    if (keyed) {
        // we use a dummy key for the second input since it is considered to be broadcast
        testHarness = new KeyedTwoInputStreamOperatorTestHarness<>(doFnOperator, keySelector, null, new CoderTypeInformation<>(FlinkKeyUtils.ByteBufferCoder.of(), FlinkPipelineOptions.defaults()));
    }
    testHarness.open();
    IntervalWindow firstWindow = new IntervalWindow(new Instant(0), new Instant(100));
    IntervalWindow secondWindow = new IntervalWindow(new Instant(0), new Instant(500));
    // test the keep of sideInputs events
    testHarness.processElement2(new StreamRecord<>(new RawUnionValue(1, valuesInWindow(PCollectionViewTesting.materializeValuesFor(view1.getPipeline().getOptions(), View.asIterable(), "hello", "ciao"), new Instant(0), firstWindow))));
    testHarness.processElement2(new StreamRecord<>(new RawUnionValue(2, valuesInWindow(PCollectionViewTesting.materializeValuesFor(view2.getPipeline().getOptions(), View.asIterable(), "foo", "bar"), new Instant(0), secondWindow))));
    // push in a regular elements
    WindowedValue<String> helloElement = valueInWindow("Hello", new Instant(0), firstWindow);
    WindowedValue<String> worldElement = valueInWindow("World", new Instant(1000), firstWindow);
    testHarness.processElement1(new StreamRecord<>(helloElement));
    testHarness.processElement1(new StreamRecord<>(worldElement));
    // test the keep of pushed-back events
    testHarness.processElement2(new StreamRecord<>(new RawUnionValue(1, valuesInWindow(PCollectionViewTesting.materializeValuesFor(view1.getPipeline().getOptions(), View.asIterable(), "hello", "ciao"), new Instant(1000), firstWindow))));
    testHarness.processElement2(new StreamRecord<>(new RawUnionValue(2, valuesInWindow(PCollectionViewTesting.materializeValuesFor(view2.getPipeline().getOptions(), View.asIterable(), "foo", "bar"), new Instant(1000), secondWindow))));
    assertThat(stripStreamRecordFromWindowedValue(testHarness.getOutput()), contains(helloElement, worldElement));
    testHarness.close();
}
Also used : TwoInputStreamOperatorTestHarness(org.apache.flink.streaming.util.TwoInputStreamOperatorTestHarness) KeyedTwoInputStreamOperatorTestHarness(org.apache.flink.streaming.util.KeyedTwoInputStreamOperatorTestHarness) TupleTag(org.apache.beam.sdk.values.TupleTag) WindowedValue(org.apache.beam.sdk.util.WindowedValue) StreamRecordStripper.stripStreamRecordFromWindowedValue(org.apache.beam.runners.flink.translation.wrappers.streaming.StreamRecordStripper.stripStreamRecordFromWindowedValue) SerializablePipelineOptions(org.apache.beam.runners.core.construction.SerializablePipelineOptions) IntervalWindow(org.apache.beam.sdk.transforms.windowing.IntervalWindow) CoderTypeInformation(org.apache.beam.runners.flink.translation.types.CoderTypeInformation) RawUnionValue(org.apache.beam.sdk.transforms.join.RawUnionValue) Instant(org.joda.time.Instant) ByteBuffer(java.nio.ByteBuffer) PCollectionView(org.apache.beam.sdk.values.PCollectionView)

Example 39 with PCollectionView

use of org.apache.beam.sdk.values.PCollectionView in project beam by apache.

the class CreateExecutableStageNodeFunction method apply.

@Override
public Node apply(MutableNetwork<Node, Edge> input) {
    for (Node node : input.nodes()) {
        if (node instanceof RemoteGrpcPortNode || node instanceof ParallelInstructionNode || node instanceof InstructionOutputNode) {
            continue;
        }
        throw new IllegalArgumentException(String.format("Network contains unknown type of node: %s", input));
    }
    // Fix all non output nodes to have named edges.
    for (Node node : input.nodes()) {
        if (node instanceof InstructionOutputNode) {
            continue;
        }
        for (Node successor : input.successors(node)) {
            for (Edge edge : input.edgesConnecting(node, successor)) {
                if (edge instanceof DefaultEdge) {
                    input.removeEdge(edge);
                    input.addEdge(node, successor, MultiOutputInfoEdge.create(new MultiOutputInfo().setTag(idGenerator.getId())));
                }
            }
        }
    }
    RunnerApi.Components.Builder componentsBuilder = RunnerApi.Components.newBuilder();
    componentsBuilder.mergeFrom(this.pipeline.getComponents());
    // Default to use the Java environment if pipeline doesn't have environment specified.
    if (pipeline.getComponents().getEnvironmentsMap().isEmpty()) {
        String envId = Environments.JAVA_SDK_HARNESS_ENVIRONMENT.getUrn() + idGenerator.getId();
        componentsBuilder.putEnvironments(envId, Environments.JAVA_SDK_HARNESS_ENVIRONMENT);
    }
    // By default, use GlobalWindow for all languages.
    // For java, if there is a IntervalWindowCoder, then use FixedWindow instead.
    // TODO: should get real WindowingStategy from pipeline proto.
    String globalWindowingStrategyId = "generatedGlobalWindowingStrategy" + idGenerator.getId();
    String intervalWindowEncodingWindowingStrategyId = "generatedIntervalWindowEncodingWindowingStrategy" + idGenerator.getId();
    SdkComponents sdkComponents = SdkComponents.create(pipeline.getComponents(), null);
    try {
        registerWindowingStrategy(globalWindowingStrategyId, WindowingStrategy.globalDefault(), componentsBuilder, sdkComponents);
        registerWindowingStrategy(intervalWindowEncodingWindowingStrategyId, WindowingStrategy.of(FixedWindows.of(Duration.standardSeconds(1))), componentsBuilder, sdkComponents);
    } catch (IOException exc) {
        throw new RuntimeException("Could not convert default windowing stratey to proto", exc);
    }
    Map<Node, String> nodesToPCollections = new HashMap<>();
    ImmutableMap.Builder<String, NameContext> ptransformIdToNameContexts = ImmutableMap.builder();
    ImmutableMap.Builder<String, Iterable<SideInputInfo>> ptransformIdToSideInputInfos = ImmutableMap.builder();
    ImmutableMap.Builder<String, Iterable<PCollectionView<?>>> ptransformIdToPCollectionViews = ImmutableMap.builder();
    // A field of ExecutableStage which includes the PCollection goes to worker side.
    Set<PCollectionNode> executableStageOutputs = new HashSet<>();
    // A field of ExecutableStage which includes the PCollection goes to runner side.
    Set<PCollectionNode> executableStageInputs = new HashSet<>();
    for (InstructionOutputNode node : Iterables.filter(input.nodes(), InstructionOutputNode.class)) {
        InstructionOutput instructionOutput = node.getInstructionOutput();
        String coderId = "generatedCoder" + idGenerator.getId();
        String windowingStrategyId;
        try (ByteString.Output output = ByteString.newOutput()) {
            try {
                Coder<?> javaCoder = CloudObjects.coderFromCloudObject(CloudObject.fromSpec(instructionOutput.getCodec()));
                Coder<?> elementCoder = ((WindowedValueCoder<?>) javaCoder).getValueCoder();
                sdkComponents.registerCoder(elementCoder);
                RunnerApi.Coder coderProto = CoderTranslation.toProto(elementCoder, sdkComponents);
                componentsBuilder.putCoders(coderId, coderProto);
                // For now, Dataflow runner harness only deal with FixedWindow.
                if (javaCoder instanceof FullWindowedValueCoder) {
                    FullWindowedValueCoder<?> windowedValueCoder = (FullWindowedValueCoder<?>) javaCoder;
                    Coder<?> windowCoder = windowedValueCoder.getWindowCoder();
                    if (windowCoder instanceof IntervalWindowCoder) {
                        windowingStrategyId = intervalWindowEncodingWindowingStrategyId;
                    } else if (windowCoder instanceof GlobalWindow.Coder) {
                        windowingStrategyId = globalWindowingStrategyId;
                    } else {
                        throw new UnsupportedOperationException(String.format("Dataflow portable runner harness doesn't support windowing with %s", windowCoder));
                    }
                } else {
                    throw new UnsupportedOperationException("Dataflow portable runner harness only supports FullWindowedValueCoder");
                }
            } catch (IOException e) {
                throw new IllegalArgumentException(String.format("Unable to encode coder %s for output %s", instructionOutput.getCodec(), instructionOutput), e);
            } catch (Exception e) {
                // Coder probably wasn't a java coder
                OBJECT_MAPPER.writeValue(output, instructionOutput.getCodec());
                componentsBuilder.putCoders(coderId, RunnerApi.Coder.newBuilder().setSpec(RunnerApi.FunctionSpec.newBuilder().setPayload(output.toByteString())).build());
                // For non-java coder, hope it's GlobalWindows by default.
                // TODO(BEAM-6231): Actually discover the right windowing strategy.
                windowingStrategyId = globalWindowingStrategyId;
            }
        } catch (IOException e) {
            throw new IllegalArgumentException(String.format("Unable to encode coder %s for output %s", instructionOutput.getCodec(), instructionOutput), e);
        }
        // TODO(BEAM-6275): Set correct IsBounded on generated PCollections
        String pcollectionId = node.getPcollectionId();
        RunnerApi.PCollection pCollection = RunnerApi.PCollection.newBuilder().setCoderId(coderId).setWindowingStrategyId(windowingStrategyId).setIsBounded(RunnerApi.IsBounded.Enum.BOUNDED).build();
        nodesToPCollections.put(node, pcollectionId);
        componentsBuilder.putPcollections(pcollectionId, pCollection);
        // is set
        if (isExecutableStageOutputPCollection(input, node)) {
            executableStageOutputs.add(PipelineNode.pCollection(pcollectionId, pCollection));
        }
        if (isExecutableStageInputPCollection(input, node)) {
            executableStageInputs.add(PipelineNode.pCollection(pcollectionId, pCollection));
        }
    }
    componentsBuilder.putAllCoders(sdkComponents.toComponents().getCodersMap());
    Set<PTransformNode> executableStageTransforms = new HashSet<>();
    Set<TimerReference> executableStageTimers = new HashSet<>();
    List<UserStateId> userStateIds = new ArrayList<>();
    Set<SideInputReference> executableStageSideInputs = new HashSet<>();
    for (ParallelInstructionNode node : Iterables.filter(input.nodes(), ParallelInstructionNode.class)) {
        ImmutableMap.Builder<String, PCollectionNode> sideInputIds = ImmutableMap.builder();
        ParallelInstruction parallelInstruction = node.getParallelInstruction();
        String ptransformId = "generatedPtransform" + idGenerator.getId();
        ptransformIdToNameContexts.put(ptransformId, NameContext.create(null, parallelInstruction.getOriginalName(), parallelInstruction.getSystemName(), parallelInstruction.getName()));
        RunnerApi.PTransform.Builder pTransform = RunnerApi.PTransform.newBuilder();
        RunnerApi.FunctionSpec.Builder transformSpec = RunnerApi.FunctionSpec.newBuilder();
        List<String> timerIds = new ArrayList<>();
        if (parallelInstruction.getParDo() != null) {
            ParDoInstruction parDoInstruction = parallelInstruction.getParDo();
            CloudObject userFnSpec = CloudObject.fromSpec(parDoInstruction.getUserFn());
            String userFnClassName = userFnSpec.getClassName();
            if (userFnClassName.equals("CombineValuesFn") || userFnClassName.equals("KeyedCombineFn")) {
                transformSpec = transformCombineValuesFnToFunctionSpec(userFnSpec);
                ptransformIdToPCollectionViews.put(ptransformId, Collections.emptyList());
            } else {
                String parDoPTransformId = getString(userFnSpec, PropertyNames.SERIALIZED_FN);
                RunnerApi.PTransform parDoPTransform = pipeline.getComponents().getTransformsOrDefault(parDoPTransformId, null);
                // TODO: only the non-null branch should exist; for migration ease only
                if (parDoPTransform != null) {
                    checkArgument(parDoPTransform.getSpec().getUrn().equals(PTransformTranslation.PAR_DO_TRANSFORM_URN), "Found transform \"%s\" for ParallelDo instruction, " + " but that transform had unexpected URN \"%s\" (expected \"%s\")", parDoPTransformId, parDoPTransform.getSpec().getUrn(), PTransformTranslation.PAR_DO_TRANSFORM_URN);
                    RunnerApi.ParDoPayload parDoPayload;
                    try {
                        parDoPayload = RunnerApi.ParDoPayload.parseFrom(parDoPTransform.getSpec().getPayload());
                    } catch (InvalidProtocolBufferException exc) {
                        throw new RuntimeException("ParDo did not have a ParDoPayload", exc);
                    }
                    // user timers and user state.
                    for (Map.Entry<String, RunnerApi.TimerFamilySpec> entry : parDoPayload.getTimerFamilySpecsMap().entrySet()) {
                        timerIds.add(entry.getKey());
                    }
                    for (Map.Entry<String, RunnerApi.StateSpec> entry : parDoPayload.getStateSpecsMap().entrySet()) {
                        UserStateId.Builder builder = UserStateId.newBuilder();
                        builder.setTransformId(parDoPTransformId);
                        builder.setLocalName(entry.getKey());
                        userStateIds.add(builder.build());
                    }
                    // To facilitate the creation of Set executableStageSideInputs.
                    for (String sideInputTag : parDoPayload.getSideInputsMap().keySet()) {
                        String sideInputPCollectionId = parDoPTransform.getInputsOrThrow(sideInputTag);
                        RunnerApi.PCollection sideInputPCollection = pipeline.getComponents().getPcollectionsOrThrow(sideInputPCollectionId);
                        pTransform.putInputs(sideInputTag, sideInputPCollectionId);
                        PCollectionNode pCollectionNode = PipelineNode.pCollection(sideInputPCollectionId, sideInputPCollection);
                        sideInputIds.put(sideInputTag, pCollectionNode);
                    }
                    // To facilitate the creation of Map(ptransformId -> pCollectionView), which is
                    // required by constructing an ExecutableStageNode.
                    ImmutableList.Builder<PCollectionView<?>> pcollectionViews = ImmutableList.builder();
                    for (Map.Entry<String, RunnerApi.SideInput> sideInputEntry : parDoPayload.getSideInputsMap().entrySet()) {
                        pcollectionViews.add(RegisterNodeFunction.transformSideInputForRunner(pipeline, parDoPTransform, sideInputEntry.getKey(), sideInputEntry.getValue()));
                    }
                    ptransformIdToPCollectionViews.put(ptransformId, pcollectionViews.build());
                    transformSpec.setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).setPayload(parDoPayload.toByteString());
                } else {
                    // legacy path - bytes are the FunctionSpec's payload field, basically, and
                    // SDKs expect it in the PTransform's payload field
                    byte[] userFnBytes = getBytes(userFnSpec, PropertyNames.SERIALIZED_FN);
                    transformSpec.setUrn(ParDoTranslation.CUSTOM_JAVA_DO_FN_URN).setPayload(ByteString.copyFrom(userFnBytes));
                }
                if (parDoInstruction.getSideInputs() != null) {
                    ptransformIdToSideInputInfos.put(ptransformId, forSideInputInfos(parDoInstruction.getSideInputs(), true));
                }
            }
        } else if (parallelInstruction.getRead() != null) {
            ReadInstruction readInstruction = parallelInstruction.getRead();
            CloudObject sourceSpec = CloudObject.fromSpec(CloudSourceUtils.flattenBaseSpecs(readInstruction.getSource()).getSpec());
            // TODO: Need to plumb through the SDK specific function spec.
            transformSpec.setUrn(JAVA_SOURCE_URN);
            try {
                byte[] serializedSource = Base64.getDecoder().decode(getString(sourceSpec, SERIALIZED_SOURCE));
                ByteString sourceByteString = ByteString.copyFrom(serializedSource);
                transformSpec.setPayload(sourceByteString);
            } catch (Exception e) {
                throw new IllegalArgumentException(String.format("Unable to process Read %s", parallelInstruction), e);
            }
        } else if (parallelInstruction.getFlatten() != null) {
            transformSpec.setUrn(PTransformTranslation.FLATTEN_TRANSFORM_URN);
        } else {
            throw new IllegalArgumentException(String.format("Unknown type of ParallelInstruction %s", parallelInstruction));
        }
        // predecessor in a ParDo. This PCollection is called the "main input".
        for (Node predecessorOutput : input.predecessors(node)) {
            pTransform.putInputs("generatedInput" + idGenerator.getId(), nodesToPCollections.get(predecessorOutput));
        }
        for (Edge edge : input.outEdges(node)) {
            Node nodeOutput = input.incidentNodes(edge).target();
            MultiOutputInfoEdge edge2 = (MultiOutputInfoEdge) edge;
            pTransform.putOutputs(edge2.getMultiOutputInfo().getTag(), nodesToPCollections.get(nodeOutput));
        }
        pTransform.setSpec(transformSpec);
        PTransformNode pTransformNode = PipelineNode.pTransform(ptransformId, pTransform.build());
        executableStageTransforms.add(pTransformNode);
        for (String timerId : timerIds) {
            executableStageTimers.add(TimerReference.of(pTransformNode, timerId));
        }
        ImmutableMap<String, PCollectionNode> sideInputIdToPCollectionNodes = sideInputIds.build();
        for (String sideInputTag : sideInputIdToPCollectionNodes.keySet()) {
            SideInputReference sideInputReference = SideInputReference.of(pTransformNode, sideInputTag, sideInputIdToPCollectionNodes.get(sideInputTag));
            executableStageSideInputs.add(sideInputReference);
        }
        executableStageTransforms.add(pTransformNode);
    }
    if (executableStageInputs.size() != 1) {
        throw new UnsupportedOperationException("ExecutableStage only support one input PCollection");
    }
    PCollectionNode executableInput = executableStageInputs.iterator().next();
    RunnerApi.Components executableStageComponents = componentsBuilder.build();
    // Get Environment from ptransform, otherwise, use JAVA_SDK_HARNESS_ENVIRONMENT as default.
    Environment executableStageEnv = getEnvironmentFromPTransform(executableStageComponents, executableStageTransforms);
    if (executableStageEnv == null) {
        executableStageEnv = Environments.JAVA_SDK_HARNESS_ENVIRONMENT;
    }
    Set<UserStateReference> executableStageUserStateReference = new HashSet<>();
    for (UserStateId userStateId : userStateIds) {
        executableStageUserStateReference.add(UserStateReference.fromUserStateId(userStateId, executableStageComponents));
    }
    ExecutableStage executableStage = ImmutableExecutableStage.ofFullComponents(executableStageComponents, executableStageEnv, executableInput, executableStageSideInputs, executableStageUserStateReference, executableStageTimers, executableStageTransforms, executableStageOutputs, DEFAULT_WIRE_CODER_SETTINGS);
    return ExecutableStageNode.create(executableStage, ptransformIdToNameContexts.build(), ptransformIdToSideInputInfos.build(), ptransformIdToPCollectionViews.build());
}
Also used : HashMap(java.util.HashMap) MultiOutputInfo(com.google.api.services.dataflow.model.MultiOutputInfo) ByteString(org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.ByteString) PCollectionNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PCollectionNode) InstructionOutputNode(org.apache.beam.runners.dataflow.worker.graph.Nodes.InstructionOutputNode) PTransformNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PTransformNode) ParallelInstructionNode(org.apache.beam.runners.dataflow.worker.graph.Nodes.ParallelInstructionNode) Node(org.apache.beam.runners.dataflow.worker.graph.Nodes.Node) ExecutableStageNode(org.apache.beam.runners.dataflow.worker.graph.Nodes.ExecutableStageNode) PipelineNode(org.apache.beam.runners.core.construction.graph.PipelineNode) RemoteGrpcPortNode(org.apache.beam.runners.dataflow.worker.graph.Nodes.RemoteGrpcPortNode) InstructionOutput(com.google.api.services.dataflow.model.InstructionOutput) PTransformNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PTransformNode) ArrayList(java.util.ArrayList) Structs.getString(org.apache.beam.runners.dataflow.util.Structs.getString) ByteString(org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.ByteString) RehydratedComponents(org.apache.beam.runners.core.construction.RehydratedComponents) SdkComponents(org.apache.beam.runners.core.construction.SdkComponents) RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) SideInputReference(org.apache.beam.runners.core.construction.graph.SideInputReference) ImmutableExecutableStage(org.apache.beam.runners.core.construction.graph.ImmutableExecutableStage) ExecutableStage(org.apache.beam.runners.core.construction.graph.ExecutableStage) HashSet(java.util.HashSet) DefaultEdge(org.apache.beam.runners.dataflow.worker.graph.Edges.DefaultEdge) MultiOutputInfoEdge(org.apache.beam.runners.dataflow.worker.graph.Edges.MultiOutputInfoEdge) ImmutableMap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap) ParDoInstruction(com.google.api.services.dataflow.model.ParDoInstruction) PCollectionView(org.apache.beam.sdk.values.PCollectionView) Environment(org.apache.beam.model.pipeline.v1.RunnerApi.Environment) GlobalWindow(org.apache.beam.sdk.transforms.windowing.GlobalWindow) ImmutableMap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap) Map(java.util.Map) HashMap(java.util.HashMap) IntervalWindowCoder(org.apache.beam.sdk.transforms.windowing.IntervalWindow.IntervalWindowCoder) RemoteGrpcPortNode(org.apache.beam.runners.dataflow.worker.graph.Nodes.RemoteGrpcPortNode) TimerReference(org.apache.beam.runners.core.construction.graph.TimerReference) ImmutableList(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList) ParallelInstructionNode(org.apache.beam.runners.dataflow.worker.graph.Nodes.ParallelInstructionNode) SdkComponents(org.apache.beam.runners.core.construction.SdkComponents) ReadInstruction(com.google.api.services.dataflow.model.ReadInstruction) InstructionOutputNode(org.apache.beam.runners.dataflow.worker.graph.Nodes.InstructionOutputNode) FullWindowedValueCoder(org.apache.beam.sdk.util.WindowedValue.FullWindowedValueCoder) NameContext(org.apache.beam.runners.dataflow.worker.counters.NameContext) InvalidProtocolBufferException(org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.InvalidProtocolBufferException) IOException(java.io.IOException) PCollectionNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PCollectionNode) InvalidProtocolBufferException(org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.InvalidProtocolBufferException) IOException(java.io.IOException) ParallelInstruction(com.google.api.services.dataflow.model.ParallelInstruction) WindowedValueCoder(org.apache.beam.sdk.util.WindowedValue.WindowedValueCoder) FullWindowedValueCoder(org.apache.beam.sdk.util.WindowedValue.FullWindowedValueCoder) CloudObject(org.apache.beam.runners.dataflow.util.CloudObject) UserStateId(org.apache.beam.model.pipeline.v1.RunnerApi.ExecutableStagePayload.UserStateId) UserStateReference(org.apache.beam.runners.core.construction.graph.UserStateReference) Edge(org.apache.beam.runners.dataflow.worker.graph.Edges.Edge) MultiOutputInfoEdge(org.apache.beam.runners.dataflow.worker.graph.Edges.MultiOutputInfoEdge) DefaultEdge(org.apache.beam.runners.dataflow.worker.graph.Edges.DefaultEdge)

Example 40 with PCollectionView

use of org.apache.beam.sdk.values.PCollectionView in project beam by apache.

the class InsertFetchAndFilterStreamingSideInputNodes method forNetwork.

public MutableNetwork<Node, Edge> forNetwork(MutableNetwork<Node, Edge> network) {
    if (pipeline == null) {
        return network;
    }
    RehydratedComponents rehydratedComponents = RehydratedComponents.forComponents(pipeline.getComponents());
    for (ParallelInstructionNode node : ImmutableList.copyOf(Iterables.filter(network.nodes(), ParallelInstructionNode.class))) {
        // to worry about it.
        if (node.getParallelInstruction().getParDo() == null || !ExecutionLocation.SDK_HARNESS.equals(node.getExecutionLocation())) {
            continue;
        }
        ParDoInstruction parDoInstruction = node.getParallelInstruction().getParDo();
        CloudObject userFnSpec = CloudObject.fromSpec(parDoInstruction.getUserFn());
        String parDoPTransformId = getString(userFnSpec, PropertyNames.SERIALIZED_FN);
        // Skip ParDoInstruction nodes that contain payloads without side inputs.
        String userFnClassName = userFnSpec.getClassName();
        if ("CombineValuesFn".equals(userFnClassName) || "KeyedCombineFn".equals(userFnClassName)) {
            // These nodes have CombinePayloads which have no side inputs.
            continue;
        }
        RunnerApi.PTransform parDoPTransform = pipeline.getComponents().getTransformsOrDefault(parDoPTransformId, null);
        // TODO: only the non-null branch should exist; for migration ease only
        if (parDoPTransform == null) {
            continue;
        }
        RunnerApi.ParDoPayload parDoPayload;
        try {
            parDoPayload = RunnerApi.ParDoPayload.parseFrom(parDoPTransform.getSpec().getPayload());
        } catch (InvalidProtocolBufferException exc) {
            throw new RuntimeException("ParDo did not have a ParDoPayload", exc);
        }
        // Skip any ParDo that doesn't have a side input.
        if (parDoPayload.getSideInputsMap().isEmpty()) {
            continue;
        }
        String mainInputPCollectionLocalName = Iterables.getOnlyElement(Sets.difference(parDoPTransform.getInputsMap().keySet(), parDoPayload.getSideInputsMap().keySet()));
        RunnerApi.WindowingStrategy windowingStrategyProto = pipeline.getComponents().getWindowingStrategiesOrThrow(pipeline.getComponents().getPcollectionsOrThrow(parDoPTransform.getInputsOrThrow(mainInputPCollectionLocalName)).getWindowingStrategyId());
        WindowingStrategy windowingStrategy;
        try {
            windowingStrategy = WindowingStrategyTranslation.fromProto(windowingStrategyProto, rehydratedComponents);
        } catch (InvalidProtocolBufferException e) {
            throw new IllegalStateException(String.format("Unable to decode windowing strategy %s.", windowingStrategyProto), e);
        }
        // Gather all the side input window mapping fns which we need to request the SDK to map
        ImmutableMap.Builder<PCollectionView<?>, RunnerApi.FunctionSpec> pCollectionViewsToWindowMapingsFns = ImmutableMap.builder();
        parDoPayload.getSideInputsMap().forEach((sideInputTag, sideInput) -> pCollectionViewsToWindowMapingsFns.put(RegisterNodeFunction.transformSideInputForRunner(pipeline, parDoPTransform, sideInputTag, sideInput), sideInput.getWindowMappingFn()));
        Node streamingSideInputWindowHandlerNode = FetchAndFilterStreamingSideInputsNode.create(windowingStrategy, pCollectionViewsToWindowMapingsFns.build(), NameContext.create(null, node.getParallelInstruction().getOriginalName(), node.getParallelInstruction().getSystemName(), node.getParallelInstruction().getName()));
        // Rewire the graph such that streaming side inputs ParDos are preceded by a
        // node which filters any side inputs that aren't ready and fetches any ready side inputs.
        Edge mainInput = Iterables.getOnlyElement(network.inEdges(node));
        InstructionOutputNode predecessor = (InstructionOutputNode) network.incidentNodes(mainInput).source();
        InstructionOutputNode predecessorCopy = InstructionOutputNode.create(predecessor.getInstructionOutput(), predecessor.getPcollectionId());
        network.removeEdge(mainInput);
        network.addNode(streamingSideInputWindowHandlerNode);
        network.addNode(predecessorCopy);
        network.addEdge(predecessor, streamingSideInputWindowHandlerNode, mainInput.clone());
        network.addEdge(streamingSideInputWindowHandlerNode, predecessorCopy, mainInput.clone());
        network.addEdge(predecessorCopy, node, mainInput.clone());
    }
    return network;
}
Also used : FetchAndFilterStreamingSideInputsNode(org.apache.beam.runners.dataflow.worker.graph.Nodes.FetchAndFilterStreamingSideInputsNode) InstructionOutputNode(org.apache.beam.runners.dataflow.worker.graph.Nodes.InstructionOutputNode) ParallelInstructionNode(org.apache.beam.runners.dataflow.worker.graph.Nodes.ParallelInstructionNode) Node(org.apache.beam.runners.dataflow.worker.graph.Nodes.Node) ParallelInstructionNode(org.apache.beam.runners.dataflow.worker.graph.Nodes.ParallelInstructionNode) InvalidProtocolBufferException(org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.InvalidProtocolBufferException) Structs.getString(org.apache.beam.runners.dataflow.util.Structs.getString) WindowingStrategy(org.apache.beam.sdk.values.WindowingStrategy) ImmutableMap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap) ParDoInstruction(com.google.api.services.dataflow.model.ParDoInstruction) RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) InstructionOutputNode(org.apache.beam.runners.dataflow.worker.graph.Nodes.InstructionOutputNode) PCollectionView(org.apache.beam.sdk.values.PCollectionView) CloudObject(org.apache.beam.runners.dataflow.util.CloudObject) RehydratedComponents(org.apache.beam.runners.core.construction.RehydratedComponents) Edge(org.apache.beam.runners.dataflow.worker.graph.Edges.Edge)

Aggregations

PCollectionView (org.apache.beam.sdk.values.PCollectionView)67 Map (java.util.Map)29 HashMap (java.util.HashMap)28 Test (org.junit.Test)28 TupleTag (org.apache.beam.sdk.values.TupleTag)27 BoundedWindow (org.apache.beam.sdk.transforms.windowing.BoundedWindow)22 Coder (org.apache.beam.sdk.coders.Coder)21 KV (org.apache.beam.sdk.values.KV)20 Instant (org.joda.time.Instant)20 KvCoder (org.apache.beam.sdk.coders.KvCoder)18 WindowedValue (org.apache.beam.sdk.util.WindowedValue)18 PCollection (org.apache.beam.sdk.values.PCollection)18 DoFn (org.apache.beam.sdk.transforms.DoFn)16 ArrayList (java.util.ArrayList)15 IntervalWindow (org.apache.beam.sdk.transforms.windowing.IntervalWindow)14 List (java.util.List)13 ImmutableMap (org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap)13 IOException (java.io.IOException)12 RunnerApi (org.apache.beam.model.pipeline.v1.RunnerApi)12 ByteString (org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.ByteString)10