Search in sources :

Example 51 with Components

use of org.apache.beam.sdk.common.runner.v1.RunnerApi.Components in project beam by apache.

the class GreedyStageFuserTest method fusesFlattenWithDifferentEnvironmentInputs.

@Test
public void fusesFlattenWithDifferentEnvironmentInputs() {
    // (impulse.out) -> read -> read.out \                                 -> window -> window.out
    // -------> flatten -> flatten.out /
    // (impulse.out) -> envRead -> envRead.out /
    // fuses into
    // read -> read.out -> flatten -> flatten.out -> window -> window.out
    // envRead -> envRead.out -> flatten -> (flatten.out)
    // (flatten.out) -> window -> window.out
    PTransform readTransform = PTransform.newBuilder().putInputs("input", "impulse.out").putOutputs("output", "read.out").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).setPayload(ParDoPayload.newBuilder().setDoFn(FunctionSpec.newBuilder()).build().toByteString())).setEnvironmentId("common").build();
    PTransform otherEnvRead = PTransform.newBuilder().putInputs("impulse", "impulse.out").putOutputs("output", "envRead.out").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).setPayload(ParDoPayload.newBuilder().setDoFn(FunctionSpec.newBuilder()).build().toByteString())).setEnvironmentId("rare").build();
    PTransform flattenTransform = PTransform.newBuilder().putInputs("readInput", "read.out").putInputs("otherEnvInput", "envRead.out").putOutputs("output", "flatten.out").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.FLATTEN_TRANSFORM_URN)).build();
    PTransform windowTransform = PTransform.newBuilder().putInputs("input", "flatten.out").putOutputs("output", "window.out").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.ASSIGN_WINDOWS_TRANSFORM_URN).setPayload(WindowIntoPayload.newBuilder().setWindowFn(FunctionSpec.newBuilder()).build().toByteString())).setEnvironmentId("common").build();
    Components components = partialComponents.toBuilder().putTransforms("read", readTransform).putPcollections("read.out", PCollection.newBuilder().setUniqueName("read.out").build()).putTransforms("envRead", otherEnvRead).putPcollections("envRead.out", PCollection.newBuilder().setUniqueName("envRead.out").build()).putTransforms("flatten", flattenTransform).putPcollections("flatten.out", PCollection.newBuilder().setUniqueName("flatten.out").build()).putTransforms("window", windowTransform).putPcollections("window.out", PCollection.newBuilder().setUniqueName("window.out").build()).putEnvironments("common", Environments.createDockerEnvironment("common")).putEnvironments("rare", Environments.createDockerEnvironment("rare")).build();
    QueryablePipeline p = QueryablePipeline.forPrimitivesIn(components);
    ExecutableStage subgraph = GreedyStageFuser.forGrpcPortRead(p, impulseOutputNode, ImmutableSet.of(PipelineNode.pTransform("read", readTransform)));
    assertThat(subgraph.getOutputPCollections(), emptyIterable());
    assertThat(subgraph, hasSubtransforms("read", "flatten", "window"));
    // Flatten shows up in both of these subgraphs, but elements only go through a path to the
    // flatten once.
    ExecutableStage readFromOtherEnv = GreedyStageFuser.forGrpcPortRead(p, impulseOutputNode, ImmutableSet.of(PipelineNode.pTransform("envRead", otherEnvRead)));
    assertThat(readFromOtherEnv.getOutputPCollections(), contains(PipelineNode.pCollection("flatten.out", components.getPcollectionsOrThrow("flatten.out"))));
    assertThat(readFromOtherEnv, hasSubtransforms("envRead", "flatten"));
}
Also used : Components(org.apache.beam.model.pipeline.v1.RunnerApi.Components) PTransform(org.apache.beam.model.pipeline.v1.RunnerApi.PTransform) Test(org.junit.Test)

Example 52 with Components

use of org.apache.beam.sdk.common.runner.v1.RunnerApi.Components in project beam by apache.

the class PTransformTranslationTest method toAndFromProto.

@Test
public void toAndFromProto() throws IOException {
    SdkComponents components = SdkComponents.create(spec.getTransform().getPipeline().getOptions());
    RunnerApi.PTransform converted = convert(spec, components);
    Components protoComponents = components.toComponents();
    // Sanity checks
    assertThat(converted.getInputsCount(), equalTo(spec.getTransform().getInputs().size()));
    assertThat(converted.getOutputsCount(), equalTo(spec.getTransform().getOutputs().size()));
    assertThat(converted.getSubtransformsCount(), equalTo(spec.getChildren().size()));
    assertThat(converted.getUniqueName(), equalTo(spec.getTransform().getFullName()));
    for (PValue inputValue : spec.getTransform().getInputs().values()) {
        PCollection<?> inputPc = (PCollection<?>) inputValue;
        protoComponents.getPcollectionsOrThrow(components.registerPCollection(inputPc));
    }
    for (PValue outputValue : spec.getTransform().getOutputs().values()) {
        PCollection<?> outputPc = (PCollection<?>) outputValue;
        protoComponents.getPcollectionsOrThrow(components.registerPCollection(outputPc));
    }
}
Also used : Components(org.apache.beam.model.pipeline.v1.RunnerApi.Components) RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) PCollection(org.apache.beam.sdk.values.PCollection) PValue(org.apache.beam.sdk.values.PValue) Test(org.junit.Test)

Example 53 with Components

use of org.apache.beam.sdk.common.runner.v1.RunnerApi.Components in project beam by apache.

the class BatchSideInputHandlerFactoryTest method createExecutableStage.

private static ExecutableStage createExecutableStage(Collection<SideInputReference> sideInputs) {
    Components components = Components.getDefaultInstance();
    Environment environment = Environment.getDefaultInstance();
    PCollectionNode inputCollection = PipelineNode.pCollection("collection-id", RunnerApi.PCollection.getDefaultInstance());
    return ImmutableExecutableStage.of(components, environment, inputCollection, sideInputs, Collections.emptyList(), Collections.emptyList(), Collections.emptyList(), Collections.emptyList(), DEFAULT_WIRE_CODER_SETTINGS);
}
Also used : Components(org.apache.beam.model.pipeline.v1.RunnerApi.Components) Environment(org.apache.beam.model.pipeline.v1.RunnerApi.Environment) PCollectionNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PCollectionNode)

Example 54 with Components

use of org.apache.beam.sdk.common.runner.v1.RunnerApi.Components in project beam by apache.

the class SparkBatchPortablePipelineTranslator method translateExecutableStage.

private static <InputT, OutputT, SideInputT> void translateExecutableStage(PTransformNode transformNode, RunnerApi.Pipeline pipeline, SparkTranslationContext context) {
    RunnerApi.ExecutableStagePayload stagePayload;
    try {
        stagePayload = RunnerApi.ExecutableStagePayload.parseFrom(transformNode.getTransform().getSpec().getPayload());
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    String inputPCollectionId = stagePayload.getInput();
    Dataset inputDataset = context.popDataset(inputPCollectionId);
    Map<String, String> outputs = transformNode.getTransform().getOutputsMap();
    BiMap<String, Integer> outputExtractionMap = createOutputMap(outputs.values());
    Components components = pipeline.getComponents();
    Coder windowCoder = getWindowingStrategy(inputPCollectionId, components).getWindowFn().windowCoder();
    ImmutableMap<String, Tuple2<Broadcast<List<byte[]>>, WindowedValueCoder<SideInputT>>> broadcastVariables = broadcastSideInputs(stagePayload, context);
    JavaRDD<RawUnionValue> staged;
    if (stagePayload.getUserStatesCount() > 0 || stagePayload.getTimersCount() > 0) {
        Coder<WindowedValue<InputT>> windowedInputCoder = instantiateCoder(inputPCollectionId, components);
        Coder valueCoder = ((WindowedValue.FullWindowedValueCoder) windowedInputCoder).getValueCoder();
        // Stateful stages are only allowed of KV input to be able to group on the key
        if (!(valueCoder instanceof KvCoder)) {
            throw new IllegalStateException(String.format(Locale.ENGLISH, "The element coder for stateful DoFn '%s' must be KvCoder but is: %s", inputPCollectionId, valueCoder.getClass().getSimpleName()));
        }
        Coder keyCoder = ((KvCoder) valueCoder).getKeyCoder();
        Coder innerValueCoder = ((KvCoder) valueCoder).getValueCoder();
        WindowingStrategy windowingStrategy = getWindowingStrategy(inputPCollectionId, components);
        WindowFn<Object, BoundedWindow> windowFn = windowingStrategy.getWindowFn();
        WindowedValue.WindowedValueCoder wvCoder = WindowedValue.FullWindowedValueCoder.of(innerValueCoder, windowFn.windowCoder());
        JavaPairRDD<ByteArray, Iterable<WindowedValue<KV>>> groupedByKey = groupByKeyPair(inputDataset, keyCoder, wvCoder);
        SparkExecutableStageFunction<KV, SideInputT> function = new SparkExecutableStageFunction<>(context.getSerializableOptions(), stagePayload, context.jobInfo, outputExtractionMap, SparkExecutableStageContextFactory.getInstance(), broadcastVariables, MetricsAccumulator.getInstance(), windowCoder);
        staged = groupedByKey.flatMap(function.forPair());
    } else {
        JavaRDD<WindowedValue<InputT>> inputRdd2 = ((BoundedDataset<InputT>) inputDataset).getRDD();
        SparkExecutableStageFunction<InputT, SideInputT> function2 = new SparkExecutableStageFunction<>(context.getSerializableOptions(), stagePayload, context.jobInfo, outputExtractionMap, SparkExecutableStageContextFactory.getInstance(), broadcastVariables, MetricsAccumulator.getInstance(), windowCoder);
        staged = inputRdd2.mapPartitions(function2);
    }
    String intermediateId = getExecutableStageIntermediateId(transformNode);
    context.pushDataset(intermediateId, new Dataset() {

        @Override
        public void cache(String storageLevel, Coder<?> coder) {
            StorageLevel level = StorageLevel.fromString(storageLevel);
            staged.persist(level);
        }

        @Override
        public void action() {
            // Empty function to force computation of RDD.
            staged.foreach(TranslationUtils.emptyVoidFunction());
        }

        @Override
        public void setName(String name) {
            staged.setName(name);
        }
    });
    // pop dataset to mark RDD as used
    context.popDataset(intermediateId);
    for (String outputId : outputs.values()) {
        JavaRDD<WindowedValue<OutputT>> outputRdd = staged.flatMap(new SparkExecutableStageExtractionFunction<>(outputExtractionMap.get(outputId)));
        context.pushDataset(outputId, new BoundedDataset<>(outputRdd));
    }
    if (outputs.isEmpty()) {
        // After pipeline translation, we traverse the set of unconsumed PCollections and add a
        // no-op sink to each to make sure they are materialized by Spark. However, some SDK-executed
        // stages have no runner-visible output after fusion. We handle this case by adding a sink
        // here.
        JavaRDD<WindowedValue<OutputT>> outputRdd = staged.flatMap((rawUnionValue) -> Collections.emptyIterator());
        context.pushDataset(String.format("EmptyOutputSink_%d", context.nextSinkId()), new BoundedDataset<>(outputRdd));
    }
}
Also used : PipelineTranslatorUtils.getWindowingStrategy(org.apache.beam.runners.fnexecution.translation.PipelineTranslatorUtils.getWindowingStrategy) WindowingStrategy(org.apache.beam.sdk.values.WindowingStrategy) Components(org.apache.beam.model.pipeline.v1.RunnerApi.Components) WindowedValueCoder(org.apache.beam.sdk.util.WindowedValue.WindowedValueCoder) RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) WindowedValue(org.apache.beam.sdk.util.WindowedValue) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) ByteArray(org.apache.beam.runners.spark.util.ByteArray) List(java.util.List) StorageLevel(org.apache.spark.storage.StorageLevel) WindowedValueCoder(org.apache.beam.sdk.util.WindowedValue.WindowedValueCoder) KvCoder(org.apache.beam.sdk.coders.KvCoder) PipelineTranslatorUtils.instantiateCoder(org.apache.beam.runners.fnexecution.translation.PipelineTranslatorUtils.instantiateCoder) PipelineTranslatorUtils.getWindowedValueCoder(org.apache.beam.runners.fnexecution.translation.PipelineTranslatorUtils.getWindowedValueCoder) Coder(org.apache.beam.sdk.coders.Coder) ByteArrayCoder(org.apache.beam.sdk.coders.ByteArrayCoder) RawUnionValue(org.apache.beam.sdk.transforms.join.RawUnionValue) KvCoder(org.apache.beam.sdk.coders.KvCoder) IOException(java.io.IOException) KV(org.apache.beam.sdk.values.KV) Tuple2(scala.Tuple2)

Example 55 with Components

use of org.apache.beam.sdk.common.runner.v1.RunnerApi.Components in project beam by apache.

the class SparkBatchPortablePipelineTranslator method translateGroupByKey.

private static <K, V> void translateGroupByKey(PTransformNode transformNode, RunnerApi.Pipeline pipeline, SparkTranslationContext context) {
    RunnerApi.Components components = pipeline.getComponents();
    String inputId = getInputId(transformNode);
    Dataset inputDataset = context.popDataset(inputId);
    JavaRDD<WindowedValue<KV<K, V>>> inputRdd = ((BoundedDataset<KV<K, V>>) inputDataset).getRDD();
    WindowedValueCoder<KV<K, V>> inputCoder = getWindowedValueCoder(inputId, components);
    KvCoder<K, V> inputKvCoder = (KvCoder<K, V>) inputCoder.getValueCoder();
    Coder<K> inputKeyCoder = inputKvCoder.getKeyCoder();
    Coder<V> inputValueCoder = inputKvCoder.getValueCoder();
    WindowingStrategy windowingStrategy = getWindowingStrategy(inputId, components);
    WindowFn<Object, BoundedWindow> windowFn = windowingStrategy.getWindowFn();
    WindowedValue.WindowedValueCoder<V> wvCoder = WindowedValue.FullWindowedValueCoder.of(inputValueCoder, windowFn.windowCoder());
    JavaRDD<WindowedValue<KV<K, Iterable<V>>>> groupedByKeyAndWindow;
    Partitioner partitioner = getPartitioner(context);
    // As this is batch, we can ignore triggering and allowed lateness parameters.
    if (windowingStrategy.getWindowFn().equals(new GlobalWindows()) && windowingStrategy.getTimestampCombiner().equals(TimestampCombiner.END_OF_WINDOW)) {
        // we can drop the windows and recover them later
        groupedByKeyAndWindow = GroupNonMergingWindowsFunctions.groupByKeyInGlobalWindow(inputRdd, inputKeyCoder, inputValueCoder, partitioner);
    } else if (GroupNonMergingWindowsFunctions.isEligibleForGroupByWindow(windowingStrategy)) {
        // we can have a memory sensitive translation for non-merging windows
        groupedByKeyAndWindow = GroupNonMergingWindowsFunctions.groupByKeyAndWindow(inputRdd, inputKeyCoder, inputValueCoder, windowingStrategy, partitioner);
    } else {
        JavaRDD<KV<K, Iterable<WindowedValue<V>>>> groupedByKeyOnly = GroupCombineFunctions.groupByKeyOnly(inputRdd, inputKeyCoder, wvCoder, partitioner);
        // for batch, GroupAlsoByWindow uses an in-memory StateInternals.
        groupedByKeyAndWindow = groupedByKeyOnly.flatMap(new SparkGroupAlsoByWindowViaOutputBufferFn<>(windowingStrategy, new TranslationUtils.InMemoryStateInternalsFactory<>(), SystemReduceFn.buffering(inputValueCoder), context.serializablePipelineOptions));
    }
    context.pushDataset(getOutputId(transformNode), new BoundedDataset<>(groupedByKeyAndWindow));
}
Also used : PipelineTranslatorUtils.getWindowingStrategy(org.apache.beam.runners.fnexecution.translation.PipelineTranslatorUtils.getWindowingStrategy) WindowingStrategy(org.apache.beam.sdk.values.WindowingStrategy) RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) WindowedValue(org.apache.beam.sdk.util.WindowedValue) KV(org.apache.beam.sdk.values.KV) Components(org.apache.beam.model.pipeline.v1.RunnerApi.Components) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) Partitioner(org.apache.spark.Partitioner) HashPartitioner(org.apache.spark.HashPartitioner) GlobalWindows(org.apache.beam.sdk.transforms.windowing.GlobalWindows) KvCoder(org.apache.beam.sdk.coders.KvCoder) KV(org.apache.beam.sdk.values.KV) JavaRDD(org.apache.spark.api.java.JavaRDD)

Aggregations

Components (org.apache.beam.model.pipeline.v1.RunnerApi.Components)49 Test (org.junit.Test)37 PTransform (org.apache.beam.model.pipeline.v1.RunnerApi.PTransform)19 PTransformNode (org.apache.beam.runners.core.construction.graph.PipelineNode.PTransformNode)18 RunnerApi (org.apache.beam.model.pipeline.v1.RunnerApi)12 PCollection (org.apache.beam.model.pipeline.v1.RunnerApi.PCollection)10 PCollectionNode (org.apache.beam.runners.core.construction.graph.PipelineNode.PCollectionNode)10 Map (java.util.Map)9 Pipeline (org.apache.beam.sdk.Pipeline)9 Environment (org.apache.beam.model.pipeline.v1.RunnerApi.Environment)7 Collection (java.util.Collection)6 Collectors (java.util.stream.Collectors)6 List (java.util.List)5 ExecutableStagePayload (org.apache.beam.model.pipeline.v1.RunnerApi.ExecutableStagePayload)5 FunctionSpec (org.apache.beam.model.pipeline.v1.RunnerApi.FunctionSpec)5 DeduplicationResult (org.apache.beam.runners.core.construction.graph.OutputDeduplicator.DeduplicationResult)5 ArrayList (java.util.ArrayList)4 Pipeline (org.apache.beam.model.pipeline.v1.RunnerApi.Pipeline)4 PCollection (org.apache.beam.sdk.values.PCollection)4 ImmutableList (org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList)4