use of org.apache.beam.sdk.common.runner.v1.RunnerApi.Components in project beam by apache.
the class GreedyStageFuserTest method fusesFlattenWithDifferentEnvironmentInputs.
@Test
public void fusesFlattenWithDifferentEnvironmentInputs() {
// (impulse.out) -> read -> read.out \ -> window -> window.out
// -------> flatten -> flatten.out /
// (impulse.out) -> envRead -> envRead.out /
// fuses into
// read -> read.out -> flatten -> flatten.out -> window -> window.out
// envRead -> envRead.out -> flatten -> (flatten.out)
// (flatten.out) -> window -> window.out
PTransform readTransform = PTransform.newBuilder().putInputs("input", "impulse.out").putOutputs("output", "read.out").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).setPayload(ParDoPayload.newBuilder().setDoFn(FunctionSpec.newBuilder()).build().toByteString())).setEnvironmentId("common").build();
PTransform otherEnvRead = PTransform.newBuilder().putInputs("impulse", "impulse.out").putOutputs("output", "envRead.out").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).setPayload(ParDoPayload.newBuilder().setDoFn(FunctionSpec.newBuilder()).build().toByteString())).setEnvironmentId("rare").build();
PTransform flattenTransform = PTransform.newBuilder().putInputs("readInput", "read.out").putInputs("otherEnvInput", "envRead.out").putOutputs("output", "flatten.out").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.FLATTEN_TRANSFORM_URN)).build();
PTransform windowTransform = PTransform.newBuilder().putInputs("input", "flatten.out").putOutputs("output", "window.out").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.ASSIGN_WINDOWS_TRANSFORM_URN).setPayload(WindowIntoPayload.newBuilder().setWindowFn(FunctionSpec.newBuilder()).build().toByteString())).setEnvironmentId("common").build();
Components components = partialComponents.toBuilder().putTransforms("read", readTransform).putPcollections("read.out", PCollection.newBuilder().setUniqueName("read.out").build()).putTransforms("envRead", otherEnvRead).putPcollections("envRead.out", PCollection.newBuilder().setUniqueName("envRead.out").build()).putTransforms("flatten", flattenTransform).putPcollections("flatten.out", PCollection.newBuilder().setUniqueName("flatten.out").build()).putTransforms("window", windowTransform).putPcollections("window.out", PCollection.newBuilder().setUniqueName("window.out").build()).putEnvironments("common", Environments.createDockerEnvironment("common")).putEnvironments("rare", Environments.createDockerEnvironment("rare")).build();
QueryablePipeline p = QueryablePipeline.forPrimitivesIn(components);
ExecutableStage subgraph = GreedyStageFuser.forGrpcPortRead(p, impulseOutputNode, ImmutableSet.of(PipelineNode.pTransform("read", readTransform)));
assertThat(subgraph.getOutputPCollections(), emptyIterable());
assertThat(subgraph, hasSubtransforms("read", "flatten", "window"));
// Flatten shows up in both of these subgraphs, but elements only go through a path to the
// flatten once.
ExecutableStage readFromOtherEnv = GreedyStageFuser.forGrpcPortRead(p, impulseOutputNode, ImmutableSet.of(PipelineNode.pTransform("envRead", otherEnvRead)));
assertThat(readFromOtherEnv.getOutputPCollections(), contains(PipelineNode.pCollection("flatten.out", components.getPcollectionsOrThrow("flatten.out"))));
assertThat(readFromOtherEnv, hasSubtransforms("envRead", "flatten"));
}
use of org.apache.beam.sdk.common.runner.v1.RunnerApi.Components in project beam by apache.
the class PTransformTranslationTest method toAndFromProto.
@Test
public void toAndFromProto() throws IOException {
SdkComponents components = SdkComponents.create(spec.getTransform().getPipeline().getOptions());
RunnerApi.PTransform converted = convert(spec, components);
Components protoComponents = components.toComponents();
// Sanity checks
assertThat(converted.getInputsCount(), equalTo(spec.getTransform().getInputs().size()));
assertThat(converted.getOutputsCount(), equalTo(spec.getTransform().getOutputs().size()));
assertThat(converted.getSubtransformsCount(), equalTo(spec.getChildren().size()));
assertThat(converted.getUniqueName(), equalTo(spec.getTransform().getFullName()));
for (PValue inputValue : spec.getTransform().getInputs().values()) {
PCollection<?> inputPc = (PCollection<?>) inputValue;
protoComponents.getPcollectionsOrThrow(components.registerPCollection(inputPc));
}
for (PValue outputValue : spec.getTransform().getOutputs().values()) {
PCollection<?> outputPc = (PCollection<?>) outputValue;
protoComponents.getPcollectionsOrThrow(components.registerPCollection(outputPc));
}
}
use of org.apache.beam.sdk.common.runner.v1.RunnerApi.Components in project beam by apache.
the class BatchSideInputHandlerFactoryTest method createExecutableStage.
private static ExecutableStage createExecutableStage(Collection<SideInputReference> sideInputs) {
Components components = Components.getDefaultInstance();
Environment environment = Environment.getDefaultInstance();
PCollectionNode inputCollection = PipelineNode.pCollection("collection-id", RunnerApi.PCollection.getDefaultInstance());
return ImmutableExecutableStage.of(components, environment, inputCollection, sideInputs, Collections.emptyList(), Collections.emptyList(), Collections.emptyList(), Collections.emptyList(), DEFAULT_WIRE_CODER_SETTINGS);
}
use of org.apache.beam.sdk.common.runner.v1.RunnerApi.Components in project beam by apache.
the class SparkBatchPortablePipelineTranslator method translateExecutableStage.
private static <InputT, OutputT, SideInputT> void translateExecutableStage(PTransformNode transformNode, RunnerApi.Pipeline pipeline, SparkTranslationContext context) {
RunnerApi.ExecutableStagePayload stagePayload;
try {
stagePayload = RunnerApi.ExecutableStagePayload.parseFrom(transformNode.getTransform().getSpec().getPayload());
} catch (IOException e) {
throw new RuntimeException(e);
}
String inputPCollectionId = stagePayload.getInput();
Dataset inputDataset = context.popDataset(inputPCollectionId);
Map<String, String> outputs = transformNode.getTransform().getOutputsMap();
BiMap<String, Integer> outputExtractionMap = createOutputMap(outputs.values());
Components components = pipeline.getComponents();
Coder windowCoder = getWindowingStrategy(inputPCollectionId, components).getWindowFn().windowCoder();
ImmutableMap<String, Tuple2<Broadcast<List<byte[]>>, WindowedValueCoder<SideInputT>>> broadcastVariables = broadcastSideInputs(stagePayload, context);
JavaRDD<RawUnionValue> staged;
if (stagePayload.getUserStatesCount() > 0 || stagePayload.getTimersCount() > 0) {
Coder<WindowedValue<InputT>> windowedInputCoder = instantiateCoder(inputPCollectionId, components);
Coder valueCoder = ((WindowedValue.FullWindowedValueCoder) windowedInputCoder).getValueCoder();
// Stateful stages are only allowed of KV input to be able to group on the key
if (!(valueCoder instanceof KvCoder)) {
throw new IllegalStateException(String.format(Locale.ENGLISH, "The element coder for stateful DoFn '%s' must be KvCoder but is: %s", inputPCollectionId, valueCoder.getClass().getSimpleName()));
}
Coder keyCoder = ((KvCoder) valueCoder).getKeyCoder();
Coder innerValueCoder = ((KvCoder) valueCoder).getValueCoder();
WindowingStrategy windowingStrategy = getWindowingStrategy(inputPCollectionId, components);
WindowFn<Object, BoundedWindow> windowFn = windowingStrategy.getWindowFn();
WindowedValue.WindowedValueCoder wvCoder = WindowedValue.FullWindowedValueCoder.of(innerValueCoder, windowFn.windowCoder());
JavaPairRDD<ByteArray, Iterable<WindowedValue<KV>>> groupedByKey = groupByKeyPair(inputDataset, keyCoder, wvCoder);
SparkExecutableStageFunction<KV, SideInputT> function = new SparkExecutableStageFunction<>(context.getSerializableOptions(), stagePayload, context.jobInfo, outputExtractionMap, SparkExecutableStageContextFactory.getInstance(), broadcastVariables, MetricsAccumulator.getInstance(), windowCoder);
staged = groupedByKey.flatMap(function.forPair());
} else {
JavaRDD<WindowedValue<InputT>> inputRdd2 = ((BoundedDataset<InputT>) inputDataset).getRDD();
SparkExecutableStageFunction<InputT, SideInputT> function2 = new SparkExecutableStageFunction<>(context.getSerializableOptions(), stagePayload, context.jobInfo, outputExtractionMap, SparkExecutableStageContextFactory.getInstance(), broadcastVariables, MetricsAccumulator.getInstance(), windowCoder);
staged = inputRdd2.mapPartitions(function2);
}
String intermediateId = getExecutableStageIntermediateId(transformNode);
context.pushDataset(intermediateId, new Dataset() {
@Override
public void cache(String storageLevel, Coder<?> coder) {
StorageLevel level = StorageLevel.fromString(storageLevel);
staged.persist(level);
}
@Override
public void action() {
// Empty function to force computation of RDD.
staged.foreach(TranslationUtils.emptyVoidFunction());
}
@Override
public void setName(String name) {
staged.setName(name);
}
});
// pop dataset to mark RDD as used
context.popDataset(intermediateId);
for (String outputId : outputs.values()) {
JavaRDD<WindowedValue<OutputT>> outputRdd = staged.flatMap(new SparkExecutableStageExtractionFunction<>(outputExtractionMap.get(outputId)));
context.pushDataset(outputId, new BoundedDataset<>(outputRdd));
}
if (outputs.isEmpty()) {
// After pipeline translation, we traverse the set of unconsumed PCollections and add a
// no-op sink to each to make sure they are materialized by Spark. However, some SDK-executed
// stages have no runner-visible output after fusion. We handle this case by adding a sink
// here.
JavaRDD<WindowedValue<OutputT>> outputRdd = staged.flatMap((rawUnionValue) -> Collections.emptyIterator());
context.pushDataset(String.format("EmptyOutputSink_%d", context.nextSinkId()), new BoundedDataset<>(outputRdd));
}
}
use of org.apache.beam.sdk.common.runner.v1.RunnerApi.Components in project beam by apache.
the class SparkBatchPortablePipelineTranslator method translateGroupByKey.
private static <K, V> void translateGroupByKey(PTransformNode transformNode, RunnerApi.Pipeline pipeline, SparkTranslationContext context) {
RunnerApi.Components components = pipeline.getComponents();
String inputId = getInputId(transformNode);
Dataset inputDataset = context.popDataset(inputId);
JavaRDD<WindowedValue<KV<K, V>>> inputRdd = ((BoundedDataset<KV<K, V>>) inputDataset).getRDD();
WindowedValueCoder<KV<K, V>> inputCoder = getWindowedValueCoder(inputId, components);
KvCoder<K, V> inputKvCoder = (KvCoder<K, V>) inputCoder.getValueCoder();
Coder<K> inputKeyCoder = inputKvCoder.getKeyCoder();
Coder<V> inputValueCoder = inputKvCoder.getValueCoder();
WindowingStrategy windowingStrategy = getWindowingStrategy(inputId, components);
WindowFn<Object, BoundedWindow> windowFn = windowingStrategy.getWindowFn();
WindowedValue.WindowedValueCoder<V> wvCoder = WindowedValue.FullWindowedValueCoder.of(inputValueCoder, windowFn.windowCoder());
JavaRDD<WindowedValue<KV<K, Iterable<V>>>> groupedByKeyAndWindow;
Partitioner partitioner = getPartitioner(context);
// As this is batch, we can ignore triggering and allowed lateness parameters.
if (windowingStrategy.getWindowFn().equals(new GlobalWindows()) && windowingStrategy.getTimestampCombiner().equals(TimestampCombiner.END_OF_WINDOW)) {
// we can drop the windows and recover them later
groupedByKeyAndWindow = GroupNonMergingWindowsFunctions.groupByKeyInGlobalWindow(inputRdd, inputKeyCoder, inputValueCoder, partitioner);
} else if (GroupNonMergingWindowsFunctions.isEligibleForGroupByWindow(windowingStrategy)) {
// we can have a memory sensitive translation for non-merging windows
groupedByKeyAndWindow = GroupNonMergingWindowsFunctions.groupByKeyAndWindow(inputRdd, inputKeyCoder, inputValueCoder, windowingStrategy, partitioner);
} else {
JavaRDD<KV<K, Iterable<WindowedValue<V>>>> groupedByKeyOnly = GroupCombineFunctions.groupByKeyOnly(inputRdd, inputKeyCoder, wvCoder, partitioner);
// for batch, GroupAlsoByWindow uses an in-memory StateInternals.
groupedByKeyAndWindow = groupedByKeyOnly.flatMap(new SparkGroupAlsoByWindowViaOutputBufferFn<>(windowingStrategy, new TranslationUtils.InMemoryStateInternalsFactory<>(), SystemReduceFn.buffering(inputValueCoder), context.serializablePipelineOptions));
}
context.pushDataset(getOutputId(transformNode), new BoundedDataset<>(groupedByKeyAndWindow));
}
Aggregations