use of org.apache.beam.sdk.common.runner.v1.RunnerApi.PTransform in project beam by apache.
the class OutputDeduplicator method createFlattenOfPartials.
private static PTransform createFlattenOfPartials(String transformId, String outputId, Collection<PCollectionNode> generatedInputs) {
PTransform.Builder newFlattenBuilder = PTransform.newBuilder();
int i = 0;
for (PCollectionNode generatedInput : generatedInputs) {
String localInputId = String.format("input_%s", i);
i++;
newFlattenBuilder.putInputs(localInputId, generatedInput.getId());
}
// Flatten all of the new partial nodes together.
return newFlattenBuilder.setUniqueName(transformId).putOutputs("output", outputId).setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.FLATTEN_TRANSFORM_URN)).build();
}
use of org.apache.beam.sdk.common.runner.v1.RunnerApi.PTransform in project beam by apache.
the class ImmutableExecutableStageTest method ofFullComponentsOnlyHasStagePTransforms.
@Test
public void ofFullComponentsOnlyHasStagePTransforms() throws Exception {
Environment env = Environments.createDockerEnvironment("foo");
PTransform pt = PTransform.newBuilder().putInputs("input", "input.out").putInputs("side_input", "sideInput.in").putInputs("timer", "timer.pc").putOutputs("output", "output.out").putOutputs("timer", "timer.pc").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).setPayload(ParDoPayload.newBuilder().setDoFn(RunnerApi.FunctionSpec.newBuilder()).putSideInputs("side_input", RunnerApi.SideInput.getDefaultInstance()).putStateSpecs("user_state", RunnerApi.StateSpec.getDefaultInstance()).putTimerFamilySpecs("timer", RunnerApi.TimerFamilySpec.getDefaultInstance()).build().toByteString())).build();
PCollection input = PCollection.newBuilder().setUniqueName("input.out").build();
PCollection sideInput = PCollection.newBuilder().setUniqueName("sideInput.in").build();
PCollection timer = PCollection.newBuilder().setUniqueName("timer.pc").build();
PCollection output = PCollection.newBuilder().setUniqueName("output.out").build();
Components components = Components.newBuilder().putTransforms("pt", pt).putTransforms("other_pt", PTransform.newBuilder().setUniqueName("other").build()).putPcollections("input.out", input).putPcollections("sideInput.in", sideInput).putPcollections("timer.pc", timer).putPcollections("output.out", output).putEnvironments("foo", env).build();
PTransformNode transformNode = PipelineNode.pTransform("pt", pt);
SideInputReference sideInputRef = SideInputReference.of(transformNode, "side_input", PipelineNode.pCollection("sideInput.in", sideInput));
UserStateReference userStateRef = UserStateReference.of(transformNode, "user_state", PipelineNode.pCollection("input.out", input));
TimerReference timerRef = TimerReference.of(transformNode, "timer");
ImmutableExecutableStage stage = ImmutableExecutableStage.ofFullComponents(components, env, PipelineNode.pCollection("input.out", input), Collections.singleton(sideInputRef), Collections.singleton(userStateRef), Collections.singleton(timerRef), Collections.singleton(PipelineNode.pTransform("pt", pt)), Collections.singleton(PipelineNode.pCollection("output.out", output)), DEFAULT_WIRE_CODER_SETTINGS);
assertThat(stage.getComponents().containsTransforms("pt"), is(true));
assertThat(stage.getComponents().containsTransforms("other_pt"), is(false));
PTransform stagePTransform = stage.toPTransform("foo");
assertThat(stagePTransform.getOutputsMap(), hasValue("output.out"));
assertThat(stagePTransform.getOutputsCount(), equalTo(1));
assertThat(stagePTransform.getInputsMap(), allOf(hasValue("input.out"), hasValue("sideInput.in")));
assertThat(stagePTransform.getInputsCount(), equalTo(2));
ExecutableStagePayload payload = ExecutableStagePayload.parseFrom(stagePTransform.getSpec().getPayload());
assertThat(payload.getTransformsList(), contains("pt"));
assertThat(ExecutableStage.fromPayload(payload), equalTo(stage));
}
use of org.apache.beam.sdk.common.runner.v1.RunnerApi.PTransform in project beam by apache.
the class OutputDeduplicatorTest method duplicateOverStages.
@Test
public void duplicateOverStages() {
/* When multiple stages and a runner-executed transform produce a PCollection, all should be
* replaced with synthetic flattens.
* original graph:
* --> one -> .out \
* red -> .out | -> shared -> .out -> blue -> .out
* --> two -> .out /
*
* fused graph:
* --> [one -> .out -> shared ->] .out
* red -> .out | (shared.out) -> blue -> .out
* --> [two -> .out -> shared ->] .out
*
* deduplicated graph:
* --> [one -> .out -> shared ->] .out:0 \
* red -> .out | -> shared -> .out -> blue ->.out
* --> [two -> .out -> shared ->] .out:1 /
*/
PCollection redOut = PCollection.newBuilder().setUniqueName("red.out").build();
PTransform red = PTransform.newBuilder().setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).build()).putOutputs("out", redOut.getUniqueName()).build();
PCollection oneOut = PCollection.newBuilder().setUniqueName("one.out").build();
PTransform one = PTransform.newBuilder().setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).build()).putInputs("in", redOut.getUniqueName()).putOutputs("out", oneOut.getUniqueName()).build();
PCollection twoOut = PCollection.newBuilder().setUniqueName("two.out").build();
PTransform two = PTransform.newBuilder().setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).build()).putInputs("in", redOut.getUniqueName()).putOutputs("out", twoOut.getUniqueName()).build();
PCollection sharedOut = PCollection.newBuilder().setUniqueName("shared.out").build();
PTransform shared = PTransform.newBuilder().setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).build()).putInputs("one", oneOut.getUniqueName()).putInputs("two", twoOut.getUniqueName()).putOutputs("shared", sharedOut.getUniqueName()).build();
PCollection blueOut = PCollection.newBuilder().setUniqueName("blue.out").build();
PTransform blue = PTransform.newBuilder().setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).build()).putInputs("in", sharedOut.getUniqueName()).putOutputs("out", blueOut.getUniqueName()).build();
RunnerApi.Components components = Components.newBuilder().putTransforms("one", one).putPcollections(oneOut.getUniqueName(), oneOut).putTransforms("two", two).putPcollections(twoOut.getUniqueName(), twoOut).putTransforms("shared", shared).putPcollections(sharedOut.getUniqueName(), sharedOut).putTransforms("red", red).putPcollections(redOut.getUniqueName(), redOut).putTransforms("blue", blue).putPcollections(blueOut.getUniqueName(), blueOut).build();
ExecutableStage oneStage = ImmutableExecutableStage.of(components, Environment.getDefaultInstance(), PipelineNode.pCollection(redOut.getUniqueName(), redOut), ImmutableList.of(), ImmutableList.of(), ImmutableList.of(), ImmutableList.of(PipelineNode.pTransform("one", one), PipelineNode.pTransform("shared", shared)), ImmutableList.of(PipelineNode.pCollection(sharedOut.getUniqueName(), sharedOut)), DEFAULT_WIRE_CODER_SETTINGS);
ExecutableStage twoStage = ImmutableExecutableStage.of(components, Environment.getDefaultInstance(), PipelineNode.pCollection(redOut.getUniqueName(), redOut), ImmutableList.of(), ImmutableList.of(), ImmutableList.of(), ImmutableList.of(PipelineNode.pTransform("two", two), PipelineNode.pTransform("shared", shared)), ImmutableList.of(PipelineNode.pCollection(sharedOut.getUniqueName(), sharedOut)), DEFAULT_WIRE_CODER_SETTINGS);
PTransformNode redTransform = PipelineNode.pTransform("red", red);
PTransformNode blueTransform = PipelineNode.pTransform("blue", blue);
QueryablePipeline pipeline = QueryablePipeline.forPrimitivesIn(components);
DeduplicationResult result = OutputDeduplicator.ensureSingleProducer(pipeline, ImmutableList.of(oneStage, twoStage), ImmutableList.of(redTransform, blueTransform));
assertThat(result.getIntroducedTransforms(), hasSize(1));
PTransformNode introduced = getOnlyElement(result.getIntroducedTransforms());
assertThat(introduced.getTransform().getOutputsMap().size(), equalTo(1));
assertThat(getOnlyElement(introduced.getTransform().getOutputsMap().values()), equalTo(sharedOut.getUniqueName()));
assertThat(result.getDeduplicatedComponents().getPcollectionsMap().keySet(), hasItems(introduced.getTransform().getInputsMap().values().toArray(new String[0])));
assertThat(result.getDeduplicatedStages().keySet(), hasSize(2));
List<String> stageOutputs = result.getDeduplicatedStages().values().stream().flatMap(stage -> stage.getOutputPCollections().stream().map(PCollectionNode::getId)).collect(Collectors.toList());
assertThat(stageOutputs, containsInAnyOrder(introduced.getTransform().getInputsMap().values().toArray()));
assertThat(result.getDeduplicatedTransforms().keySet(), empty());
assertThat(result.getDeduplicatedComponents().getPcollectionsMap().keySet(), hasItems(stageOutputs.toArray(new String[0])));
assertThat(result.getDeduplicatedComponents().getTransformsMap(), hasEntry(introduced.getId(), introduced.getTransform()));
}
use of org.apache.beam.sdk.common.runner.v1.RunnerApi.PTransform in project beam by apache.
the class QueryablePipelineTest method transformWithSideAndMainInputs.
/**
* Tests that inputs that are only side inputs are not returned from {@link
* QueryablePipeline#getPerElementConsumers(PCollectionNode)} and are returned from {@link
* QueryablePipeline#getSideInputs(PTransformNode)}.
*/
@Test
public void transformWithSideAndMainInputs() {
Pipeline p = Pipeline.create();
PCollection<byte[]> impulse = p.apply("Impulse", Impulse.create());
PCollectionView<String> view = p.apply("Create", Create.of("foo")).apply("View", View.asSingleton());
impulse.apply("par_do", ParDo.of(new TestFn()).withSideInputs(view).withOutputTags(new TupleTag<>(), TupleTagList.empty()));
Components components = PipelineTranslation.toProto(p).getComponents();
QueryablePipeline qp = QueryablePipeline.forPrimitivesIn(components);
String mainInputName = getOnlyElement(PipelineNode.pTransform("Impulse", components.getTransformsOrThrow("Impulse")).getTransform().getOutputsMap().values());
PCollectionNode mainInput = PipelineNode.pCollection(mainInputName, components.getPcollectionsOrThrow(mainInputName));
PTransform parDoTransform = components.getTransformsOrThrow("par_do");
String sideInputLocalName = getOnlyElement(parDoTransform.getInputsMap().entrySet().stream().filter(entry -> !entry.getValue().equals(mainInputName)).map(Map.Entry::getKey).collect(Collectors.toSet()));
String sideInputCollectionId = parDoTransform.getInputsOrThrow(sideInputLocalName);
PCollectionNode sideInput = PipelineNode.pCollection(sideInputCollectionId, components.getPcollectionsOrThrow(sideInputCollectionId));
PTransformNode parDoNode = PipelineNode.pTransform("par_do", components.getTransformsOrThrow("par_do"));
SideInputReference sideInputRef = SideInputReference.of(parDoNode, sideInputLocalName, sideInput);
assertThat(qp.getSideInputs(parDoNode), contains(sideInputRef));
assertThat(qp.getPerElementConsumers(mainInput), contains(parDoNode));
assertThat(qp.getPerElementConsumers(sideInput), not(contains(parDoNode)));
}
use of org.apache.beam.sdk.common.runner.v1.RunnerApi.PTransform in project beam by apache.
the class GreedyPipelineFuserTest method parDoWithTimerRootsStage.
/*
* impulse -> .out -> parDo -> .out -> timer -> .out
* becomes
* (impulse.out) -> parDo -> (parDo.out)
* (parDo.out) -> timer
*/
@Test
public void parDoWithTimerRootsStage() {
// (impulse.out) -> parDo -> (parDo.out)
// (parDo.out) -> timer -> timer.out
// timer has a timer spec which prevents it from fusing with an upstream ParDo
PTransform parDoTransform = PTransform.newBuilder().setUniqueName("ParDo").putInputs("input", "impulse.out").putOutputs("output", "parDo.out").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).setPayload(ParDoPayload.newBuilder().setDoFn(FunctionSpec.newBuilder()).build().toByteString())).setEnvironmentId("common").build();
PTransform timerTransform = PTransform.newBuilder().setUniqueName("TimerParDo").putInputs("input", "parDo.out").putInputs("timer", "timer.out").putOutputs("timer", "timer.out").putOutputs("output", "output.out").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).setPayload(ParDoPayload.newBuilder().setDoFn(FunctionSpec.newBuilder()).putTimerFamilySpecs("timer", TimerFamilySpec.getDefaultInstance()).build().toByteString())).setEnvironmentId("common").build();
Components components = partialComponents.toBuilder().putTransforms("parDo", parDoTransform).putPcollections("parDo.out", pc("parDo.out")).putTransforms("timer", timerTransform).putPcollections("timer.out", pc("timer.out")).putPcollections("output.out", pc("output.out")).putEnvironments("common", Environments.createDockerEnvironment("common")).build();
FusedPipeline fused = GreedyPipelineFuser.fuse(Pipeline.newBuilder().setComponents(components).addRequirements(ParDoTranslation.REQUIRES_STATEFUL_PROCESSING_URN).build());
assertThat(fused.getRunnerExecutedTransforms(), containsInAnyOrder(PipelineNode.pTransform("impulse", components.getTransformsOrThrow("impulse"))));
assertThat(fused.getFusedStages(), containsInAnyOrder(ExecutableStageMatcher.withInput("impulse.out").withOutputs("parDo.out").withTransforms("parDo"), ExecutableStageMatcher.withInput("parDo.out").withNoOutputs().withTransforms("timer")));
}
Aggregations