use of com.google.api.services.dataflow.model.ParallelInstruction in project beam by apache.
the class ReplacePgbkWithPrecombineFunction method apply.
@Override
public MutableNetwork<Node, Edge> apply(MutableNetwork<Node, Edge> network) {
Networks.replaceDirectedNetworkNodes(network, (Node node) -> {
if (!isPrecombinePgbk(node)) {
return node;
}
// Turn the Pgbk into a ParDo with the combine function as a UserFn.
ParallelInstructionNode castNode = ((ParallelInstructionNode) node);
ParallelInstruction parallelInstruction = castNode.getParallelInstruction();
Map<String, Object> cloudUserFnSpec = parallelInstruction.getPartialGroupByKey().getValueCombiningFn();
addString(cloudUserFnSpec, WorkerPropertyNames.PHASE, CombinePhase.ADD);
ParDoInstruction newParDoInstruction = new ParDoInstruction();
newParDoInstruction.setUserFn(cloudUserFnSpec);
ParallelInstruction newParallelInstruction = parallelInstruction.clone();
newParallelInstruction.setPartialGroupByKey(null);
newParallelInstruction.setParDo(newParDoInstruction);
return ParallelInstructionNode.create(newParallelInstruction, ExecutionLocation.UNKNOWN);
});
return network;
}
use of com.google.api.services.dataflow.model.ParallelInstruction in project beam by apache.
the class CloneAmbiguousFlattensFunction method cloneFlatten.
/**
* A helper function which performs the actual cloning procedure, which means creating the runner
* and SDK versions of both the ambiguous flatten and its PCollection, attaching the old flatten's
* predecessors and successors properly, and then removing the ambiguous flatten from the network.
*/
private void cloneFlatten(Node flatten, MutableNetwork<Node, Edge> network) {
// Start by creating the clones of the flatten and its PCollection.
InstructionOutputNode flattenOut = (InstructionOutputNode) Iterables.getOnlyElement(network.successors(flatten));
ParallelInstruction flattenInstruction = ((ParallelInstructionNode) flatten).getParallelInstruction();
Node runnerFlatten = ParallelInstructionNode.create(flattenInstruction, ExecutionLocation.RUNNER_HARNESS);
Node runnerFlattenOut = InstructionOutputNode.create(flattenOut.getInstructionOutput(), flattenOut.getPcollectionId());
network.addNode(runnerFlatten);
network.addNode(runnerFlattenOut);
Node sdkFlatten = ParallelInstructionNode.create(flattenInstruction, ExecutionLocation.SDK_HARNESS);
Node sdkFlattenOut = InstructionOutputNode.create(flattenOut.getInstructionOutput(), flattenOut.getPcollectionId());
network.addNode(sdkFlatten);
network.addNode(sdkFlattenOut);
for (Edge edge : ImmutableList.copyOf(network.edgesConnecting(flatten, flattenOut))) {
network.addEdge(runnerFlatten, runnerFlattenOut, edge.clone());
network.addEdge(sdkFlatten, sdkFlattenOut, edge.clone());
}
// Copy over predecessor edges to both cloned nodes.
for (Node predecessor : network.predecessors(flatten)) {
for (Edge edge : ImmutableList.copyOf(network.edgesConnecting(predecessor, flatten))) {
network.addEdge(predecessor, runnerFlatten, edge.clone());
network.addEdge(predecessor, sdkFlatten, edge.clone());
}
}
// Copy over successor edges depending on execution locations of successors.
for (Node successor : network.successors(flattenOut)) {
// Connect successor to SDK harness only if sure it executes in SDK.
Node selectedOutput = executesInSdkHarness(successor) ? sdkFlattenOut : runnerFlattenOut;
for (Edge edge : ImmutableList.copyOf(network.edgesConnecting(flattenOut, successor))) {
network.addEdge(selectedOutput, successor, edge.clone());
}
}
network.removeNode(flatten);
network.removeNode(flattenOut);
}
use of com.google.api.services.dataflow.model.ParallelInstruction in project beam by apache.
the class FixMultiOutputInfosOnParDoInstructions method apply.
@Override
public MapTask apply(MapTask input) {
for (ParallelInstruction instruction : Apiary.listOrEmpty(input.getInstructions())) {
ParDoInstruction parDoInstruction = instruction.getParDo();
if (parDoInstruction != null) {
int numOutputs = Apiary.intOrZero(parDoInstruction.getNumOutputs());
List<MultiOutputInfo> multiOutputInfos = Apiary.listOrEmpty(parDoInstruction.getMultiOutputInfos());
if (numOutputs != Apiary.listOrEmpty(instruction.getParDo().getMultiOutputInfos()).size()) {
if (numOutputs == 1) {
parDoInstruction.setMultiOutputInfos(ImmutableList.of(new MultiOutputInfo().setTag(idGenerator.getId())));
} else {
throw new IllegalArgumentException(String.format("Invalid ParDoInstruction %s, %d outputs specified, found %s tags.", instruction.getSystemName(), numOutputs, multiOutputInfos));
}
}
}
}
return input;
}
use of com.google.api.services.dataflow.model.ParallelInstruction in project beam by apache.
the class BeamFnMapTaskExecutorFactory method createPartialGroupByKeyOperation.
<K> OperationNode createPartialGroupByKeyOperation(Network<Node, Edge> network, ParallelInstructionNode node, PipelineOptions options, DataflowExecutionContext<?> executionContext, DataflowOperationContext operationContext) throws Exception {
ParallelInstruction instruction = node.getParallelInstruction();
PartialGroupByKeyInstruction pgbk = instruction.getPartialGroupByKey();
OutputReceiver[] receivers = getOutputReceivers(network, node);
Coder<?> windowedCoder = CloudObjects.coderFromCloudObject(CloudObject.fromSpec(pgbk.getInputElementCodec()));
if (!(windowedCoder instanceof WindowedValueCoder)) {
throw new IllegalArgumentException(String.format("unexpected kind of input coder for PartialGroupByKeyOperation: %s", windowedCoder));
}
Coder<?> elemCoder = ((WindowedValueCoder<?>) windowedCoder).getValueCoder();
if (!(elemCoder instanceof KvCoder)) {
throw new IllegalArgumentException(String.format("unexpected kind of input element coder for PartialGroupByKeyOperation: %s", elemCoder));
}
@SuppressWarnings("unchecked") KvCoder<K, ?> keyedElementCoder = (KvCoder<K, ?>) elemCoder;
CloudObject cloudUserFn = pgbk.getValueCombiningFn() != null ? CloudObject.fromSpec(pgbk.getValueCombiningFn()) : null;
ParDoFn fn = PartialGroupByKeyParDoFns.create(options, keyedElementCoder, cloudUserFn, pgbk.getSideInputs(), Arrays.<Receiver>asList(receivers), executionContext, operationContext);
return OperationNode.create(new ParDoOperation(fn, receivers, operationContext));
}
use of com.google.api.services.dataflow.model.ParallelInstruction in project beam by apache.
the class IntrinsicMapTaskExecutorFactory method createParDoOperation.
private OperationNode createParDoOperation(Network<Node, Edge> network, ParallelInstructionNode node, PipelineOptions options, DataflowExecutionContext<?> executionContext, DataflowOperationContext operationContext) throws Exception {
ParallelInstruction instruction = node.getParallelInstruction();
ParDoInstruction parDo = instruction.getParDo();
TupleTag<?> mainOutputTag = tupleTag(parDo.getMultiOutputInfos().get(0));
ImmutableMap.Builder<TupleTag<?>, Integer> outputTagsToReceiverIndicesBuilder = ImmutableMap.builder();
int successorOffset = 0;
for (Node successor : network.successors(node)) {
for (Edge edge : network.edgesConnecting(node, successor)) {
outputTagsToReceiverIndicesBuilder.put(tupleTag(((MultiOutputInfoEdge) edge).getMultiOutputInfo()), successorOffset);
}
successorOffset += 1;
}
ParDoFn fn = parDoFnFactory.create(options, CloudObject.fromSpec(parDo.getUserFn()), parDo.getSideInputs(), mainOutputTag, outputTagsToReceiverIndicesBuilder.build(), executionContext, operationContext);
OutputReceiver[] receivers = getOutputReceivers(network, node);
return OperationNode.create(new ParDoOperation(fn, receivers, operationContext));
}
Aggregations