Search in sources :

Example 1 with InstructionInput

use of com.google.api.services.dataflow.model.InstructionInput in project beam by apache.

the class IntrinsicMapTaskExecutorFactoryTest method createWriteInstruction.

static ParallelInstruction createWriteInstruction(int producerIndex, int producerOutputNum, String systemName) {
    InstructionInput cloudInput = new InstructionInput();
    cloudInput.setProducerInstructionIndex(producerIndex);
    cloudInput.setOutputNum(producerOutputNum);
    CloudObject spec = CloudObject.forClass(IntrinsicMapTaskExecutorFactoryTest.TestSinkFactory.class);
    com.google.api.services.dataflow.model.Sink cloudSink = new com.google.api.services.dataflow.model.Sink();
    cloudSink.setSpec(spec);
    cloudSink.setCodec(windowedStringCoder);
    WriteInstruction writeInstruction = new WriteInstruction();
    writeInstruction.setInput(cloudInput);
    writeInstruction.setSink(cloudSink);
    ParallelInstruction instruction = new ParallelInstruction();
    instruction.setWrite(writeInstruction);
    instruction.setSystemName(systemName);
    instruction.setOriginalName(systemName + "OriginalName");
    return instruction;
}
Also used : ParallelInstruction(com.google.api.services.dataflow.model.ParallelInstruction) CloudObject(org.apache.beam.runners.dataflow.util.CloudObject) Sink(org.apache.beam.runners.dataflow.worker.util.common.worker.Sink) WriteInstruction(com.google.api.services.dataflow.model.WriteInstruction) InstructionInput(com.google.api.services.dataflow.model.InstructionInput)

Example 2 with InstructionInput

use of com.google.api.services.dataflow.model.InstructionInput in project beam by apache.

the class MapTaskToNetworkFunctionTest method createInstructionInput.

private static InstructionInput createInstructionInput(int instructionIndex, int outputNum) {
    InstructionInput rval = new InstructionInput();
    rval.setProducerInstructionIndex(instructionIndex);
    rval.setOutputNum(outputNum);
    return rval;
}
Also used : InstructionInput(com.google.api.services.dataflow.model.InstructionInput)

Example 3 with InstructionInput

use of com.google.api.services.dataflow.model.InstructionInput in project beam by apache.

the class MapTaskToNetworkFunction method apply.

@Override
public MutableNetwork<Node, Edge> apply(MapTask mapTask) {
    List<ParallelInstruction> parallelInstructions = Apiary.listOrEmpty(mapTask.getInstructions());
    MutableNetwork<Node, Edge> network = NetworkBuilder.directed().allowsSelfLoops(false).allowsParallelEdges(true).expectedNodeCount(parallelInstructions.size() * 2).build();
    // Add all the instruction nodes and output nodes
    ParallelInstructionNode[] instructionNodes = new ParallelInstructionNode[parallelInstructions.size()];
    InstructionOutputNode[][] outputNodes = new InstructionOutputNode[parallelInstructions.size()][];
    for (int i = 0; i < parallelInstructions.size(); ++i) {
        // InstructionOutputNode's are the source of truth on instruction outputs.
        // Clear the instruction's outputs to reduce chance for confusion.
        List<InstructionOutput> outputs = Apiary.listOrEmpty(parallelInstructions.get(i).getOutputs());
        outputNodes[i] = new InstructionOutputNode[outputs.size()];
        JsonFactory factory = MoreObjects.firstNonNull(mapTask.getFactory(), Transport.getJsonFactory());
        ParallelInstruction parallelInstruction = clone(factory, parallelInstructions.get(i)).setOutputs(null);
        ParallelInstructionNode instructionNode = ParallelInstructionNode.create(parallelInstruction, Nodes.ExecutionLocation.UNKNOWN);
        instructionNodes[i] = instructionNode;
        network.addNode(instructionNode);
        // Connect the instruction node output to the output PCollection node
        for (int j = 0; j < outputs.size(); ++j) {
            InstructionOutput instructionOutput = outputs.get(j);
            InstructionOutputNode outputNode = InstructionOutputNode.create(instructionOutput, "generatedPcollection" + this.idGenerator.getId());
            network.addNode(outputNode);
            if (parallelInstruction.getParDo() != null) {
                network.addEdge(instructionNode, outputNode, MultiOutputInfoEdge.create(parallelInstruction.getParDo().getMultiOutputInfos().get(j)));
            } else {
                network.addEdge(instructionNode, outputNode, DefaultEdge.create());
            }
            outputNodes[i][j] = outputNode;
        }
    }
    // Connect PCollections as inputs to instructions
    for (ParallelInstructionNode instructionNode : instructionNodes) {
        ParallelInstruction parallelInstruction = instructionNode.getParallelInstruction();
        if (parallelInstruction.getFlatten() != null) {
            for (InstructionInput input : Apiary.listOrEmpty(parallelInstruction.getFlatten().getInputs())) {
                attachInput(input, network, instructionNode, outputNodes);
            }
        } else if (parallelInstruction.getParDo() != null) {
            attachInput(parallelInstruction.getParDo().getInput(), network, instructionNode, outputNodes);
        } else if (parallelInstruction.getPartialGroupByKey() != null) {
            attachInput(parallelInstruction.getPartialGroupByKey().getInput(), network, instructionNode, outputNodes);
        } else if (parallelInstruction.getRead() != null) {
        // Reads have no inputs so nothing to do
        } else if (parallelInstruction.getWrite() != null) {
            attachInput(parallelInstruction.getWrite().getInput(), network, instructionNode, outputNodes);
        } else {
            throw new IllegalArgumentException(String.format("Unknown type of instruction %s for map task %s", parallelInstruction, mapTask));
        }
    }
    return network;
}
Also used : Node(org.apache.beam.runners.dataflow.worker.graph.Nodes.Node) InstructionOutputNode(org.apache.beam.runners.dataflow.worker.graph.Nodes.InstructionOutputNode) ParallelInstructionNode(org.apache.beam.runners.dataflow.worker.graph.Nodes.ParallelInstructionNode) ParallelInstructionNode(org.apache.beam.runners.dataflow.worker.graph.Nodes.ParallelInstructionNode) InstructionOutput(com.google.api.services.dataflow.model.InstructionOutput) JsonFactory(com.google.api.client.json.JsonFactory) ParallelInstruction(com.google.api.services.dataflow.model.ParallelInstruction) InstructionOutputNode(org.apache.beam.runners.dataflow.worker.graph.Nodes.InstructionOutputNode) InstructionInput(com.google.api.services.dataflow.model.InstructionInput) Edge(org.apache.beam.runners.dataflow.worker.graph.Edges.Edge) MultiOutputInfoEdge(org.apache.beam.runners.dataflow.worker.graph.Edges.MultiOutputInfoEdge) DefaultEdge(org.apache.beam.runners.dataflow.worker.graph.Edges.DefaultEdge)

Example 4 with InstructionInput

use of com.google.api.services.dataflow.model.InstructionInput in project beam by apache.

the class IntrinsicMapTaskExecutorFactoryTest method createPartialGroupByKeyInstruction.

static ParallelInstruction createPartialGroupByKeyInstruction(int producerIndex, int producerOutputNum) {
    InstructionInput cloudInput = new InstructionInput();
    cloudInput.setProducerInstructionIndex(producerIndex);
    cloudInput.setOutputNum(producerOutputNum);
    PartialGroupByKeyInstruction pgbkInstruction = new PartialGroupByKeyInstruction();
    pgbkInstruction.setInput(cloudInput);
    pgbkInstruction.setInputElementCodec(CloudObjects.asCloudObject(FullWindowedValueCoder.of(KvCoder.of(StringUtf8Coder.of(), BigEndianIntegerCoder.of()), IntervalWindowCoder.of()), /*sdkComponents=*/
    null));
    InstructionOutput output = new InstructionOutput();
    output.setName("pgbk_output_name");
    output.setCodec(CloudObjects.asCloudObject(KvCoder.of(StringUtf8Coder.of(), IterableCoder.of(BigEndianIntegerCoder.of())), /*sdkComponents=*/
    null));
    output.setOriginalName("originalName");
    output.setSystemName("systemName");
    ParallelInstruction instruction = new ParallelInstruction();
    instruction.setOriginalName("pgbk_original_name");
    instruction.setSystemName("pgbk_system_name");
    instruction.setPartialGroupByKey(pgbkInstruction);
    instruction.setOutputs(Arrays.asList(output));
    return instruction;
}
Also used : ParallelInstruction(com.google.api.services.dataflow.model.ParallelInstruction) InstructionOutput(com.google.api.services.dataflow.model.InstructionOutput) PartialGroupByKeyInstruction(com.google.api.services.dataflow.model.PartialGroupByKeyInstruction) InstructionInput(com.google.api.services.dataflow.model.InstructionInput)

Example 5 with InstructionInput

use of com.google.api.services.dataflow.model.InstructionInput in project beam by apache.

the class IntrinsicMapTaskExecutorFactoryTest method createFlattenInstruction.

static ParallelInstruction createFlattenInstruction(int producerIndex1, int producerOutputNum1, int producerIndex2, int producerOutputNum2, String systemName) {
    List<InstructionInput> cloudInputs = new ArrayList<>();
    InstructionInput cloudInput1 = new InstructionInput();
    cloudInput1.setProducerInstructionIndex(producerIndex1);
    cloudInput1.setOutputNum(producerOutputNum1);
    cloudInputs.add(cloudInput1);
    InstructionInput cloudInput2 = new InstructionInput();
    cloudInput2.setProducerInstructionIndex(producerIndex2);
    cloudInput2.setOutputNum(producerOutputNum2);
    cloudInputs.add(cloudInput2);
    FlattenInstruction flattenInstruction = new FlattenInstruction();
    flattenInstruction.setInputs(cloudInputs);
    InstructionOutput output = new InstructionOutput();
    output.setName("flatten_output_name");
    output.setCodec(CloudObjects.asCloudObject(StringUtf8Coder.of(), /*sdkComponents=*/
    null));
    output.setOriginalName("originalName");
    output.setSystemName("systemName");
    ParallelInstruction instruction = new ParallelInstruction();
    instruction.setFlatten(flattenInstruction);
    instruction.setOutputs(Arrays.asList(output));
    instruction.setSystemName(systemName);
    instruction.setOriginalName(systemName + "OriginalName");
    return instruction;
}
Also used : ParallelInstruction(com.google.api.services.dataflow.model.ParallelInstruction) ArrayList(java.util.ArrayList) InstructionOutput(com.google.api.services.dataflow.model.InstructionOutput) InstructionInput(com.google.api.services.dataflow.model.InstructionInput) FlattenInstruction(com.google.api.services.dataflow.model.FlattenInstruction)

Aggregations

InstructionInput (com.google.api.services.dataflow.model.InstructionInput)12 ParallelInstruction (com.google.api.services.dataflow.model.ParallelInstruction)11 InstructionOutput (com.google.api.services.dataflow.model.InstructionOutput)9 CloudObject (org.apache.beam.runners.dataflow.util.CloudObject)8 ParDoInstruction (com.google.api.services.dataflow.model.ParDoInstruction)6 ArrayList (java.util.ArrayList)4 AtomicLong (java.util.concurrent.atomic.AtomicLong)4 SdkComponents (org.apache.beam.runners.core.construction.SdkComponents)4 Structs.addString (org.apache.beam.runners.dataflow.util.Structs.addString)4 DataflowCounterUpdateExtractor.splitIntToLong (org.apache.beam.runners.dataflow.worker.counters.DataflowCounterUpdateExtractor.splitIntToLong)4 WorkItemCommitRequest (org.apache.beam.runners.dataflow.worker.windmill.Windmill.WorkItemCommitRequest)4 UnsignedLong (org.apache.beam.vendor.guava.v26_0_jre.com.google.common.primitives.UnsignedLong)4 HashMap (java.util.HashMap)3 List (java.util.List)3 ConcurrentHashMap (java.util.concurrent.ConcurrentHashMap)3 WindowedValue (org.apache.beam.sdk.util.WindowedValue)3 KV (org.apache.beam.sdk.values.KV)3 ByteString (org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.ByteString)3 ImmutableList (org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList)3 Test (org.junit.Test)3