Search in sources :

Example 46 with ParallelInstruction

use of com.google.api.services.dataflow.model.ParallelInstruction in project beam by apache.

the class IntrinsicMapTaskExecutorFactoryTest method createReadInstruction.

static ParallelInstruction createReadInstruction(String name, Class<? extends ReaderFactory> readerFactoryClass) {
    CloudObject spec = CloudObject.forClass(readerFactoryClass);
    Source cloudSource = new Source();
    cloudSource.setSpec(spec);
    cloudSource.setCodec(windowedStringCoder);
    ReadInstruction readInstruction = new ReadInstruction();
    readInstruction.setSource(cloudSource);
    InstructionOutput output = new InstructionOutput();
    output.setName("read_output_name");
    output.setCodec(windowedStringCoder);
    output.setOriginalName("originalName");
    output.setSystemName("systemName");
    ParallelInstruction instruction = new ParallelInstruction();
    instruction.setSystemName(name);
    instruction.setOriginalName(name + "OriginalName");
    instruction.setRead(readInstruction);
    instruction.setOutputs(Arrays.asList(output));
    return instruction;
}
Also used : ParallelInstruction(com.google.api.services.dataflow.model.ParallelInstruction) CloudObject(org.apache.beam.runners.dataflow.util.CloudObject) InstructionOutput(com.google.api.services.dataflow.model.InstructionOutput) ReadInstruction(com.google.api.services.dataflow.model.ReadInstruction) Source(com.google.api.services.dataflow.model.Source)

Example 47 with ParallelInstruction

use of com.google.api.services.dataflow.model.ParallelInstruction in project beam by apache.

the class IntrinsicMapTaskExecutorFactoryTest method testCreatePartialGroupByKeyOperationWithCombine.

@Test
public void testCreatePartialGroupByKeyOperationWithCombine() throws Exception {
    int producerIndex = 1;
    int producerOutputNum = 2;
    ParallelInstruction instruction = createPartialGroupByKeyInstruction(producerIndex, producerOutputNum);
    AppliedCombineFn<?, ?, ?, ?> combineFn = AppliedCombineFn.withInputCoder(Sum.ofIntegers(), CoderRegistry.createDefault(), KvCoder.of(StringUtf8Coder.of(), BigEndianIntegerCoder.of()));
    CloudObject cloudCombineFn = CloudObject.forClassName("CombineFn");
    addString(cloudCombineFn, PropertyNames.SERIALIZED_FN, byteArrayToJsonString(serializeToByteArray(combineFn)));
    instruction.getPartialGroupByKey().setValueCombiningFn(cloudCombineFn);
    ParallelInstructionNode instructionNode = ParallelInstructionNode.create(instruction, ExecutionLocation.UNKNOWN);
    when(network.successors(instructionNode)).thenReturn(ImmutableSet.<Node>of(IntrinsicMapTaskExecutorFactory.createOutputReceiversTransform(STAGE, counterSet).apply(InstructionOutputNode.create(instructionNode.getParallelInstruction().getOutputs().get(0), PCOLLECTION_ID))));
    when(network.outDegree(instructionNode)).thenReturn(1);
    Node operationNode = mapTaskExecutorFactory.createOperationTransformForParallelInstructionNodes(STAGE, network, options, readerRegistry, sinkRegistry, BatchModeExecutionContext.forTesting(options, counterSet, "testStage")).apply(instructionNode);
    assertThat(operationNode, instanceOf(OperationNode.class));
    assertThat(((OperationNode) operationNode).getOperation(), instanceOf(ParDoOperation.class));
    ParDoOperation pgbkOperation = (ParDoOperation) ((OperationNode) operationNode).getOperation();
    assertEquals(1, pgbkOperation.receivers.length);
    assertEquals(0, pgbkOperation.receivers[0].getReceiverCount());
    assertEquals(Operation.InitializationState.UNSTARTED, pgbkOperation.initializationState);
}
Also used : ParallelInstruction(com.google.api.services.dataflow.model.ParallelInstruction) OperationNode(org.apache.beam.runners.dataflow.worker.graph.Nodes.OperationNode) CloudObject(org.apache.beam.runners.dataflow.util.CloudObject) InstructionOutputNode(org.apache.beam.runners.dataflow.worker.graph.Nodes.InstructionOutputNode) OperationNode(org.apache.beam.runners.dataflow.worker.graph.Nodes.OperationNode) ParallelInstructionNode(org.apache.beam.runners.dataflow.worker.graph.Nodes.ParallelInstructionNode) Node(org.apache.beam.runners.dataflow.worker.graph.Nodes.Node) ParallelInstructionNode(org.apache.beam.runners.dataflow.worker.graph.Nodes.ParallelInstructionNode) ParDoOperation(org.apache.beam.runners.dataflow.worker.util.common.worker.ParDoOperation) Test(org.junit.Test)

Example 48 with ParallelInstruction

use of com.google.api.services.dataflow.model.ParallelInstruction in project beam by apache.

the class IntrinsicMapTaskExecutorFactoryTest method createParDoInstruction.

static ParallelInstruction createParDoInstruction(int producerIndex, int producerOutputNum, String systemName, String userName) {
    InstructionInput cloudInput = new InstructionInput();
    cloudInput.setProducerInstructionIndex(producerIndex);
    cloudInput.setOutputNum(producerOutputNum);
    TestDoFn fn = new TestDoFn();
    String serializedFn = StringUtils.byteArrayToJsonString(SerializableUtils.serializeToByteArray(DoFnInfo.forFn(fn, WindowingStrategy.globalDefault(), null, /* side input views */
    null, /* input coder */
    new TupleTag<>(PropertyNames.OUTPUT), /* main output id */
    DoFnSchemaInformation.create(), Collections.emptyMap())));
    CloudObject cloudUserFn = CloudObject.forClassName("DoFn");
    addString(cloudUserFn, PropertyNames.SERIALIZED_FN, serializedFn);
    MultiOutputInfo mainOutputTag = new MultiOutputInfo();
    mainOutputTag.setTag("1");
    ParDoInstruction parDoInstruction = new ParDoInstruction();
    parDoInstruction.setInput(cloudInput);
    parDoInstruction.setNumOutputs(1);
    parDoInstruction.setMultiOutputInfos(ImmutableList.of(mainOutputTag));
    parDoInstruction.setUserFn(cloudUserFn);
    InstructionOutput output = new InstructionOutput();
    output.setName(systemName + "_output");
    output.setCodec(windowedStringCoder);
    output.setOriginalName("originalName");
    output.setSystemName("systemName");
    ParallelInstruction instruction = new ParallelInstruction();
    instruction.setParDo(parDoInstruction);
    instruction.setOutputs(Arrays.asList(output));
    instruction.setSystemName(systemName);
    instruction.setOriginalName(systemName + "OriginalName");
    instruction.setName(userName);
    return instruction;
}
Also used : ParDoInstruction(com.google.api.services.dataflow.model.ParDoInstruction) ParallelInstruction(com.google.api.services.dataflow.model.ParallelInstruction) CloudObject(org.apache.beam.runners.dataflow.util.CloudObject) MultiOutputInfo(com.google.api.services.dataflow.model.MultiOutputInfo) InstructionOutput(com.google.api.services.dataflow.model.InstructionOutput) InstructionInput(com.google.api.services.dataflow.model.InstructionInput) StringUtils.byteArrayToJsonString(org.apache.beam.sdk.util.StringUtils.byteArrayToJsonString) Structs.addString(org.apache.beam.runners.dataflow.util.Structs.addString)

Example 49 with ParallelInstruction

use of com.google.api.services.dataflow.model.ParallelInstruction in project beam by apache.

the class StreamingDataflowWorkerTest method testLimitOnOutputBundleSizeWithMultipleSinks.

@Test
public void testLimitOnOutputBundleSizeWithMultipleSinks() throws Exception {
    // Same as testLimitOnOutputBundleSize(), but with 3 sinks for the stage rather than one.
    // Verifies that output bundle size has same limit even with multiple sinks.
    List<Integer> finalizeTracker = Lists.newArrayList();
    TestCountingSource.setFinalizeTracker(finalizeTracker);
    // 100K input messages.
    final int numMessagesInCustomSourceShard = 100000;
    // x10k => 1GB total output size.
    final int inflatedSizePerMessage = 10000;
    List<ParallelInstruction> instructions = new ArrayList<>();
    instructions.addAll(makeUnboundedSourcePipeline(numMessagesInCustomSourceShard, new InflateDoFn(inflatedSizePerMessage)));
    // add two more sinks
    instructions.add(makeSinkInstruction(DEFAULT_DESTINATION_STREAM_ID + "-1", StringUtf8Coder.of(), 1, GlobalWindow.Coder.INSTANCE));
    instructions.add(makeSinkInstruction(DEFAULT_DESTINATION_STREAM_ID + "-2", StringUtf8Coder.of(), 1, GlobalWindow.Coder.INSTANCE));
    FakeWindmillServer server = new FakeWindmillServer(errorCollector);
    StreamingDataflowWorker worker = makeWorker(instructions, createTestingPipelineOptions(server), true);
    worker.start();
    // Test new key.
    server.addWorkToOffer(buildInput("work {" + "  computation_id: \"computation\"" + "  input_data_watermark: 0" + "  work {" + "    key: \"0000000000000001\"" + "    sharding_key: 1" + "    work_token: 1" + "    cache_token: 1" + "  }" + "}", null));
    // Matcher to ensure that commit size is within 10% of max bundle size.
    Matcher<Integer> isWithinBundleSizeLimits = both(greaterThan(StreamingDataflowWorker.MAX_SINK_BYTES * 9 / 10)).and(lessThan(StreamingDataflowWorker.MAX_SINK_BYTES * 11 / 10));
    Map<Long, Windmill.WorkItemCommitRequest> result = server.waitForAndGetCommits(1);
    Windmill.WorkItemCommitRequest commit = result.get(1L);
    assertThat(commit.getSerializedSize(), isWithinBundleSizeLimits);
    // Try another bundle
    server.addWorkToOffer(buildInput("work {" + "  computation_id: \"computation\"" + "  input_data_watermark: 0" + "  work {" + "    key: \"0000000000000001\"" + "    sharding_key: 1" + "    work_token: 2" + "    cache_token: 1" + "  }" + "}", null));
    result = server.waitForAndGetCommits(1);
    commit = result.get(2L);
    assertThat(commit.getSerializedSize(), isWithinBundleSizeLimits);
}
Also used : ArrayList(java.util.ArrayList) WorkItemCommitRequest(org.apache.beam.runners.dataflow.worker.windmill.Windmill.WorkItemCommitRequest) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) ParallelInstruction(com.google.api.services.dataflow.model.ParallelInstruction) WorkItemCommitRequest(org.apache.beam.runners.dataflow.worker.windmill.Windmill.WorkItemCommitRequest) AtomicLong(java.util.concurrent.atomic.AtomicLong) DataflowCounterUpdateExtractor.splitIntToLong(org.apache.beam.runners.dataflow.worker.counters.DataflowCounterUpdateExtractor.splitIntToLong) UnsignedLong(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.primitives.UnsignedLong) Windmill(org.apache.beam.runners.dataflow.worker.windmill.Windmill) Test(org.junit.Test)

Example 50 with ParallelInstruction

use of com.google.api.services.dataflow.model.ParallelInstruction in project beam by apache.

the class StreamingDataflowWorkerTest method runMergeSessionsActions.

// Helper for running tests for merging sessions based upon Actions consisting of GetWorkResponse
// and expected timers and holds in the corresponding commit. All GetData requests are responded
// to with empty state, relying on user worker caching to keep data written.
private void runMergeSessionsActions(List<Action> actions) throws Exception {
    Coder<KV<String, String>> kvCoder = KvCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of());
    Coder<WindowedValue<KV<String, String>>> windowedKvCoder = FullWindowedValueCoder.of(kvCoder, IntervalWindow.getCoder());
    KvCoder<String, List<String>> groupedCoder = KvCoder.of(StringUtf8Coder.of(), ListCoder.of(StringUtf8Coder.of()));
    Coder<WindowedValue<KV<String, List<String>>>> windowedGroupedCoder = FullWindowedValueCoder.of(groupedCoder, IntervalWindow.getCoder());
    CloudObject spec = CloudObject.forClassName("MergeWindowsDoFn");
    SdkComponents sdkComponents = SdkComponents.create();
    sdkComponents.registerEnvironment(Environments.JAVA_SDK_HARNESS_ENVIRONMENT);
    addString(spec, PropertyNames.SERIALIZED_FN, StringUtils.byteArrayToJsonString(WindowingStrategyTranslation.toMessageProto(WindowingStrategy.of(Sessions.withGapDuration(Duration.millis(10))).withMode(AccumulationMode.DISCARDING_FIRED_PANES).withTrigger(Repeatedly.forever(AfterWatermark.pastEndOfWindow().withLateFirings(AfterPane.elementCountAtLeast(1)))).withAllowedLateness(Duration.standardMinutes(60)), sdkComponents).toByteArray()));
    addObject(spec, WorkerPropertyNames.INPUT_CODER, CloudObjects.asCloudObject(windowedKvCoder, /*sdkComponents=*/
    null));
    ParallelInstruction mergeWindowsInstruction = new ParallelInstruction().setSystemName("MergeWindows-System").setName("MergeWindowsStep").setOriginalName("MergeWindowsOriginal").setParDo(new ParDoInstruction().setInput(new InstructionInput().setProducerInstructionIndex(0).setOutputNum(0)).setNumOutputs(1).setUserFn(spec)).setOutputs(Arrays.asList(new InstructionOutput().setOriginalName(DEFAULT_OUTPUT_ORIGINAL_NAME).setSystemName(DEFAULT_OUTPUT_SYSTEM_NAME).setName("output").setCodec(CloudObjects.asCloudObject(windowedGroupedCoder, /*sdkComponents=*/
    null))));
    List<ParallelInstruction> instructions = Arrays.asList(makeWindowingSourceInstruction(kvCoder), mergeWindowsInstruction, makeSinkInstruction(groupedCoder, 1));
    FakeWindmillServer server = new FakeWindmillServer(errorCollector);
    StreamingDataflowWorker worker = makeWorker(instructions, createTestingPipelineOptions(server), false);
    Map<String, String> nameMap = new HashMap<>();
    nameMap.put("MergeWindowsStep", "MergeWindows");
    worker.addStateNameMappings(nameMap);
    worker.start();
    // Respond to any GetData requests with empty state.
    for (int i = 0; i < 1000; ++i) {
        server.addDataFnToOffer(EMPTY_DATA_RESPONDER);
    }
    for (int i = 0; i < actions.size(); ++i) {
        Action action = actions.get(i);
        server.addWorkToOffer(action.response);
        Map<Long, Windmill.WorkItemCommitRequest> result = server.waitForAndGetCommits(1);
        WorkItemCommitRequest actualOutput = result.get(i + 1L);
        assertThat(actualOutput, Matchers.not(Matchers.nullValue()));
        verifyTimers(actualOutput, action.expectedTimers);
        verifyHolds(actualOutput, action.expectedHolds);
    }
}
Also used : ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) HashMap(java.util.HashMap) InstructionOutput(com.google.api.services.dataflow.model.InstructionOutput) KV(org.apache.beam.sdk.values.KV) ByteString(org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.ByteString) Structs.addString(org.apache.beam.runners.dataflow.util.Structs.addString) SdkComponents(org.apache.beam.runners.core.construction.SdkComponents) ParallelInstruction(com.google.api.services.dataflow.model.ParallelInstruction) ParDoInstruction(com.google.api.services.dataflow.model.ParDoInstruction) CloudObject(org.apache.beam.runners.dataflow.util.CloudObject) WorkItemCommitRequest(org.apache.beam.runners.dataflow.worker.windmill.Windmill.WorkItemCommitRequest) WindowedValue(org.apache.beam.sdk.util.WindowedValue) AtomicLong(java.util.concurrent.atomic.AtomicLong) DataflowCounterUpdateExtractor.splitIntToLong(org.apache.beam.runners.dataflow.worker.counters.DataflowCounterUpdateExtractor.splitIntToLong) UnsignedLong(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.primitives.UnsignedLong) ArrayList(java.util.ArrayList) List(java.util.List) ImmutableList(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList) InstructionInput(com.google.api.services.dataflow.model.InstructionInput)

Aggregations

ParallelInstruction (com.google.api.services.dataflow.model.ParallelInstruction)73 Test (org.junit.Test)39 InstructionOutput (com.google.api.services.dataflow.model.InstructionOutput)27 ParallelInstructionNode (org.apache.beam.runners.dataflow.worker.graph.Nodes.ParallelInstructionNode)26 CloudObject (org.apache.beam.runners.dataflow.util.CloudObject)24 Node (org.apache.beam.runners.dataflow.worker.graph.Nodes.Node)22 InstructionOutputNode (org.apache.beam.runners.dataflow.worker.graph.Nodes.InstructionOutputNode)21 Edge (org.apache.beam.runners.dataflow.worker.graph.Edges.Edge)20 ParDoInstruction (com.google.api.services.dataflow.model.ParDoInstruction)18 ReadInstruction (com.google.api.services.dataflow.model.ReadInstruction)17 DefaultEdge (org.apache.beam.runners.dataflow.worker.graph.Edges.DefaultEdge)17 MultiOutputInfoEdge (org.apache.beam.runners.dataflow.worker.graph.Edges.MultiOutputInfoEdge)16 Structs.addString (org.apache.beam.runners.dataflow.util.Structs.addString)12 ByteString (org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.ByteString)12 InstructionInput (com.google.api.services.dataflow.model.InstructionInput)11 MapTask (com.google.api.services.dataflow.model.MapTask)11 AtomicLong (java.util.concurrent.atomic.AtomicLong)11 DataflowCounterUpdateExtractor.splitIntToLong (org.apache.beam.runners.dataflow.worker.counters.DataflowCounterUpdateExtractor.splitIntToLong)11 WorkItemCommitRequest (org.apache.beam.runners.dataflow.worker.windmill.Windmill.WorkItemCommitRequest)11 UnsignedLong (org.apache.beam.vendor.guava.v26_0_jre.com.google.common.primitives.UnsignedLong)11