use of com.google.api.services.dataflow.model.ParallelInstruction in project beam by apache.
the class IntrinsicMapTaskExecutorFactoryTest method createReadInstruction.
static ParallelInstruction createReadInstruction(String name, Class<? extends ReaderFactory> readerFactoryClass) {
CloudObject spec = CloudObject.forClass(readerFactoryClass);
Source cloudSource = new Source();
cloudSource.setSpec(spec);
cloudSource.setCodec(windowedStringCoder);
ReadInstruction readInstruction = new ReadInstruction();
readInstruction.setSource(cloudSource);
InstructionOutput output = new InstructionOutput();
output.setName("read_output_name");
output.setCodec(windowedStringCoder);
output.setOriginalName("originalName");
output.setSystemName("systemName");
ParallelInstruction instruction = new ParallelInstruction();
instruction.setSystemName(name);
instruction.setOriginalName(name + "OriginalName");
instruction.setRead(readInstruction);
instruction.setOutputs(Arrays.asList(output));
return instruction;
}
use of com.google.api.services.dataflow.model.ParallelInstruction in project beam by apache.
the class IntrinsicMapTaskExecutorFactoryTest method testCreatePartialGroupByKeyOperationWithCombine.
@Test
public void testCreatePartialGroupByKeyOperationWithCombine() throws Exception {
int producerIndex = 1;
int producerOutputNum = 2;
ParallelInstruction instruction = createPartialGroupByKeyInstruction(producerIndex, producerOutputNum);
AppliedCombineFn<?, ?, ?, ?> combineFn = AppliedCombineFn.withInputCoder(Sum.ofIntegers(), CoderRegistry.createDefault(), KvCoder.of(StringUtf8Coder.of(), BigEndianIntegerCoder.of()));
CloudObject cloudCombineFn = CloudObject.forClassName("CombineFn");
addString(cloudCombineFn, PropertyNames.SERIALIZED_FN, byteArrayToJsonString(serializeToByteArray(combineFn)));
instruction.getPartialGroupByKey().setValueCombiningFn(cloudCombineFn);
ParallelInstructionNode instructionNode = ParallelInstructionNode.create(instruction, ExecutionLocation.UNKNOWN);
when(network.successors(instructionNode)).thenReturn(ImmutableSet.<Node>of(IntrinsicMapTaskExecutorFactory.createOutputReceiversTransform(STAGE, counterSet).apply(InstructionOutputNode.create(instructionNode.getParallelInstruction().getOutputs().get(0), PCOLLECTION_ID))));
when(network.outDegree(instructionNode)).thenReturn(1);
Node operationNode = mapTaskExecutorFactory.createOperationTransformForParallelInstructionNodes(STAGE, network, options, readerRegistry, sinkRegistry, BatchModeExecutionContext.forTesting(options, counterSet, "testStage")).apply(instructionNode);
assertThat(operationNode, instanceOf(OperationNode.class));
assertThat(((OperationNode) operationNode).getOperation(), instanceOf(ParDoOperation.class));
ParDoOperation pgbkOperation = (ParDoOperation) ((OperationNode) operationNode).getOperation();
assertEquals(1, pgbkOperation.receivers.length);
assertEquals(0, pgbkOperation.receivers[0].getReceiverCount());
assertEquals(Operation.InitializationState.UNSTARTED, pgbkOperation.initializationState);
}
use of com.google.api.services.dataflow.model.ParallelInstruction in project beam by apache.
the class IntrinsicMapTaskExecutorFactoryTest method createParDoInstruction.
static ParallelInstruction createParDoInstruction(int producerIndex, int producerOutputNum, String systemName, String userName) {
InstructionInput cloudInput = new InstructionInput();
cloudInput.setProducerInstructionIndex(producerIndex);
cloudInput.setOutputNum(producerOutputNum);
TestDoFn fn = new TestDoFn();
String serializedFn = StringUtils.byteArrayToJsonString(SerializableUtils.serializeToByteArray(DoFnInfo.forFn(fn, WindowingStrategy.globalDefault(), null, /* side input views */
null, /* input coder */
new TupleTag<>(PropertyNames.OUTPUT), /* main output id */
DoFnSchemaInformation.create(), Collections.emptyMap())));
CloudObject cloudUserFn = CloudObject.forClassName("DoFn");
addString(cloudUserFn, PropertyNames.SERIALIZED_FN, serializedFn);
MultiOutputInfo mainOutputTag = new MultiOutputInfo();
mainOutputTag.setTag("1");
ParDoInstruction parDoInstruction = new ParDoInstruction();
parDoInstruction.setInput(cloudInput);
parDoInstruction.setNumOutputs(1);
parDoInstruction.setMultiOutputInfos(ImmutableList.of(mainOutputTag));
parDoInstruction.setUserFn(cloudUserFn);
InstructionOutput output = new InstructionOutput();
output.setName(systemName + "_output");
output.setCodec(windowedStringCoder);
output.setOriginalName("originalName");
output.setSystemName("systemName");
ParallelInstruction instruction = new ParallelInstruction();
instruction.setParDo(parDoInstruction);
instruction.setOutputs(Arrays.asList(output));
instruction.setSystemName(systemName);
instruction.setOriginalName(systemName + "OriginalName");
instruction.setName(userName);
return instruction;
}
use of com.google.api.services.dataflow.model.ParallelInstruction in project beam by apache.
the class StreamingDataflowWorkerTest method testLimitOnOutputBundleSizeWithMultipleSinks.
@Test
public void testLimitOnOutputBundleSizeWithMultipleSinks() throws Exception {
// Same as testLimitOnOutputBundleSize(), but with 3 sinks for the stage rather than one.
// Verifies that output bundle size has same limit even with multiple sinks.
List<Integer> finalizeTracker = Lists.newArrayList();
TestCountingSource.setFinalizeTracker(finalizeTracker);
// 100K input messages.
final int numMessagesInCustomSourceShard = 100000;
// x10k => 1GB total output size.
final int inflatedSizePerMessage = 10000;
List<ParallelInstruction> instructions = new ArrayList<>();
instructions.addAll(makeUnboundedSourcePipeline(numMessagesInCustomSourceShard, new InflateDoFn(inflatedSizePerMessage)));
// add two more sinks
instructions.add(makeSinkInstruction(DEFAULT_DESTINATION_STREAM_ID + "-1", StringUtf8Coder.of(), 1, GlobalWindow.Coder.INSTANCE));
instructions.add(makeSinkInstruction(DEFAULT_DESTINATION_STREAM_ID + "-2", StringUtf8Coder.of(), 1, GlobalWindow.Coder.INSTANCE));
FakeWindmillServer server = new FakeWindmillServer(errorCollector);
StreamingDataflowWorker worker = makeWorker(instructions, createTestingPipelineOptions(server), true);
worker.start();
// Test new key.
server.addWorkToOffer(buildInput("work {" + " computation_id: \"computation\"" + " input_data_watermark: 0" + " work {" + " key: \"0000000000000001\"" + " sharding_key: 1" + " work_token: 1" + " cache_token: 1" + " }" + "}", null));
// Matcher to ensure that commit size is within 10% of max bundle size.
Matcher<Integer> isWithinBundleSizeLimits = both(greaterThan(StreamingDataflowWorker.MAX_SINK_BYTES * 9 / 10)).and(lessThan(StreamingDataflowWorker.MAX_SINK_BYTES * 11 / 10));
Map<Long, Windmill.WorkItemCommitRequest> result = server.waitForAndGetCommits(1);
Windmill.WorkItemCommitRequest commit = result.get(1L);
assertThat(commit.getSerializedSize(), isWithinBundleSizeLimits);
// Try another bundle
server.addWorkToOffer(buildInput("work {" + " computation_id: \"computation\"" + " input_data_watermark: 0" + " work {" + " key: \"0000000000000001\"" + " sharding_key: 1" + " work_token: 2" + " cache_token: 1" + " }" + "}", null));
result = server.waitForAndGetCommits(1);
commit = result.get(2L);
assertThat(commit.getSerializedSize(), isWithinBundleSizeLimits);
}
use of com.google.api.services.dataflow.model.ParallelInstruction in project beam by apache.
the class StreamingDataflowWorkerTest method runMergeSessionsActions.
// Helper for running tests for merging sessions based upon Actions consisting of GetWorkResponse
// and expected timers and holds in the corresponding commit. All GetData requests are responded
// to with empty state, relying on user worker caching to keep data written.
private void runMergeSessionsActions(List<Action> actions) throws Exception {
Coder<KV<String, String>> kvCoder = KvCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of());
Coder<WindowedValue<KV<String, String>>> windowedKvCoder = FullWindowedValueCoder.of(kvCoder, IntervalWindow.getCoder());
KvCoder<String, List<String>> groupedCoder = KvCoder.of(StringUtf8Coder.of(), ListCoder.of(StringUtf8Coder.of()));
Coder<WindowedValue<KV<String, List<String>>>> windowedGroupedCoder = FullWindowedValueCoder.of(groupedCoder, IntervalWindow.getCoder());
CloudObject spec = CloudObject.forClassName("MergeWindowsDoFn");
SdkComponents sdkComponents = SdkComponents.create();
sdkComponents.registerEnvironment(Environments.JAVA_SDK_HARNESS_ENVIRONMENT);
addString(spec, PropertyNames.SERIALIZED_FN, StringUtils.byteArrayToJsonString(WindowingStrategyTranslation.toMessageProto(WindowingStrategy.of(Sessions.withGapDuration(Duration.millis(10))).withMode(AccumulationMode.DISCARDING_FIRED_PANES).withTrigger(Repeatedly.forever(AfterWatermark.pastEndOfWindow().withLateFirings(AfterPane.elementCountAtLeast(1)))).withAllowedLateness(Duration.standardMinutes(60)), sdkComponents).toByteArray()));
addObject(spec, WorkerPropertyNames.INPUT_CODER, CloudObjects.asCloudObject(windowedKvCoder, /*sdkComponents=*/
null));
ParallelInstruction mergeWindowsInstruction = new ParallelInstruction().setSystemName("MergeWindows-System").setName("MergeWindowsStep").setOriginalName("MergeWindowsOriginal").setParDo(new ParDoInstruction().setInput(new InstructionInput().setProducerInstructionIndex(0).setOutputNum(0)).setNumOutputs(1).setUserFn(spec)).setOutputs(Arrays.asList(new InstructionOutput().setOriginalName(DEFAULT_OUTPUT_ORIGINAL_NAME).setSystemName(DEFAULT_OUTPUT_SYSTEM_NAME).setName("output").setCodec(CloudObjects.asCloudObject(windowedGroupedCoder, /*sdkComponents=*/
null))));
List<ParallelInstruction> instructions = Arrays.asList(makeWindowingSourceInstruction(kvCoder), mergeWindowsInstruction, makeSinkInstruction(groupedCoder, 1));
FakeWindmillServer server = new FakeWindmillServer(errorCollector);
StreamingDataflowWorker worker = makeWorker(instructions, createTestingPipelineOptions(server), false);
Map<String, String> nameMap = new HashMap<>();
nameMap.put("MergeWindowsStep", "MergeWindows");
worker.addStateNameMappings(nameMap);
worker.start();
// Respond to any GetData requests with empty state.
for (int i = 0; i < 1000; ++i) {
server.addDataFnToOffer(EMPTY_DATA_RESPONDER);
}
for (int i = 0; i < actions.size(); ++i) {
Action action = actions.get(i);
server.addWorkToOffer(action.response);
Map<Long, Windmill.WorkItemCommitRequest> result = server.waitForAndGetCommits(1);
WorkItemCommitRequest actualOutput = result.get(i + 1L);
assertThat(actualOutput, Matchers.not(Matchers.nullValue()));
verifyTimers(actualOutput, action.expectedTimers);
verifyHolds(actualOutput, action.expectedHolds);
}
}
Aggregations