use of org.apache.beam.runners.dataflow.worker.util.common.worker.ParDoFn in project beam by apache.
the class BeamFnMapTaskExecutorFactory method createPartialGroupByKeyOperation.
<K> OperationNode createPartialGroupByKeyOperation(Network<Node, Edge> network, ParallelInstructionNode node, PipelineOptions options, DataflowExecutionContext<?> executionContext, DataflowOperationContext operationContext) throws Exception {
ParallelInstruction instruction = node.getParallelInstruction();
PartialGroupByKeyInstruction pgbk = instruction.getPartialGroupByKey();
OutputReceiver[] receivers = getOutputReceivers(network, node);
Coder<?> windowedCoder = CloudObjects.coderFromCloudObject(CloudObject.fromSpec(pgbk.getInputElementCodec()));
if (!(windowedCoder instanceof WindowedValueCoder)) {
throw new IllegalArgumentException(String.format("unexpected kind of input coder for PartialGroupByKeyOperation: %s", windowedCoder));
}
Coder<?> elemCoder = ((WindowedValueCoder<?>) windowedCoder).getValueCoder();
if (!(elemCoder instanceof KvCoder)) {
throw new IllegalArgumentException(String.format("unexpected kind of input element coder for PartialGroupByKeyOperation: %s", elemCoder));
}
@SuppressWarnings("unchecked") KvCoder<K, ?> keyedElementCoder = (KvCoder<K, ?>) elemCoder;
CloudObject cloudUserFn = pgbk.getValueCombiningFn() != null ? CloudObject.fromSpec(pgbk.getValueCombiningFn()) : null;
ParDoFn fn = PartialGroupByKeyParDoFns.create(options, keyedElementCoder, cloudUserFn, pgbk.getSideInputs(), Arrays.<Receiver>asList(receivers), executionContext, operationContext);
return OperationNode.create(new ParDoOperation(fn, receivers, operationContext));
}
use of org.apache.beam.runners.dataflow.worker.util.common.worker.ParDoFn in project beam by apache.
the class IntrinsicMapTaskExecutorFactory method createParDoOperation.
private OperationNode createParDoOperation(Network<Node, Edge> network, ParallelInstructionNode node, PipelineOptions options, DataflowExecutionContext<?> executionContext, DataflowOperationContext operationContext) throws Exception {
ParallelInstruction instruction = node.getParallelInstruction();
ParDoInstruction parDo = instruction.getParDo();
TupleTag<?> mainOutputTag = tupleTag(parDo.getMultiOutputInfos().get(0));
ImmutableMap.Builder<TupleTag<?>, Integer> outputTagsToReceiverIndicesBuilder = ImmutableMap.builder();
int successorOffset = 0;
for (Node successor : network.successors(node)) {
for (Edge edge : network.edgesConnecting(node, successor)) {
outputTagsToReceiverIndicesBuilder.put(tupleTag(((MultiOutputInfoEdge) edge).getMultiOutputInfo()), successorOffset);
}
successorOffset += 1;
}
ParDoFn fn = parDoFnFactory.create(options, CloudObject.fromSpec(parDo.getUserFn()), parDo.getSideInputs(), mainOutputTag, outputTagsToReceiverIndicesBuilder.build(), executionContext, operationContext);
OutputReceiver[] receivers = getOutputReceivers(network, node);
return OperationNode.create(new ParDoOperation(fn, receivers, operationContext));
}
use of org.apache.beam.runners.dataflow.worker.util.common.worker.ParDoFn in project beam by apache.
the class IntrinsicMapTaskExecutorFactory method createPartialGroupByKeyOperation.
<K> OperationNode createPartialGroupByKeyOperation(Network<Node, Edge> network, ParallelInstructionNode node, PipelineOptions options, DataflowExecutionContext<?> executionContext, DataflowOperationContext operationContext) throws Exception {
ParallelInstruction instruction = node.getParallelInstruction();
PartialGroupByKeyInstruction pgbk = instruction.getPartialGroupByKey();
OutputReceiver[] receivers = getOutputReceivers(network, node);
Coder<?> windowedCoder = CloudObjects.coderFromCloudObject(CloudObject.fromSpec(pgbk.getInputElementCodec()));
if (!(windowedCoder instanceof WindowedValueCoder)) {
throw new IllegalArgumentException(String.format("unexpected kind of input coder for PartialGroupByKeyOperation: %s", windowedCoder));
}
Coder<?> elemCoder = ((WindowedValueCoder<?>) windowedCoder).getValueCoder();
if (!(elemCoder instanceof KvCoder)) {
throw new IllegalArgumentException(String.format("unexpected kind of input element coder for PartialGroupByKeyOperation: %s", elemCoder));
}
@SuppressWarnings("unchecked") KvCoder<K, ?> keyedElementCoder = (KvCoder<K, ?>) elemCoder;
CloudObject cloudUserFn = pgbk.getValueCombiningFn() != null ? CloudObject.fromSpec(pgbk.getValueCombiningFn()) : null;
ParDoFn fn = PartialGroupByKeyParDoFns.create(options, keyedElementCoder, cloudUserFn, pgbk.getSideInputs(), Arrays.<Receiver>asList(receivers), executionContext, operationContext);
return OperationNode.create(new ParDoOperation(fn, receivers, operationContext));
}
use of org.apache.beam.runners.dataflow.worker.util.common.worker.ParDoFn in project beam by apache.
the class GroupAlsoByWindowParDoFnFactory method create.
@Override
public ParDoFn create(PipelineOptions options, CloudObject cloudUserFn, @Nullable List<SideInputInfo> sideInputInfos, TupleTag<?> mainOutputTag, Map<TupleTag<?>, Integer> outputTupleTagsToReceiverIndices, final DataflowExecutionContext<?> executionContext, DataflowOperationContext operationContext) throws Exception {
Map.Entry<TupleTag<?>, Integer> entry = Iterables.getOnlyElement(outputTupleTagsToReceiverIndices.entrySet());
checkArgument(entry.getKey().equals(mainOutputTag), "Output tags should reference only the main output tag: %s vs %s", entry.getKey(), mainOutputTag);
checkArgument(entry.getValue() == 0, "There should be a single receiver, but using receiver index %s", entry.getValue());
byte[] encodedWindowingStrategy = getBytes(cloudUserFn, PropertyNames.SERIALIZED_FN);
WindowingStrategy windowingStrategy;
try {
windowingStrategy = deserializeWindowingStrategy(encodedWindowingStrategy);
} catch (Exception e) {
// TODO: Catch block disappears, becoming an error once Python SDK is compliant.
if (DataflowRunner.hasExperiment(options.as(DataflowPipelineDebugOptions.class), "beam_fn_api")) {
LOG.info("FnAPI: Unable to deserialize windowing strategy, assuming default", e);
windowingStrategy = WindowingStrategy.globalDefault();
} else {
throw e;
}
}
byte[] serializedCombineFn = getBytes(cloudUserFn, WorkerPropertyNames.COMBINE_FN, null);
AppliedCombineFn<?, ?, ?, ?> combineFn = null;
if (serializedCombineFn != null) {
Object combineFnObj = SerializableUtils.deserializeFromByteArray(serializedCombineFn, "serialized combine fn");
checkArgument(combineFnObj instanceof AppliedCombineFn, "unexpected kind of AppliedCombineFn: " + combineFnObj.getClass().getName());
combineFn = (AppliedCombineFn<?, ?, ?, ?>) combineFnObj;
}
Map<String, Object> inputCoderObject = getObject(cloudUserFn, WorkerPropertyNames.INPUT_CODER);
Coder<?> inputCoder = CloudObjects.coderFromCloudObject(CloudObject.fromSpec(inputCoderObject));
checkArgument(inputCoder instanceof WindowedValueCoder, "Expected WindowedValueCoder for inputCoder, got: " + inputCoder.getClass().getName());
@SuppressWarnings("unchecked") WindowedValueCoder<?> windowedValueCoder = (WindowedValueCoder<?>) inputCoder;
Coder<?> elemCoder = windowedValueCoder.getValueCoder();
checkArgument(elemCoder instanceof KvCoder, "Expected KvCoder for inputCoder, got: " + elemCoder.getClass().getName());
@SuppressWarnings("unchecked") KvCoder<?, ?> kvCoder = (KvCoder<?, ?>) elemCoder;
boolean isStreamingPipeline = options.as(StreamingOptions.class).isStreaming();
SideInputReader sideInputReader = NullSideInputReader.empty();
@Nullable AppliedCombineFn<?, ?, ?, ?> maybeMergingCombineFn = null;
if (combineFn != null) {
sideInputReader = executionContext.getSideInputReader(sideInputInfos, combineFn.getSideInputViews(), operationContext);
String phase = getString(cloudUserFn, WorkerPropertyNames.PHASE, CombinePhase.ALL);
checkArgument(phase.equals(CombinePhase.ALL) || phase.equals(CombinePhase.MERGE), "Unexpected phase: %s", phase);
if (phase.equals(CombinePhase.MERGE)) {
maybeMergingCombineFn = makeAppliedMergingFunction(combineFn);
} else {
maybeMergingCombineFn = combineFn;
}
}
StateInternalsFactory<?> stateInternalsFactory = key -> executionContext.getStepContext(operationContext).stateInternals();
// This will be a GABW Fn for either batch or streaming, with combiner in it or not
GroupAlsoByWindowFn<?, ?> fn;
// This will be a FakeKeyedWorkItemCoder for streaming or null for batch
Coder<?> gabwInputCoder;
// TODO: do not do this with mess of "if"
if (isStreamingPipeline) {
if (maybeMergingCombineFn == null) {
fn = StreamingGroupAlsoByWindowsDoFns.createForIterable(windowingStrategy, stateInternalsFactory, ((KvCoder) kvCoder).getValueCoder());
gabwInputCoder = WindmillKeyedWorkItem.FakeKeyedWorkItemCoder.of(kvCoder);
} else {
fn = StreamingGroupAlsoByWindowsDoFns.create(windowingStrategy, stateInternalsFactory, (AppliedCombineFn) maybeMergingCombineFn, ((KvCoder) kvCoder).getKeyCoder());
gabwInputCoder = WindmillKeyedWorkItem.FakeKeyedWorkItemCoder.of(((AppliedCombineFn) maybeMergingCombineFn).getKvCoder());
}
} else {
if (maybeMergingCombineFn == null) {
fn = BatchGroupAlsoByWindowsDoFns.createForIterable(windowingStrategy, stateInternalsFactory, ((KvCoder) kvCoder).getValueCoder());
gabwInputCoder = null;
} else {
fn = BatchGroupAlsoByWindowsDoFns.create(windowingStrategy, (AppliedCombineFn) maybeMergingCombineFn);
gabwInputCoder = null;
}
}
// TODO: or anyhow related to it, do not do this with mess of "if"
if (maybeMergingCombineFn != null) {
return new GroupAlsoByWindowsParDoFn(options, fn, windowingStrategy, ((AppliedCombineFn) maybeMergingCombineFn).getSideInputViews(), gabwInputCoder, sideInputReader, mainOutputTag, executionContext.getStepContext(operationContext));
} else {
return new GroupAlsoByWindowsParDoFn(options, fn, windowingStrategy, null, gabwInputCoder, sideInputReader, mainOutputTag, executionContext.getStepContext(operationContext));
}
}
use of org.apache.beam.runners.dataflow.worker.util.common.worker.ParDoFn in project beam by apache.
the class DefaultParDoFnFactoryTest method testCreateSimpleParDoFn.
/**
* Tests that a {@link SimpleParDoFn} is correctly dispatched to {@code UserParDoFnFactory} and
* instantiated correctly.
*/
@Test
public void testCreateSimpleParDoFn() throws Exception {
// A serialized DoFn
String stringFieldValue = "some state";
long longFieldValue = 42L;
TestDoFn fn = new TestDoFn(stringFieldValue, longFieldValue);
String serializedFn = StringUtils.byteArrayToJsonString(SerializableUtils.serializeToByteArray(DoFnInfo.forFn(fn, WindowingStrategy.globalDefault(), null, /* side input views */
null, /* input coder */
new TupleTag<>("output"), /* main output */
DoFnSchemaInformation.create(), Collections.emptyMap())));
CloudObject cloudUserFn = CloudObject.forClassName("DoFn");
addString(cloudUserFn, "serialized_fn", serializedFn);
// Create the ParDoFn from the serialized DoFn
ParDoFn parDoFn = DEFAULT_FACTORY.create(DEFAULT_OPTIONS, cloudUserFn, null, MAIN_OUTPUT, ImmutableMap.<TupleTag<?>, Integer>of(MAIN_OUTPUT, 0), DEFAULT_EXECUTION_CONTEXT, TestOperationContext.create(counterSet));
// Test that the factory created the correct class
assertThat(parDoFn, instanceOf(SimpleParDoFn.class));
// TODO: move the asserts below into new tests in UserParDoFnFactoryTest, and this test should
// simply assert that DefaultParDoFnFactory.create() matches UserParDoFnFactory.create()
// Test that the DoFnInfo reflects the one passed in
SimpleParDoFn simpleParDoFn = (SimpleParDoFn) parDoFn;
parDoFn.startBundle(new OutputReceiver());
// DoFnInfo may not yet be initialized until an element is processed
parDoFn.processElement(WindowedValue.valueInGlobalWindow("foo"));
@SuppressWarnings("rawtypes") DoFnInfo doFnInfo = simpleParDoFn.getDoFnInfo();
DoFn innerDoFn = (TestDoFn) doFnInfo.getDoFn();
assertThat(innerDoFn, instanceOf(TestDoFn.class));
assertThat(doFnInfo.getWindowingStrategy().getWindowFn(), instanceOf(GlobalWindows.class));
assertThat(doFnInfo.getWindowingStrategy().getTrigger(), instanceOf(DefaultTrigger.class));
// Test that the deserialized user DoFn is as expected
TestDoFn actualTestDoFn = (TestDoFn) innerDoFn;
assertEquals(stringFieldValue, actualTestDoFn.stringField);
assertEquals(longFieldValue, actualTestDoFn.longField);
}
Aggregations