use of org.apache.beam.sdk.transforms.join.RawUnionValue in project beam by apache.
the class FlinkExecutableStageFunctionTest method outputsAreTaggedCorrectly.
@Test
public void outputsAreTaggedCorrectly() throws Exception {
WindowedValue<Integer> three = WindowedValue.valueInGlobalWindow(3);
WindowedValue<Integer> four = WindowedValue.valueInGlobalWindow(4);
WindowedValue<Integer> five = WindowedValue.valueInGlobalWindow(5);
Map<String, Integer> outputTagMap = ImmutableMap.of("one", 1, "two", 2, "three", 3);
// We use a real StageBundleFactory here in order to exercise the output receiver factory.
StageBundleFactory stageBundleFactory = new StageBundleFactory() {
private boolean once;
@Override
public RemoteBundle getBundle(OutputReceiverFactory receiverFactory, TimerReceiverFactory timerReceiverFactory, StateRequestHandler stateRequestHandler, BundleProgressHandler progressHandler, BundleFinalizationHandler finalizationHandler, BundleCheckpointHandler checkpointHandler) {
return new RemoteBundle() {
@Override
public String getId() {
return "bundle-id";
}
@Override
public Map<String, FnDataReceiver> getInputReceivers() {
return ImmutableMap.of("input", input -> {
/* Ignore input*/
});
}
@Override
public Map<KV<String, String>, FnDataReceiver<Timer>> getTimerReceivers() {
return Collections.emptyMap();
}
@Override
public void requestProgress() {
throw new UnsupportedOperationException();
}
@Override
public void split(double fractionOfRemainder) {
throw new UnsupportedOperationException();
}
@Override
public void close() throws Exception {
if (once) {
return;
}
// Emit all values to the runner when the bundle is closed.
receiverFactory.create("one").accept(three);
receiverFactory.create("two").accept(four);
receiverFactory.create("three").accept(five);
once = true;
}
};
}
@Override
public ProcessBundleDescriptors.ExecutableProcessBundleDescriptor getProcessBundleDescriptor() {
return processBundleDescriptor;
}
@Override
public InstructionRequestHandler getInstructionRequestHandler() {
return null;
}
@Override
public void close() throws Exception {
}
};
// Wire the stage bundle factory into our context.
when(stageContext.getStageBundleFactory(any())).thenReturn(stageBundleFactory);
FlinkExecutableStageFunction<Integer> function = getFunction(outputTagMap);
function.open(new Configuration());
if (isStateful) {
function.reduce(Collections.emptyList(), collector);
} else {
function.mapPartition(Collections.emptyList(), collector);
}
// Ensure that the tagged values sent to the collector have the correct union tags as specified
// in the output map.
verify(collector).collect(new RawUnionValue(1, three));
verify(collector).collect(new RawUnionValue(2, four));
verify(collector).collect(new RawUnionValue(3, five));
verifyNoMoreInteractions(collector);
}
use of org.apache.beam.sdk.transforms.join.RawUnionValue in project beam by apache.
the class DoFnOperatorTest method pushbackDataCheckpointing.
void pushbackDataCheckpointing(TestHarnessFactory<TwoInputStreamOperatorTestHarness<WindowedValue<String>, RawUnionValue, WindowedValue<String>>> harnessFactory) throws Exception {
TwoInputStreamOperatorTestHarness<WindowedValue<String>, RawUnionValue, WindowedValue<String>> testHarness = harnessFactory.create();
testHarness.open();
IntervalWindow firstWindow = new IntervalWindow(new Instant(0), new Instant(100));
IntervalWindow secondWindow = new IntervalWindow(new Instant(0), new Instant(500));
// push in main-input elements
WindowedValue<String> helloElement = valueInWindow("Hello", new Instant(0), firstWindow);
WindowedValue<String> worldElement = valueInWindow("World", new Instant(1000), firstWindow);
testHarness.processElement1(new StreamRecord<>(helloElement));
testHarness.processElement1(new StreamRecord<>(worldElement));
// snapshot state, throw away the operator, then restore and verify that we still match
// main-input elements to the side-inputs that we sent earlier
OperatorSubtaskState snapshot = testHarness.snapshot(0, 0);
testHarness = harnessFactory.create();
testHarness.initializeState(snapshot);
testHarness.open();
// push in some side inputs for both windows
testHarness.processElement2(new StreamRecord<>(new RawUnionValue(1, valuesInWindow(PCollectionViewTesting.materializeValuesFor(view1.getPipeline().getOptions(), View.asIterable(), "hello", "ciao"), new Instant(0), firstWindow))));
testHarness.processElement2(new StreamRecord<>(new RawUnionValue(2, valuesInWindow(PCollectionViewTesting.materializeValuesFor(view2.getPipeline().getOptions(), View.asIterable(), "foo", "bar"), new Instant(0), secondWindow))));
assertThat(stripStreamRecordFromWindowedValue(testHarness.getOutput()), containsInAnyOrder(helloElement, worldElement));
testHarness.close();
}
use of org.apache.beam.sdk.transforms.join.RawUnionValue in project beam by apache.
the class DoFnOperatorTest method testSideInputs.
public void testSideInputs(boolean keyed) throws Exception {
WindowedValue.ValueOnlyWindowedValueCoder<String> windowedValueCoder = WindowedValue.getValueOnlyCoder(StringUtf8Coder.of());
TupleTag<String> outputTag = new TupleTag<>("main-output");
ImmutableMap<Integer, PCollectionView<?>> sideInputMapping = ImmutableMap.<Integer, PCollectionView<?>>builder().put(1, view1).put(2, view2).build();
Coder<String> keyCoder = null;
if (keyed) {
keyCoder = StringUtf8Coder.of();
}
DoFnOperator<String, String, String> doFnOperator = new DoFnOperator<>(new IdentityDoFn<String>(), "stepName", windowedValueCoder, outputTag, Collections.<TupleTag<?>>emptyList(), new DoFnOperator.DefaultOutputManagerFactory<String>(), WindowingStrategy.globalDefault(), sideInputMapping, /* side-input mapping */
ImmutableList.<PCollectionView<?>>of(view1, view2), /* side inputs */
PipelineOptionsFactory.as(FlinkPipelineOptions.class), keyCoder);
TwoInputStreamOperatorTestHarness<WindowedValue<String>, RawUnionValue, String> testHarness = new TwoInputStreamOperatorTestHarness<>(doFnOperator);
if (keyed) {
// we use a dummy key for the second input since it is considered to be broadcast
testHarness = new KeyedTwoInputStreamOperatorTestHarness<>(doFnOperator, new StringKeySelector(), new DummyKeySelector(), BasicTypeInfo.STRING_TYPE_INFO);
}
testHarness.open();
IntervalWindow firstWindow = new IntervalWindow(new Instant(0), new Instant(100));
IntervalWindow secondWindow = new IntervalWindow(new Instant(0), new Instant(500));
// test the keep of sideInputs events
testHarness.processElement2(new StreamRecord<>(new RawUnionValue(1, valuesInWindow(ImmutableList.of("hello", "ciao"), new Instant(0), firstWindow))));
testHarness.processElement2(new StreamRecord<>(new RawUnionValue(2, valuesInWindow(ImmutableList.of("foo", "bar"), new Instant(0), secondWindow))));
// push in a regular elements
WindowedValue<String> helloElement = valueInWindow("Hello", new Instant(0), firstWindow);
WindowedValue<String> worldElement = valueInWindow("World", new Instant(1000), firstWindow);
testHarness.processElement1(new StreamRecord<>(helloElement));
testHarness.processElement1(new StreamRecord<>(worldElement));
// test the keep of pushed-back events
testHarness.processElement2(new StreamRecord<>(new RawUnionValue(1, valuesInWindow(ImmutableList.of("hello", "ciao"), new Instant(1000), firstWindow))));
testHarness.processElement2(new StreamRecord<>(new RawUnionValue(2, valuesInWindow(ImmutableList.of("foo", "bar"), new Instant(1000), secondWindow))));
assertThat(this.<String>stripStreamRecordFromWindowedValue(testHarness.getOutput()), contains(helloElement, worldElement));
testHarness.close();
}
use of org.apache.beam.sdk.transforms.join.RawUnionValue in project beam by apache.
the class DoFnOperatorTest method testSideInputs.
void testSideInputs(boolean keyed) throws Exception {
Coder<WindowedValue<String>> coder = WindowedValue.getValueOnlyCoder(StringUtf8Coder.of());
TupleTag<String> outputTag = new TupleTag<>("main-output");
ImmutableMap<Integer, PCollectionView<?>> sideInputMapping = ImmutableMap.<Integer, PCollectionView<?>>builder().put(1, view1).put(2, view2).build();
Coder<String> keyCoder = StringUtf8Coder.of();
KeySelector<WindowedValue<String>, ByteBuffer> keySelector = null;
if (keyed) {
keySelector = value -> FlinkKeyUtils.encodeKey(value.getValue(), keyCoder);
}
DoFnOperator<String, String> doFnOperator = new DoFnOperator<>(new IdentityDoFn<>(), "stepName", coder, Collections.emptyMap(), outputTag, Collections.emptyList(), new DoFnOperator.MultiOutputOutputManagerFactory<>(outputTag, coder, new SerializablePipelineOptions(FlinkPipelineOptions.defaults())), WindowingStrategy.of(FixedWindows.of(Duration.millis(100))), sideInputMapping, /* side-input mapping */
ImmutableList.of(view1, view2), /* side inputs */
FlinkPipelineOptions.defaults(), keyed ? keyCoder : null, keyed ? keySelector : null, DoFnSchemaInformation.create(), Collections.emptyMap());
TwoInputStreamOperatorTestHarness<WindowedValue<String>, RawUnionValue, WindowedValue<String>> testHarness = new TwoInputStreamOperatorTestHarness<>(doFnOperator);
if (keyed) {
// we use a dummy key for the second input since it is considered to be broadcast
testHarness = new KeyedTwoInputStreamOperatorTestHarness<>(doFnOperator, keySelector, null, new CoderTypeInformation<>(FlinkKeyUtils.ByteBufferCoder.of(), FlinkPipelineOptions.defaults()));
}
testHarness.open();
IntervalWindow firstWindow = new IntervalWindow(new Instant(0), new Instant(100));
IntervalWindow secondWindow = new IntervalWindow(new Instant(0), new Instant(500));
// test the keep of sideInputs events
testHarness.processElement2(new StreamRecord<>(new RawUnionValue(1, valuesInWindow(PCollectionViewTesting.materializeValuesFor(view1.getPipeline().getOptions(), View.asIterable(), "hello", "ciao"), new Instant(0), firstWindow))));
testHarness.processElement2(new StreamRecord<>(new RawUnionValue(2, valuesInWindow(PCollectionViewTesting.materializeValuesFor(view2.getPipeline().getOptions(), View.asIterable(), "foo", "bar"), new Instant(0), secondWindow))));
// push in a regular elements
WindowedValue<String> helloElement = valueInWindow("Hello", new Instant(0), firstWindow);
WindowedValue<String> worldElement = valueInWindow("World", new Instant(1000), firstWindow);
testHarness.processElement1(new StreamRecord<>(helloElement));
testHarness.processElement1(new StreamRecord<>(worldElement));
// test the keep of pushed-back events
testHarness.processElement2(new StreamRecord<>(new RawUnionValue(1, valuesInWindow(PCollectionViewTesting.materializeValuesFor(view1.getPipeline().getOptions(), View.asIterable(), "hello", "ciao"), new Instant(1000), firstWindow))));
testHarness.processElement2(new StreamRecord<>(new RawUnionValue(2, valuesInWindow(PCollectionViewTesting.materializeValuesFor(view2.getPipeline().getOptions(), View.asIterable(), "foo", "bar"), new Instant(1000), secondWindow))));
assertThat(stripStreamRecordFromWindowedValue(testHarness.getOutput()), contains(helloElement, worldElement));
testHarness.close();
}
use of org.apache.beam.sdk.transforms.join.RawUnionValue in project beam by apache.
the class FlinkBatchPortablePipelineTranslator method translateExecutableStage.
private static <InputT> void translateExecutableStage(PTransformNode transform, RunnerApi.Pipeline pipeline, BatchTranslationContext context) {
// TODO: Fail on splittable DoFns.
// TODO: Special-case single outputs to avoid multiplexing PCollections.
RunnerApi.Components components = pipeline.getComponents();
Map<String, String> outputs = transform.getTransform().getOutputsMap();
// Mapping from PCollection id to coder tag id.
BiMap<String, Integer> outputMap = createOutputMap(outputs.values());
// Collect all output Coders and create a UnionCoder for our tagged outputs.
List<Coder<?>> unionCoders = Lists.newArrayList();
// Enforce tuple tag sorting by union tag index.
Map<String, Coder<WindowedValue<?>>> outputCoders = Maps.newHashMap();
for (String collectionId : new TreeMap<>(outputMap.inverse()).values()) {
PCollectionNode collectionNode = PipelineNode.pCollection(collectionId, components.getPcollectionsOrThrow(collectionId));
Coder<WindowedValue<?>> coder;
try {
coder = (Coder) WireCoders.instantiateRunnerWireCoder(collectionNode, components);
} catch (IOException e) {
throw new RuntimeException(e);
}
outputCoders.put(collectionId, coder);
unionCoders.add(coder);
}
UnionCoder unionCoder = UnionCoder.of(unionCoders);
TypeInformation<RawUnionValue> typeInformation = new CoderTypeInformation<>(unionCoder, context.getPipelineOptions());
RunnerApi.ExecutableStagePayload stagePayload;
try {
stagePayload = RunnerApi.ExecutableStagePayload.parseFrom(transform.getTransform().getSpec().getPayload());
} catch (IOException e) {
throw new RuntimeException(e);
}
String inputPCollectionId = stagePayload.getInput();
Coder<WindowedValue<InputT>> windowedInputCoder = instantiateCoder(inputPCollectionId, components);
DataSet<WindowedValue<InputT>> inputDataSet = context.getDataSetOrThrow(inputPCollectionId);
final FlinkExecutableStageFunction<InputT> function = new FlinkExecutableStageFunction<>(transform.getTransform().getUniqueName(), context.getPipelineOptions(), stagePayload, context.getJobInfo(), outputMap, FlinkExecutableStageContextFactory.getInstance(), getWindowingStrategy(inputPCollectionId, components).getWindowFn().windowCoder(), windowedInputCoder);
final String operatorName = generateNameFromStagePayload(stagePayload);
final SingleInputUdfOperator taggedDataset;
if (stagePayload.getUserStatesCount() > 0 || stagePayload.getTimersCount() > 0) {
Coder valueCoder = ((WindowedValue.FullWindowedValueCoder) windowedInputCoder).getValueCoder();
// Stateful stages are only allowed of KV input to be able to group on the key
if (!(valueCoder instanceof KvCoder)) {
throw new IllegalStateException(String.format(Locale.ENGLISH, "The element coder for stateful DoFn '%s' must be KvCoder but is: %s", inputPCollectionId, valueCoder.getClass().getSimpleName()));
}
Coder keyCoder = ((KvCoder) valueCoder).getKeyCoder();
Grouping<WindowedValue<InputT>> groupedInput = inputDataSet.groupBy(new KvKeySelector<>(keyCoder));
boolean requiresTimeSortedInput = requiresTimeSortedInput(stagePayload, false);
if (requiresTimeSortedInput) {
groupedInput = ((UnsortedGrouping<WindowedValue<InputT>>) groupedInput).sortGroup(WindowedValue::getTimestamp, Order.ASCENDING);
}
taggedDataset = new GroupReduceOperator<>(groupedInput, typeInformation, function, operatorName);
} else {
taggedDataset = new MapPartitionOperator<>(inputDataSet, typeInformation, function, operatorName);
}
for (SideInputId sideInputId : stagePayload.getSideInputsList()) {
String collectionId = stagePayload.getComponents().getTransformsOrThrow(sideInputId.getTransformId()).getInputsOrThrow(sideInputId.getLocalName());
// Register under the global PCollection name. Only ExecutableStageFunction needs to know the
// mapping from local name to global name and how to translate the broadcast data to a state
// API view.
taggedDataset.withBroadcastSet(context.getDataSetOrThrow(collectionId), collectionId);
}
for (String collectionId : outputs.values()) {
pruneOutput(taggedDataset, context, outputMap.get(collectionId), outputCoders.get(collectionId), collectionId);
}
if (outputs.isEmpty()) {
// NOTE: After pipeline translation, we traverse the set of unconsumed PCollections and add a
// no-op sink to each to make sure they are materialized by Flink. However, some SDK-executed
// stages have no runner-visible output after fusion. We handle this case by adding a sink
// here.
taggedDataset.output(new DiscardingOutputFormat<>()).name("DiscardingOutput");
}
}
Aggregations