use of org.apache.beam.runners.flink.translation.types.CoderTypeInformation in project beam by apache.
the class ExecutableStageDoFnOperatorTest method testWatermarkHandling.
@Test
public void testWatermarkHandling() throws Exception {
TupleTag<Integer> mainOutput = new TupleTag<>("main-output");
DoFnOperator.MultiOutputOutputManagerFactory<Integer> outputManagerFactory = new DoFnOperator.MultiOutputOutputManagerFactory(mainOutput, VoidCoder.of(), new SerializablePipelineOptions(FlinkPipelineOptions.defaults()));
ExecutableStageDoFnOperator<KV<String, Integer>, Integer> operator = getOperator(mainOutput, Collections.emptyList(), outputManagerFactory, WindowingStrategy.of(FixedWindows.of(Duration.millis(10))), StringUtf8Coder.of(), WindowedValue.getFullCoder(KvCoder.of(StringUtf8Coder.of(), VarIntCoder.of()), IntervalWindow.getCoder()));
KeyedOneInputStreamOperatorTestHarness<String, WindowedValue<KV<String, Integer>>, WindowedValue<Integer>> testHarness = new KeyedOneInputStreamOperatorTestHarness<>(operator, val -> val.getValue().getKey(), new CoderTypeInformation<>(StringUtf8Coder.of(), FlinkPipelineOptions.defaults()));
RemoteBundle bundle = Mockito.mock(RemoteBundle.class);
when(bundle.getInputReceivers()).thenReturn(ImmutableMap.<String, FnDataReceiver<WindowedValue>>builder().put("input", Mockito.mock(FnDataReceiver.class)).build());
when(bundle.getTimerReceivers()).thenReturn(ImmutableMap.<KV<String, String>, FnDataReceiver<WindowedValue>>builder().put(KV.of("transform", "timer"), Mockito.mock(FnDataReceiver.class)).put(KV.of("transform", "timer2"), Mockito.mock(FnDataReceiver.class)).put(KV.of("transform", "timer3"), Mockito.mock(FnDataReceiver.class)).build());
when(stageBundleFactory.getBundle(any(), any(), any(), any(), any(), any())).thenReturn(bundle);
testHarness.open();
assertThat(operator.getCurrentOutputWatermark(), is(BoundedWindow.TIMESTAMP_MIN_VALUE.getMillis()));
// No bundle has been started, watermark can be freely advanced
testHarness.processWatermark(0);
assertThat(operator.getCurrentOutputWatermark(), is(0L));
// Trigger a new bundle
IntervalWindow intervalWindow = new IntervalWindow(new Instant(0), new Instant(9));
WindowedValue<KV<String, Integer>> windowedValue = WindowedValue.of(KV.of("one", 1), Instant.now(), intervalWindow, PaneInfo.NO_FIRING);
testHarness.processElement(new StreamRecord<>(windowedValue));
// The output watermark should be held back during the bundle
testHarness.processWatermark(1);
assertThat(operator.getEffectiveInputWatermark(), is(1L));
assertThat(operator.getCurrentOutputWatermark(), is(0L));
// After the bundle has been finished, the watermark should be advanced
operator.invokeFinishBundle();
assertThat(operator.getCurrentOutputWatermark(), is(1L));
// Bundle finished, watermark can be freely advanced
testHarness.processWatermark(2);
assertThat(operator.getEffectiveInputWatermark(), is(2L));
assertThat(operator.getCurrentOutputWatermark(), is(2L));
// Trigger a new bundle
testHarness.processElement(new StreamRecord<>(windowedValue));
// cleanup timer
assertThat(testHarness.numEventTimeTimers(), is(1));
// Set at timer
Instant timerTarget = new Instant(5);
Instant timerTarget2 = new Instant(6);
operator.getLockToAcquireForStateAccessDuringBundles().lock();
BiConsumer<String, Instant> timerConsumer = (timerId, timestamp) -> operator.setTimer(Timer.of(windowedValue.getValue().getKey(), "", windowedValue.getWindows(), timestamp, timestamp, PaneInfo.NO_FIRING), TimerInternals.TimerData.of("", TimerReceiverFactory.encodeToTimerDataTimerId("transform", timerId), StateNamespaces.window(IntervalWindow.getCoder(), intervalWindow), timestamp, timestamp, TimeDomain.EVENT_TIME));
timerConsumer.accept("timer", timerTarget);
timerConsumer.accept("timer2", timerTarget2);
assertThat(testHarness.numEventTimeTimers(), is(3));
// Advance input watermark past the timer
// Check the output watermark is held back
long targetWatermark = timerTarget.getMillis() + 100;
testHarness.processWatermark(targetWatermark);
// Do not yet advance the output watermark because we are still processing a bundle
assertThat(testHarness.numEventTimeTimers(), is(3));
assertThat(operator.getCurrentOutputWatermark(), is(2L));
// Check that the timers are fired but the output watermark is advanced no further than
// the minimum timer timestamp of the previous bundle because we are still processing a
// bundle which might contain more timers.
// Timers can create loops if they keep rescheduling themselves when firing
// Thus, we advance the watermark asynchronously to allow for checkpointing to run
operator.invokeFinishBundle();
assertThat(testHarness.numEventTimeTimers(), is(3));
testHarness.setProcessingTime(testHarness.getProcessingTime() + 1);
assertThat(testHarness.numEventTimeTimers(), is(0));
assertThat(operator.getCurrentOutputWatermark(), is(5L));
// Output watermark is advanced synchronously when the bundle finishes,
// no more timers are scheduled
operator.invokeFinishBundle();
assertThat(operator.getCurrentOutputWatermark(), is(targetWatermark));
assertThat(testHarness.numEventTimeTimers(), is(0));
// Watermark is advanced in a blocking fashion on close, not via a timers
// Create a bundle with a pending timer to simulate that
testHarness.processElement(new StreamRecord<>(windowedValue));
timerConsumer.accept("timer3", new Instant(targetWatermark));
assertThat(testHarness.numEventTimeTimers(), is(1));
// This should be blocking until the watermark reaches Long.MAX_VALUE.
testHarness.close();
assertThat(testHarness.numEventTimeTimers(), is(0));
assertThat(operator.getCurrentOutputWatermark(), is(Long.MAX_VALUE));
}
use of org.apache.beam.runners.flink.translation.types.CoderTypeInformation in project beam by apache.
the class FlinkBatchPortablePipelineTranslator method pruneOutput.
private static void pruneOutput(DataSet<RawUnionValue> taggedDataset, BatchTranslationContext context, int unionTag, Coder<WindowedValue<?>> outputCoder, String collectionId) {
TypeInformation<WindowedValue<?>> outputType = new CoderTypeInformation<>(outputCoder, context.getPipelineOptions());
FlinkExecutableStagePruningFunction pruningFunction = new FlinkExecutableStagePruningFunction(unionTag, context.getPipelineOptions());
FlatMapOperator<RawUnionValue, WindowedValue<?>> pruningOperator = new FlatMapOperator<>(taggedDataset, outputType, pruningFunction, String.format("ExtractOutput[%s]", unionTag));
context.addDataSet(collectionId, pruningOperator);
}
use of org.apache.beam.runners.flink.translation.types.CoderTypeInformation in project beam by apache.
the class FlinkStreamingPortablePipelineTranslator method translateFlatten.
private <T> void translateFlatten(String id, RunnerApi.Pipeline pipeline, StreamingTranslationContext context) {
RunnerApi.PTransform transform = pipeline.getComponents().getTransformsOrThrow(id);
Map<String, String> allInputs = transform.getInputsMap();
if (allInputs.isEmpty()) {
// create an empty dummy source to satisfy downstream operations
// we cannot create an empty source in Flink, therefore we have to
// add the flatMap that simply never forwards the single element
long shutdownAfterIdleSourcesMs = context.getPipelineOptions().getShutdownSourcesAfterIdleMs();
DataStreamSource<WindowedValue<byte[]>> dummySource = context.getExecutionEnvironment().addSource(new ImpulseSourceFunction(shutdownAfterIdleSourcesMs));
DataStream<WindowedValue<T>> result = dummySource.<WindowedValue<T>>flatMap((s, collector) -> {
// never return anything
}).returns(new CoderTypeInformation<>(WindowedValue.getFullCoder((Coder<T>) VoidCoder.of(), GlobalWindow.Coder.INSTANCE), context.getPipelineOptions()));
context.addDataStream(Iterables.getOnlyElement(transform.getOutputsMap().values()), result);
} else {
DataStream<T> result = null;
// Determine DataStreams that we use as input several times. For those, we need to uniquify
// input streams because Flink seems to swallow watermarks when we have a union of one and
// the same stream.
HashMultiset<DataStream<T>> inputCounts = HashMultiset.create();
for (String input : allInputs.values()) {
DataStream<T> current = context.getDataStreamOrThrow(input);
inputCounts.add(current, 1);
}
for (String input : allInputs.values()) {
DataStream<T> current = context.getDataStreamOrThrow(input);
final int timesRequired = inputCounts.count(current);
if (timesRequired > 1) {
current = current.flatMap(new FlatMapFunction<T, T>() {
private static final long serialVersionUID = 1L;
@Override
public void flatMap(T t, Collector<T> collector) {
collector.collect(t);
}
});
}
result = (result == null) ? current : result.union(current);
}
context.addDataStream(Iterables.getOnlyElement(transform.getOutputsMap().values()), result);
}
}
use of org.apache.beam.runners.flink.translation.types.CoderTypeInformation in project beam by apache.
the class FlinkStreamingPortablePipelineTranslator method translateStreamingImpulse.
private void translateStreamingImpulse(String id, RunnerApi.Pipeline pipeline, StreamingTranslationContext context) {
RunnerApi.PTransform pTransform = pipeline.getComponents().getTransformsOrThrow(id);
TypeInformation<WindowedValue<byte[]>> typeInfo = new CoderTypeInformation<>(WindowedValue.getFullCoder(ByteArrayCoder.of(), GlobalWindow.Coder.INSTANCE), context.getPipelineOptions());
ObjectMapper objectMapper = new ObjectMapper();
final int intervalMillis;
final int messageCount;
try {
JsonNode config = objectMapper.readTree(pTransform.getSpec().getPayload().toByteArray());
intervalMillis = config.path("interval_ms").asInt(100);
messageCount = config.path("message_count").asInt(0);
} catch (IOException e) {
throw new RuntimeException("Failed to parse configuration for streaming impulse", e);
}
SingleOutputStreamOperator<WindowedValue<byte[]>> source = context.getExecutionEnvironment().addSource(new StreamingImpulseSource(intervalMillis, messageCount), StreamingImpulseSource.class.getSimpleName()).returns(typeInfo);
context.addDataStream(Iterables.getOnlyElement(pTransform.getOutputsMap().values()), source);
}
use of org.apache.beam.runners.flink.translation.types.CoderTypeInformation in project beam by apache.
the class FlinkStreamingPortablePipelineTranslator method translateExecutableStage.
private <InputT, OutputT> void translateExecutableStage(String id, RunnerApi.Pipeline pipeline, StreamingTranslationContext context) {
// TODO: Fail on splittable DoFns.
// TODO: Special-case single outputs to avoid multiplexing PCollections.
RunnerApi.Components components = pipeline.getComponents();
RunnerApi.PTransform transform = components.getTransformsOrThrow(id);
Map<String, String> outputs = transform.getOutputsMap();
final RunnerApi.ExecutableStagePayload stagePayload;
try {
stagePayload = RunnerApi.ExecutableStagePayload.parseFrom(transform.getSpec().getPayload());
} catch (IOException e) {
throw new RuntimeException(e);
}
String inputPCollectionId = stagePayload.getInput();
final TransformedSideInputs transformedSideInputs;
if (stagePayload.getSideInputsCount() > 0) {
transformedSideInputs = transformSideInputs(stagePayload, components, context);
} else {
transformedSideInputs = new TransformedSideInputs(Collections.emptyMap(), null);
}
Map<TupleTag<?>, OutputTag<WindowedValue<?>>> tagsToOutputTags = Maps.newLinkedHashMap();
Map<TupleTag<?>, Coder<WindowedValue<?>>> tagsToCoders = Maps.newLinkedHashMap();
// TODO: does it matter which output we designate as "main"
final TupleTag<OutputT> mainOutputTag = outputs.isEmpty() ? null : new TupleTag(outputs.keySet().iterator().next());
// associate output tags with ids, output manager uses these Integer ids to serialize state
BiMap<String, Integer> outputIndexMap = createOutputMap(outputs.keySet());
Map<String, Coder<WindowedValue<?>>> outputCoders = Maps.newHashMap();
Map<TupleTag<?>, Integer> tagsToIds = Maps.newHashMap();
Map<String, TupleTag<?>> collectionIdToTupleTag = Maps.newHashMap();
// order output names for deterministic mapping
for (String localOutputName : new TreeMap<>(outputIndexMap).keySet()) {
String collectionId = outputs.get(localOutputName);
Coder<WindowedValue<?>> windowCoder = (Coder) instantiateCoder(collectionId, components);
outputCoders.put(localOutputName, windowCoder);
TupleTag<?> tupleTag = new TupleTag<>(localOutputName);
CoderTypeInformation<WindowedValue<?>> typeInformation = new CoderTypeInformation(windowCoder, context.getPipelineOptions());
tagsToOutputTags.put(tupleTag, new OutputTag<>(localOutputName, typeInformation));
tagsToCoders.put(tupleTag, windowCoder);
tagsToIds.put(tupleTag, outputIndexMap.get(localOutputName));
collectionIdToTupleTag.put(collectionId, tupleTag);
}
final SingleOutputStreamOperator<WindowedValue<OutputT>> outputStream;
DataStream<WindowedValue<InputT>> inputDataStream = context.getDataStreamOrThrow(inputPCollectionId);
CoderTypeInformation<WindowedValue<OutputT>> outputTypeInformation = !outputs.isEmpty() ? new CoderTypeInformation(outputCoders.get(mainOutputTag.getId()), context.getPipelineOptions()) : null;
ArrayList<TupleTag<?>> additionalOutputTags = Lists.newArrayList();
for (TupleTag<?> tupleTag : tagsToCoders.keySet()) {
if (!mainOutputTag.getId().equals(tupleTag.getId())) {
additionalOutputTags.add(tupleTag);
}
}
final Coder<WindowedValue<InputT>> windowedInputCoder = instantiateCoder(inputPCollectionId, components);
final boolean stateful = stagePayload.getUserStatesCount() > 0 || stagePayload.getTimersCount() > 0;
final boolean hasSdfProcessFn = stagePayload.getComponents().getTransformsMap().values().stream().anyMatch(pTransform -> pTransform.getSpec().getUrn().equals(PTransformTranslation.SPLITTABLE_PROCESS_SIZED_ELEMENTS_AND_RESTRICTIONS_URN));
Coder keyCoder = null;
KeySelector<WindowedValue<InputT>, ?> keySelector = null;
if (stateful || hasSdfProcessFn) {
// Stateful/SDF stages are only allowed of KV input.
Coder valueCoder = ((WindowedValue.FullWindowedValueCoder) windowedInputCoder).getValueCoder();
if (!(valueCoder instanceof KvCoder)) {
throw new IllegalStateException(String.format(Locale.ENGLISH, "The element coder for stateful DoFn '%s' must be KvCoder but is: %s", inputPCollectionId, valueCoder.getClass().getSimpleName()));
}
if (stateful) {
keyCoder = ((KvCoder) valueCoder).getKeyCoder();
keySelector = new KvToByteBufferKeySelector(keyCoder, new SerializablePipelineOptions(context.getPipelineOptions()));
} else {
// as the key.
if (!(((KvCoder) valueCoder).getKeyCoder() instanceof KvCoder)) {
throw new IllegalStateException(String.format(Locale.ENGLISH, "The element coder for splittable DoFn '%s' must be KVCoder(KvCoder, DoubleCoder) but is: %s", inputPCollectionId, valueCoder.getClass().getSimpleName()));
}
keyCoder = ((KvCoder) ((KvCoder) valueCoder).getKeyCoder()).getKeyCoder();
keySelector = new SdfByteBufferKeySelector(keyCoder, new SerializablePipelineOptions(context.getPipelineOptions()));
}
inputDataStream = inputDataStream.keyBy(keySelector);
}
DoFnOperator.MultiOutputOutputManagerFactory<OutputT> outputManagerFactory = new DoFnOperator.MultiOutputOutputManagerFactory<>(mainOutputTag, tagsToOutputTags, tagsToCoders, tagsToIds, new SerializablePipelineOptions(context.getPipelineOptions()));
DoFnOperator<InputT, OutputT> doFnOperator = new ExecutableStageDoFnOperator<>(transform.getUniqueName(), windowedInputCoder, Collections.emptyMap(), mainOutputTag, additionalOutputTags, outputManagerFactory, transformedSideInputs.unionTagToView, new ArrayList<>(transformedSideInputs.unionTagToView.values()), getSideInputIdToPCollectionViewMap(stagePayload, components), context.getPipelineOptions(), stagePayload, context.getJobInfo(), FlinkExecutableStageContextFactory.getInstance(), collectionIdToTupleTag, getWindowingStrategy(inputPCollectionId, components), keyCoder, keySelector);
final String operatorName = generateNameFromStagePayload(stagePayload);
if (transformedSideInputs.unionTagToView.isEmpty()) {
outputStream = inputDataStream.transform(operatorName, outputTypeInformation, doFnOperator);
} else {
DataStream<RawUnionValue> sideInputStream = transformedSideInputs.unionedSideInputs.broadcast();
if (stateful || hasSdfProcessFn) {
// We have to manually construct the two-input transform because we're not
// allowed to have only one input keyed, normally. Since Flink 1.5.0 it's
// possible to use the Broadcast State Pattern which provides a more elegant
// way to process keyed main input with broadcast state, but it's not feasible
// here because it breaks the DoFnOperator abstraction.
TwoInputTransformation<WindowedValue<KV<?, InputT>>, RawUnionValue, WindowedValue<OutputT>> rawFlinkTransform = new TwoInputTransformation(inputDataStream.getTransformation(), sideInputStream.getTransformation(), transform.getUniqueName(), doFnOperator, outputTypeInformation, inputDataStream.getParallelism());
rawFlinkTransform.setStateKeyType(((KeyedStream) inputDataStream).getKeyType());
rawFlinkTransform.setStateKeySelectors(((KeyedStream) inputDataStream).getKeySelector(), null);
outputStream = new SingleOutputStreamOperator(inputDataStream.getExecutionEnvironment(), // we have to cheat around the ctor being protected
rawFlinkTransform) {
};
} else {
outputStream = inputDataStream.connect(sideInputStream).transform(operatorName, outputTypeInformation, doFnOperator);
}
}
// Assign a unique but consistent id to re-map operator state
outputStream.uid(transform.getUniqueName());
if (mainOutputTag != null) {
context.addDataStream(outputs.get(mainOutputTag.getId()), outputStream);
}
for (TupleTag<?> tupleTag : additionalOutputTags) {
context.addDataStream(outputs.get(tupleTag.getId()), outputStream.getSideOutput(tagsToOutputTags.get(tupleTag)));
}
}
Aggregations