use of org.apache.beam.sdk.values.TupleTag in project beam by apache.
the class FlinkStreamingPortablePipelineTranslator method translateExecutableStage.
private <InputT, OutputT> void translateExecutableStage(String id, RunnerApi.Pipeline pipeline, StreamingTranslationContext context) {
// TODO: Fail on splittable DoFns.
// TODO: Special-case single outputs to avoid multiplexing PCollections.
RunnerApi.Components components = pipeline.getComponents();
RunnerApi.PTransform transform = components.getTransformsOrThrow(id);
Map<String, String> outputs = transform.getOutputsMap();
final RunnerApi.ExecutableStagePayload stagePayload;
try {
stagePayload = RunnerApi.ExecutableStagePayload.parseFrom(transform.getSpec().getPayload());
} catch (IOException e) {
throw new RuntimeException(e);
}
String inputPCollectionId = stagePayload.getInput();
final TransformedSideInputs transformedSideInputs;
if (stagePayload.getSideInputsCount() > 0) {
transformedSideInputs = transformSideInputs(stagePayload, components, context);
} else {
transformedSideInputs = new TransformedSideInputs(Collections.emptyMap(), null);
}
Map<TupleTag<?>, OutputTag<WindowedValue<?>>> tagsToOutputTags = Maps.newLinkedHashMap();
Map<TupleTag<?>, Coder<WindowedValue<?>>> tagsToCoders = Maps.newLinkedHashMap();
// TODO: does it matter which output we designate as "main"
final TupleTag<OutputT> mainOutputTag = outputs.isEmpty() ? null : new TupleTag(outputs.keySet().iterator().next());
// associate output tags with ids, output manager uses these Integer ids to serialize state
BiMap<String, Integer> outputIndexMap = createOutputMap(outputs.keySet());
Map<String, Coder<WindowedValue<?>>> outputCoders = Maps.newHashMap();
Map<TupleTag<?>, Integer> tagsToIds = Maps.newHashMap();
Map<String, TupleTag<?>> collectionIdToTupleTag = Maps.newHashMap();
// order output names for deterministic mapping
for (String localOutputName : new TreeMap<>(outputIndexMap).keySet()) {
String collectionId = outputs.get(localOutputName);
Coder<WindowedValue<?>> windowCoder = (Coder) instantiateCoder(collectionId, components);
outputCoders.put(localOutputName, windowCoder);
TupleTag<?> tupleTag = new TupleTag<>(localOutputName);
CoderTypeInformation<WindowedValue<?>> typeInformation = new CoderTypeInformation(windowCoder, context.getPipelineOptions());
tagsToOutputTags.put(tupleTag, new OutputTag<>(localOutputName, typeInformation));
tagsToCoders.put(tupleTag, windowCoder);
tagsToIds.put(tupleTag, outputIndexMap.get(localOutputName));
collectionIdToTupleTag.put(collectionId, tupleTag);
}
final SingleOutputStreamOperator<WindowedValue<OutputT>> outputStream;
DataStream<WindowedValue<InputT>> inputDataStream = context.getDataStreamOrThrow(inputPCollectionId);
CoderTypeInformation<WindowedValue<OutputT>> outputTypeInformation = !outputs.isEmpty() ? new CoderTypeInformation(outputCoders.get(mainOutputTag.getId()), context.getPipelineOptions()) : null;
ArrayList<TupleTag<?>> additionalOutputTags = Lists.newArrayList();
for (TupleTag<?> tupleTag : tagsToCoders.keySet()) {
if (!mainOutputTag.getId().equals(tupleTag.getId())) {
additionalOutputTags.add(tupleTag);
}
}
final Coder<WindowedValue<InputT>> windowedInputCoder = instantiateCoder(inputPCollectionId, components);
final boolean stateful = stagePayload.getUserStatesCount() > 0 || stagePayload.getTimersCount() > 0;
final boolean hasSdfProcessFn = stagePayload.getComponents().getTransformsMap().values().stream().anyMatch(pTransform -> pTransform.getSpec().getUrn().equals(PTransformTranslation.SPLITTABLE_PROCESS_SIZED_ELEMENTS_AND_RESTRICTIONS_URN));
Coder keyCoder = null;
KeySelector<WindowedValue<InputT>, ?> keySelector = null;
if (stateful || hasSdfProcessFn) {
// Stateful/SDF stages are only allowed of KV input.
Coder valueCoder = ((WindowedValue.FullWindowedValueCoder) windowedInputCoder).getValueCoder();
if (!(valueCoder instanceof KvCoder)) {
throw new IllegalStateException(String.format(Locale.ENGLISH, "The element coder for stateful DoFn '%s' must be KvCoder but is: %s", inputPCollectionId, valueCoder.getClass().getSimpleName()));
}
if (stateful) {
keyCoder = ((KvCoder) valueCoder).getKeyCoder();
keySelector = new KvToByteBufferKeySelector(keyCoder, new SerializablePipelineOptions(context.getPipelineOptions()));
} else {
// as the key.
if (!(((KvCoder) valueCoder).getKeyCoder() instanceof KvCoder)) {
throw new IllegalStateException(String.format(Locale.ENGLISH, "The element coder for splittable DoFn '%s' must be KVCoder(KvCoder, DoubleCoder) but is: %s", inputPCollectionId, valueCoder.getClass().getSimpleName()));
}
keyCoder = ((KvCoder) ((KvCoder) valueCoder).getKeyCoder()).getKeyCoder();
keySelector = new SdfByteBufferKeySelector(keyCoder, new SerializablePipelineOptions(context.getPipelineOptions()));
}
inputDataStream = inputDataStream.keyBy(keySelector);
}
DoFnOperator.MultiOutputOutputManagerFactory<OutputT> outputManagerFactory = new DoFnOperator.MultiOutputOutputManagerFactory<>(mainOutputTag, tagsToOutputTags, tagsToCoders, tagsToIds, new SerializablePipelineOptions(context.getPipelineOptions()));
DoFnOperator<InputT, OutputT> doFnOperator = new ExecutableStageDoFnOperator<>(transform.getUniqueName(), windowedInputCoder, Collections.emptyMap(), mainOutputTag, additionalOutputTags, outputManagerFactory, transformedSideInputs.unionTagToView, new ArrayList<>(transformedSideInputs.unionTagToView.values()), getSideInputIdToPCollectionViewMap(stagePayload, components), context.getPipelineOptions(), stagePayload, context.getJobInfo(), FlinkExecutableStageContextFactory.getInstance(), collectionIdToTupleTag, getWindowingStrategy(inputPCollectionId, components), keyCoder, keySelector);
final String operatorName = generateNameFromStagePayload(stagePayload);
if (transformedSideInputs.unionTagToView.isEmpty()) {
outputStream = inputDataStream.transform(operatorName, outputTypeInformation, doFnOperator);
} else {
DataStream<RawUnionValue> sideInputStream = transformedSideInputs.unionedSideInputs.broadcast();
if (stateful || hasSdfProcessFn) {
// We have to manually construct the two-input transform because we're not
// allowed to have only one input keyed, normally. Since Flink 1.5.0 it's
// possible to use the Broadcast State Pattern which provides a more elegant
// way to process keyed main input with broadcast state, but it's not feasible
// here because it breaks the DoFnOperator abstraction.
TwoInputTransformation<WindowedValue<KV<?, InputT>>, RawUnionValue, WindowedValue<OutputT>> rawFlinkTransform = new TwoInputTransformation(inputDataStream.getTransformation(), sideInputStream.getTransformation(), transform.getUniqueName(), doFnOperator, outputTypeInformation, inputDataStream.getParallelism());
rawFlinkTransform.setStateKeyType(((KeyedStream) inputDataStream).getKeyType());
rawFlinkTransform.setStateKeySelectors(((KeyedStream) inputDataStream).getKeySelector(), null);
outputStream = new SingleOutputStreamOperator(inputDataStream.getExecutionEnvironment(), // we have to cheat around the ctor being protected
rawFlinkTransform) {
};
} else {
outputStream = inputDataStream.connect(sideInputStream).transform(operatorName, outputTypeInformation, doFnOperator);
}
}
// Assign a unique but consistent id to re-map operator state
outputStream.uid(transform.getUniqueName());
if (mainOutputTag != null) {
context.addDataStream(outputs.get(mainOutputTag.getId()), outputStream);
}
for (TupleTag<?> tupleTag : additionalOutputTags) {
context.addDataStream(outputs.get(tupleTag.getId()), outputStream.getSideOutput(tagsToOutputTags.get(tupleTag)));
}
}
use of org.apache.beam.sdk.values.TupleTag in project beam by apache.
the class FlinkStatefulDoFnFunction method reduce.
@Override
public void reduce(Iterable<WindowedValue<KV<K, V>>> values, Collector<WindowedValue<RawUnionValue>> out) throws Exception {
RuntimeContext runtimeContext = getRuntimeContext();
DoFnRunners.OutputManager outputManager;
if (outputMap.size() == 1) {
outputManager = new FlinkDoFnFunction.DoFnOutputManager(out);
} else {
// it has some additional Outputs
outputManager = new FlinkDoFnFunction.MultiDoFnOutputManager(out, outputMap);
}
final Iterator<WindowedValue<KV<K, V>>> iterator = values.iterator();
// get the first value, we need this for initializing the state internals with the key.
// we are guaranteed to have a first value, otherwise reduce() would not have been called.
WindowedValue<KV<K, V>> currentValue = iterator.next();
final K key = currentValue.getValue().getKey();
final InMemoryStateInternals<K> stateInternals = InMemoryStateInternals.forKey(key);
// Used with Batch, we know that all the data is available for this key. We can't use the
// timer manager from the context because it doesn't exist. So we create one and advance
// time to the end after processing all elements.
final InMemoryTimerInternals timerInternals = new InMemoryTimerInternals();
timerInternals.advanceProcessingTime(Instant.now());
timerInternals.advanceSynchronizedProcessingTime(Instant.now());
final Set<BoundedWindow> windowsSeen = new HashSet<>();
List<TupleTag<?>> additionalOutputTags = Lists.newArrayList(outputMap.keySet());
DoFnRunner<KV<K, V>, OutputT> doFnRunner = DoFnRunners.simpleRunner(serializedOptions.get(), dofn, new FlinkSideInputReader(sideInputs, runtimeContext), outputManager, mainOutputTag, additionalOutputTags, new FlinkNoOpStepContext() {
@Override
public StateInternals stateInternals() {
return stateInternals;
}
@Override
public TimerInternals timerInternals() {
return timerInternals;
}
}, inputCoder, outputCoderMap, windowingStrategy, doFnSchemaInformation, sideInputMapping);
FlinkPipelineOptions pipelineOptions = serializedOptions.get().as(FlinkPipelineOptions.class);
if (!pipelineOptions.getDisableMetrics()) {
doFnRunner = new DoFnRunnerWithMetricsUpdate<>(stepName, doFnRunner, metricContainer);
}
doFnRunner.startBundle();
doFnRunner.processElement(currentValue);
if (usesOnWindowExpiration) {
windowsSeen.addAll(currentValue.getWindows());
}
while (iterator.hasNext()) {
currentValue = iterator.next();
if (usesOnWindowExpiration) {
windowsSeen.addAll(currentValue.getWindows());
}
doFnRunner.processElement(currentValue);
}
// Finish any pending windows by advancing the input watermark to infinity.
timerInternals.advanceInputWatermark(BoundedWindow.TIMESTAMP_MAX_VALUE);
// Finally, advance the processing time to infinity to fire any timers.
timerInternals.advanceProcessingTime(BoundedWindow.TIMESTAMP_MAX_VALUE);
timerInternals.advanceSynchronizedProcessingTime(BoundedWindow.TIMESTAMP_MAX_VALUE);
fireEligibleTimers(key, timerInternals, doFnRunner);
if (usesOnWindowExpiration) {
for (BoundedWindow window : windowsSeen) {
doFnRunner.onWindowExpiration(window, window.maxTimestamp().minus(Duration.millis(1)), key);
}
}
doFnRunner.finishBundle();
}
use of org.apache.beam.sdk.values.TupleTag in project beam by apache.
the class FlinkStreamingTransformTranslators method transformSideInputs.
private static Tuple2<Map<Integer, PCollectionView<?>>, DataStream<RawUnionValue>> transformSideInputs(Collection<PCollectionView<?>> sideInputs, FlinkStreamingTranslationContext context) {
// collect all side inputs
Map<TupleTag<?>, Integer> tagToIntMapping = new HashMap<>();
Map<Integer, PCollectionView<?>> intToViewMapping = new HashMap<>();
int count = 0;
for (PCollectionView<?> sideInput : sideInputs) {
TupleTag<?> tag = sideInput.getTagInternal();
intToViewMapping.put(count, sideInput);
tagToIntMapping.put(tag, count);
count++;
}
List<Coder<?>> inputCoders = new ArrayList<>();
for (PCollectionView<?> sideInput : sideInputs) {
DataStream<Object> sideInputStream = context.getInputDataStream(sideInput);
TypeInformation<Object> tpe = sideInputStream.getType();
if (!(tpe instanceof CoderTypeInformation)) {
throw new IllegalStateException("Input Stream TypeInformation is no CoderTypeInformation.");
}
Coder<?> coder = ((CoderTypeInformation) tpe).getCoder();
inputCoders.add(coder);
}
UnionCoder unionCoder = UnionCoder.of(inputCoders);
CoderTypeInformation<RawUnionValue> unionTypeInformation = new CoderTypeInformation<>(unionCoder, context.getPipelineOptions());
// transform each side input to RawUnionValue and union them
DataStream<RawUnionValue> sideInputUnion = null;
for (PCollectionView<?> sideInput : sideInputs) {
TupleTag<?> tag = sideInput.getTagInternal();
final int intTag = tagToIntMapping.get(tag);
DataStream<Object> sideInputStream = context.getInputDataStream(sideInput);
DataStream<RawUnionValue> unionValueStream = sideInputStream.map(new ToRawUnion<>(intTag, context.getPipelineOptions())).returns(unionTypeInformation);
if (sideInputUnion == null) {
sideInputUnion = unionValueStream;
} else {
sideInputUnion = sideInputUnion.union(unionValueStream);
}
}
if (sideInputUnion == null) {
throw new IllegalStateException("No unioned side inputs, this indicates a bug.");
}
return new Tuple2<>(intToViewMapping, sideInputUnion);
}
use of org.apache.beam.sdk.values.TupleTag in project beam by apache.
the class SplittableParDoProcessKeyedElementsOp method open.
@Override
public void open(Config config, Context context, Scheduler<KeyedTimerData<byte[]>> timerRegistry, OpEmitter<RawUnionValue> emitter) {
this.pipelineOptions = Base64Serializer.deserializeUnchecked(config.get("beamPipelineOptions"), SerializablePipelineOptions.class).get().as(SamzaPipelineOptions.class);
final SamzaStoreStateInternals.Factory<?> nonKeyedStateInternalsFactory = SamzaStoreStateInternals.createNonKeyedStateInternalsFactory(transformId, context.getTaskContext(), pipelineOptions);
final DoFnRunners.OutputManager outputManager = outputManagerFactory.create(emitter);
this.stateInternalsFactory = new SamzaStoreStateInternals.Factory<>(transformId, Collections.singletonMap(SamzaStoreStateInternals.BEAM_STORE, SamzaStoreStateInternals.getBeamStore(context.getTaskContext())), ByteArrayCoder.of(), pipelineOptions.getStoreBatchGetSize());
this.timerInternalsFactory = SamzaTimerInternalsFactory.createTimerInternalFactory(ByteArrayCoder.of(), timerRegistry, TIMER_STATE_ID, nonKeyedStateInternalsFactory, windowingStrategy, isBounded, pipelineOptions);
final KeyedInternals<byte[]> keyedInternals = new KeyedInternals<>(stateInternalsFactory, timerInternalsFactory);
SplittableParDoViaKeyedWorkItems.ProcessFn<InputT, OutputT, RestrictionT, PositionT, WatermarkEstimatorStateT> processFn = processElements.newProcessFn(processElements.getFn());
DoFnInvokers.tryInvokeSetupFor(processFn, pipelineOptions);
processFn.setStateInternalsFactory(stateInternalsFactory);
processFn.setTimerInternalsFactory(timerInternalsFactory);
processFn.setSideInputReader(NullSideInputReader.empty());
processFn.setProcessElementInvoker(new OutputAndTimeBoundedSplittableProcessElementInvoker<>(processElements.getFn(), pipelineOptions, new OutputWindowedValue<OutputT>() {
@Override
public void outputWindowedValue(OutputT output, Instant timestamp, Collection<? extends BoundedWindow> windows, PaneInfo pane) {
outputWindowedValue(mainOutputTag, output, timestamp, windows, pane);
}
@Override
public <AdditionalOutputT> void outputWindowedValue(TupleTag<AdditionalOutputT> tag, AdditionalOutputT output, Instant timestamp, Collection<? extends BoundedWindow> windows, PaneInfo pane) {
outputManager.output(tag, WindowedValue.of(output, timestamp, windows, pane));
}
}, NullSideInputReader.empty(), Executors.newSingleThreadScheduledExecutor(Executors.defaultThreadFactory()), 10000, Duration.standardSeconds(10), () -> {
throw new UnsupportedOperationException("BundleFinalizer unsupported in Samza");
}));
final StepContext stepContext = new StepContext() {
@Override
public StateInternals stateInternals() {
return keyedInternals.stateInternals();
}
@Override
public TimerInternals timerInternals() {
return keyedInternals.timerInternals();
}
};
this.fnRunner = DoFnRunners.simpleRunner(pipelineOptions, processFn, NullSideInputReader.of(Collections.emptyList()), outputManager, mainOutputTag, Collections.emptyList(), stepContext, null, Collections.emptyMap(), windowingStrategy, DoFnSchemaInformation.create(), Collections.emptyMap());
}
use of org.apache.beam.sdk.values.TupleTag in project beam by apache.
the class DoFnOperatorTest method testWatermarkUpdateAfterWatermarkHoldRelease.
@Test
public void testWatermarkUpdateAfterWatermarkHoldRelease() throws Exception {
Coder<WindowedValue<KV<String, String>>> coder = WindowedValue.getValueOnlyCoder(KvCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of()));
TupleTag<KV<String, String>> outputTag = new TupleTag<>("main-output");
List<Long> emittedWatermarkHolds = new ArrayList<>();
KeySelector<WindowedValue<KV<String, String>>, ByteBuffer> keySelector = e -> FlinkKeyUtils.encodeKey(e.getValue().getKey(), StringUtf8Coder.of());
DoFnOperator<KV<String, String>, KV<String, String>> doFnOperator = new DoFnOperator<KV<String, String>, KV<String, String>>(new IdentityDoFn<>(), "stepName", coder, Collections.emptyMap(), outputTag, Collections.emptyList(), new DoFnOperator.MultiOutputOutputManagerFactory<>(outputTag, coder, new SerializablePipelineOptions(FlinkPipelineOptions.defaults())), WindowingStrategy.globalDefault(), new HashMap<>(), /* side-input mapping */
Collections.emptyList(), /* side inputs */
FlinkPipelineOptions.defaults(), StringUtf8Coder.of(), keySelector, DoFnSchemaInformation.create(), Collections.emptyMap()) {
@Override
protected DoFnRunner<KV<String, String>, KV<String, String>> createWrappingDoFnRunner(DoFnRunner<KV<String, String>, KV<String, String>> wrappedRunner, StepContext stepContext) {
StateNamespace namespace = StateNamespaces.window(GlobalWindow.Coder.INSTANCE, GlobalWindow.INSTANCE);
StateTag<WatermarkHoldState> holdTag = StateTags.watermarkStateInternal("hold", TimestampCombiner.LATEST);
WatermarkHoldState holdState = stepContext.stateInternals().state(namespace, holdTag);
TimerInternals timerInternals = stepContext.timerInternals();
return new DoFnRunner<KV<String, String>, KV<String, String>>() {
@Override
public void startBundle() {
wrappedRunner.startBundle();
}
@Override
public void processElement(WindowedValue<KV<String, String>> elem) {
wrappedRunner.processElement(elem);
holdState.add(elem.getTimestamp());
timerInternals.setTimer(namespace, "timer", "family", elem.getTimestamp().plus(Duration.millis(1)), elem.getTimestamp().plus(Duration.millis(1)), TimeDomain.EVENT_TIME);
timerInternals.setTimer(namespace, "cleanup", "", GlobalWindow.INSTANCE.maxTimestamp(), GlobalWindow.INSTANCE.maxTimestamp(), TimeDomain.EVENT_TIME);
}
@Override
public <KeyT> void onTimer(String timerId, String timerFamilyId, KeyT key, BoundedWindow window, Instant timestamp, Instant outputTimestamp, TimeDomain timeDomain) {
if ("cleanup".equals(timerId)) {
holdState.clear();
} else {
holdState.add(outputTimestamp);
}
}
@Override
public void finishBundle() {
wrappedRunner.finishBundle();
}
@Override
public <KeyT> void onWindowExpiration(BoundedWindow window, Instant timestamp, KeyT key) {
wrappedRunner.onWindowExpiration(window, timestamp, key);
}
@Override
public DoFn<KV<String, String>, KV<String, String>> getFn() {
return doFn;
}
};
}
@Override
void emitWatermarkIfHoldChanged(long currentWatermarkHold) {
emittedWatermarkHolds.add(keyedStateInternals.minWatermarkHoldMs());
}
};
OneInputStreamOperatorTestHarness<WindowedValue<KV<String, String>>, WindowedValue<KV<String, String>>> testHarness = new KeyedOneInputStreamOperatorTestHarness<>(doFnOperator, keySelector, new CoderTypeInformation<>(FlinkKeyUtils.ByteBufferCoder.of(), FlinkPipelineOptions.defaults()));
testHarness.setup();
Instant now = Instant.now();
testHarness.open();
// process first element, set hold to `now', setup timer for `now + 1'
testHarness.processElement(new StreamRecord<>(WindowedValue.timestampedValueInGlobalWindow(KV.of("Key", "Hello"), now)));
assertThat(emittedWatermarkHolds, is(equalTo(Collections.singletonList(now.getMillis()))));
// fire timer, change hold to `now + 2'
testHarness.processWatermark(now.getMillis() + 2);
assertThat(emittedWatermarkHolds, is(equalTo(Arrays.asList(now.getMillis(), now.getMillis() + 1))));
// process second element, verify we emitted changed hold
testHarness.processElement(new StreamRecord<>(WindowedValue.timestampedValueInGlobalWindow(KV.of("Key", "Hello"), now.plus(Duration.millis(2)))));
assertThat(emittedWatermarkHolds, is(equalTo(Arrays.asList(now.getMillis(), now.getMillis() + 1, now.getMillis() + 2))));
testHarness.processWatermark(GlobalWindow.INSTANCE.maxTimestamp().plus(Duration.millis(1)).getMillis());
testHarness.processWatermark(BoundedWindow.TIMESTAMP_MAX_VALUE.getMillis());
testHarness.close();
}
Aggregations