use of org.apache.beam.sdk.values.WindowingStrategy in project beam by apache.
the class DoFnOperator method open.
@Override
public void open() throws Exception {
// WindowDoFnOperator need use state and timer to get DoFn.
// So must wait StateInternals and TimerInternals ready.
// This will be called after initializeState()
this.doFn = getDoFn();
FlinkPipelineOptions options = serializedOptions.get().as(FlinkPipelineOptions.class);
doFnInvoker = DoFnInvokers.tryInvokeSetupFor(doFn, options);
StepContext stepContext = new FlinkStepContext();
doFnRunner = DoFnRunners.simpleRunner(options, doFn, sideInputReader, outputManager, mainOutputTag, additionalOutputTags, stepContext, getInputCoder(), outputCoders, windowingStrategy, doFnSchemaInformation, sideInputMapping);
if (requiresStableInput) {
// put this in front of the root FnRunner before any additional wrappers
doFnRunner = bufferingDoFnRunner = BufferingDoFnRunner.create(doFnRunner, "stable-input-buffer", windowedInputCoder, windowingStrategy.getWindowFn().windowCoder(), getOperatorStateBackend(), getKeyedStateBackend(), options.getNumConcurrentCheckpoints(), serializedOptions);
}
doFnRunner = createWrappingDoFnRunner(doFnRunner, stepContext);
earlyBindStateIfNeeded();
if (!options.getDisableMetrics()) {
flinkMetricContainer = new FlinkMetricContainer(getRuntimeContext());
doFnRunner = new DoFnRunnerWithMetricsUpdate<>(stepName, doFnRunner, flinkMetricContainer);
String checkpointMetricNamespace = options.getReportCheckpointDuration();
if (checkpointMetricNamespace != null) {
MetricName checkpointMetric = MetricName.named(checkpointMetricNamespace, "checkpoint_duration");
checkpointStats = new CheckpointStats(() -> flinkMetricContainer.getMetricsContainer(stepName).getDistribution(checkpointMetric));
}
}
elementCount = 0L;
lastFinishBundleTime = getProcessingTimeService().getCurrentProcessingTime();
// Schedule timer to check timeout of finish bundle.
long bundleCheckPeriod = Math.max(maxBundleTimeMills / 2, 1);
checkFinishBundleTimer = getProcessingTimeService().scheduleAtFixedRate(timestamp -> checkInvokeFinishBundleByTime(), bundleCheckPeriod, bundleCheckPeriod);
if (doFn instanceof SplittableParDoViaKeyedWorkItems.ProcessFn) {
pushbackDoFnRunner = new ProcessFnRunner<>((DoFnRunner) doFnRunner, sideInputs, sideInputHandler);
} else {
pushbackDoFnRunner = SimplePushbackSideInputDoFnRunner.create(doFnRunner, sideInputs, sideInputHandler);
}
bundleFinalizer = new InMemoryBundleFinalizer();
pendingFinalizations = new LinkedHashMap<>();
}
use of org.apache.beam.sdk.values.WindowingStrategy in project beam by apache.
the class InsertFetchAndFilterStreamingSideInputNodes method forNetwork.
public MutableNetwork<Node, Edge> forNetwork(MutableNetwork<Node, Edge> network) {
if (pipeline == null) {
return network;
}
RehydratedComponents rehydratedComponents = RehydratedComponents.forComponents(pipeline.getComponents());
for (ParallelInstructionNode node : ImmutableList.copyOf(Iterables.filter(network.nodes(), ParallelInstructionNode.class))) {
// to worry about it.
if (node.getParallelInstruction().getParDo() == null || !ExecutionLocation.SDK_HARNESS.equals(node.getExecutionLocation())) {
continue;
}
ParDoInstruction parDoInstruction = node.getParallelInstruction().getParDo();
CloudObject userFnSpec = CloudObject.fromSpec(parDoInstruction.getUserFn());
String parDoPTransformId = getString(userFnSpec, PropertyNames.SERIALIZED_FN);
// Skip ParDoInstruction nodes that contain payloads without side inputs.
String userFnClassName = userFnSpec.getClassName();
if ("CombineValuesFn".equals(userFnClassName) || "KeyedCombineFn".equals(userFnClassName)) {
// These nodes have CombinePayloads which have no side inputs.
continue;
}
RunnerApi.PTransform parDoPTransform = pipeline.getComponents().getTransformsOrDefault(parDoPTransformId, null);
// TODO: only the non-null branch should exist; for migration ease only
if (parDoPTransform == null) {
continue;
}
RunnerApi.ParDoPayload parDoPayload;
try {
parDoPayload = RunnerApi.ParDoPayload.parseFrom(parDoPTransform.getSpec().getPayload());
} catch (InvalidProtocolBufferException exc) {
throw new RuntimeException("ParDo did not have a ParDoPayload", exc);
}
// Skip any ParDo that doesn't have a side input.
if (parDoPayload.getSideInputsMap().isEmpty()) {
continue;
}
String mainInputPCollectionLocalName = Iterables.getOnlyElement(Sets.difference(parDoPTransform.getInputsMap().keySet(), parDoPayload.getSideInputsMap().keySet()));
RunnerApi.WindowingStrategy windowingStrategyProto = pipeline.getComponents().getWindowingStrategiesOrThrow(pipeline.getComponents().getPcollectionsOrThrow(parDoPTransform.getInputsOrThrow(mainInputPCollectionLocalName)).getWindowingStrategyId());
WindowingStrategy windowingStrategy;
try {
windowingStrategy = WindowingStrategyTranslation.fromProto(windowingStrategyProto, rehydratedComponents);
} catch (InvalidProtocolBufferException e) {
throw new IllegalStateException(String.format("Unable to decode windowing strategy %s.", windowingStrategyProto), e);
}
// Gather all the side input window mapping fns which we need to request the SDK to map
ImmutableMap.Builder<PCollectionView<?>, RunnerApi.FunctionSpec> pCollectionViewsToWindowMapingsFns = ImmutableMap.builder();
parDoPayload.getSideInputsMap().forEach((sideInputTag, sideInput) -> pCollectionViewsToWindowMapingsFns.put(RegisterNodeFunction.transformSideInputForRunner(pipeline, parDoPTransform, sideInputTag, sideInput), sideInput.getWindowMappingFn()));
Node streamingSideInputWindowHandlerNode = FetchAndFilterStreamingSideInputsNode.create(windowingStrategy, pCollectionViewsToWindowMapingsFns.build(), NameContext.create(null, node.getParallelInstruction().getOriginalName(), node.getParallelInstruction().getSystemName(), node.getParallelInstruction().getName()));
// Rewire the graph such that streaming side inputs ParDos are preceded by a
// node which filters any side inputs that aren't ready and fetches any ready side inputs.
Edge mainInput = Iterables.getOnlyElement(network.inEdges(node));
InstructionOutputNode predecessor = (InstructionOutputNode) network.incidentNodes(mainInput).source();
InstructionOutputNode predecessorCopy = InstructionOutputNode.create(predecessor.getInstructionOutput(), predecessor.getPcollectionId());
network.removeEdge(mainInput);
network.addNode(streamingSideInputWindowHandlerNode);
network.addNode(predecessorCopy);
network.addEdge(predecessor, streamingSideInputWindowHandlerNode, mainInput.clone());
network.addEdge(streamingSideInputWindowHandlerNode, predecessorCopy, mainInput.clone());
network.addEdge(predecessorCopy, node, mainInput.clone());
}
return network;
}
use of org.apache.beam.sdk.values.WindowingStrategy in project beam by apache.
the class ParDoEvaluator method create.
public static <InputT, OutputT> ParDoEvaluator<InputT> create(EvaluationContext evaluationContext, PipelineOptions options, DirectStepContext stepContext, AppliedPTransform<?, ?, ?> application, Coder<InputT> inputCoder, WindowingStrategy<?, ? extends BoundedWindow> windowingStrategy, DoFn<InputT, OutputT> fn, StructuralKey<?> key, List<PCollectionView<?>> sideInputs, TupleTag<OutputT> mainOutputTag, List<TupleTag<?>> additionalOutputTags, Map<TupleTag<?>, PCollection<?>> outputs, DoFnSchemaInformation doFnSchemaInformation, Map<String, PCollectionView<?>> sideInputMapping, DoFnRunnerFactory<InputT, OutputT> runnerFactory) {
BundleOutputManager outputManager = createOutputManager(evaluationContext, key, outputs);
ReadyCheckingSideInputReader sideInputReader = evaluationContext.createSideInputReader(sideInputs);
Map<TupleTag<?>, Coder<?>> outputCoders = outputs.entrySet().stream().collect(Collectors.toMap(e -> e.getKey(), e -> e.getValue().getCoder()));
PushbackSideInputDoFnRunner<InputT, OutputT> runner = runnerFactory.createRunner(options, fn, sideInputs, sideInputReader, outputManager, mainOutputTag, additionalOutputTags, stepContext, inputCoder, outputCoders, windowingStrategy, doFnSchemaInformation, sideInputMapping);
return create(runner, stepContext, application, outputManager);
}
use of org.apache.beam.sdk.values.WindowingStrategy in project beam by apache.
the class SortingFlinkCombineRunner method combine.
@Override
public void combine(FlinkCombiner<K, InputT, AccumT, OutputT> flinkCombiner, WindowingStrategy<Object, W> windowingStrategy, SideInputReader sideInputReader, PipelineOptions options, Iterable<WindowedValue<KV<K, InputT>>> elements, Collector<WindowedValue<KV<K, OutputT>>> out) throws Exception {
@SuppressWarnings("unchecked") TimestampCombiner timestampCombiner = (TimestampCombiner) windowingStrategy.getTimestampCombiner();
// get all elements so that we can sort them, has to fit into
// memory
// this seems very unprudent, but correct, for now
List<WindowedValue<KV<K, InputT>>> sortedInput = Lists.newArrayList();
for (WindowedValue<KV<K, InputT>> inputValue : elements) {
for (WindowedValue<KV<K, InputT>> exploded : inputValue.explodeWindows()) {
sortedInput.add(exploded);
}
}
sortedInput.sort(Comparator.comparing(o -> Iterables.getOnlyElement(o.getWindows()).maxTimestamp()));
if (windowingStrategy.needsMerge()) {
// merge windows, we have to do it in an extra pre-processing step and
// can't do it as we go since the window of early elements would not
// be correct when calling the CombineFn
mergeWindow(sortedInput);
}
// iterate over the elements that are sorted by window timestamp
final Iterator<WindowedValue<KV<K, InputT>>> iterator = sortedInput.iterator();
// create accumulator using the first elements key
WindowedValue<KV<K, InputT>> currentValue = iterator.next();
K key = currentValue.getValue().getKey();
W currentWindow = (W) Iterables.getOnlyElement(currentValue.getWindows());
InputT firstValue = currentValue.getValue().getValue();
AccumT accumulator = flinkCombiner.firstInput(key, firstValue, options, sideInputReader, currentValue.getWindows());
// we use this to keep track of the timestamps assigned by the TimestampCombiner
Instant windowTimestamp = timestampCombiner.assign(currentWindow, currentValue.getTimestamp());
while (iterator.hasNext()) {
WindowedValue<KV<K, InputT>> nextValue = iterator.next();
W nextWindow = (W) Iterables.getOnlyElement(nextValue.getWindows());
if (currentWindow.equals(nextWindow)) {
// continue accumulating and merge windows
InputT value = nextValue.getValue().getValue();
accumulator = flinkCombiner.addInput(key, accumulator, value, options, sideInputReader, currentValue.getWindows());
windowTimestamp = timestampCombiner.combine(windowTimestamp, timestampCombiner.assign(currentWindow, nextValue.getTimestamp()));
} else {
// emit the value that we currently have
out.collect(WindowedValue.of(KV.of(key, flinkCombiner.extractOutput(key, accumulator, options, sideInputReader, currentValue.getWindows())), windowTimestamp, currentWindow, PaneInfo.NO_FIRING));
currentWindow = nextWindow;
currentValue = nextValue;
InputT value = nextValue.getValue().getValue();
accumulator = flinkCombiner.firstInput(key, value, options, sideInputReader, currentValue.getWindows());
windowTimestamp = timestampCombiner.assign(currentWindow, nextValue.getTimestamp());
}
}
// emit the final accumulator
out.collect(WindowedValue.of(KV.of(key, flinkCombiner.extractOutput(key, accumulator, options, sideInputReader, currentValue.getWindows())), windowTimestamp, currentWindow, PaneInfo.NO_FIRING));
}
use of org.apache.beam.sdk.values.WindowingStrategy in project beam by apache.
the class ExecutableStageDoFnOperatorTest method testEnsureStateCleanupOnFinalWatermark.
@Test
public void testEnsureStateCleanupOnFinalWatermark() throws Exception {
TupleTag<Integer> mainOutput = new TupleTag<>("main-output");
DoFnOperator.MultiOutputOutputManagerFactory<Integer> outputManagerFactory = new DoFnOperator.MultiOutputOutputManagerFactory(mainOutput, VoidCoder.of(), new SerializablePipelineOptions(FlinkPipelineOptions.defaults()));
StringUtf8Coder keyCoder = StringUtf8Coder.of();
WindowingStrategy windowingStrategy = WindowingStrategy.globalDefault();
Coder<BoundedWindow> windowCoder = windowingStrategy.getWindowFn().windowCoder();
KvCoder<String, Integer> kvCoder = KvCoder.of(keyCoder, VarIntCoder.of());
ExecutableStageDoFnOperator<Integer, Integer> operator = getOperator(mainOutput, Collections.emptyList(), outputManagerFactory, windowingStrategy, keyCoder, WindowedValue.getFullCoder(kvCoder, windowCoder));
KeyedOneInputStreamOperatorTestHarness<ByteBuffer, WindowedValue<KV<String, Integer>>, WindowedValue<Integer>> testHarness = new KeyedOneInputStreamOperatorTestHarness(operator, operator.keySelector, new CoderTypeInformation<>(FlinkKeyUtils.ByteBufferCoder.of(), FlinkPipelineOptions.defaults()));
RemoteBundle bundle = Mockito.mock(RemoteBundle.class);
when(bundle.getInputReceivers()).thenReturn(ImmutableMap.<String, FnDataReceiver<WindowedValue>>builder().put("input", Mockito.mock(FnDataReceiver.class)).build());
when(stageBundleFactory.getBundle(any(), any(), any(), any(), any(), any())).thenReturn(bundle);
testHarness.open();
KeyedStateBackend<ByteBuffer> keyedStateBackend = operator.getKeyedStateBackend();
ByteBuffer key = FlinkKeyUtils.encodeKey("key1", keyCoder);
keyedStateBackend.setCurrentKey(key);
// create some state which can be cleaned up
assertThat(testHarness.numKeyedStateEntries(), is(0));
StateNamespace stateNamespace = StateNamespaces.window(windowCoder, GlobalWindow.INSTANCE);
// State from the SDK Harness is stored as ByteStrings
BagState<ByteString> state = operator.keyedStateInternals.state(stateNamespace, StateTags.bag(stateId, ByteStringCoder.of()));
state.add(ByteString.copyFrom("userstate".getBytes(Charsets.UTF_8)));
// No timers have been set for cleanup
assertThat(testHarness.numEventTimeTimers(), is(0));
// State has been created
assertThat(testHarness.numKeyedStateEntries(), is(1));
// Generate final watermark to trigger state cleanup
testHarness.processWatermark(new Watermark(BoundedWindow.TIMESTAMP_MAX_VALUE.plus(Duration.millis(1)).getMillis()));
assertThat(testHarness.numKeyedStateEntries(), is(0));
}
Aggregations