Search in sources :

Example 1 with CoderTypeInformation

use of org.apache.beam.runners.flink.translation.types.CoderTypeInformation in project beam by apache.

the class ExecutableStageDoFnOperatorTest method testWatermarkHandling.

@Test
public void testWatermarkHandling() throws Exception {
    TupleTag<Integer> mainOutput = new TupleTag<>("main-output");
    DoFnOperator.MultiOutputOutputManagerFactory<Integer> outputManagerFactory = new DoFnOperator.MultiOutputOutputManagerFactory(mainOutput, VoidCoder.of(), new SerializablePipelineOptions(FlinkPipelineOptions.defaults()));
    ExecutableStageDoFnOperator<KV<String, Integer>, Integer> operator = getOperator(mainOutput, Collections.emptyList(), outputManagerFactory, WindowingStrategy.of(FixedWindows.of(Duration.millis(10))), StringUtf8Coder.of(), WindowedValue.getFullCoder(KvCoder.of(StringUtf8Coder.of(), VarIntCoder.of()), IntervalWindow.getCoder()));
    KeyedOneInputStreamOperatorTestHarness<String, WindowedValue<KV<String, Integer>>, WindowedValue<Integer>> testHarness = new KeyedOneInputStreamOperatorTestHarness<>(operator, val -> val.getValue().getKey(), new CoderTypeInformation<>(StringUtf8Coder.of(), FlinkPipelineOptions.defaults()));
    RemoteBundle bundle = Mockito.mock(RemoteBundle.class);
    when(bundle.getInputReceivers()).thenReturn(ImmutableMap.<String, FnDataReceiver<WindowedValue>>builder().put("input", Mockito.mock(FnDataReceiver.class)).build());
    when(bundle.getTimerReceivers()).thenReturn(ImmutableMap.<KV<String, String>, FnDataReceiver<WindowedValue>>builder().put(KV.of("transform", "timer"), Mockito.mock(FnDataReceiver.class)).put(KV.of("transform", "timer2"), Mockito.mock(FnDataReceiver.class)).put(KV.of("transform", "timer3"), Mockito.mock(FnDataReceiver.class)).build());
    when(stageBundleFactory.getBundle(any(), any(), any(), any(), any(), any())).thenReturn(bundle);
    testHarness.open();
    assertThat(operator.getCurrentOutputWatermark(), is(BoundedWindow.TIMESTAMP_MIN_VALUE.getMillis()));
    // No bundle has been started, watermark can be freely advanced
    testHarness.processWatermark(0);
    assertThat(operator.getCurrentOutputWatermark(), is(0L));
    // Trigger a new bundle
    IntervalWindow intervalWindow = new IntervalWindow(new Instant(0), new Instant(9));
    WindowedValue<KV<String, Integer>> windowedValue = WindowedValue.of(KV.of("one", 1), Instant.now(), intervalWindow, PaneInfo.NO_FIRING);
    testHarness.processElement(new StreamRecord<>(windowedValue));
    // The output watermark should be held back during the bundle
    testHarness.processWatermark(1);
    assertThat(operator.getEffectiveInputWatermark(), is(1L));
    assertThat(operator.getCurrentOutputWatermark(), is(0L));
    // After the bundle has been finished, the watermark should be advanced
    operator.invokeFinishBundle();
    assertThat(operator.getCurrentOutputWatermark(), is(1L));
    // Bundle finished, watermark can be freely advanced
    testHarness.processWatermark(2);
    assertThat(operator.getEffectiveInputWatermark(), is(2L));
    assertThat(operator.getCurrentOutputWatermark(), is(2L));
    // Trigger a new bundle
    testHarness.processElement(new StreamRecord<>(windowedValue));
    // cleanup timer
    assertThat(testHarness.numEventTimeTimers(), is(1));
    // Set at timer
    Instant timerTarget = new Instant(5);
    Instant timerTarget2 = new Instant(6);
    operator.getLockToAcquireForStateAccessDuringBundles().lock();
    BiConsumer<String, Instant> timerConsumer = (timerId, timestamp) -> operator.setTimer(Timer.of(windowedValue.getValue().getKey(), "", windowedValue.getWindows(), timestamp, timestamp, PaneInfo.NO_FIRING), TimerInternals.TimerData.of("", TimerReceiverFactory.encodeToTimerDataTimerId("transform", timerId), StateNamespaces.window(IntervalWindow.getCoder(), intervalWindow), timestamp, timestamp, TimeDomain.EVENT_TIME));
    timerConsumer.accept("timer", timerTarget);
    timerConsumer.accept("timer2", timerTarget2);
    assertThat(testHarness.numEventTimeTimers(), is(3));
    // Advance input watermark past the timer
    // Check the output watermark is held back
    long targetWatermark = timerTarget.getMillis() + 100;
    testHarness.processWatermark(targetWatermark);
    // Do not yet advance the output watermark because we are still processing a bundle
    assertThat(testHarness.numEventTimeTimers(), is(3));
    assertThat(operator.getCurrentOutputWatermark(), is(2L));
    // Check that the timers are fired but the output watermark is advanced no further than
    // the minimum timer timestamp of the previous bundle because we are still processing a
    // bundle which might contain more timers.
    // Timers can create loops if they keep rescheduling themselves when firing
    // Thus, we advance the watermark asynchronously to allow for checkpointing to run
    operator.invokeFinishBundle();
    assertThat(testHarness.numEventTimeTimers(), is(3));
    testHarness.setProcessingTime(testHarness.getProcessingTime() + 1);
    assertThat(testHarness.numEventTimeTimers(), is(0));
    assertThat(operator.getCurrentOutputWatermark(), is(5L));
    // Output watermark is advanced synchronously when the bundle finishes,
    // no more timers are scheduled
    operator.invokeFinishBundle();
    assertThat(operator.getCurrentOutputWatermark(), is(targetWatermark));
    assertThat(testHarness.numEventTimeTimers(), is(0));
    // Watermark is advanced in a blocking fashion on close, not via a timers
    // Create a bundle with a pending timer to simulate that
    testHarness.processElement(new StreamRecord<>(windowedValue));
    timerConsumer.accept("timer3", new Instant(targetWatermark));
    assertThat(testHarness.numEventTimeTimers(), is(1));
    // This should be blocking until the watermark reaches Long.MAX_VALUE.
    testHarness.close();
    assertThat(testHarness.numEventTimeTimers(), is(0));
    assertThat(operator.getCurrentOutputWatermark(), is(Long.MAX_VALUE));
}
Also used : Arrays(java.util.Arrays) RuntimeContext(org.apache.flink.api.common.functions.RuntimeContext) InMemoryStateInternals(org.apache.beam.runners.core.InMemoryStateInternals) FlinkExecutableStageContextFactory(org.apache.beam.runners.flink.translation.functions.FlinkExecutableStageContextFactory) CoderUtils(org.apache.beam.sdk.util.CoderUtils) ImmutableMap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap) KeyedOneInputStreamOperatorTestHarness(org.apache.flink.streaming.util.KeyedOneInputStreamOperatorTestHarness) TimerInternals(org.apache.beam.runners.core.TimerInternals) OneInputStreamOperatorTestHarness(org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness) Mockito.doThrow(org.mockito.Mockito.doThrow) MockitoAnnotations(org.mockito.MockitoAnnotations) Mockito.doAnswer(org.mockito.Mockito.doAnswer) Map(java.util.Map) GlobalWindow(org.apache.beam.sdk.transforms.windowing.GlobalWindow) Components(org.apache.beam.model.pipeline.v1.RunnerApi.Components) CoderTypeInformation(org.apache.beam.runners.flink.translation.types.CoderTypeInformation) KvCoder(org.apache.beam.sdk.coders.KvCoder) StageBundleFactory(org.apache.beam.runners.fnexecution.control.StageBundleFactory) PAR_DO_TRANSFORM_URN(org.apache.beam.runners.core.construction.PTransformTranslation.PAR_DO_TRANSFORM_URN) FnDataReceiver(org.apache.beam.sdk.fn.data.FnDataReceiver) BundleProgressHandler(org.apache.beam.runners.fnexecution.control.BundleProgressHandler) SerializationUtils(org.apache.beam.repackaged.core.org.apache.commons.lang3.SerializationUtils) Struct(org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.Struct) OutputTag(org.apache.flink.util.OutputTag) BeamFnApi(org.apache.beam.model.fnexecution.v1.BeamFnApi) StandardCharsets(java.nio.charset.StandardCharsets) Matchers.instanceOf(org.hamcrest.Matchers.instanceOf) Matchers.any(org.mockito.Matchers.any) Matchers.is(org.hamcrest.Matchers.is) StatefulDoFnRunner(org.apache.beam.runners.core.StatefulDoFnRunner) KV(org.apache.beam.sdk.values.KV) Mock(org.mockito.Mock) Duration(org.joda.time.Duration) RunWith(org.junit.runner.RunWith) ExecutableStagePayload(org.apache.beam.model.pipeline.v1.RunnerApi.ExecutableStagePayload) StateTags(org.apache.beam.runners.core.StateTags) PCollection(org.apache.beam.model.pipeline.v1.RunnerApi.PCollection) TupleTag(org.apache.beam.sdk.values.TupleTag) BiConsumer(java.util.function.BiConsumer) Matchers.hasSize(org.hamcrest.Matchers.hasSize) MatcherAssert.assertThat(org.hamcrest.MatcherAssert.assertThat) StateRequestHandler(org.apache.beam.runners.fnexecution.state.StateRequestHandler) Before(org.junit.Before) RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) OutputReceiverFactory(org.apache.beam.runners.fnexecution.control.OutputReceiverFactory) StateRequestHandlers(org.apache.beam.runners.fnexecution.state.StateRequestHandlers) Test(org.junit.Test) BundleCheckpointHandler(org.apache.beam.runners.fnexecution.control.BundleCheckpointHandler) Assert.assertNotEquals(org.junit.Assert.assertNotEquals) NoopLock(org.apache.beam.sdk.util.NoopLock) Lock(java.util.concurrent.locks.Lock) InMemoryTimerInternals(org.apache.beam.runners.core.InMemoryTimerInternals) Timer(org.apache.beam.runners.core.construction.Timer) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) VarIntCoder(org.apache.beam.sdk.coders.VarIntCoder) VoidCoder(org.apache.beam.sdk.coders.VoidCoder) IntervalWindow(org.apache.beam.sdk.transforms.windowing.IntervalWindow) TimeDomain(org.apache.beam.sdk.state.TimeDomain) Assert.assertEquals(org.junit.Assert.assertEquals) ArgumentMatchers.anyString(org.mockito.ArgumentMatchers.anyString) StateNamespace(org.apache.beam.runners.core.StateNamespace) SerializablePipelineOptions(org.apache.beam.runners.core.construction.SerializablePipelineOptions) WindowedValue(org.apache.beam.sdk.util.WindowedValue) StreamRecordStripper.stripStreamRecordFromWindowedValue(org.apache.beam.runners.flink.translation.wrappers.streaming.StreamRecordStripper.stripStreamRecordFromWindowedValue) IsIterableContainingInOrder.contains(org.hamcrest.collection.IsIterableContainingInOrder.contains) FlinkPipelineOptions(org.apache.beam.runners.flink.FlinkPipelineOptions) ByteBuffer(java.nio.ByteBuffer) Mockito.verifyNoMoreInteractions(org.mockito.Mockito.verifyNoMoreInteractions) Iterables(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Iterables) BundleFinalizationHandler(org.apache.beam.runners.fnexecution.control.BundleFinalizationHandler) JobInfo(org.apache.beam.runners.fnexecution.provisioning.JobInfo) TypeInformation(org.apache.flink.api.common.typeinfo.TypeInformation) TimerReceiverFactory(org.apache.beam.runners.fnexecution.control.TimerReceiverFactory) PaneInfo(org.apache.beam.sdk.transforms.windowing.PaneInfo) Collection(java.util.Collection) DistributedCache(org.apache.flink.api.common.cache.DistributedCache) List(java.util.List) InstructionRequestHandler(org.apache.beam.runners.fnexecution.control.InstructionRequestHandler) ImmutableList(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList) WindowingStrategy(org.apache.beam.sdk.values.WindowingStrategy) Whitebox(org.powermock.reflect.Whitebox) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) Coder(org.apache.beam.sdk.coders.Coder) Watermark(org.apache.flink.streaming.api.watermark.Watermark) HashMap(java.util.HashMap) MutableObject(org.apache.beam.repackaged.core.org.apache.commons.lang3.mutable.MutableObject) StateNamespaces(org.apache.beam.runners.core.StateNamespaces) StringUtf8Coder(org.apache.beam.sdk.coders.StringUtf8Coder) RemoteBundle(org.apache.beam.runners.fnexecution.control.RemoteBundle) StreamRecord(org.apache.flink.streaming.runtime.streamrecord.StreamRecord) ByteString(org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.ByteString) ExpectedException(org.junit.rules.ExpectedException) Nullable(org.checkerframework.checker.nullness.qual.Nullable) DoFnRunnerWithMetricsUpdate(org.apache.beam.runners.flink.metrics.DoFnRunnerWithMetricsUpdate) ProcessBundleDescriptors(org.apache.beam.runners.fnexecution.control.ProcessBundleDescriptors) ByteStringCoder(org.apache.beam.runners.fnexecution.wire.ByteStringCoder) Assert.assertNotNull(org.junit.Assert.assertNotNull) Charsets(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Charsets) FixedWindows(org.apache.beam.sdk.transforms.windowing.FixedWindows) Mockito.when(org.mockito.Mockito.when) JUnit4(org.junit.runners.JUnit4) KeyedStateBackend(org.apache.flink.runtime.state.KeyedStateBackend) Mockito.verify(org.mockito.Mockito.verify) Mockito(org.mockito.Mockito) BagState(org.apache.beam.sdk.state.BagState) Rule(org.junit.Rule) Instant(org.joda.time.Instant) Collections(java.util.Collections) FlinkStateInternalsTest(org.apache.beam.runners.flink.streaming.FlinkStateInternalsTest) ExecutableStageContext(org.apache.beam.runners.fnexecution.control.ExecutableStageContext) FnDataReceiver(org.apache.beam.sdk.fn.data.FnDataReceiver) Instant(org.joda.time.Instant) TupleTag(org.apache.beam.sdk.values.TupleTag) KV(org.apache.beam.sdk.values.KV) ArgumentMatchers.anyString(org.mockito.ArgumentMatchers.anyString) ByteString(org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.ByteString) KeyedOneInputStreamOperatorTestHarness(org.apache.flink.streaming.util.KeyedOneInputStreamOperatorTestHarness) WindowedValue(org.apache.beam.sdk.util.WindowedValue) StreamRecordStripper.stripStreamRecordFromWindowedValue(org.apache.beam.runners.flink.translation.wrappers.streaming.StreamRecordStripper.stripStreamRecordFromWindowedValue) SerializablePipelineOptions(org.apache.beam.runners.core.construction.SerializablePipelineOptions) RemoteBundle(org.apache.beam.runners.fnexecution.control.RemoteBundle) IntervalWindow(org.apache.beam.sdk.transforms.windowing.IntervalWindow) Test(org.junit.Test) FlinkStateInternalsTest(org.apache.beam.runners.flink.streaming.FlinkStateInternalsTest)

Example 2 with CoderTypeInformation

use of org.apache.beam.runners.flink.translation.types.CoderTypeInformation in project beam by apache.

the class FlinkBatchPortablePipelineTranslator method pruneOutput.

private static void pruneOutput(DataSet<RawUnionValue> taggedDataset, BatchTranslationContext context, int unionTag, Coder<WindowedValue<?>> outputCoder, String collectionId) {
    TypeInformation<WindowedValue<?>> outputType = new CoderTypeInformation<>(outputCoder, context.getPipelineOptions());
    FlinkExecutableStagePruningFunction pruningFunction = new FlinkExecutableStagePruningFunction(unionTag, context.getPipelineOptions());
    FlatMapOperator<RawUnionValue, WindowedValue<?>> pruningOperator = new FlatMapOperator<>(taggedDataset, outputType, pruningFunction, String.format("ExtractOutput[%s]", unionTag));
    context.addDataSet(collectionId, pruningOperator);
}
Also used : CoderTypeInformation(org.apache.beam.runners.flink.translation.types.CoderTypeInformation) FlatMapOperator(org.apache.flink.api.java.operators.FlatMapOperator) RawUnionValue(org.apache.beam.sdk.transforms.join.RawUnionValue) WindowedValue(org.apache.beam.sdk.util.WindowedValue) FlinkExecutableStagePruningFunction(org.apache.beam.runners.flink.translation.functions.FlinkExecutableStagePruningFunction)

Example 3 with CoderTypeInformation

use of org.apache.beam.runners.flink.translation.types.CoderTypeInformation in project beam by apache.

the class FlinkStreamingPortablePipelineTranslator method translateFlatten.

private <T> void translateFlatten(String id, RunnerApi.Pipeline pipeline, StreamingTranslationContext context) {
    RunnerApi.PTransform transform = pipeline.getComponents().getTransformsOrThrow(id);
    Map<String, String> allInputs = transform.getInputsMap();
    if (allInputs.isEmpty()) {
        // create an empty dummy source to satisfy downstream operations
        // we cannot create an empty source in Flink, therefore we have to
        // add the flatMap that simply never forwards the single element
        long shutdownAfterIdleSourcesMs = context.getPipelineOptions().getShutdownSourcesAfterIdleMs();
        DataStreamSource<WindowedValue<byte[]>> dummySource = context.getExecutionEnvironment().addSource(new ImpulseSourceFunction(shutdownAfterIdleSourcesMs));
        DataStream<WindowedValue<T>> result = dummySource.<WindowedValue<T>>flatMap((s, collector) -> {
        // never return anything
        }).returns(new CoderTypeInformation<>(WindowedValue.getFullCoder((Coder<T>) VoidCoder.of(), GlobalWindow.Coder.INSTANCE), context.getPipelineOptions()));
        context.addDataStream(Iterables.getOnlyElement(transform.getOutputsMap().values()), result);
    } else {
        DataStream<T> result = null;
        // Determine DataStreams that we use as input several times. For those, we need to uniquify
        // input streams because Flink seems to swallow watermarks when we have a union of one and
        // the same stream.
        HashMultiset<DataStream<T>> inputCounts = HashMultiset.create();
        for (String input : allInputs.values()) {
            DataStream<T> current = context.getDataStreamOrThrow(input);
            inputCounts.add(current, 1);
        }
        for (String input : allInputs.values()) {
            DataStream<T> current = context.getDataStreamOrThrow(input);
            final int timesRequired = inputCounts.count(current);
            if (timesRequired > 1) {
                current = current.flatMap(new FlatMapFunction<T, T>() {

                    private static final long serialVersionUID = 1L;

                    @Override
                    public void flatMap(T t, Collector<T> collector) {
                        collector.collect(t);
                    }
                });
            }
            result = (result == null) ? current : result.union(current);
        }
        context.addDataStream(Iterables.getOnlyElement(transform.getOutputsMap().values()), result);
    }
}
Also used : SingletonKeyedWorkItemCoder(org.apache.beam.runners.flink.translation.wrappers.streaming.SingletonKeyedWorkItemCoder) WindowedValueCoder(org.apache.beam.sdk.util.WindowedValue.WindowedValueCoder) FlinkExecutableStageContextFactory(org.apache.beam.runners.flink.translation.functions.FlinkExecutableStageContextFactory) CoderUtils(org.apache.beam.sdk.util.CoderUtils) WireCoders(org.apache.beam.runners.fnexecution.wire.WireCoders) UnboundedSource(org.apache.beam.sdk.io.UnboundedSource) PCollectionViews(org.apache.beam.sdk.values.PCollectionViews) SdkHarnessClient(org.apache.beam.runners.fnexecution.control.SdkHarnessClient) ImmutableMap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap) RunnerPCollectionView(org.apache.beam.runners.core.construction.RunnerPCollectionView) ImmutableSet(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableSet) TestStreamSource(org.apache.beam.runners.flink.translation.wrappers.streaming.io.TestStreamSource) Map(java.util.Map) TestStreamTranslation(org.apache.beam.runners.core.construction.TestStreamTranslation) GlobalWindow(org.apache.beam.sdk.transforms.windowing.GlobalWindow) JsonNode(com.fasterxml.jackson.databind.JsonNode) CoderTypeInformation(org.apache.beam.runners.flink.translation.types.CoderTypeInformation) KvCoder(org.apache.beam.sdk.coders.KvCoder) PTransformTranslation(org.apache.beam.runners.core.construction.PTransformTranslation) WindowDoFnOperator(org.apache.beam.runners.flink.translation.wrappers.streaming.WindowDoFnOperator) Set(java.util.Set) OutputTag(org.apache.flink.util.OutputTag) ExecutableStage(org.apache.beam.runners.core.construction.graph.ExecutableStage) ExecutableStageTranslation.generateNameFromStagePayload(org.apache.beam.runners.core.construction.ExecutableStageTranslation.generateNameFromStagePayload) FlatMapFunction(org.apache.flink.api.common.functions.FlatMapFunction) CoderException(org.apache.beam.sdk.coders.CoderException) WindowingStrategyTranslation(org.apache.beam.runners.core.construction.WindowingStrategyTranslation) TestStream(org.apache.beam.sdk.testing.TestStream) PipelineTranslatorUtils.instantiateCoder(org.apache.beam.runners.fnexecution.translation.PipelineTranslatorUtils.instantiateCoder) ValueWithRecordId(org.apache.beam.sdk.values.ValueWithRecordId) KV(org.apache.beam.sdk.values.KV) TypeDescriptor(org.apache.beam.sdk.values.TypeDescriptor) ArrayList(java.util.ArrayList) LinkedHashMap(java.util.LinkedHashMap) StreamingImpulseSource(org.apache.beam.runners.flink.translation.wrappers.streaming.io.StreamingImpulseSource) RichMapFunction(org.apache.flink.api.common.functions.RichMapFunction) Collector(org.apache.flink.util.Collector) TupleTag(org.apache.beam.sdk.values.TupleTag) Maps(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Maps) BiMap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.BiMap) InvalidProtocolBufferException(org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.InvalidProtocolBufferException) RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) QueryablePipeline(org.apache.beam.runners.core.construction.graph.QueryablePipeline) IterableCoder(org.apache.beam.sdk.coders.IterableCoder) SingleOutputStreamOperator(org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator) IOException(java.io.IOException) DedupingOperator(org.apache.beam.runners.flink.translation.wrappers.streaming.io.DedupingOperator) BoundedSource(org.apache.beam.sdk.io.BoundedSource) TreeMap(java.util.TreeMap) PCollectionView(org.apache.beam.sdk.values.PCollectionView) AutoService(com.google.auto.service.AutoService) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) PipelineNode(org.apache.beam.runners.core.construction.graph.PipelineNode) VoidCoder(org.apache.beam.sdk.coders.VoidCoder) UnboundedSourceWrapper(org.apache.beam.runners.flink.translation.wrappers.streaming.io.UnboundedSourceWrapper) FileSystems(org.apache.beam.sdk.io.FileSystems) SystemReduceFn(org.apache.beam.runners.core.SystemReduceFn) SerializablePipelineOptions(org.apache.beam.runners.core.construction.SerializablePipelineOptions) WindowedValue(org.apache.beam.sdk.util.WindowedValue) PipelineTranslatorUtils.getWindowingStrategy(org.apache.beam.runners.fnexecution.translation.PipelineTranslatorUtils.getWindowingStrategy) WorkItemKeySelector(org.apache.beam.runners.flink.translation.wrappers.streaming.WorkItemKeySelector) KvToByteBufferKeySelector(org.apache.beam.runners.flink.translation.wrappers.streaming.KvToByteBufferKeySelector) SerializableFunction(org.apache.beam.sdk.transforms.SerializableFunction) RehydratedComponents(org.apache.beam.runners.core.construction.RehydratedComponents) DoFnOperator(org.apache.beam.runners.flink.translation.wrappers.streaming.DoFnOperator) ByteBuffer(java.nio.ByteBuffer) Sets(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Sets) Locale(java.util.Locale) Iterables(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Iterables) JobInfo(org.apache.beam.runners.fnexecution.provisioning.JobInfo) TypeInformation(org.apache.flink.api.common.typeinfo.TypeInformation) KeyedWorkItem(org.apache.beam.runners.core.KeyedWorkItem) KeySelector(org.apache.flink.api.java.functions.KeySelector) TwoInputTransformation(org.apache.flink.streaming.api.transformations.TwoInputTransformation) KeyedStream(org.apache.flink.streaming.api.datastream.KeyedStream) String.format(java.lang.String.format) ModelCoders(org.apache.beam.runners.core.construction.ModelCoders) UnionCoder(org.apache.beam.sdk.transforms.join.UnionCoder) JobExecutionResult(org.apache.flink.api.common.JobExecutionResult) List(java.util.List) TypeDescriptors(org.apache.beam.sdk.values.TypeDescriptors) WindowingStrategy(org.apache.beam.sdk.values.WindowingStrategy) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) HashMultiset(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.HashMultiset) PipelineTranslatorUtils.createOutputMap(org.apache.beam.runners.fnexecution.translation.PipelineTranslatorUtils.createOutputMap) ReadTranslation(org.apache.beam.runners.core.construction.ReadTranslation) ExecutableStageDoFnOperator(org.apache.beam.runners.flink.translation.wrappers.streaming.ExecutableStageDoFnOperator) Coder(org.apache.beam.sdk.coders.Coder) HashMap(java.util.HashMap) DataStreamSource(org.apache.flink.streaming.api.datastream.DataStreamSource) RawUnionValue(org.apache.beam.sdk.transforms.join.RawUnionValue) ImpulseSourceFunction(org.apache.beam.runners.flink.translation.functions.ImpulseSourceFunction) ViewFn(org.apache.beam.sdk.transforms.ViewFn) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) SdfByteBufferKeySelector(org.apache.beam.runners.flink.translation.wrappers.streaming.SdfByteBufferKeySelector) NativeTransforms(org.apache.beam.runners.core.construction.NativeTransforms) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper) Configuration(org.apache.flink.configuration.Configuration) Lists(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Lists) DataStream(org.apache.flink.streaming.api.datastream.DataStream) ByteArrayCoder(org.apache.beam.sdk.coders.ByteArrayCoder) SourceInputFormat(org.apache.beam.runners.flink.translation.wrappers.SourceInputFormat) Collections(java.util.Collections) DataStream(org.apache.flink.streaming.api.datastream.DataStream) ImpulseSourceFunction(org.apache.beam.runners.flink.translation.functions.ImpulseSourceFunction) RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) WindowedValue(org.apache.beam.sdk.util.WindowedValue) FlatMapFunction(org.apache.flink.api.common.functions.FlatMapFunction) Collector(org.apache.flink.util.Collector)

Example 4 with CoderTypeInformation

use of org.apache.beam.runners.flink.translation.types.CoderTypeInformation in project beam by apache.

the class FlinkStreamingPortablePipelineTranslator method translateStreamingImpulse.

private void translateStreamingImpulse(String id, RunnerApi.Pipeline pipeline, StreamingTranslationContext context) {
    RunnerApi.PTransform pTransform = pipeline.getComponents().getTransformsOrThrow(id);
    TypeInformation<WindowedValue<byte[]>> typeInfo = new CoderTypeInformation<>(WindowedValue.getFullCoder(ByteArrayCoder.of(), GlobalWindow.Coder.INSTANCE), context.getPipelineOptions());
    ObjectMapper objectMapper = new ObjectMapper();
    final int intervalMillis;
    final int messageCount;
    try {
        JsonNode config = objectMapper.readTree(pTransform.getSpec().getPayload().toByteArray());
        intervalMillis = config.path("interval_ms").asInt(100);
        messageCount = config.path("message_count").asInt(0);
    } catch (IOException e) {
        throw new RuntimeException("Failed to parse configuration for streaming impulse", e);
    }
    SingleOutputStreamOperator<WindowedValue<byte[]>> source = context.getExecutionEnvironment().addSource(new StreamingImpulseSource(intervalMillis, messageCount), StreamingImpulseSource.class.getSimpleName()).returns(typeInfo);
    context.addDataStream(Iterables.getOnlyElement(pTransform.getOutputsMap().values()), source);
}
Also used : CoderTypeInformation(org.apache.beam.runners.flink.translation.types.CoderTypeInformation) RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) WindowedValue(org.apache.beam.sdk.util.WindowedValue) StreamingImpulseSource(org.apache.beam.runners.flink.translation.wrappers.streaming.io.StreamingImpulseSource) JsonNode(com.fasterxml.jackson.databind.JsonNode) IOException(java.io.IOException) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper)

Example 5 with CoderTypeInformation

use of org.apache.beam.runners.flink.translation.types.CoderTypeInformation in project beam by apache.

the class FlinkStreamingPortablePipelineTranslator method translateExecutableStage.

private <InputT, OutputT> void translateExecutableStage(String id, RunnerApi.Pipeline pipeline, StreamingTranslationContext context) {
    // TODO: Fail on splittable DoFns.
    // TODO: Special-case single outputs to avoid multiplexing PCollections.
    RunnerApi.Components components = pipeline.getComponents();
    RunnerApi.PTransform transform = components.getTransformsOrThrow(id);
    Map<String, String> outputs = transform.getOutputsMap();
    final RunnerApi.ExecutableStagePayload stagePayload;
    try {
        stagePayload = RunnerApi.ExecutableStagePayload.parseFrom(transform.getSpec().getPayload());
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    String inputPCollectionId = stagePayload.getInput();
    final TransformedSideInputs transformedSideInputs;
    if (stagePayload.getSideInputsCount() > 0) {
        transformedSideInputs = transformSideInputs(stagePayload, components, context);
    } else {
        transformedSideInputs = new TransformedSideInputs(Collections.emptyMap(), null);
    }
    Map<TupleTag<?>, OutputTag<WindowedValue<?>>> tagsToOutputTags = Maps.newLinkedHashMap();
    Map<TupleTag<?>, Coder<WindowedValue<?>>> tagsToCoders = Maps.newLinkedHashMap();
    // TODO: does it matter which output we designate as "main"
    final TupleTag<OutputT> mainOutputTag = outputs.isEmpty() ? null : new TupleTag(outputs.keySet().iterator().next());
    // associate output tags with ids, output manager uses these Integer ids to serialize state
    BiMap<String, Integer> outputIndexMap = createOutputMap(outputs.keySet());
    Map<String, Coder<WindowedValue<?>>> outputCoders = Maps.newHashMap();
    Map<TupleTag<?>, Integer> tagsToIds = Maps.newHashMap();
    Map<String, TupleTag<?>> collectionIdToTupleTag = Maps.newHashMap();
    // order output names for deterministic mapping
    for (String localOutputName : new TreeMap<>(outputIndexMap).keySet()) {
        String collectionId = outputs.get(localOutputName);
        Coder<WindowedValue<?>> windowCoder = (Coder) instantiateCoder(collectionId, components);
        outputCoders.put(localOutputName, windowCoder);
        TupleTag<?> tupleTag = new TupleTag<>(localOutputName);
        CoderTypeInformation<WindowedValue<?>> typeInformation = new CoderTypeInformation(windowCoder, context.getPipelineOptions());
        tagsToOutputTags.put(tupleTag, new OutputTag<>(localOutputName, typeInformation));
        tagsToCoders.put(tupleTag, windowCoder);
        tagsToIds.put(tupleTag, outputIndexMap.get(localOutputName));
        collectionIdToTupleTag.put(collectionId, tupleTag);
    }
    final SingleOutputStreamOperator<WindowedValue<OutputT>> outputStream;
    DataStream<WindowedValue<InputT>> inputDataStream = context.getDataStreamOrThrow(inputPCollectionId);
    CoderTypeInformation<WindowedValue<OutputT>> outputTypeInformation = !outputs.isEmpty() ? new CoderTypeInformation(outputCoders.get(mainOutputTag.getId()), context.getPipelineOptions()) : null;
    ArrayList<TupleTag<?>> additionalOutputTags = Lists.newArrayList();
    for (TupleTag<?> tupleTag : tagsToCoders.keySet()) {
        if (!mainOutputTag.getId().equals(tupleTag.getId())) {
            additionalOutputTags.add(tupleTag);
        }
    }
    final Coder<WindowedValue<InputT>> windowedInputCoder = instantiateCoder(inputPCollectionId, components);
    final boolean stateful = stagePayload.getUserStatesCount() > 0 || stagePayload.getTimersCount() > 0;
    final boolean hasSdfProcessFn = stagePayload.getComponents().getTransformsMap().values().stream().anyMatch(pTransform -> pTransform.getSpec().getUrn().equals(PTransformTranslation.SPLITTABLE_PROCESS_SIZED_ELEMENTS_AND_RESTRICTIONS_URN));
    Coder keyCoder = null;
    KeySelector<WindowedValue<InputT>, ?> keySelector = null;
    if (stateful || hasSdfProcessFn) {
        // Stateful/SDF stages are only allowed of KV input.
        Coder valueCoder = ((WindowedValue.FullWindowedValueCoder) windowedInputCoder).getValueCoder();
        if (!(valueCoder instanceof KvCoder)) {
            throw new IllegalStateException(String.format(Locale.ENGLISH, "The element coder for stateful DoFn '%s' must be KvCoder but is: %s", inputPCollectionId, valueCoder.getClass().getSimpleName()));
        }
        if (stateful) {
            keyCoder = ((KvCoder) valueCoder).getKeyCoder();
            keySelector = new KvToByteBufferKeySelector(keyCoder, new SerializablePipelineOptions(context.getPipelineOptions()));
        } else {
            // as the key.
            if (!(((KvCoder) valueCoder).getKeyCoder() instanceof KvCoder)) {
                throw new IllegalStateException(String.format(Locale.ENGLISH, "The element coder for splittable DoFn '%s' must be KVCoder(KvCoder, DoubleCoder) but is: %s", inputPCollectionId, valueCoder.getClass().getSimpleName()));
            }
            keyCoder = ((KvCoder) ((KvCoder) valueCoder).getKeyCoder()).getKeyCoder();
            keySelector = new SdfByteBufferKeySelector(keyCoder, new SerializablePipelineOptions(context.getPipelineOptions()));
        }
        inputDataStream = inputDataStream.keyBy(keySelector);
    }
    DoFnOperator.MultiOutputOutputManagerFactory<OutputT> outputManagerFactory = new DoFnOperator.MultiOutputOutputManagerFactory<>(mainOutputTag, tagsToOutputTags, tagsToCoders, tagsToIds, new SerializablePipelineOptions(context.getPipelineOptions()));
    DoFnOperator<InputT, OutputT> doFnOperator = new ExecutableStageDoFnOperator<>(transform.getUniqueName(), windowedInputCoder, Collections.emptyMap(), mainOutputTag, additionalOutputTags, outputManagerFactory, transformedSideInputs.unionTagToView, new ArrayList<>(transformedSideInputs.unionTagToView.values()), getSideInputIdToPCollectionViewMap(stagePayload, components), context.getPipelineOptions(), stagePayload, context.getJobInfo(), FlinkExecutableStageContextFactory.getInstance(), collectionIdToTupleTag, getWindowingStrategy(inputPCollectionId, components), keyCoder, keySelector);
    final String operatorName = generateNameFromStagePayload(stagePayload);
    if (transformedSideInputs.unionTagToView.isEmpty()) {
        outputStream = inputDataStream.transform(operatorName, outputTypeInformation, doFnOperator);
    } else {
        DataStream<RawUnionValue> sideInputStream = transformedSideInputs.unionedSideInputs.broadcast();
        if (stateful || hasSdfProcessFn) {
            // We have to manually construct the two-input transform because we're not
            // allowed to have only one input keyed, normally. Since Flink 1.5.0 it's
            // possible to use the Broadcast State Pattern which provides a more elegant
            // way to process keyed main input with broadcast state, but it's not feasible
            // here because it breaks the DoFnOperator abstraction.
            TwoInputTransformation<WindowedValue<KV<?, InputT>>, RawUnionValue, WindowedValue<OutputT>> rawFlinkTransform = new TwoInputTransformation(inputDataStream.getTransformation(), sideInputStream.getTransformation(), transform.getUniqueName(), doFnOperator, outputTypeInformation, inputDataStream.getParallelism());
            rawFlinkTransform.setStateKeyType(((KeyedStream) inputDataStream).getKeyType());
            rawFlinkTransform.setStateKeySelectors(((KeyedStream) inputDataStream).getKeySelector(), null);
            outputStream = new SingleOutputStreamOperator(inputDataStream.getExecutionEnvironment(), // we have to cheat around the ctor being protected
            rawFlinkTransform) {
            };
        } else {
            outputStream = inputDataStream.connect(sideInputStream).transform(operatorName, outputTypeInformation, doFnOperator);
        }
    }
    // Assign a unique but consistent id to re-map operator state
    outputStream.uid(transform.getUniqueName());
    if (mainOutputTag != null) {
        context.addDataStream(outputs.get(mainOutputTag.getId()), outputStream);
    }
    for (TupleTag<?> tupleTag : additionalOutputTags) {
        context.addDataStream(outputs.get(tupleTag.getId()), outputStream.getSideOutput(tagsToOutputTags.get(tupleTag)));
    }
}
Also used : KvToByteBufferKeySelector(org.apache.beam.runners.flink.translation.wrappers.streaming.KvToByteBufferKeySelector) TupleTag(org.apache.beam.sdk.values.TupleTag) RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) WindowedValue(org.apache.beam.sdk.util.WindowedValue) OutputTag(org.apache.flink.util.OutputTag) SerializablePipelineOptions(org.apache.beam.runners.core.construction.SerializablePipelineOptions) RawUnionValue(org.apache.beam.sdk.transforms.join.RawUnionValue) SingleOutputStreamOperator(org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator) WindowDoFnOperator(org.apache.beam.runners.flink.translation.wrappers.streaming.WindowDoFnOperator) DoFnOperator(org.apache.beam.runners.flink.translation.wrappers.streaming.DoFnOperator) ExecutableStageDoFnOperator(org.apache.beam.runners.flink.translation.wrappers.streaming.ExecutableStageDoFnOperator) SdfByteBufferKeySelector(org.apache.beam.runners.flink.translation.wrappers.streaming.SdfByteBufferKeySelector) TwoInputTransformation(org.apache.flink.streaming.api.transformations.TwoInputTransformation) ExecutableStageDoFnOperator(org.apache.beam.runners.flink.translation.wrappers.streaming.ExecutableStageDoFnOperator) CoderTypeInformation(org.apache.beam.runners.flink.translation.types.CoderTypeInformation) SingletonKeyedWorkItemCoder(org.apache.beam.runners.flink.translation.wrappers.streaming.SingletonKeyedWorkItemCoder) WindowedValueCoder(org.apache.beam.sdk.util.WindowedValue.WindowedValueCoder) KvCoder(org.apache.beam.sdk.coders.KvCoder) PipelineTranslatorUtils.instantiateCoder(org.apache.beam.runners.fnexecution.translation.PipelineTranslatorUtils.instantiateCoder) IterableCoder(org.apache.beam.sdk.coders.IterableCoder) VoidCoder(org.apache.beam.sdk.coders.VoidCoder) UnionCoder(org.apache.beam.sdk.transforms.join.UnionCoder) Coder(org.apache.beam.sdk.coders.Coder) ByteArrayCoder(org.apache.beam.sdk.coders.ByteArrayCoder) KvCoder(org.apache.beam.sdk.coders.KvCoder) IOException(java.io.IOException)

Aggregations

CoderTypeInformation (org.apache.beam.runners.flink.translation.types.CoderTypeInformation)19 WindowedValue (org.apache.beam.sdk.util.WindowedValue)17 KvCoder (org.apache.beam.sdk.coders.KvCoder)14 TupleTag (org.apache.beam.sdk.values.TupleTag)14 RawUnionValue (org.apache.beam.sdk.transforms.join.RawUnionValue)13 KV (org.apache.beam.sdk.values.KV)13 Coder (org.apache.beam.sdk.coders.Coder)12 ByteBuffer (java.nio.ByteBuffer)11 SerializablePipelineOptions (org.apache.beam.runners.core.construction.SerializablePipelineOptions)11 HashMap (java.util.HashMap)10 BoundedWindow (org.apache.beam.sdk.transforms.windowing.BoundedWindow)10 PCollectionView (org.apache.beam.sdk.values.PCollectionView)10 ArrayList (java.util.ArrayList)9 List (java.util.List)9 StreamRecordStripper.stripStreamRecordFromWindowedValue (org.apache.beam.runners.flink.translation.wrappers.streaming.StreamRecordStripper.stripStreamRecordFromWindowedValue)9 ImmutableMap (org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap)9 Collections (java.util.Collections)8 IntervalWindow (org.apache.beam.sdk.transforms.windowing.IntervalWindow)8 Arrays (java.util.Arrays)7 RunnerApi (org.apache.beam.model.pipeline.v1.RunnerApi)7