Search in sources :

Example 11 with TupleTag

use of org.apache.beam.sdk.values.TupleTag in project beam by apache.

the class DoFnOperatorTest method testStateGCForStatefulFn.

@Test
public void testStateGCForStatefulFn() throws Exception {
    WindowingStrategy<Object, IntervalWindow> windowingStrategy = WindowingStrategy.of(FixedWindows.of(new Duration(10))).withAllowedLateness(Duration.ZERO);
    final String timerId = "boo";
    final String stateId = "dazzle";
    final int offset = 5000;
    final int timerOutput = 4093;
    DoFn<KV<String, Integer>, KV<String, Integer>> fn = new DoFn<KV<String, Integer>, KV<String, Integer>>() {

        @TimerId(timerId)
        private final TimerSpec spec = TimerSpecs.timer(TimeDomain.EVENT_TIME);

        @StateId(stateId)
        private final StateSpec<ValueState<String>> stateSpec = StateSpecs.value(StringUtf8Coder.of());

        @ProcessElement
        public void processElement(ProcessContext context, @TimerId(timerId) Timer timer, @StateId(stateId) ValueState<String> state, BoundedWindow window) {
            timer.set(window.maxTimestamp());
            state.write(context.element().getKey());
            context.output(KV.of(context.element().getKey(), context.element().getValue() + offset));
        }

        @OnTimer(timerId)
        public void onTimer(OnTimerContext context, @StateId(stateId) ValueState<String> state) {
            context.output(KV.of(state.read(), timerOutput));
        }
    };
    WindowedValue.FullWindowedValueCoder<KV<String, Integer>> windowedValueCoder = WindowedValue.getFullCoder(KvCoder.of(StringUtf8Coder.of(), VarIntCoder.of()), windowingStrategy.getWindowFn().windowCoder());
    TupleTag<KV<String, Integer>> outputTag = new TupleTag<>("main-output");
    DoFnOperator<KV<String, Integer>, KV<String, Integer>, WindowedValue<KV<String, Integer>>> doFnOperator = new DoFnOperator<>(fn, "stepName", windowedValueCoder, outputTag, Collections.<TupleTag<?>>emptyList(), new DoFnOperator.DefaultOutputManagerFactory<WindowedValue<KV<String, Integer>>>(), windowingStrategy, new HashMap<Integer, PCollectionView<?>>(), /* side-input mapping */
    Collections.<PCollectionView<?>>emptyList(), /* side inputs */
    PipelineOptionsFactory.as(FlinkPipelineOptions.class), StringUtf8Coder.of());
    KeyedOneInputStreamOperatorTestHarness<String, WindowedValue<KV<String, Integer>>, WindowedValue<KV<String, Integer>>> testHarness = new KeyedOneInputStreamOperatorTestHarness<>(doFnOperator, new KeySelector<WindowedValue<KV<String, Integer>>, String>() {

        @Override
        public String getKey(WindowedValue<KV<String, Integer>> kvWindowedValue) throws Exception {
            return kvWindowedValue.getValue().getKey();
        }
    }, new CoderTypeInformation<>(StringUtf8Coder.of()));
    testHarness.open();
    testHarness.processWatermark(0);
    assertEquals(0, testHarness.numKeyedStateEntries());
    IntervalWindow window1 = new IntervalWindow(new Instant(0), Duration.millis(10));
    testHarness.processElement(new StreamRecord<>(WindowedValue.of(KV.of("key1", 5), new Instant(1), window1, PaneInfo.NO_FIRING)));
    testHarness.processElement(new StreamRecord<>(WindowedValue.of(KV.of("key2", 7), new Instant(3), window1, PaneInfo.NO_FIRING)));
    assertThat(this.<KV<String, Integer>>stripStreamRecordFromWindowedValue(testHarness.getOutput()), contains(WindowedValue.of(KV.of("key1", 5 + offset), new Instant(1), window1, PaneInfo.NO_FIRING), WindowedValue.of(KV.of("key2", 7 + offset), new Instant(3), window1, PaneInfo.NO_FIRING)));
    assertEquals(2, testHarness.numKeyedStateEntries());
    testHarness.getOutput().clear();
    // this should trigger both the window.maxTimestamp() timer and the GC timer
    // this tests that the GC timer fires after the user timer
    testHarness.processWatermark(window1.maxTimestamp().plus(windowingStrategy.getAllowedLateness()).plus(StatefulDoFnRunner.TimeInternalsCleanupTimer.GC_DELAY_MS).getMillis());
    assertThat(this.<KV<String, Integer>>stripStreamRecordFromWindowedValue(testHarness.getOutput()), contains(WindowedValue.of(KV.of("key1", timerOutput), new Instant(9), window1, PaneInfo.NO_FIRING), WindowedValue.of(KV.of("key2", timerOutput), new Instant(9), window1, PaneInfo.NO_FIRING)));
    // ensure the state was garbage collected
    assertEquals(0, testHarness.numKeyedStateEntries());
    testHarness.close();
}
Also used : TupleTag(org.apache.beam.sdk.values.TupleTag) FlinkPipelineOptions(org.apache.beam.runners.flink.FlinkPipelineOptions) DoFnOperator(org.apache.beam.runners.flink.translation.wrappers.streaming.DoFnOperator) KeyedOneInputStreamOperatorTestHarness(org.apache.flink.streaming.util.KeyedOneInputStreamOperatorTestHarness) StateSpec(org.apache.beam.sdk.state.StateSpec) WindowedValue(org.apache.beam.sdk.util.WindowedValue) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) IntervalWindow(org.apache.beam.sdk.transforms.windowing.IntervalWindow) TimerSpec(org.apache.beam.sdk.state.TimerSpec) Instant(org.joda.time.Instant) Duration(org.joda.time.Duration) KV(org.apache.beam.sdk.values.KV) PCollectionView(org.apache.beam.sdk.values.PCollectionView) DoFn(org.apache.beam.sdk.transforms.DoFn) ValueState(org.apache.beam.sdk.state.ValueState) Timer(org.apache.beam.sdk.state.Timer) Test(org.junit.Test)

Example 12 with TupleTag

use of org.apache.beam.sdk.values.TupleTag in project beam by apache.

the class FlinkDoFnFunction method mapPartition.

@Override
public void mapPartition(Iterable<WindowedValue<InputT>> values, Collector<WindowedValue<OutputT>> out) throws Exception {
    RuntimeContext runtimeContext = getRuntimeContext();
    DoFnRunners.OutputManager outputManager;
    if (outputMap.size() == 1) {
        outputManager = new FlinkDoFnFunction.DoFnOutputManager(out);
    } else {
        // it has some additional outputs
        outputManager = new FlinkDoFnFunction.MultiDoFnOutputManager((Collector) out, outputMap);
    }
    List<TupleTag<?>> additionalOutputTags = Lists.newArrayList(outputMap.keySet());
    DoFnRunner<InputT, OutputT> doFnRunner = DoFnRunners.simpleRunner(serializedOptions.getPipelineOptions(), doFn, new FlinkSideInputReader(sideInputs, runtimeContext), outputManager, mainOutputTag, additionalOutputTags, new FlinkNoOpStepContext(), windowingStrategy);
    if ((serializedOptions.getPipelineOptions().as(FlinkPipelineOptions.class)).getEnableMetrics()) {
        doFnRunner = new DoFnRunnerWithMetricsUpdate<>(stepName, doFnRunner, getRuntimeContext());
    }
    doFnRunner.startBundle();
    for (WindowedValue<InputT> value : values) {
        doFnRunner.processElement(value);
    }
    doFnRunner.finishBundle();
}
Also used : DoFnRunners(org.apache.beam.runners.core.DoFnRunners) TupleTag(org.apache.beam.sdk.values.TupleTag) Collector(org.apache.flink.util.Collector) RuntimeContext(org.apache.flink.api.common.functions.RuntimeContext)

Example 13 with TupleTag

use of org.apache.beam.sdk.values.TupleTag in project beam by apache.

the class FlinkStatefulDoFnFunction method reduce.

@Override
public void reduce(Iterable<WindowedValue<KV<K, V>>> values, Collector<WindowedValue<OutputT>> out) throws Exception {
    RuntimeContext runtimeContext = getRuntimeContext();
    DoFnRunners.OutputManager outputManager;
    if (outputMap.size() == 1) {
        outputManager = new FlinkDoFnFunction.DoFnOutputManager(out);
    } else {
        // it has some additional Outputs
        outputManager = new FlinkDoFnFunction.MultiDoFnOutputManager((Collector) out, outputMap);
    }
    final Iterator<WindowedValue<KV<K, V>>> iterator = values.iterator();
    // get the first value, we need this for initializing the state internals with the key.
    // we are guaranteed to have a first value, otherwise reduce() would not have been called.
    WindowedValue<KV<K, V>> currentValue = iterator.next();
    final K key = currentValue.getValue().getKey();
    final InMemoryStateInternals<K> stateInternals = InMemoryStateInternals.forKey(key);
    // Used with Batch, we know that all the data is available for this key. We can't use the
    // timer manager from the context because it doesn't exist. So we create one and advance
    // time to the end after processing all elements.
    final InMemoryTimerInternals timerInternals = new InMemoryTimerInternals();
    timerInternals.advanceProcessingTime(Instant.now());
    timerInternals.advanceSynchronizedProcessingTime(Instant.now());
    List<TupleTag<?>> additionalOutputTags = Lists.newArrayList(outputMap.keySet());
    DoFnRunner<KV<K, V>, OutputT> doFnRunner = DoFnRunners.simpleRunner(serializedOptions.getPipelineOptions(), dofn, new FlinkSideInputReader(sideInputs, runtimeContext), outputManager, mainOutputTag, additionalOutputTags, new FlinkNoOpStepContext() {

        @Override
        public StateInternals stateInternals() {
            return stateInternals;
        }

        @Override
        public TimerInternals timerInternals() {
            return timerInternals;
        }
    }, windowingStrategy);
    if ((serializedOptions.getPipelineOptions().as(FlinkPipelineOptions.class)).getEnableMetrics()) {
        doFnRunner = new DoFnRunnerWithMetricsUpdate<>(stepName, doFnRunner, getRuntimeContext());
    }
    doFnRunner.startBundle();
    doFnRunner.processElement(currentValue);
    while (iterator.hasNext()) {
        currentValue = iterator.next();
        doFnRunner.processElement(currentValue);
    }
    // Finish any pending windows by advancing the input watermark to infinity.
    timerInternals.advanceInputWatermark(BoundedWindow.TIMESTAMP_MAX_VALUE);
    // Finally, advance the processing time to infinity to fire any timers.
    timerInternals.advanceProcessingTime(BoundedWindow.TIMESTAMP_MAX_VALUE);
    timerInternals.advanceSynchronizedProcessingTime(BoundedWindow.TIMESTAMP_MAX_VALUE);
    fireEligibleTimers(timerInternals, doFnRunner);
    doFnRunner.finishBundle();
}
Also used : DoFnRunners(org.apache.beam.runners.core.DoFnRunners) TupleTag(org.apache.beam.sdk.values.TupleTag) WindowedValue(org.apache.beam.sdk.util.WindowedValue) KV(org.apache.beam.sdk.values.KV) Collector(org.apache.flink.util.Collector) InMemoryTimerInternals(org.apache.beam.runners.core.InMemoryTimerInternals) KV(org.apache.beam.sdk.values.KV) TimerInternals(org.apache.beam.runners.core.TimerInternals) InMemoryTimerInternals(org.apache.beam.runners.core.InMemoryTimerInternals) InMemoryStateInternals(org.apache.beam.runners.core.InMemoryStateInternals) StateInternals(org.apache.beam.runners.core.StateInternals) RuntimeContext(org.apache.flink.api.common.functions.RuntimeContext)

Example 14 with TupleTag

use of org.apache.beam.sdk.values.TupleTag in project beam by apache.

the class ParDoTranslator method translate.

@Override
public void translate(ParDo.MultiOutput<InputT, OutputT> transform, TranslationContext context) {
    DoFn<InputT, OutputT> doFn = transform.getFn();
    DoFnSignature signature = DoFnSignatures.getSignature(doFn.getClass());
    if (signature.processElement().isSplittable()) {
        throw new UnsupportedOperationException(String.format("%s does not support splittable DoFn: %s", ApexRunner.class.getSimpleName(), doFn));
    }
    if (signature.stateDeclarations().size() > 0) {
        throw new UnsupportedOperationException(String.format("Found %s annotations on %s, but %s cannot yet be used with state in the %s.", DoFn.StateId.class.getSimpleName(), doFn.getClass().getName(), DoFn.class.getSimpleName(), ApexRunner.class.getSimpleName()));
    }
    if (signature.timerDeclarations().size() > 0) {
        throw new UnsupportedOperationException(String.format("Found %s annotations on %s, but %s cannot yet be used with timers in the %s.", DoFn.TimerId.class.getSimpleName(), doFn.getClass().getName(), DoFn.class.getSimpleName(), ApexRunner.class.getSimpleName()));
    }
    Map<TupleTag<?>, PValue> outputs = context.getOutputs();
    PCollection<InputT> input = context.getInput();
    List<PCollectionView<?>> sideInputs = transform.getSideInputs();
    Coder<InputT> inputCoder = input.getCoder();
    WindowedValueCoder<InputT> wvInputCoder = FullWindowedValueCoder.of(inputCoder, input.getWindowingStrategy().getWindowFn().windowCoder());
    ApexParDoOperator<InputT, OutputT> operator = new ApexParDoOperator<>(context.getPipelineOptions(), doFn, transform.getMainOutputTag(), transform.getAdditionalOutputTags().getAll(), input.getWindowingStrategy(), sideInputs, wvInputCoder, context.getStateBackend());
    Map<PCollection<?>, OutputPort<?>> ports = Maps.newHashMapWithExpectedSize(outputs.size());
    for (Entry<TupleTag<?>, PValue> output : outputs.entrySet()) {
        checkArgument(output.getValue() instanceof PCollection, "%s %s outputs non-PCollection %s of type %s", ParDo.MultiOutput.class.getSimpleName(), context.getFullName(), output.getValue(), output.getValue().getClass().getSimpleName());
        PCollection<?> pc = (PCollection<?>) output.getValue();
        if (output.getKey().equals(transform.getMainOutputTag())) {
            ports.put(pc, operator.output);
        } else {
            int portIndex = 0;
            for (TupleTag<?> tag : transform.getAdditionalOutputTags().getAll()) {
                if (tag.equals(output.getKey())) {
                    ports.put(pc, operator.additionalOutputPorts[portIndex]);
                    break;
                }
                portIndex++;
            }
        }
    }
    context.addOperator(operator, ports);
    context.addStream(context.getInput(), operator.input);
    if (!sideInputs.isEmpty()) {
        addSideInputs(operator.sideInput1, sideInputs, context);
    }
}
Also used : OutputPort(com.datatorrent.api.Operator.OutputPort) TupleTag(org.apache.beam.sdk.values.TupleTag) ApexParDoOperator(org.apache.beam.runners.apex.translation.operators.ApexParDoOperator) PValue(org.apache.beam.sdk.values.PValue) PCollection(org.apache.beam.sdk.values.PCollection) PCollectionView(org.apache.beam.sdk.values.PCollectionView) DoFn(org.apache.beam.sdk.transforms.DoFn) DoFnSignature(org.apache.beam.sdk.transforms.reflect.DoFnSignature)

Example 15 with TupleTag

use of org.apache.beam.sdk.values.TupleTag in project beam by apache.

the class ParDoTranslatorTest method testMultiOutputParDoWithSideInputs.

@Test
public void testMultiOutputParDoWithSideInputs() throws Exception {
    ApexPipelineOptions options = PipelineOptionsFactory.create().as(ApexPipelineOptions.class);
    // non-blocking run
    options.setRunner(ApexRunner.class);
    Pipeline pipeline = Pipeline.create(options);
    List<Integer> inputs = Arrays.asList(3, -42, 666);
    final TupleTag<String> mainOutputTag = new TupleTag<>("main");
    final TupleTag<Void> additionalOutputTag = new TupleTag<>("output");
    PCollectionView<Integer> sideInput1 = pipeline.apply("CreateSideInput1", Create.of(11)).apply("ViewSideInput1", View.<Integer>asSingleton());
    PCollectionView<Integer> sideInputUnread = pipeline.apply("CreateSideInputUnread", Create.of(-3333)).apply("ViewSideInputUnread", View.<Integer>asSingleton());
    PCollectionView<Integer> sideInput2 = pipeline.apply("CreateSideInput2", Create.of(222)).apply("ViewSideInput2", View.<Integer>asSingleton());
    PCollectionTuple outputs = pipeline.apply(Create.of(inputs)).apply(ParDo.of(new TestMultiOutputWithSideInputsFn(Arrays.asList(sideInput1, sideInput2), Arrays.<TupleTag<String>>asList())).withSideInputs(sideInput1).withSideInputs(sideInputUnread).withSideInputs(sideInput2).withOutputTags(mainOutputTag, TupleTagList.of(additionalOutputTag)));
    outputs.get(mainOutputTag).apply(ParDo.of(new EmbeddedCollector()));
    outputs.get(additionalOutputTag).setCoder(VoidCoder.of());
    ApexRunnerResult result = (ApexRunnerResult) pipeline.run();
    HashSet<String> expected = Sets.newHashSet("processing: 3: [11, 222]", "processing: -42: [11, 222]", "processing: 666: [11, 222]");
    long timeout = System.currentTimeMillis() + TIMEOUT_MILLIS;
    while (System.currentTimeMillis() < timeout) {
        if (EmbeddedCollector.RESULTS.containsAll(expected)) {
            break;
        }
        LOG.info("Waiting for expected results.");
        Thread.sleep(SLEEP_MILLIS);
    }
    result.cancel();
    Assert.assertEquals(Sets.newHashSet(expected), EmbeddedCollector.RESULTS);
}
Also used : TupleTag(org.apache.beam.sdk.values.TupleTag) ApexRunnerResult(org.apache.beam.runners.apex.ApexRunnerResult) Pipeline(org.apache.beam.sdk.Pipeline) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) ApexPipelineOptions(org.apache.beam.runners.apex.ApexPipelineOptions) Test(org.junit.Test)

Aggregations

TupleTag (org.apache.beam.sdk.values.TupleTag)185 Test (org.junit.Test)100 WindowedValue (org.apache.beam.sdk.util.WindowedValue)54 KV (org.apache.beam.sdk.values.KV)54 PCollectionTuple (org.apache.beam.sdk.values.PCollectionTuple)49 PCollection (org.apache.beam.sdk.values.PCollection)42 DoFn (org.apache.beam.sdk.transforms.DoFn)32 Instant (org.joda.time.Instant)32 SerializablePipelineOptions (org.apache.beam.runners.core.construction.SerializablePipelineOptions)30 Map (java.util.Map)29 Pipeline (org.apache.beam.sdk.Pipeline)29 PCollectionView (org.apache.beam.sdk.values.PCollectionView)29 HashMap (java.util.HashMap)27 Coder (org.apache.beam.sdk.coders.Coder)26 StreamRecordStripper.stripStreamRecordFromWindowedValue (org.apache.beam.runners.flink.translation.wrappers.streaming.StreamRecordStripper.stripStreamRecordFromWindowedValue)25 Matchers.containsString (org.hamcrest.Matchers.containsString)25 List (java.util.List)24 BoundedWindow (org.apache.beam.sdk.transforms.windowing.BoundedWindow)23 KvCoder (org.apache.beam.sdk.coders.KvCoder)22 KeyedOneInputStreamOperatorTestHarness (org.apache.flink.streaming.util.KeyedOneInputStreamOperatorTestHarness)22