Search in sources :

Example 36 with StateSpec

use of org.apache.beam.sdk.state.StateSpec in project beam by apache.

the class ParDoTest method testValueStateDedup.

@Test
@Category({ ValidatesRunner.class, UsesStatefulParDo.class })
public void testValueStateDedup() {
    final String stateId = "foo";
    DoFn<KV<Integer, Integer>, Integer> onePerKey = new DoFn<KV<Integer, Integer>, Integer>() {

        @StateId(stateId)
        private final StateSpec<ValueState<Integer>> seenSpec = StateSpecs.value(VarIntCoder.of());

        @ProcessElement
        public void processElement(ProcessContext c, @StateId(stateId) ValueState<Integer> seenState) {
            Integer seen = MoreObjects.firstNonNull(seenState.read(), 0);
            if (seen == 0) {
                seenState.write(seen + 1);
                c.output(c.element().getValue());
            }
        }
    };
    int numKeys = 50;
    // A big enough list that we can see some deduping
    List<KV<Integer, Integer>> input = new ArrayList<>();
    // The output should have no dupes
    Set<Integer> expectedOutput = new HashSet<>();
    for (int key = 0; key < numKeys; ++key) {
        int output = 1000 + key;
        expectedOutput.add(output);
        for (int i = 0; i < 15; ++i) {
            input.add(KV.of(key, output));
        }
    }
    Collections.shuffle(input);
    PCollection<Integer> output = pipeline.apply(Create.of(input)).apply(ParDo.of(onePerKey));
    PAssert.that(output).containsInAnyOrder(expectedOutput);
    pipeline.run();
}
Also used : ArrayList(java.util.ArrayList) StringUtils.byteArrayToJsonString(org.apache.beam.sdk.util.StringUtils.byteArrayToJsonString) Matchers.containsString(org.hamcrest.Matchers.containsString) KV(org.apache.beam.sdk.values.KV) StateSpec(org.apache.beam.sdk.state.StateSpec) ValueState(org.apache.beam.sdk.state.ValueState) HashSet(java.util.HashSet) Category(org.junit.experimental.categories.Category) Test(org.junit.Test)

Example 37 with StateSpec

use of org.apache.beam.sdk.state.StateSpec in project beam by apache.

the class ParDoTest method testSetState.

@Test
@Category({ ValidatesRunner.class, UsesStatefulParDo.class, UsesSetState.class })
public void testSetState() {
    final String stateId = "foo";
    final String countStateId = "count";
    DoFn<KV<String, Integer>, Set<Integer>> fn = new DoFn<KV<String, Integer>, Set<Integer>>() {

        @StateId(stateId)
        private final StateSpec<SetState<Integer>> setState = StateSpecs.set(VarIntCoder.of());

        @StateId(countStateId)
        private final StateSpec<CombiningState<Integer, int[], Integer>> countState = StateSpecs.combiningFromInputInternal(VarIntCoder.of(), Sum.ofIntegers());

        @ProcessElement
        public void processElement(ProcessContext c, @StateId(stateId) SetState<Integer> state, @StateId(countStateId) CombiningState<Integer, int[], Integer> count) {
            state.add(c.element().getValue());
            count.add(1);
            if (count.read() >= 4) {
                Set<Integer> set = Sets.newHashSet(state.read());
                c.output(set);
            }
        }
    };
    PCollection<Set<Integer>> output = pipeline.apply(Create.of(KV.of("hello", 97), KV.of("hello", 42), KV.of("hello", 42), KV.of("hello", 12))).apply(ParDo.of(fn));
    PAssert.that(output).containsInAnyOrder(Sets.newHashSet(97, 42, 12));
    pipeline.run();
}
Also used : Set(java.util.Set) HashSet(java.util.HashSet) StringUtils.byteArrayToJsonString(org.apache.beam.sdk.util.StringUtils.byteArrayToJsonString) Matchers.containsString(org.hamcrest.Matchers.containsString) KV(org.apache.beam.sdk.values.KV) StateSpec(org.apache.beam.sdk.state.StateSpec) CombiningState(org.apache.beam.sdk.state.CombiningState) SetState(org.apache.beam.sdk.state.SetState) UsesSetState(org.apache.beam.sdk.testing.UsesSetState) Category(org.junit.experimental.categories.Category) Test(org.junit.Test)

Example 38 with StateSpec

use of org.apache.beam.sdk.state.StateSpec in project beam by apache.

the class ParDoTest method testCoderInferenceOfList.

@Test
@Category({ ValidatesRunner.class, UsesStatefulParDo.class })
public void testCoderInferenceOfList() {
    final String stateId = "foo";
    MyIntegerCoder myIntegerCoder = MyIntegerCoder.of();
    pipeline.getCoderRegistry().registerCoderForClass(MyInteger.class, myIntegerCoder);
    DoFn<KV<String, Integer>, List<MyInteger>> fn = new DoFn<KV<String, Integer>, List<MyInteger>>() {

        @StateId(stateId)
        private final StateSpec<ValueState<List<MyInteger>>> intState = StateSpecs.value();

        @ProcessElement
        public void processElement(ProcessContext c, @StateId(stateId) ValueState<List<MyInteger>> state) {
            MyInteger myInteger = new MyInteger(c.element().getValue());
            List<MyInteger> currentValue = state.read();
            List<MyInteger> newValue = currentValue != null ? ImmutableList.<MyInteger>builder().addAll(currentValue).add(myInteger).build() : Collections.singletonList(myInteger);
            c.output(newValue);
            state.write(newValue);
        }
    };
    pipeline.apply(Create.of(KV.of("hello", 42), KV.of("hello", 97), KV.of("hello", 84))).apply(ParDo.of(fn)).setCoder(ListCoder.of(myIntegerCoder));
    pipeline.run();
}
Also used : StringUtils.byteArrayToJsonString(org.apache.beam.sdk.util.StringUtils.byteArrayToJsonString) Matchers.containsString(org.hamcrest.Matchers.containsString) KV(org.apache.beam.sdk.values.KV) StateSpec(org.apache.beam.sdk.state.StateSpec) ValueState(org.apache.beam.sdk.state.ValueState) ArrayList(java.util.ArrayList) List(java.util.List) TupleTagList(org.apache.beam.sdk.values.TupleTagList) ImmutableList(com.google.common.collect.ImmutableList) Category(org.junit.experimental.categories.Category) Test(org.junit.Test)

Example 39 with StateSpec

use of org.apache.beam.sdk.state.StateSpec in project beam by apache.

the class DoFnOperatorTest method testLateDroppingForStatefulFn.

@Test
public void testLateDroppingForStatefulFn() throws Exception {
    WindowingStrategy<Object, IntervalWindow> windowingStrategy = WindowingStrategy.of(FixedWindows.of(Duration.millis(10)));
    DoFn<Integer, String> fn = new DoFn<Integer, String>() {

        @StateId("state")
        private final StateSpec<ValueState<String>> stateSpec = StateSpecs.value(StringUtf8Coder.of());

        @ProcessElement
        public void processElement(ProcessContext context) {
            context.output(context.element().toString());
        }
    };
    VarIntCoder keyCoder = VarIntCoder.of();
    Coder<WindowedValue<Integer>> inputCoder = WindowedValue.getFullCoder(keyCoder, windowingStrategy.getWindowFn().windowCoder());
    Coder<WindowedValue<String>> outputCoder = WindowedValue.getFullCoder(StringUtf8Coder.of(), windowingStrategy.getWindowFn().windowCoder());
    KeySelector<WindowedValue<Integer>, ByteBuffer> keySelector = e -> FlinkKeyUtils.encodeKey(e.getValue(), keyCoder);
    TupleTag<String> outputTag = new TupleTag<>("main-output");
    DoFnOperator<Integer, String> doFnOperator = new DoFnOperator<>(fn, "stepName", inputCoder, Collections.emptyMap(), outputTag, Collections.emptyList(), new DoFnOperator.MultiOutputOutputManagerFactory<>(outputTag, outputCoder, new SerializablePipelineOptions(FlinkPipelineOptions.defaults())), windowingStrategy, new HashMap<>(), /* side-input mapping */
    Collections.emptyList(), /* side inputs */
    FlinkPipelineOptions.defaults(), keyCoder, /* key coder */
    keySelector, DoFnSchemaInformation.create(), Collections.emptyMap());
    OneInputStreamOperatorTestHarness<WindowedValue<Integer>, WindowedValue<String>> testHarness = new KeyedOneInputStreamOperatorTestHarness<>(doFnOperator, keySelector, new CoderTypeInformation<>(FlinkKeyUtils.ByteBufferCoder.of(), FlinkPipelineOptions.defaults()));
    testHarness.open();
    testHarness.processWatermark(0);
    IntervalWindow window1 = new IntervalWindow(new Instant(0), Duration.millis(10));
    // this should not be late
    testHarness.processElement(new StreamRecord<>(WindowedValue.of(13, new Instant(0), window1, PaneInfo.NO_FIRING)));
    assertThat(stripStreamRecordFromWindowedValue(testHarness.getOutput()), contains(WindowedValue.of("13", new Instant(0), window1, PaneInfo.NO_FIRING)));
    testHarness.getOutput().clear();
    testHarness.processWatermark(9);
    // this should still not be considered late
    testHarness.processElement(new StreamRecord<>(WindowedValue.of(17, new Instant(0), window1, PaneInfo.NO_FIRING)));
    assertThat(stripStreamRecordFromWindowedValue(testHarness.getOutput()), contains(WindowedValue.of("17", new Instant(0), window1, PaneInfo.NO_FIRING)));
    testHarness.getOutput().clear();
    testHarness.processWatermark(10);
    // this should now be considered late
    testHarness.processElement(new StreamRecord<>(WindowedValue.of(17, new Instant(0), window1, PaneInfo.NO_FIRING)));
    assertThat(stripStreamRecordFromWindowedValue(testHarness.getOutput()), emptyIterable());
    testHarness.close();
}
Also used : StateSpec(org.apache.beam.sdk.state.StateSpec) Arrays(java.util.Arrays) StateNamespace(org.apache.beam.runners.core.StateNamespace) SerializablePipelineOptions(org.apache.beam.runners.core.construction.SerializablePipelineOptions) TimestampCombiner(org.apache.beam.sdk.transforms.windowing.TimestampCombiner) WindowedValue(org.apache.beam.sdk.util.WindowedValue) StreamRecordStripper.stripStreamRecordFromWindowedValue(org.apache.beam.runners.flink.translation.wrappers.streaming.StreamRecordStripper.stripStreamRecordFromWindowedValue) IsIterableContainingInOrder.contains(org.hamcrest.collection.IsIterableContainingInOrder.contains) FlinkPipelineOptions(org.apache.beam.runners.flink.FlinkPipelineOptions) TimerSpecs(org.apache.beam.sdk.state.TimerSpecs) DoFnRunner(org.apache.beam.runners.core.DoFnRunner) FlinkMetricContainer(org.apache.beam.runners.flink.metrics.FlinkMetricContainer) StepContext(org.apache.beam.runners.core.StepContext) ValueState(org.apache.beam.sdk.state.ValueState) ImmutableMap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap) KeyedOneInputStreamOperatorTestHarness(org.apache.flink.streaming.util.KeyedOneInputStreamOperatorTestHarness) TimerInternals(org.apache.beam.runners.core.TimerInternals) ByteBuffer(java.nio.ByteBuffer) DoFnSchemaInformation(org.apache.beam.sdk.transforms.DoFnSchemaInformation) OneInputStreamOperatorTestHarness(org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness) TypeFactory(com.fasterxml.jackson.databind.type.TypeFactory) Create(org.apache.beam.sdk.transforms.Create) TwoInputStreamOperatorTestHarness(org.apache.flink.streaming.util.TwoInputStreamOperatorTestHarness) Iterables(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Iterables) LRUMap(com.fasterxml.jackson.databind.util.LRUMap) Window(org.apache.beam.sdk.transforms.windowing.Window) GlobalWindow(org.apache.beam.sdk.transforms.windowing.GlobalWindow) FluentIterable(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.FluentIterable) TypeInformation(org.apache.flink.api.common.typeinfo.TypeInformation) CoderTypeInformation(org.apache.beam.runners.flink.translation.types.CoderTypeInformation) KvCoder(org.apache.beam.sdk.coders.KvCoder) KeySelector(org.apache.flink.api.java.functions.KeySelector) KeyedTwoInputStreamOperatorTestHarness(org.apache.flink.streaming.util.KeyedTwoInputStreamOperatorTestHarness) PaneInfo(org.apache.beam.sdk.transforms.windowing.PaneInfo) FullWindowedValueCoder(org.apache.beam.sdk.util.WindowedValue.FullWindowedValueCoder) OutputTag(org.apache.flink.util.OutputTag) VarLongCoder(org.apache.beam.sdk.coders.VarLongCoder) OperatorSubtaskState(org.apache.flink.runtime.checkpoint.OperatorSubtaskState) Collectors(java.util.stream.Collectors) Matchers.instanceOf(org.hamcrest.Matchers.instanceOf) Objects(java.util.Objects) List(java.util.List) WatermarkHoldState(org.apache.beam.sdk.state.WatermarkHoldState) Matchers.containsInAnyOrder(org.hamcrest.Matchers.containsInAnyOrder) Timer(org.apache.beam.sdk.state.Timer) Matchers.equalTo(org.hamcrest.Matchers.equalTo) Optional(java.util.Optional) ImmutableList(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList) Matchers.greaterThan(org.hamcrest.Matchers.greaterThan) Matchers.is(org.hamcrest.Matchers.is) StateTag(org.apache.beam.runners.core.StateTag) WindowingStrategy(org.apache.beam.sdk.values.WindowingStrategy) StatefulDoFnRunner(org.apache.beam.runners.core.StatefulDoFnRunner) Whitebox(org.powermock.reflect.Whitebox) KV(org.apache.beam.sdk.values.KV) Assert.assertThrows(org.junit.Assert.assertThrows) Duration(org.joda.time.Duration) RunWith(org.junit.runner.RunWith) Coder(org.apache.beam.sdk.coders.Coder) HashMap(java.util.HashMap) View(org.apache.beam.sdk.transforms.View) StateNamespaces(org.apache.beam.runners.core.StateNamespaces) Supplier(java.util.function.Supplier) StateTags(org.apache.beam.runners.core.StateTags) ArrayList(java.util.ArrayList) StringUtf8Coder(org.apache.beam.sdk.coders.StringUtf8Coder) RawUnionValue(org.apache.beam.sdk.transforms.join.RawUnionValue) StreamRecord(org.apache.flink.streaming.runtime.streamrecord.StreamRecord) TimerSpec(org.apache.beam.sdk.state.TimerSpec) CoderTypeSerializer(org.apache.beam.runners.flink.translation.types.CoderTypeSerializer) TupleTag(org.apache.beam.sdk.values.TupleTag) MatcherAssert.assertThat(org.hamcrest.MatcherAssert.assertThat) Pipeline(org.apache.beam.sdk.Pipeline) Nullable(org.checkerframework.checker.nullness.qual.Nullable) Before(org.junit.Before) DoFn(org.apache.beam.sdk.transforms.DoFn) PCollectionViewTesting(org.apache.beam.sdk.testing.PCollectionViewTesting) FixedWindows(org.apache.beam.sdk.transforms.windowing.FixedWindows) Function(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Function) Test(org.junit.Test) JUnit4(org.junit.runners.JUnit4) PCollection(org.apache.beam.sdk.values.PCollection) Mockito(org.mockito.Mockito) Matchers.emptyIterable(org.hamcrest.Matchers.emptyIterable) StateSpecs(org.apache.beam.sdk.state.StateSpecs) PCollectionView(org.apache.beam.sdk.values.PCollectionView) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) Instant(org.joda.time.Instant) VarIntCoder(org.apache.beam.sdk.coders.VarIntCoder) IntervalWindow(org.apache.beam.sdk.transforms.windowing.IntervalWindow) Collections(java.util.Collections) TimeDomain(org.apache.beam.sdk.state.TimeDomain) Assert.assertEquals(org.junit.Assert.assertEquals) VarIntCoder(org.apache.beam.sdk.coders.VarIntCoder) TupleTag(org.apache.beam.sdk.values.TupleTag) KeyedOneInputStreamOperatorTestHarness(org.apache.flink.streaming.util.KeyedOneInputStreamOperatorTestHarness) StateSpec(org.apache.beam.sdk.state.StateSpec) WindowedValue(org.apache.beam.sdk.util.WindowedValue) StreamRecordStripper.stripStreamRecordFromWindowedValue(org.apache.beam.runners.flink.translation.wrappers.streaming.StreamRecordStripper.stripStreamRecordFromWindowedValue) SerializablePipelineOptions(org.apache.beam.runners.core.construction.SerializablePipelineOptions) IntervalWindow(org.apache.beam.sdk.transforms.windowing.IntervalWindow) Instant(org.joda.time.Instant) ByteBuffer(java.nio.ByteBuffer) DoFn(org.apache.beam.sdk.transforms.DoFn) Test(org.junit.Test)

Example 40 with StateSpec

use of org.apache.beam.sdk.state.StateSpec in project beam by apache.

the class PortableStateExecutionTest method testExecution.

@Test(timeout = 120_000)
public void testExecution() throws Exception {
    PipelineOptions options = PipelineOptionsFactory.fromArgs("--experiments=beam_fn_api").create();
    options.setRunner(CrashingRunner.class);
    options.as(FlinkPipelineOptions.class).setFlinkMaster("[local]");
    options.as(FlinkPipelineOptions.class).setStreaming(isStreaming);
    options.as(FlinkPipelineOptions.class).setParallelism(2);
    options.as(PortablePipelineOptions.class).setDefaultEnvironmentType(Environments.ENVIRONMENT_EMBEDDED);
    Pipeline p = Pipeline.create(options);
    PCollection<KV<String, String>> output = p.apply(Impulse.create()).apply(ParDo.of(new DoFn<byte[], KV<String, Integer>>() {

        @ProcessElement
        public void process(ProcessContext ctx) {
            // Values == -1 will clear the state
            ctx.output(KV.of("clearedState", 1));
            ctx.output(KV.of("clearedState", CLEAR_STATE));
            // values >= 1 will be added on top of each other
            ctx.output(KV.of("bla1", 42));
            ctx.output(KV.of("bla", 23));
            ctx.output(KV.of("bla2", 64));
            ctx.output(KV.of("bla", 1));
            ctx.output(KV.of("bla", 1));
            // values == -2 will write the current state to the output
            ctx.output(KV.of("bla", WRITE_STATE));
            ctx.output(KV.of("bla1", WRITE_STATE));
            ctx.output(KV.of("bla2", WRITE_STATE));
            ctx.output(KV.of("clearedState", WRITE_STATE));
        }
    })).apply("statefulDoFn", ParDo.of(new DoFn<KV<String, Integer>, KV<String, String>>() {

        @StateId("valueState")
        private final StateSpec<ValueState<Integer>> valueStateSpec = StateSpecs.value(VarIntCoder.of());

        @StateId("valueState2")
        private final StateSpec<ValueState<Integer>> valueStateSpec2 = StateSpecs.value(VarIntCoder.of());

        @ProcessElement
        public void process(ProcessContext ctx, @StateId("valueState") ValueState<Integer> valueState, @StateId("valueState2") ValueState<Integer> valueState2) {
            performStateUpdates(ctx, valueState);
            performStateUpdates(ctx, valueState2);
        }

        private void performStateUpdates(ProcessContext ctx, ValueState<Integer> valueState) {
            Integer value = ctx.element().getValue();
            if (value == null) {
                throw new IllegalStateException();
            }
            switch(value) {
                case CLEAR_STATE:
                    valueState.clear();
                    break;
                case WRITE_STATE:
                    Integer read = valueState.read();
                    ctx.output(KV.of(ctx.element().getKey(), read == null ? "null" : read.toString()));
                    break;
                default:
                    Integer currentState = valueState.read();
                    if (currentState == null) {
                        currentState = value;
                    } else {
                        currentState += value;
                    }
                    valueState.write(currentState);
            }
        }
    }));
    PAssert.that(output).containsInAnyOrder(KV.of("bla", "25"), KV.of("bla1", "42"), KV.of("bla2", "64"), KV.of("clearedState", "null"), KV.of("bla", "25"), KV.of("bla1", "42"), KV.of("bla2", "64"), KV.of("clearedState", "null"));
    RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(p);
    JobInvocation jobInvocation = FlinkJobInvoker.create(null).createJobInvocation("id", "none", flinkJobExecutor, pipelineProto, options.as(FlinkPipelineOptions.class), new FlinkPipelineRunner(options.as(FlinkPipelineOptions.class), null, Collections.emptyList()));
    jobInvocation.start();
    while (jobInvocation.getState() != JobState.Enum.DONE) {
        Thread.sleep(1000);
    }
}
Also used : JobInvocation(org.apache.beam.runners.jobsubmission.JobInvocation) KV(org.apache.beam.sdk.values.KV) Pipeline(org.apache.beam.sdk.Pipeline) StateSpec(org.apache.beam.sdk.state.StateSpec) RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) DoFn(org.apache.beam.sdk.transforms.DoFn) ValueState(org.apache.beam.sdk.state.ValueState) PortablePipelineOptions(org.apache.beam.sdk.options.PortablePipelineOptions) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) PortablePipelineOptions(org.apache.beam.sdk.options.PortablePipelineOptions) Test(org.junit.Test)

Aggregations

StateSpec (org.apache.beam.sdk.state.StateSpec)47 Test (org.junit.Test)38 KV (org.apache.beam.sdk.values.KV)35 Matchers.containsString (org.hamcrest.Matchers.containsString)24 Category (org.junit.experimental.categories.Category)24 StringUtils.byteArrayToJsonString (org.apache.beam.sdk.util.StringUtils.byteArrayToJsonString)23 ValueState (org.apache.beam.sdk.state.ValueState)21 DoFn (org.apache.beam.sdk.transforms.DoFn)19 ArrayList (java.util.ArrayList)10 CombiningState (org.apache.beam.sdk.state.CombiningState)10 Pipeline (org.apache.beam.sdk.Pipeline)9 List (java.util.List)8 TupleTag (org.apache.beam.sdk.values.TupleTag)8 Instant (org.joda.time.Instant)8 BagState (org.apache.beam.sdk.state.BagState)7 BoundedWindow (org.apache.beam.sdk.transforms.windowing.BoundedWindow)7 TupleTagList (org.apache.beam.sdk.values.TupleTagList)7 MapState (org.apache.beam.sdk.state.MapState)6 SetState (org.apache.beam.sdk.state.SetState)6 Timer (org.apache.beam.sdk.state.Timer)6