use of org.apache.beam.sdk.state.StateSpec in project beam by apache.
the class ParDoTest method testValueStateDedup.
@Test
@Category({ ValidatesRunner.class, UsesStatefulParDo.class })
public void testValueStateDedup() {
final String stateId = "foo";
DoFn<KV<Integer, Integer>, Integer> onePerKey = new DoFn<KV<Integer, Integer>, Integer>() {
@StateId(stateId)
private final StateSpec<ValueState<Integer>> seenSpec = StateSpecs.value(VarIntCoder.of());
@ProcessElement
public void processElement(ProcessContext c, @StateId(stateId) ValueState<Integer> seenState) {
Integer seen = MoreObjects.firstNonNull(seenState.read(), 0);
if (seen == 0) {
seenState.write(seen + 1);
c.output(c.element().getValue());
}
}
};
int numKeys = 50;
// A big enough list that we can see some deduping
List<KV<Integer, Integer>> input = new ArrayList<>();
// The output should have no dupes
Set<Integer> expectedOutput = new HashSet<>();
for (int key = 0; key < numKeys; ++key) {
int output = 1000 + key;
expectedOutput.add(output);
for (int i = 0; i < 15; ++i) {
input.add(KV.of(key, output));
}
}
Collections.shuffle(input);
PCollection<Integer> output = pipeline.apply(Create.of(input)).apply(ParDo.of(onePerKey));
PAssert.that(output).containsInAnyOrder(expectedOutput);
pipeline.run();
}
use of org.apache.beam.sdk.state.StateSpec in project beam by apache.
the class ParDoTest method testSetState.
@Test
@Category({ ValidatesRunner.class, UsesStatefulParDo.class, UsesSetState.class })
public void testSetState() {
final String stateId = "foo";
final String countStateId = "count";
DoFn<KV<String, Integer>, Set<Integer>> fn = new DoFn<KV<String, Integer>, Set<Integer>>() {
@StateId(stateId)
private final StateSpec<SetState<Integer>> setState = StateSpecs.set(VarIntCoder.of());
@StateId(countStateId)
private final StateSpec<CombiningState<Integer, int[], Integer>> countState = StateSpecs.combiningFromInputInternal(VarIntCoder.of(), Sum.ofIntegers());
@ProcessElement
public void processElement(ProcessContext c, @StateId(stateId) SetState<Integer> state, @StateId(countStateId) CombiningState<Integer, int[], Integer> count) {
state.add(c.element().getValue());
count.add(1);
if (count.read() >= 4) {
Set<Integer> set = Sets.newHashSet(state.read());
c.output(set);
}
}
};
PCollection<Set<Integer>> output = pipeline.apply(Create.of(KV.of("hello", 97), KV.of("hello", 42), KV.of("hello", 42), KV.of("hello", 12))).apply(ParDo.of(fn));
PAssert.that(output).containsInAnyOrder(Sets.newHashSet(97, 42, 12));
pipeline.run();
}
use of org.apache.beam.sdk.state.StateSpec in project beam by apache.
the class ParDoTest method testCoderInferenceOfList.
@Test
@Category({ ValidatesRunner.class, UsesStatefulParDo.class })
public void testCoderInferenceOfList() {
final String stateId = "foo";
MyIntegerCoder myIntegerCoder = MyIntegerCoder.of();
pipeline.getCoderRegistry().registerCoderForClass(MyInteger.class, myIntegerCoder);
DoFn<KV<String, Integer>, List<MyInteger>> fn = new DoFn<KV<String, Integer>, List<MyInteger>>() {
@StateId(stateId)
private final StateSpec<ValueState<List<MyInteger>>> intState = StateSpecs.value();
@ProcessElement
public void processElement(ProcessContext c, @StateId(stateId) ValueState<List<MyInteger>> state) {
MyInteger myInteger = new MyInteger(c.element().getValue());
List<MyInteger> currentValue = state.read();
List<MyInteger> newValue = currentValue != null ? ImmutableList.<MyInteger>builder().addAll(currentValue).add(myInteger).build() : Collections.singletonList(myInteger);
c.output(newValue);
state.write(newValue);
}
};
pipeline.apply(Create.of(KV.of("hello", 42), KV.of("hello", 97), KV.of("hello", 84))).apply(ParDo.of(fn)).setCoder(ListCoder.of(myIntegerCoder));
pipeline.run();
}
use of org.apache.beam.sdk.state.StateSpec in project beam by apache.
the class DoFnOperatorTest method testLateDroppingForStatefulFn.
@Test
public void testLateDroppingForStatefulFn() throws Exception {
WindowingStrategy<Object, IntervalWindow> windowingStrategy = WindowingStrategy.of(FixedWindows.of(Duration.millis(10)));
DoFn<Integer, String> fn = new DoFn<Integer, String>() {
@StateId("state")
private final StateSpec<ValueState<String>> stateSpec = StateSpecs.value(StringUtf8Coder.of());
@ProcessElement
public void processElement(ProcessContext context) {
context.output(context.element().toString());
}
};
VarIntCoder keyCoder = VarIntCoder.of();
Coder<WindowedValue<Integer>> inputCoder = WindowedValue.getFullCoder(keyCoder, windowingStrategy.getWindowFn().windowCoder());
Coder<WindowedValue<String>> outputCoder = WindowedValue.getFullCoder(StringUtf8Coder.of(), windowingStrategy.getWindowFn().windowCoder());
KeySelector<WindowedValue<Integer>, ByteBuffer> keySelector = e -> FlinkKeyUtils.encodeKey(e.getValue(), keyCoder);
TupleTag<String> outputTag = new TupleTag<>("main-output");
DoFnOperator<Integer, String> doFnOperator = new DoFnOperator<>(fn, "stepName", inputCoder, Collections.emptyMap(), outputTag, Collections.emptyList(), new DoFnOperator.MultiOutputOutputManagerFactory<>(outputTag, outputCoder, new SerializablePipelineOptions(FlinkPipelineOptions.defaults())), windowingStrategy, new HashMap<>(), /* side-input mapping */
Collections.emptyList(), /* side inputs */
FlinkPipelineOptions.defaults(), keyCoder, /* key coder */
keySelector, DoFnSchemaInformation.create(), Collections.emptyMap());
OneInputStreamOperatorTestHarness<WindowedValue<Integer>, WindowedValue<String>> testHarness = new KeyedOneInputStreamOperatorTestHarness<>(doFnOperator, keySelector, new CoderTypeInformation<>(FlinkKeyUtils.ByteBufferCoder.of(), FlinkPipelineOptions.defaults()));
testHarness.open();
testHarness.processWatermark(0);
IntervalWindow window1 = new IntervalWindow(new Instant(0), Duration.millis(10));
// this should not be late
testHarness.processElement(new StreamRecord<>(WindowedValue.of(13, new Instant(0), window1, PaneInfo.NO_FIRING)));
assertThat(stripStreamRecordFromWindowedValue(testHarness.getOutput()), contains(WindowedValue.of("13", new Instant(0), window1, PaneInfo.NO_FIRING)));
testHarness.getOutput().clear();
testHarness.processWatermark(9);
// this should still not be considered late
testHarness.processElement(new StreamRecord<>(WindowedValue.of(17, new Instant(0), window1, PaneInfo.NO_FIRING)));
assertThat(stripStreamRecordFromWindowedValue(testHarness.getOutput()), contains(WindowedValue.of("17", new Instant(0), window1, PaneInfo.NO_FIRING)));
testHarness.getOutput().clear();
testHarness.processWatermark(10);
// this should now be considered late
testHarness.processElement(new StreamRecord<>(WindowedValue.of(17, new Instant(0), window1, PaneInfo.NO_FIRING)));
assertThat(stripStreamRecordFromWindowedValue(testHarness.getOutput()), emptyIterable());
testHarness.close();
}
use of org.apache.beam.sdk.state.StateSpec in project beam by apache.
the class PortableStateExecutionTest method testExecution.
@Test(timeout = 120_000)
public void testExecution() throws Exception {
PipelineOptions options = PipelineOptionsFactory.fromArgs("--experiments=beam_fn_api").create();
options.setRunner(CrashingRunner.class);
options.as(FlinkPipelineOptions.class).setFlinkMaster("[local]");
options.as(FlinkPipelineOptions.class).setStreaming(isStreaming);
options.as(FlinkPipelineOptions.class).setParallelism(2);
options.as(PortablePipelineOptions.class).setDefaultEnvironmentType(Environments.ENVIRONMENT_EMBEDDED);
Pipeline p = Pipeline.create(options);
PCollection<KV<String, String>> output = p.apply(Impulse.create()).apply(ParDo.of(new DoFn<byte[], KV<String, Integer>>() {
@ProcessElement
public void process(ProcessContext ctx) {
// Values == -1 will clear the state
ctx.output(KV.of("clearedState", 1));
ctx.output(KV.of("clearedState", CLEAR_STATE));
// values >= 1 will be added on top of each other
ctx.output(KV.of("bla1", 42));
ctx.output(KV.of("bla", 23));
ctx.output(KV.of("bla2", 64));
ctx.output(KV.of("bla", 1));
ctx.output(KV.of("bla", 1));
// values == -2 will write the current state to the output
ctx.output(KV.of("bla", WRITE_STATE));
ctx.output(KV.of("bla1", WRITE_STATE));
ctx.output(KV.of("bla2", WRITE_STATE));
ctx.output(KV.of("clearedState", WRITE_STATE));
}
})).apply("statefulDoFn", ParDo.of(new DoFn<KV<String, Integer>, KV<String, String>>() {
@StateId("valueState")
private final StateSpec<ValueState<Integer>> valueStateSpec = StateSpecs.value(VarIntCoder.of());
@StateId("valueState2")
private final StateSpec<ValueState<Integer>> valueStateSpec2 = StateSpecs.value(VarIntCoder.of());
@ProcessElement
public void process(ProcessContext ctx, @StateId("valueState") ValueState<Integer> valueState, @StateId("valueState2") ValueState<Integer> valueState2) {
performStateUpdates(ctx, valueState);
performStateUpdates(ctx, valueState2);
}
private void performStateUpdates(ProcessContext ctx, ValueState<Integer> valueState) {
Integer value = ctx.element().getValue();
if (value == null) {
throw new IllegalStateException();
}
switch(value) {
case CLEAR_STATE:
valueState.clear();
break;
case WRITE_STATE:
Integer read = valueState.read();
ctx.output(KV.of(ctx.element().getKey(), read == null ? "null" : read.toString()));
break;
default:
Integer currentState = valueState.read();
if (currentState == null) {
currentState = value;
} else {
currentState += value;
}
valueState.write(currentState);
}
}
}));
PAssert.that(output).containsInAnyOrder(KV.of("bla", "25"), KV.of("bla1", "42"), KV.of("bla2", "64"), KV.of("clearedState", "null"), KV.of("bla", "25"), KV.of("bla1", "42"), KV.of("bla2", "64"), KV.of("clearedState", "null"));
RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(p);
JobInvocation jobInvocation = FlinkJobInvoker.create(null).createJobInvocation("id", "none", flinkJobExecutor, pipelineProto, options.as(FlinkPipelineOptions.class), new FlinkPipelineRunner(options.as(FlinkPipelineOptions.class), null, Collections.emptyList()));
jobInvocation.start();
while (jobInvocation.getState() != JobState.Enum.DONE) {
Thread.sleep(1000);
}
}
Aggregations