use of org.apache.beam.sdk.values.KV in project beam by apache.
the class GroupIntoBatchesTest method testInStreamingMode.
@Test
@Category({ NeedsRunner.class, UsesTimersInParDo.class, UsesTestStream.class, UsesStatefulParDo.class })
public void testInStreamingMode() {
int timestampInterval = 1;
Instant startInstant = new Instant(0L);
TestStream.Builder<KV<String, String>> streamBuilder = TestStream.create(KvCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of())).advanceWatermarkTo(startInstant);
long offset = 0L;
for (KV<String, String> element : data) {
streamBuilder = streamBuilder.addElements(TimestampedValue.of(element, startInstant.plus(Duration.standardSeconds(offset * timestampInterval))));
offset++;
}
final long windowDuration = 6;
TestStream<KV<String, String>> stream = streamBuilder.advanceWatermarkTo(startInstant.plus(Duration.standardSeconds(windowDuration - 1))).advanceWatermarkTo(startInstant.plus(Duration.standardSeconds(windowDuration + 1))).advanceWatermarkTo(startInstant.plus(Duration.standardSeconds(NUM_ELEMENTS))).advanceWatermarkToInfinity();
PCollection<KV<String, String>> inputCollection = pipeline.apply(stream).apply(Window.<KV<String, String>>into(FixedWindows.of(Duration.standardSeconds(windowDuration))).withAllowedLateness(Duration.millis(ALLOWED_LATENESS)));
inputCollection.apply(ParDo.of(new DoFn<KV<String, String>, Void>() {
@ProcessElement
public void processElement(ProcessContext c, BoundedWindow window) {
LOG.debug("*** ELEMENT: ({},{}) *** with timestamp %s in window %s", c.element().getKey(), c.element().getValue(), c.timestamp().toString(), window.toString());
}
}));
PCollection<KV<String, Iterable<String>>> outputCollection = inputCollection.apply(GroupIntoBatches.<String, String>ofSize(BATCH_SIZE)).setCoder(KvCoder.of(StringUtf8Coder.of(), IterableCoder.of(StringUtf8Coder.of())));
// elements have the same key and collection is divided into windows,
// so Count.perKey values are the number of elements in windows
PCollection<KV<String, Long>> countOutput = outputCollection.apply("Count elements in windows after applying GroupIntoBatches", Count.<String, Iterable<String>>perKey());
PAssert.that("Wrong number of elements in windows after GroupIntoBatches", countOutput).satisfies(new SerializableFunction<Iterable<KV<String, Long>>, Void>() {
@Override
public Void apply(Iterable<KV<String, Long>> input) {
Iterator<KV<String, Long>> inputIterator = input.iterator();
// first element
long count0 = inputIterator.next().getValue();
// window duration is 6 and batch size is 5, so there should be 2 elements in the
// window (flush because batchSize reached and for end of window reached)
assertEquals("Wrong number of elements in first window", 2, count0);
// second element
long count1 = inputIterator.next().getValue();
// collection is 10 elements, there is only 4 elements left, so there should be only
// one element in the window (flush because end of window/collection reached)
assertEquals("Wrong number of elements in second window", 1, count1);
// third element
return null;
}
});
PAssert.that("Incorrect output collection after GroupIntoBatches", outputCollection).satisfies(new SerializableFunction<Iterable<KV<String, Iterable<String>>>, Void>() {
@Override
public Void apply(Iterable<KV<String, Iterable<String>>> input) {
Iterator<KV<String, Iterable<String>>> inputIterator = input.iterator();
// first element
int size0 = Iterables.size(inputIterator.next().getValue());
// window duration is 6 and batch size is 5, so output batch size should de 5
// (flush because of batchSize reached)
assertEquals("Wrong first element batch Size", 5, size0);
// second element
int size1 = Iterables.size(inputIterator.next().getValue());
// there is only one element left in the window so batch size should be 1
// (flush because of end of window reached)
assertEquals("Wrong second element batch Size", 1, size1);
// third element
int size2 = Iterables.size(inputIterator.next().getValue());
// collection is 10 elements, there is only 4 left, so batch size should be 4
// (flush because end of collection reached)
assertEquals("Wrong third element batch Size", 4, size2);
return null;
}
});
pipeline.run().waitUntilFinish();
}
use of org.apache.beam.sdk.values.KV in project beam by apache.
the class ParDoTest method testValueStateFixedWindows.
@Test
@Category({ ValidatesRunner.class, UsesStatefulParDo.class })
public void testValueStateFixedWindows() {
final String stateId = "foo";
DoFn<KV<String, Integer>, Integer> fn = new DoFn<KV<String, Integer>, Integer>() {
@StateId(stateId)
private final StateSpec<ValueState<Integer>> intState = StateSpecs.value(VarIntCoder.of());
@ProcessElement
public void processElement(ProcessContext c, @StateId(stateId) ValueState<Integer> state) {
Integer currentValue = MoreObjects.firstNonNull(state.read(), 0);
c.output(currentValue);
state.write(currentValue + 1);
}
};
IntervalWindow firstWindow = new IntervalWindow(new Instant(0), new Instant(10));
IntervalWindow secondWindow = new IntervalWindow(new Instant(10), new Instant(20));
PCollection<Integer> output = pipeline.apply(Create.timestamped(// first window
TimestampedValue.of(KV.of("hello", 7), new Instant(1)), TimestampedValue.of(KV.of("hello", 14), new Instant(2)), TimestampedValue.of(KV.of("hello", 21), new Instant(3)), // second window
TimestampedValue.of(KV.of("hello", 28), new Instant(11)), TimestampedValue.of(KV.of("hello", 35), new Instant(13)))).apply(Window.<KV<String, Integer>>into(FixedWindows.of(Duration.millis(10)))).apply("Stateful ParDo", ParDo.of(fn));
PAssert.that(output).inWindow(firstWindow).containsInAnyOrder(0, 1, 2);
PAssert.that(output).inWindow(secondWindow).containsInAnyOrder(0, 1);
pipeline.run();
}
use of org.apache.beam.sdk.values.KV in project beam by apache.
the class ParDoTest method testSetStateCoderInferenceFailure.
@Test
@Category({ ValidatesRunner.class, UsesStatefulParDo.class, UsesSetState.class })
public void testSetStateCoderInferenceFailure() throws Exception {
final String stateId = "foo";
final String countStateId = "count";
Coder<MyInteger> myIntegerCoder = MyIntegerCoder.of();
DoFn<KV<String, Integer>, Set<MyInteger>> fn = new DoFn<KV<String, Integer>, Set<MyInteger>>() {
@StateId(stateId)
private final StateSpec<SetState<MyInteger>> setState = StateSpecs.set();
@StateId(countStateId)
private final StateSpec<CombiningState<Integer, int[], Integer>> countState = StateSpecs.combiningFromInputInternal(VarIntCoder.of(), Sum.ofIntegers());
@ProcessElement
public void processElement(ProcessContext c, @StateId(stateId) SetState<MyInteger> state, @StateId(countStateId) CombiningState<Integer, int[], Integer> count) {
state.add(new MyInteger(c.element().getValue()));
count.add(1);
if (count.read() >= 4) {
Set<MyInteger> set = Sets.newHashSet(state.read());
c.output(set);
}
}
};
thrown.expect(RuntimeException.class);
thrown.expectMessage("Unable to infer a coder for SetState and no Coder was specified.");
pipeline.apply(Create.of(KV.of("hello", 97), KV.of("hello", 42), KV.of("hello", 42), KV.of("hello", 12))).apply(ParDo.of(fn)).setCoder(SetCoder.of(myIntegerCoder));
pipeline.run();
}
use of org.apache.beam.sdk.values.KV in project beam by apache.
the class ParDoTest method testMapStateCoderInference.
@Test
@Category({ ValidatesRunner.class, UsesStatefulParDo.class, UsesMapState.class })
public void testMapStateCoderInference() {
final String stateId = "foo";
final String countStateId = "count";
Coder<MyInteger> myIntegerCoder = MyIntegerCoder.of();
pipeline.getCoderRegistry().registerCoderForClass(MyInteger.class, myIntegerCoder);
DoFn<KV<String, KV<String, Integer>>, KV<String, MyInteger>> fn = new DoFn<KV<String, KV<String, Integer>>, KV<String, MyInteger>>() {
@StateId(stateId)
private final StateSpec<MapState<String, MyInteger>> mapState = StateSpecs.map();
@StateId(countStateId)
private final StateSpec<CombiningState<Integer, int[], Integer>> countState = StateSpecs.combiningFromInputInternal(VarIntCoder.of(), Sum.ofIntegers());
@ProcessElement
public void processElement(ProcessContext c, @StateId(stateId) MapState<String, MyInteger> state, @StateId(countStateId) CombiningState<Integer, int[], Integer> count) {
KV<String, Integer> value = c.element().getValue();
state.put(value.getKey(), new MyInteger(value.getValue()));
count.add(1);
if (count.read() >= 4) {
Iterable<Map.Entry<String, MyInteger>> iterate = state.entries().read();
for (Map.Entry<String, MyInteger> entry : iterate) {
c.output(KV.of(entry.getKey(), entry.getValue()));
}
}
}
};
PCollection<KV<String, MyInteger>> output = pipeline.apply(Create.of(KV.of("hello", KV.of("a", 97)), KV.of("hello", KV.of("b", 42)), KV.of("hello", KV.of("b", 42)), KV.of("hello", KV.of("c", 12)))).apply(ParDo.of(fn)).setCoder(KvCoder.of(StringUtf8Coder.of(), myIntegerCoder));
PAssert.that(output).containsInAnyOrder(KV.of("a", new MyInteger(97)), KV.of("b", new MyInteger(42)), KV.of("c", new MyInteger(12)));
pipeline.run();
}
use of org.apache.beam.sdk.values.KV in project beam by apache.
the class ParDoTest method testCombiningStateCoderInference.
@Test
@Category({ ValidatesRunner.class, UsesStatefulParDo.class })
public void testCombiningStateCoderInference() {
pipeline.getCoderRegistry().registerCoderForClass(MyInteger.class, MyIntegerCoder.of());
final String stateId = "foo";
DoFn<KV<String, Integer>, String> fn = new DoFn<KV<String, Integer>, String>() {
private static final int EXPECTED_SUM = 16;
@StateId(stateId)
private final StateSpec<CombiningState<Integer, MyInteger, Integer>> combiningState = StateSpecs.combining(new Combine.CombineFn<Integer, MyInteger, Integer>() {
@Override
public MyInteger createAccumulator() {
return new MyInteger(0);
}
@Override
public MyInteger addInput(MyInteger accumulator, Integer input) {
return new MyInteger(accumulator.getValue() + input);
}
@Override
public MyInteger mergeAccumulators(Iterable<MyInteger> accumulators) {
int newValue = 0;
for (MyInteger myInteger : accumulators) {
newValue += myInteger.getValue();
}
return new MyInteger(newValue);
}
@Override
public Integer extractOutput(MyInteger accumulator) {
return accumulator.getValue();
}
});
@ProcessElement
public void processElement(ProcessContext c, @StateId(stateId) CombiningState<Integer, MyInteger, Integer> state) {
state.add(c.element().getValue());
Integer currentValue = state.read();
if (currentValue == EXPECTED_SUM) {
c.output("right on");
}
}
};
PCollection<String> output = pipeline.apply(Create.of(KV.of("hello", 3), KV.of("hello", 6), KV.of("hello", 7))).apply(ParDo.of(fn));
// There should only be one moment at which the average is exactly 16
PAssert.that(output).containsInAnyOrder("right on");
pipeline.run();
}
Aggregations