use of org.apache.beam.sdk.transforms.DoFn in project beam by apache.
the class DoFnOperatorTest method testBundleKeyed.
@Test
public void testBundleKeyed() throws Exception {
StringUtf8Coder keyCoder = StringUtf8Coder.of();
KvToByteBufferKeySelector<String, String> keySelector = new KvToByteBufferKeySelector<>(keyCoder, new SerializablePipelineOptions(FlinkPipelineOptions.defaults()));
KvCoder<String, String> kvCoder = KvCoder.of(keyCoder, StringUtf8Coder.of());
WindowedValue.ValueOnlyWindowedValueCoder<KV<String, String>> windowedValueCoder = WindowedValue.getValueOnlyCoder(kvCoder);
TupleTag<String> outputTag = new TupleTag<>("main-output");
FlinkPipelineOptions options = FlinkPipelineOptions.defaults();
options.setMaxBundleSize(2L);
options.setMaxBundleTimeMills(10L);
DoFn<KV<String, String>, String> doFn = new DoFn<KV<String, String>, String>() {
@ProcessElement
public void processElement(ProcessContext ctx) {
// Change output type of element to test that we do not depend on the input keying
ctx.output(ctx.element().getValue());
}
@FinishBundle
public void finishBundle(FinishBundleContext context) {
context.output("finishBundle", BoundedWindow.TIMESTAMP_MIN_VALUE, GlobalWindow.INSTANCE);
}
};
DoFnOperator.MultiOutputOutputManagerFactory<String> outputManagerFactory = new DoFnOperator.MultiOutputOutputManagerFactory<>(outputTag, WindowedValue.getFullCoder(kvCoder.getValueCoder(), GlobalWindow.Coder.INSTANCE), new SerializablePipelineOptions(options));
DoFnOperator<KV<String, String>, String> doFnOperator = new DoFnOperator<>(doFn, "stepName", windowedValueCoder, Collections.emptyMap(), outputTag, Collections.emptyList(), outputManagerFactory, WindowingStrategy.globalDefault(), new HashMap<>(), /* side-input mapping */
Collections.emptyList(), /* side inputs */
options, keyCoder, keySelector, DoFnSchemaInformation.create(), Collections.emptyMap());
OneInputStreamOperatorTestHarness<WindowedValue<KV<String, String>>, WindowedValue<String>> testHarness = new KeyedOneInputStreamOperatorTestHarness<>(doFnOperator, keySelector, keySelector.getProducedType());
testHarness.open();
testHarness.processElement(new StreamRecord<>(WindowedValue.valueInGlobalWindow(KV.of("key", "a"))));
testHarness.processElement(new StreamRecord<>(WindowedValue.valueInGlobalWindow(KV.of("key", "b"))));
testHarness.processElement(new StreamRecord<>(WindowedValue.valueInGlobalWindow(KV.of("key", "c"))));
assertThat(stripStreamRecordFromWindowedValue(testHarness.getOutput()), contains(WindowedValue.valueInGlobalWindow("a"), WindowedValue.valueInGlobalWindow("b"), WindowedValue.valueInGlobalWindow("finishBundle"), WindowedValue.valueInGlobalWindow("c")));
// Take a snapshot
OperatorSubtaskState snapshot = testHarness.snapshot(0, 0);
// Finish bundle element will be buffered as part of finishing a bundle in snapshot()
PushedBackElementsHandler<KV<Integer, WindowedValue<?>>> pushedBackElementsHandler = doFnOperator.outputManager.pushedBackElementsHandler;
assertThat(pushedBackElementsHandler, instanceOf(NonKeyedPushedBackElementsHandler.class));
List<KV<Integer, WindowedValue<?>>> bufferedElements = pushedBackElementsHandler.getElements().collect(Collectors.toList());
assertThat(bufferedElements, contains(KV.of(0, WindowedValue.valueInGlobalWindow("finishBundle"))));
testHarness.close();
doFnOperator = new DoFnOperator<>(doFn, "stepName", windowedValueCoder, Collections.emptyMap(), outputTag, Collections.emptyList(), outputManagerFactory, WindowingStrategy.globalDefault(), new HashMap<>(), /* side-input mapping */
Collections.emptyList(), /* side inputs */
options, keyCoder, keySelector, DoFnSchemaInformation.create(), Collections.emptyMap());
testHarness = new KeyedOneInputStreamOperatorTestHarness<>(doFnOperator, keySelector, keySelector.getProducedType());
// Restore snapshot
testHarness.initializeState(snapshot);
testHarness.open();
// startBundle will output the buffered elements.
testHarness.processElement(new StreamRecord<>(WindowedValue.valueInGlobalWindow(KV.of("key", "d"))));
// check finishBundle by timeout
testHarness.setProcessingTime(10);
assertThat(stripStreamRecordFromWindowedValue(testHarness.getOutput()), contains(// The first finishBundle is restored from the checkpoint
WindowedValue.valueInGlobalWindow("finishBundle"), WindowedValue.valueInGlobalWindow("d"), WindowedValue.valueInGlobalWindow("finishBundle")));
testHarness.close();
}
use of org.apache.beam.sdk.transforms.DoFn in project beam by apache.
the class DoFnOperatorTest method testWatermarkContract.
/**
* This test specifically verifies that we correctly map Flink watermarks to Beam watermarks. In
* Beam, a watermark {@code T} guarantees there will not be elements with a timestamp {@code < T}
* in the future. In Flink, a watermark {@code T} guarantees there will not be elements with a
* timestamp {@code <= T} in the future. We have to make sure to take this into account when
* firing timers.
*
* <p>This does not test the timer API in general or processing-time timers because there are
* generic tests for this in {@code ParDoTest}.
*/
@Test
public void testWatermarkContract() throws Exception {
final Instant timerTimestamp = new Instant(1000);
final Instant timerOutputTimestamp = timerTimestamp.minus(Duration.millis(1));
final String eventTimeMessage = "Event timer fired: ";
final String processingTimeMessage = "Processing timer fired";
WindowingStrategy<Object, IntervalWindow> windowingStrategy = WindowingStrategy.of(FixedWindows.of(Duration.millis(10_000)));
final String eventTimerId = "eventTimer";
final String eventTimerId2 = "eventTimer2";
final String processingTimerId = "processingTimer";
DoFn<Integer, String> fn = new DoFn<Integer, String>() {
@TimerId(eventTimerId)
private final TimerSpec eventTimer = TimerSpecs.timer(TimeDomain.EVENT_TIME);
@TimerId(eventTimerId2)
private final TimerSpec eventTimer2 = TimerSpecs.timer(TimeDomain.EVENT_TIME);
@TimerId(processingTimerId)
private final TimerSpec processingTimer = TimerSpecs.timer(TimeDomain.PROCESSING_TIME);
@ProcessElement
public void processElement(ProcessContext context, @TimerId(eventTimerId) Timer eventTimer, @TimerId(eventTimerId2) Timer eventTimerWithOutputTimestamp, @TimerId(processingTimerId) Timer processingTimer) {
eventTimer.set(timerTimestamp);
eventTimerWithOutputTimestamp.withOutputTimestamp(timerOutputTimestamp).set(timerTimestamp);
processingTimer.offset(Duration.millis(timerTimestamp.getMillis())).setRelative();
}
@OnTimer(eventTimerId)
public void onEventTime(OnTimerContext context) {
assertEquals("Timer timestamp must match set timestamp.", timerTimestamp, context.timestamp());
context.outputWithTimestamp(eventTimeMessage + eventTimerId, context.timestamp());
}
@OnTimer(eventTimerId2)
public void onEventTime2(OnTimerContext context) {
assertEquals("Timer timestamp must match set timestamp.", timerTimestamp, context.fireTimestamp());
context.output(eventTimeMessage + eventTimerId2);
}
@OnTimer(processingTimerId)
public void onProcessingTime(OnTimerContext context) {
assertEquals(// See SimpleDoFnRunner#onTimer
"Timer timestamp must match current input watermark", timerTimestamp.plus(Duration.millis(1)), context.timestamp());
context.outputWithTimestamp(processingTimeMessage, context.timestamp());
}
};
VarIntCoder keyCoder = VarIntCoder.of();
WindowedValue.FullWindowedValueCoder<Integer> inputCoder = WindowedValue.getFullCoder(keyCoder, windowingStrategy.getWindowFn().windowCoder());
WindowedValue.FullWindowedValueCoder<String> outputCoder = WindowedValue.getFullCoder(StringUtf8Coder.of(), windowingStrategy.getWindowFn().windowCoder());
KeySelector<WindowedValue<Integer>, ByteBuffer> keySelector = e -> FlinkKeyUtils.encodeKey(e.getValue(), keyCoder);
TupleTag<String> outputTag = new TupleTag<>("main-output");
DoFnOperator<Integer, String> doFnOperator = new DoFnOperator<>(fn, "stepName", inputCoder, Collections.emptyMap(), outputTag, Collections.emptyList(), new DoFnOperator.MultiOutputOutputManagerFactory<>(outputTag, outputCoder, new SerializablePipelineOptions(FlinkPipelineOptions.defaults())), windowingStrategy, new HashMap<>(), /* side-input mapping */
Collections.emptyList(), /* side inputs */
FlinkPipelineOptions.defaults(), keyCoder, /* key coder */
keySelector, DoFnSchemaInformation.create(), Collections.emptyMap());
OneInputStreamOperatorTestHarness<WindowedValue<Integer>, WindowedValue<String>> testHarness = new KeyedOneInputStreamOperatorTestHarness<>(doFnOperator, keySelector, new CoderTypeInformation<>(FlinkKeyUtils.ByteBufferCoder.of(), FlinkPipelineOptions.defaults()));
testHarness.setup(new CoderTypeSerializer<>(outputCoder, new SerializablePipelineOptions(FlinkPipelineOptions.defaults())));
testHarness.open();
testHarness.processWatermark(0);
testHarness.setProcessingTime(0);
IntervalWindow window1 = new IntervalWindow(new Instant(0), Duration.millis(10_000));
// this should register the two timers above
testHarness.processElement(new StreamRecord<>(WindowedValue.of(13, new Instant(0), window1, PaneInfo.NO_FIRING)));
assertThat(stripStreamRecordFromWindowedValue(testHarness.getOutput()), emptyIterable());
// this does not yet fire the timers (in vanilla Flink it would)
testHarness.processWatermark(timerTimestamp.getMillis());
testHarness.setProcessingTime(timerTimestamp.getMillis());
assertThat(stripStreamRecordFromWindowedValue(testHarness.getOutput()), emptyIterable());
assertThat(doFnOperator.keyedStateInternals.minWatermarkHoldMs(), is(timerOutputTimestamp.getMillis()));
// this must fire the event timers
testHarness.processWatermark(timerTimestamp.getMillis() + 1);
assertThat(stripStreamRecordFromWindowedValue(testHarness.getOutput()), containsInAnyOrder(WindowedValue.of(eventTimeMessage + eventTimerId, timerTimestamp, window1, PaneInfo.NO_FIRING), WindowedValue.of(eventTimeMessage + eventTimerId2, timerTimestamp.minus(Duration.millis(1)), window1, PaneInfo.NO_FIRING)));
testHarness.getOutput().clear();
// this must fire the processing timer
testHarness.setProcessingTime(timerTimestamp.getMillis() + 1);
assertThat(stripStreamRecordFromWindowedValue(testHarness.getOutput()), contains(WindowedValue.of(// See SimpleDoFnRunner#onTimer
processingTimeMessage, timerTimestamp.plus(Duration.millis(1)), window1, PaneInfo.NO_FIRING)));
testHarness.close();
}
use of org.apache.beam.sdk.transforms.DoFn in project beam by apache.
the class ReadSourceTest method runProgram.
private static void runProgram(String resultPath) throws Exception {
Pipeline p = FlinkTestPipeline.createForBatch();
PCollection<String> result = p.apply(GenerateSequence.from(0).to(10)).apply(ParDo.of(new DoFn<Long, String>() {
@ProcessElement
public void processElement(ProcessContext c) throws Exception {
c.output(c.element().toString());
}
}));
result.apply(TextIO.write().to(new URI(resultPath).getPath() + "/part"));
p.run();
}
use of org.apache.beam.sdk.transforms.DoFn in project beam by apache.
the class TopWikipediaSessionsTest method testProgram.
@Test
public void testProgram() throws Exception {
Pipeline p = FlinkTestPipeline.createForStreaming();
Long now = (System.currentTimeMillis() + 10000) / 1000;
PCollection<KV<String, Long>> output = p.apply(Create.of(Arrays.asList(new TableRow().set("timestamp", now).set("contributor_username", "user1"), new TableRow().set("timestamp", now + 10).set("contributor_username", "user3"), new TableRow().set("timestamp", now).set("contributor_username", "user2"), new TableRow().set("timestamp", now).set("contributor_username", "user1"), new TableRow().set("timestamp", now + 2).set("contributor_username", "user1"), new TableRow().set("timestamp", now).set("contributor_username", "user2"), new TableRow().set("timestamp", now + 1).set("contributor_username", "user2"), new TableRow().set("timestamp", now + 5).set("contributor_username", "user2"), new TableRow().set("timestamp", now + 7).set("contributor_username", "user2"), new TableRow().set("timestamp", now + 8).set("contributor_username", "user2"), new TableRow().set("timestamp", now + 200).set("contributor_username", "user2"), new TableRow().set("timestamp", now + 230).set("contributor_username", "user1"), new TableRow().set("timestamp", now + 230).set("contributor_username", "user2"), new TableRow().set("timestamp", now + 240).set("contributor_username", "user2"), new TableRow().set("timestamp", now + 245).set("contributor_username", "user3"), new TableRow().set("timestamp", now + 235).set("contributor_username", "user3"), new TableRow().set("timestamp", now + 236).set("contributor_username", "user3"), new TableRow().set("timestamp", now + 237).set("contributor_username", "user3"), new TableRow().set("timestamp", now + 238).set("contributor_username", "user3"), new TableRow().set("timestamp", now + 239).set("contributor_username", "user3"), new TableRow().set("timestamp", now + 240).set("contributor_username", "user3"), new TableRow().set("timestamp", now + 241).set("contributor_username", "user2"), new TableRow().set("timestamp", now).set("contributor_username", "user3")))).apply(ParDo.of(new DoFn<TableRow, String>() {
@ProcessElement
public void processElement(ProcessContext c) throws Exception {
TableRow row = c.element();
long timestamp = (Integer) row.get("timestamp");
String userName = (String) row.get("contributor_username");
if (userName != null) {
// Sets the timestamp field to be used in windowing.
c.outputWithTimestamp(userName, new Instant(timestamp * 1000L));
}
}
})).apply(Window.into(Sessions.withGapDuration(Duration.standardMinutes(1)))).apply(Count.perElement());
PCollection<String> format = output.apply(ParDo.of(new DoFn<KV<String, Long>, String>() {
@ProcessElement
public void processElement(ProcessContext c) throws Exception {
KV<String, Long> el = c.element();
String out = "user: " + el.getKey() + " value:" + el.getValue();
c.output(out);
}
}));
format.apply(TextIO.write().to(resultPath));
p.run();
}
use of org.apache.beam.sdk.transforms.DoFn in project beam by apache.
the class SamzaStoreStateInternalsTest method testIteratorClosed.
@Test
public void testIteratorClosed() {
final String stateId = "foo";
DoFn<KV<String, Integer>, Set<Integer>> fn = new DoFn<KV<String, Integer>, Set<Integer>>() {
@StateId(stateId)
private final StateSpec<SetState<Integer>> setState = StateSpecs.set(VarIntCoder.of());
@ProcessElement
public void processElement(ProcessContext c, @StateId(stateId) SetState<Integer> setState) {
SamzaSetState<Integer> state = (SamzaSetState<Integer>) setState;
state.add(c.element().getValue());
// the iterator for size needs to be closed
int size = Iterators.size(state.readIterator().read());
if (size > 1) {
final Iterator<Integer> iterator = state.readIterator().read();
assertTrue(iterator.hasNext());
// this iterator should be closed too
iterator.next();
}
}
};
pipeline.apply(Create.of(KV.of("hello", 97), KV.of("hello", 42), KV.of("hello", 42), KV.of("hello", 12))).apply(ParDo.of(fn));
SamzaPipelineOptions options = PipelineOptionsFactory.create().as(SamzaPipelineOptions.class);
options.setRunner(TestSamzaRunner.class);
Map<String, String> configs = new HashMap<>(ConfigBuilder.localRunConfig());
configs.put("stores.foo.factory", TestStorageEngine.class.getName());
pipeline.getOptions().as(SamzaPipelineOptions.class).setConfigOverride(configs);
pipeline.run();
// The test code creates 7 underlying iterators, and 1 more is created during state.clear()
// Verify all of them are closed
assertEquals(8, TestStore.iterators.size());
TestStore.iterators.forEach(iter -> assertTrue(iter.closed));
}
Aggregations