use of org.apache.flink.streaming.api.functions.KeyedProcessFunction in project flink by apache.
the class KafkaConsumerTestBase method runCollectingSchemaTest.
/**
* Test that ensures that DeserializationSchema can emit multiple records via a Collector.
*
* @throws Exception
*/
public void runCollectingSchemaTest() throws Exception {
final int elementCount = 20;
final String topic = writeSequence("testCollectingSchema", elementCount, 1, 1);
// read using custom schema
final StreamExecutionEnvironment env1 = StreamExecutionEnvironment.getExecutionEnvironment();
env1.setParallelism(1);
env1.getConfig().setRestartStrategy(RestartStrategies.noRestart());
Properties props = new Properties();
props.putAll(standardProps);
props.putAll(secureProps);
DataStream<Tuple2<Integer, String>> fromKafka = env1.addSource(kafkaServer.getConsumer(topic, new CollectingDeserializationSchema(elementCount), props).assignTimestampsAndWatermarks(new AscendingTimestampExtractor<Tuple2<Integer, String>>() {
@Override
public long extractAscendingTimestamp(Tuple2<Integer, String> element) {
String string = element.f1;
return Long.parseLong(string.substring(0, string.length() - 1));
}
}));
fromKafka.keyBy(t -> t.f0).process(new KeyedProcessFunction<Integer, Tuple2<Integer, String>, Void>() {
private boolean registered = false;
@Override
public void processElement(Tuple2<Integer, String> value, Context ctx, Collector<Void> out) throws Exception {
if (!registered) {
ctx.timerService().registerEventTimeTimer(elementCount - 2);
registered = true;
}
}
@Override
public void onTimer(long timestamp, OnTimerContext ctx, Collector<Void> out) throws Exception {
throw new SuccessException();
}
});
tryExecute(env1, "Consume " + elementCount + " elements from Kafka");
deleteTestTopic(topic);
}
use of org.apache.flink.streaming.api.functions.KeyedProcessFunction in project flink by apache.
the class SortingBoundedInputITCase method testBatchExecutionWithTimersOneInput.
@Test
public void testBatchExecutionWithTimersOneInput() {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// set parallelism to 1 to have consistent order of results
env.setParallelism(1);
Configuration config = new Configuration();
config.set(ExecutionOptions.RUNTIME_MODE, RuntimeExecutionMode.BATCH);
env.configure(config, this.getClass().getClassLoader());
WatermarkStrategy<Tuple2<Integer, Integer>> watermarkStrategy = WatermarkStrategy.forGenerator(ctx -> GENERATE_WATERMARK_AFTER_4_14_TIMESTAMP).withTimestampAssigner((r, previousTimestamp) -> r.f1);
SingleOutputStreamOperator<Tuple2<Integer, Integer>> elements = env.fromElements(Tuple2.of(1, 3), Tuple2.of(1, 1), Tuple2.of(2, 1), Tuple2.of(1, 4), // late element
Tuple2.of(2, 3), // late element
Tuple2.of(1, 2), Tuple2.of(1, 13), Tuple2.of(1, 11), Tuple2.of(2, 14), // late element
Tuple2.of(1, 11)).assignTimestampsAndWatermarks(watermarkStrategy);
OutputTag<Integer> lateElements = new OutputTag<>("late_elements", BasicTypeInfo.INT_TYPE_INFO);
SingleOutputStreamOperator<Tuple3<Long, Integer, Integer>> sums = elements.map(element -> element.f0).keyBy(element -> element).process(new KeyedProcessFunction<Integer, Integer, Tuple3<Long, Integer, Integer>>() {
private MapState<Long, Integer> countState;
private ValueState<Long> previousTimestampState;
@Override
public void open(Configuration parameters) {
countState = getRuntimeContext().getMapState(new MapStateDescriptor<>("sum", BasicTypeInfo.LONG_TYPE_INFO, BasicTypeInfo.INT_TYPE_INFO));
previousTimestampState = getRuntimeContext().getState(new ValueStateDescriptor<>("previousTimestamp", BasicTypeInfo.LONG_TYPE_INFO));
}
@Override
public void processElement(Integer value, Context ctx, Collector<Tuple3<Long, Integer, Integer>> out) throws Exception {
Long elementTimestamp = ctx.timestamp();
long nextTen = ((elementTimestamp + 10) / 10) * 10;
ctx.timerService().registerEventTimeTimer(nextTen);
if (elementTimestamp < ctx.timerService().currentWatermark()) {
ctx.output(lateElements, value);
} else {
Long previousTimestamp = Optional.ofNullable(previousTimestampState.value()).orElse(0L);
assertThat(elementTimestamp, greaterThanOrEqualTo(previousTimestamp));
previousTimestampState.update(elementTimestamp);
Integer currentCount = Optional.ofNullable(countState.get(nextTen)).orElse(0);
countState.put(nextTen, currentCount + 1);
}
}
@Override
public void onTimer(long timestamp, OnTimerContext ctx, Collector<Tuple3<Long, Integer, Integer>> out) throws Exception {
out.collect(Tuple3.of(timestamp, ctx.getCurrentKey(), countState.get(timestamp)));
countState.remove(timestamp);
// this would go in infinite loop if we did not quiesce the
// timer service.
ctx.timerService().registerEventTimeTimer(timestamp + 1);
}
});
DataStream<Integer> lateStream = sums.getSideOutput(lateElements);
List<Integer> lateRecordsCollected = CollectionUtil.iteratorToList(DataStreamUtils.collect(lateStream));
List<Tuple3<Long, Integer, Integer>> sumsCollected = CollectionUtil.iteratorToList(DataStreamUtils.collect(sums));
assertTrue(lateRecordsCollected.isEmpty());
assertThat(sumsCollected, equalTo(Arrays.asList(Tuple3.of(10L, 1, 4), Tuple3.of(20L, 1, 3), Tuple3.of(10L, 2, 2), Tuple3.of(20L, 2, 1))));
}
use of org.apache.flink.streaming.api.functions.KeyedProcessFunction in project flink by apache.
the class DataStreamJavaITCase method getComplexUnifiedPipeline.
// --------------------------------------------------------------------------------------------
// Helper methods
// --------------------------------------------------------------------------------------------
private Table getComplexUnifiedPipeline(StreamExecutionEnvironment env) {
final DataStream<String> allowedNamesStream = env.fromElements("Bob", "Alice");
final StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env);
tableEnv.createTemporaryView("AllowedNamesTable", tableEnv.fromDataStream(allowedNamesStream).as("allowedName"));
final Table nameCountTable = tableEnv.sqlQuery("SELECT name, COUNT(*) AS c " + "FROM (VALUES ('Bob'), ('Alice'), ('Greg'), ('Bob')) AS NameTable(name) " + "WHERE name IN (SELECT allowedName FROM AllowedNamesTable)" + "GROUP BY name");
final DataStream<Row> nameCountStream = tableEnv.toChangelogStream(nameCountTable);
final DataStream<Tuple2<String, Long>> updatesPerNameStream = nameCountStream.keyBy(r -> r.<String>getFieldAs("name")).process(new KeyedProcessFunction<String, Row, Tuple2<String, Long>>() {
ValueState<Long> count;
@Override
public void open(Configuration parameters) {
count = getRuntimeContext().getState(new ValueStateDescriptor<>("count", Long.class));
}
@Override
public void processElement(Row r, Context ctx, Collector<Tuple2<String, Long>> out) throws IOException {
Long currentCount = count.value();
if (currentCount == null) {
currentCount = 0L;
}
final long updatedCount = currentCount + 1;
count.update(updatedCount);
out.collect(Tuple2.of(ctx.getCurrentKey(), updatedCount));
}
});
tableEnv.createTemporaryView("UpdatesPerName", updatesPerNameStream);
return tableEnv.sqlQuery("SELECT DISTINCT f0, f1 FROM UpdatesPerName");
}
use of org.apache.flink.streaming.api.functions.KeyedProcessFunction in project flink by apache.
the class DataStreamTest method testKeyedStreamKeyedProcessTranslation.
/**
* Verify that a {@link KeyedStream#process(KeyedProcessFunction)} call is correctly translated
* to an operator.
*/
@Test
public void testKeyedStreamKeyedProcessTranslation() {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
DataStreamSource<Long> src = env.generateSequence(0, 0);
KeyedProcessFunction<Long, Long, Integer> keyedProcessFunction = new KeyedProcessFunction<Long, Long, Integer>() {
private static final long serialVersionUID = 1L;
@Override
public void processElement(Long value, Context ctx, Collector<Integer> out) throws Exception {
// Do nothing
}
@Override
public void onTimer(long timestamp, OnTimerContext ctx, Collector<Integer> out) throws Exception {
// Do nothing
}
};
DataStream<Integer> processed = src.keyBy(new IdentityKeySelector<Long>()).process(keyedProcessFunction);
processed.addSink(new DiscardingSink<Integer>());
assertEquals(keyedProcessFunction, getFunctionForDataStream(processed));
assertTrue(getOperatorForDataStream(processed) instanceof KeyedProcessOperator);
}
use of org.apache.flink.streaming.api.functions.KeyedProcessFunction in project flink by apache.
the class SavepointITCase method testTriggerSavepointAndResumeWithNoClaim.
@Test
@Ignore("Disabling this test because it regularly fails on AZP. See FLINK-25427.")
public void testTriggerSavepointAndResumeWithNoClaim() throws Exception {
final int numTaskManagers = 2;
final int numSlotsPerTaskManager = 2;
final int parallelism = numTaskManagers * numSlotsPerTaskManager;
final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setStateBackend(new EmbeddedRocksDBStateBackend(true));
env.getCheckpointConfig().enableExternalizedCheckpoints(CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION);
env.getCheckpointConfig().setCheckpointStorage(folder.newFolder().toURI());
env.setParallelism(parallelism);
final SharedReference<CountDownLatch> counter = sharedObjects.add(new CountDownLatch(10_000));
env.fromSequence(1, Long.MAX_VALUE).keyBy(i -> i % parallelism).process(new KeyedProcessFunction<Long, Long, Long>() {
private ListState<Long> last;
@Override
public void open(Configuration parameters) {
// we use list state here to create sst files of a significant size
// if sst files do not reach certain thresholds they are not stored
// in files, but as a byte stream in checkpoints metadata
last = getRuntimeContext().getListState(new ListStateDescriptor<>("last", BasicTypeInfo.LONG_TYPE_INFO));
}
@Override
public void processElement(Long value, KeyedProcessFunction<Long, Long, Long>.Context ctx, Collector<Long> out) throws Exception {
last.add(value);
out.collect(value);
}
}).addSink(new SinkFunction<Long>() {
@Override
public void invoke(Long value) {
counter.consumeSync(CountDownLatch::countDown);
}
}).setParallelism(1);
final JobGraph jobGraph = env.getStreamGraph().getJobGraph();
MiniClusterWithClientResource cluster = new MiniClusterWithClientResource(new MiniClusterResourceConfiguration.Builder().setNumberTaskManagers(numTaskManagers).setNumberSlotsPerTaskManager(numSlotsPerTaskManager).build());
cluster.before();
try {
final JobID jobID1 = new JobID();
jobGraph.setJobID(jobID1);
cluster.getClusterClient().submitJob(jobGraph).get();
CommonTestUtils.waitForAllTaskRunning(cluster.getMiniCluster(), jobID1, false);
// wait for some records to be processed before taking the checkpoint
counter.get().await();
final String firstCheckpoint = cluster.getMiniCluster().triggerCheckpoint(jobID1).get();
cluster.getClusterClient().cancel(jobID1).get();
jobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(firstCheckpoint, false, RestoreMode.NO_CLAIM));
final JobID jobID2 = new JobID();
jobGraph.setJobID(jobID2);
cluster.getClusterClient().submitJob(jobGraph).get();
CommonTestUtils.waitForAllTaskRunning(cluster.getMiniCluster(), jobID2, false);
String secondCheckpoint = cluster.getMiniCluster().triggerCheckpoint(jobID2).get();
cluster.getClusterClient().cancel(jobID2).get();
// delete the checkpoint we restored from
FileUtils.deleteDirectory(Paths.get(new URI(firstCheckpoint)).getParent().toFile());
// we should be able to restore from the second checkpoint even though it has been built
// on top of the first checkpoint
jobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(secondCheckpoint, false, RestoreMode.NO_CLAIM));
final JobID jobID3 = new JobID();
jobGraph.setJobID(jobID3);
cluster.getClusterClient().submitJob(jobGraph).get();
CommonTestUtils.waitForAllTaskRunning(cluster.getMiniCluster(), jobID3, false);
} finally {
cluster.after();
}
}
Aggregations