use of org.apache.flink.streaming.util.serialization.KeyedDeserializationSchemaWrapper in project flink by apache.
the class Kafka09FetcherTest method ensureOffsetsGetCommitted.
@Test
public void ensureOffsetsGetCommitted() throws Exception {
// test data
final KafkaTopicPartition testPartition1 = new KafkaTopicPartition("test", 42);
final KafkaTopicPartition testPartition2 = new KafkaTopicPartition("another", 99);
final Map<KafkaTopicPartition, Long> testCommitData1 = new HashMap<>();
testCommitData1.put(testPartition1, 11L);
testCommitData1.put(testPartition2, 18L);
final Map<KafkaTopicPartition, Long> testCommitData2 = new HashMap<>();
testCommitData2.put(testPartition1, 19L);
testCommitData2.put(testPartition2, 28L);
final BlockingQueue<Map<TopicPartition, OffsetAndMetadata>> commitStore = new LinkedBlockingQueue<>();
// ----- the mock consumer with poll(), wakeup(), and commit(A)sync calls ----
final MultiShotLatch blockerLatch = new MultiShotLatch();
KafkaConsumer<?, ?> mockConsumer = mock(KafkaConsumer.class);
when(mockConsumer.poll(anyLong())).thenAnswer(new Answer<ConsumerRecords<?, ?>>() {
@Override
public ConsumerRecords<?, ?> answer(InvocationOnMock invocation) throws InterruptedException {
blockerLatch.await();
return ConsumerRecords.empty();
}
});
doAnswer(new Answer<Void>() {
@Override
public Void answer(InvocationOnMock invocation) {
blockerLatch.trigger();
return null;
}
}).when(mockConsumer).wakeup();
doAnswer(new Answer<Void>() {
@Override
public Void answer(InvocationOnMock invocation) {
@SuppressWarnings("unchecked") Map<TopicPartition, OffsetAndMetadata> offsets = (Map<TopicPartition, OffsetAndMetadata>) invocation.getArguments()[0];
OffsetCommitCallback callback = (OffsetCommitCallback) invocation.getArguments()[1];
commitStore.add(offsets);
callback.onComplete(offsets, null);
return null;
}
}).when(mockConsumer).commitAsync(Mockito.<Map<TopicPartition, OffsetAndMetadata>>any(), any(OffsetCommitCallback.class));
// make sure the fetcher creates the mock consumer
whenNew(KafkaConsumer.class).withAnyArguments().thenReturn(mockConsumer);
// ----- create the test fetcher -----
@SuppressWarnings("unchecked") SourceContext<String> sourceContext = mock(SourceContext.class);
Map<KafkaTopicPartition, Long> partitionsWithInitialOffsets = Collections.singletonMap(new KafkaTopicPartition("test", 42), KafkaTopicPartitionStateSentinel.GROUP_OFFSET);
KeyedDeserializationSchema<String> schema = new KeyedDeserializationSchemaWrapper<>(new SimpleStringSchema());
final Kafka09Fetcher<String> fetcher = new Kafka09Fetcher<>(sourceContext, partitionsWithInitialOffsets, null, /* periodic watermark extractor */
null, /* punctuated watermark extractor */
new TestProcessingTimeService(), 10, /* watermark interval */
this.getClass().getClassLoader(), "task_name", new UnregisteredMetricsGroup(), schema, new Properties(), 0L, false);
// ----- run the fetcher -----
final AtomicReference<Throwable> error = new AtomicReference<>();
final Thread fetcherRunner = new Thread("fetcher runner") {
@Override
public void run() {
try {
fetcher.runFetchLoop();
} catch (Throwable t) {
error.set(t);
}
}
};
fetcherRunner.start();
// ----- trigger the first offset commit -----
fetcher.commitInternalOffsetsToKafka(testCommitData1);
Map<TopicPartition, OffsetAndMetadata> result1 = commitStore.take();
for (Entry<TopicPartition, OffsetAndMetadata> entry : result1.entrySet()) {
TopicPartition partition = entry.getKey();
if (partition.topic().equals("test")) {
assertEquals(42, partition.partition());
assertEquals(12L, entry.getValue().offset());
} else if (partition.topic().equals("another")) {
assertEquals(99, partition.partition());
assertEquals(17L, entry.getValue().offset());
}
}
// ----- trigger the second offset commit -----
fetcher.commitInternalOffsetsToKafka(testCommitData2);
Map<TopicPartition, OffsetAndMetadata> result2 = commitStore.take();
for (Entry<TopicPartition, OffsetAndMetadata> entry : result2.entrySet()) {
TopicPartition partition = entry.getKey();
if (partition.topic().equals("test")) {
assertEquals(42, partition.partition());
assertEquals(20L, entry.getValue().offset());
} else if (partition.topic().equals("another")) {
assertEquals(99, partition.partition());
assertEquals(27L, entry.getValue().offset());
}
}
// ----- test done, wait till the fetcher is done for a clean shutdown -----
fetcher.cancel();
fetcherRunner.join();
// check that there were no errors in the fetcher
final Throwable caughtError = error.get();
if (caughtError != null && !(caughtError instanceof Handover.ClosedException)) {
throw new Exception("Exception in the fetcher", caughtError);
}
}
use of org.apache.flink.streaming.util.serialization.KeyedDeserializationSchemaWrapper in project flink by apache.
the class KafkaConsumerTestBase method writeSequence.
protected String writeSequence(String baseTopicName, final int numElements, final int parallelism, final int replicationFactor) throws Exception {
LOG.info("\n===================================\n" + "== Writing sequence of " + numElements + " into " + baseTopicName + " with p=" + parallelism + "\n" + "===================================");
final TypeInformation<Tuple2<Integer, Integer>> resultType = TypeInformation.of(new TypeHint<Tuple2<Integer, Integer>>() {
});
final KeyedSerializationSchema<Tuple2<Integer, Integer>> serSchema = new KeyedSerializationSchemaWrapper<>(new TypeInformationSerializationSchema<>(resultType, new ExecutionConfig()));
final KeyedDeserializationSchema<Tuple2<Integer, Integer>> deserSchema = new KeyedDeserializationSchemaWrapper<>(new TypeInformationSerializationSchema<>(resultType, new ExecutionConfig()));
final int maxNumAttempts = 10;
for (int attempt = 1; attempt <= maxNumAttempts; attempt++) {
final String topicName = baseTopicName + '-' + attempt;
LOG.info("Writing attempt #1");
// -------- Write the Sequence --------
createTestTopic(topicName, parallelism, replicationFactor);
StreamExecutionEnvironment writeEnv = StreamExecutionEnvironment.createRemoteEnvironment("localhost", flinkPort);
writeEnv.getConfig().setRestartStrategy(RestartStrategies.noRestart());
writeEnv.getConfig().disableSysoutLogging();
DataStream<Tuple2<Integer, Integer>> stream = writeEnv.addSource(new RichParallelSourceFunction<Tuple2<Integer, Integer>>() {
private boolean running = true;
@Override
public void run(SourceContext<Tuple2<Integer, Integer>> ctx) throws Exception {
int cnt = 0;
int partition = getRuntimeContext().getIndexOfThisSubtask();
while (running && cnt < numElements) {
ctx.collect(new Tuple2<>(partition, cnt));
cnt++;
}
}
@Override
public void cancel() {
running = false;
}
}).setParallelism(parallelism);
// the producer must not produce duplicates
Properties producerProperties = FlinkKafkaProducerBase.getPropertiesFromBrokerList(brokerConnectionStrings);
producerProperties.setProperty("retries", "0");
producerProperties.putAll(secureProps);
kafkaServer.produceIntoKafka(stream, topicName, serSchema, producerProperties, new Tuple2Partitioner(parallelism)).setParallelism(parallelism);
try {
writeEnv.execute("Write sequence");
} catch (Exception e) {
LOG.error("Write attempt failed, trying again", e);
deleteTestTopic(topicName);
JobManagerCommunicationUtils.waitUntilNoJobIsRunning(flink.getLeaderGateway(timeout));
continue;
}
LOG.info("Finished writing sequence");
// -------- Validate the Sequence --------
// we need to validate the sequence, because kafka's producers are not exactly once
LOG.info("Validating sequence");
JobManagerCommunicationUtils.waitUntilNoJobIsRunning(flink.getLeaderGateway(timeout));
final StreamExecutionEnvironment readEnv = StreamExecutionEnvironment.createRemoteEnvironment("localhost", flinkPort);
readEnv.getConfig().setRestartStrategy(RestartStrategies.noRestart());
readEnv.getConfig().disableSysoutLogging();
readEnv.setParallelism(parallelism);
Properties readProps = (Properties) standardProps.clone();
readProps.setProperty("group.id", "flink-tests-validator");
readProps.putAll(secureProps);
FlinkKafkaConsumerBase<Tuple2<Integer, Integer>> consumer = kafkaServer.getConsumer(topicName, deserSchema, readProps);
readEnv.addSource(consumer).map(new RichMapFunction<Tuple2<Integer, Integer>, Tuple2<Integer, Integer>>() {
private final int totalCount = parallelism * numElements;
private int count = 0;
@Override
public Tuple2<Integer, Integer> map(Tuple2<Integer, Integer> value) throws Exception {
if (++count == totalCount) {
throw new SuccessException();
} else {
return value;
}
}
}).setParallelism(1).addSink(new DiscardingSink<Tuple2<Integer, Integer>>()).setParallelism(1);
final AtomicReference<Throwable> errorRef = new AtomicReference<>();
Thread runner = new Thread() {
@Override
public void run() {
try {
tryExecute(readEnv, "sequence validation");
} catch (Throwable t) {
errorRef.set(t);
}
}
};
runner.start();
final long deadline = System.nanoTime() + 10_000_000_000L;
long delay;
while (runner.isAlive() && (delay = deadline - System.nanoTime()) > 0) {
runner.join(delay / 1_000_000L);
}
boolean success;
if (runner.isAlive()) {
// did not finish in time, maybe the producer dropped one or more records and
// the validation did not reach the exit point
success = false;
JobManagerCommunicationUtils.cancelCurrentJob(flink.getLeaderGateway(timeout));
} else {
Throwable error = errorRef.get();
if (error != null) {
success = false;
LOG.info("Attempt " + attempt + " failed with exception", error);
} else {
success = true;
}
}
JobManagerCommunicationUtils.waitUntilNoJobIsRunning(flink.getLeaderGateway(timeout));
if (success) {
// everything is good!
return topicName;
} else {
deleteTestTopic(topicName);
// fall through the loop
}
}
throw new Exception("Could not write a valid sequence to Kafka after " + maxNumAttempts + " attempts");
}
use of org.apache.flink.streaming.util.serialization.KeyedDeserializationSchemaWrapper in project flink by apache.
the class KafkaConsumerTestBase method runStartFromLatestOffsets.
/**
* This test ensures that when explicitly set to start from latest record, the consumer
* ignores the "auto.offset.reset" behaviour as well as any committed group offsets in Kafka.
*/
public void runStartFromLatestOffsets() throws Exception {
// 50 records written to each of 3 partitions before launching a latest-starting consuming job
final int parallelism = 3;
final int recordsInEachPartition = 50;
// each partition will be written an extra 200 records
final int extraRecordsInEachPartition = 200;
// all already existing data in the topic, before the consuming topology has started, should be ignored
final String topicName = writeSequence("testStartFromLatestOffsetsTopic", recordsInEachPartition, parallelism, 1);
// the committed offsets should be ignored
KafkaTestEnvironment.KafkaOffsetHandler kafkaOffsetHandler = kafkaServer.createOffsetHandler();
kafkaOffsetHandler.setCommittedOffset(topicName, 0, 23);
kafkaOffsetHandler.setCommittedOffset(topicName, 1, 31);
kafkaOffsetHandler.setCommittedOffset(topicName, 2, 43);
// job names for the topologies for writing and consuming the extra records
final String consumeExtraRecordsJobName = "Consume Extra Records Job";
final String writeExtraRecordsJobName = "Write Extra Records Job";
// seriliazation / deserialization schemas for writing and consuming the extra records
final TypeInformation<Tuple2<Integer, Integer>> resultType = TypeInformation.of(new TypeHint<Tuple2<Integer, Integer>>() {
});
final KeyedSerializationSchema<Tuple2<Integer, Integer>> serSchema = new KeyedSerializationSchemaWrapper<>(new TypeInformationSerializationSchema<>(resultType, new ExecutionConfig()));
final KeyedDeserializationSchema<Tuple2<Integer, Integer>> deserSchema = new KeyedDeserializationSchemaWrapper<>(new TypeInformationSerializationSchema<>(resultType, new ExecutionConfig()));
// setup and run the latest-consuming job
final StreamExecutionEnvironment env = StreamExecutionEnvironment.createRemoteEnvironment("localhost", flinkPort);
env.getConfig().disableSysoutLogging();
env.setParallelism(parallelism);
final Properties readProps = new Properties();
readProps.putAll(standardProps);
// this should be ignored
readProps.setProperty("auto.offset.reset", "earliest");
FlinkKafkaConsumerBase<Tuple2<Integer, Integer>> latestReadingConsumer = kafkaServer.getConsumer(topicName, deserSchema, readProps);
latestReadingConsumer.setStartFromLatest();
env.addSource(latestReadingConsumer).setParallelism(parallelism).flatMap(new FlatMapFunction<Tuple2<Integer, Integer>, Object>() {
@Override
public void flatMap(Tuple2<Integer, Integer> value, Collector<Object> out) throws Exception {
if (value.f1 - recordsInEachPartition < 0) {
throw new RuntimeException("test failed; consumed a record that was previously written: " + value);
}
}
}).setParallelism(1).addSink(new DiscardingSink<>());
final AtomicReference<Throwable> error = new AtomicReference<>();
Thread consumeThread = new Thread(new Runnable() {
@Override
public void run() {
try {
env.execute(consumeExtraRecordsJobName);
} catch (Throwable t) {
if (!(t.getCause() instanceof JobCancellationException)) {
error.set(t);
}
}
}
});
consumeThread.start();
// wait until the consuming job has started, to be extra safe
JobManagerCommunicationUtils.waitUntilJobIsRunning(flink.getLeaderGateway(timeout), consumeExtraRecordsJobName);
// setup the extra records writing job
final StreamExecutionEnvironment env2 = StreamExecutionEnvironment.createRemoteEnvironment("localhost", flinkPort);
DataStream<Tuple2<Integer, Integer>> extraRecordsStream = env2.addSource(new RichParallelSourceFunction<Tuple2<Integer, Integer>>() {
private boolean running = true;
@Override
public void run(SourceContext<Tuple2<Integer, Integer>> ctx) throws Exception {
// the extra records should start from the last written value
int count = recordsInEachPartition;
int partition = getRuntimeContext().getIndexOfThisSubtask();
while (running && count < recordsInEachPartition + extraRecordsInEachPartition) {
ctx.collect(new Tuple2<>(partition, count));
count++;
}
}
@Override
public void cancel() {
running = false;
}
}).setParallelism(parallelism);
kafkaServer.produceIntoKafka(extraRecordsStream, topicName, serSchema, readProps, null);
try {
env2.execute(writeExtraRecordsJobName);
} catch (Exception e) {
throw new RuntimeException("Writing extra records failed", e);
}
// cancel the consume job after all extra records are written
JobManagerCommunicationUtils.cancelCurrentJob(flink.getLeaderGateway(timeout), consumeExtraRecordsJobName);
consumeThread.join();
kafkaOffsetHandler.close();
deleteTestTopic(topicName);
// check whether the consuming thread threw any test errors;
// test will fail here if the consume job had incorrectly read any records other than the extra records
final Throwable consumerError = error.get();
if (consumerError != null) {
throw new Exception("Exception in the consuming thread", consumerError);
}
}
Aggregations