use of org.apache.flink.streaming.api.environment.StreamExecutionEnvironment in project flink by apache.
the class KafkaConsumerTestBase method runAutoOffsetRetrievalAndCommitToKafka.
/**
* This test ensures that when the consumers retrieve some start offset from kafka (earliest, latest), that this offset
* is committed to Kafka, even if some partitions are not read.
*
* Test:
* - Create 3 partitions
* - write 50 messages into each.
* - Start three consumers with auto.offset.reset='latest' and wait until they committed into Kafka.
* - Check if the offsets in Kafka are set to 50 for the three partitions
*
* See FLINK-3440 as well
*/
public void runAutoOffsetRetrievalAndCommitToKafka() throws Exception {
// 3 partitions with 50 records each (0-49, so the expected commit offset of each partition should be 50)
final int parallelism = 3;
final int recordsInEachPartition = 50;
final String topicName = writeSequence("testAutoOffsetRetrievalAndCommitToKafkaTopic", recordsInEachPartition, parallelism, 1);
final StreamExecutionEnvironment env = StreamExecutionEnvironment.createRemoteEnvironment("localhost", flinkPort);
env.getConfig().disableSysoutLogging();
env.getConfig().setRestartStrategy(RestartStrategies.noRestart());
env.setParallelism(parallelism);
env.enableCheckpointing(200);
Properties readProps = new Properties();
readProps.putAll(standardProps);
// set to reset to latest, so that partitions are initially not read
readProps.setProperty("auto.offset.reset", "latest");
DataStream<String> stream = env.addSource(kafkaServer.getConsumer(topicName, new SimpleStringSchema(), readProps));
stream.addSink(new DiscardingSink<String>());
final AtomicReference<Throwable> errorRef = new AtomicReference<>();
final Thread runner = new Thread("runner") {
@Override
public void run() {
try {
env.execute();
} catch (Throwable t) {
if (!(t.getCause() instanceof JobCancellationException)) {
errorRef.set(t);
}
}
}
};
runner.start();
KafkaTestEnvironment.KafkaOffsetHandler kafkaOffsetHandler = kafkaServer.createOffsetHandler();
// the final committed offset in Kafka should be 50
final Long l50 = 50L;
final long deadline = 30_000_000_000L + System.nanoTime();
do {
Long o1 = kafkaOffsetHandler.getCommittedOffset(topicName, 0);
Long o2 = kafkaOffsetHandler.getCommittedOffset(topicName, 1);
Long o3 = kafkaOffsetHandler.getCommittedOffset(topicName, 2);
if (l50.equals(o1) && l50.equals(o2) && l50.equals(o3)) {
break;
}
Thread.sleep(100);
} while (System.nanoTime() < deadline);
// cancel the job
JobManagerCommunicationUtils.cancelCurrentJob(flink.getLeaderGateway(timeout));
final Throwable t = errorRef.get();
if (t != null) {
throw new RuntimeException("Job failed with an exception", t);
}
// final check to see if offsets are correctly in Kafka
Long o1 = kafkaOffsetHandler.getCommittedOffset(topicName, 0);
Long o2 = kafkaOffsetHandler.getCommittedOffset(topicName, 1);
Long o3 = kafkaOffsetHandler.getCommittedOffset(topicName, 2);
Assert.assertEquals(Long.valueOf(50L), o1);
Assert.assertEquals(Long.valueOf(50L), o2);
Assert.assertEquals(Long.valueOf(50L), o3);
kafkaOffsetHandler.close();
deleteTestTopic(topicName);
}
use of org.apache.flink.streaming.api.environment.StreamExecutionEnvironment in project flink by apache.
the class KafkaConsumerTestBase method runStartFromSpecificOffsets.
/**
* This test ensures that the consumer correctly uses user-supplied specific offsets when explicitly configured to
* start from specific offsets. For partitions which a specific offset can not be found for, the starting position
* for them should fallback to the group offsets behaviour.
*
* 4 partitions will have 50 records with offsets 0 to 49. The supplied specific offsets map is:
* partition 0 --> start from offset 19
* partition 1 --> not set
* partition 2 --> start from offset 22
* partition 3 --> not set
* partition 4 --> start from offset 26 (this should be ignored because the partition does not exist)
*
* The partitions and their committed group offsets are setup as:
* partition 0 --> committed offset 23
* partition 1 --> committed offset 31
* partition 2 --> committed offset 43
* partition 3 --> no commit offset
*
* When configured to start from these specific offsets, each partition should read:
* partition 0 --> start from offset 19, read to offset 49 (31 records)
* partition 1 --> fallback to group offsets, so start from offset 31, read to offset 49 (19 records)
* partition 2 --> start from offset 22, read to offset 49 (28 records)
* partition 3 --> fallback to group offsets, but since there is no group offset for this partition,
* will default to "auto.offset.reset" (set to "earliest"),
* so start from offset 0, read to offset 49 (50 records)
*/
public void runStartFromSpecificOffsets() throws Exception {
// 4 partitions with 50 records each (offsets 0-49)
final int parallelism = 4;
final int recordsInEachPartition = 50;
final String topicName = writeSequence("testStartFromSpecificOffsetsTopic", recordsInEachPartition, parallelism, 1);
final StreamExecutionEnvironment env = StreamExecutionEnvironment.createRemoteEnvironment("localhost", flinkPort);
env.getConfig().disableSysoutLogging();
env.setParallelism(parallelism);
Properties readProps = new Properties();
readProps.putAll(standardProps);
// partition 3 should default back to this behaviour
readProps.setProperty("auto.offset.reset", "earliest");
Map<KafkaTopicPartition, Long> specificStartupOffsets = new HashMap<>();
specificStartupOffsets.put(new KafkaTopicPartition(topicName, 0), 19L);
specificStartupOffsets.put(new KafkaTopicPartition(topicName, 2), 22L);
// non-existing partition, should be ignored
specificStartupOffsets.put(new KafkaTopicPartition(topicName, 4), 26L);
// only the committed offset for partition 1 should be used, because partition 1 has no entry in specific offset map
KafkaTestEnvironment.KafkaOffsetHandler kafkaOffsetHandler = kafkaServer.createOffsetHandler();
kafkaOffsetHandler.setCommittedOffset(topicName, 0, 23);
kafkaOffsetHandler.setCommittedOffset(topicName, 1, 31);
kafkaOffsetHandler.setCommittedOffset(topicName, 2, 43);
Map<Integer, Tuple2<Integer, Integer>> partitionsToValueCountAndStartOffsets = new HashMap<>();
// partition 0 should read offset 19-49
partitionsToValueCountAndStartOffsets.put(0, new Tuple2<>(31, 19));
// partition 1 should read offset 31-49
partitionsToValueCountAndStartOffsets.put(1, new Tuple2<>(19, 31));
// partition 2 should read offset 22-49
partitionsToValueCountAndStartOffsets.put(2, new Tuple2<>(28, 22));
// partition 3 should read offset 0-49
partitionsToValueCountAndStartOffsets.put(3, new Tuple2<>(50, 0));
readSequence(env, StartupMode.SPECIFIC_OFFSETS, specificStartupOffsets, readProps, topicName, partitionsToValueCountAndStartOffsets);
kafkaOffsetHandler.close();
deleteTestTopic(topicName);
}
use of org.apache.flink.streaming.api.environment.StreamExecutionEnvironment in project flink by apache.
the class KafkaConsumerTestBase method runKeyValueTest.
public void runKeyValueTest() throws Exception {
final String topic = "keyvaluetest";
createTestTopic(topic, 1, 1);
final int ELEMENT_COUNT = 5000;
// ----------- Write some data into Kafka -------------------
StreamExecutionEnvironment env = StreamExecutionEnvironment.createRemoteEnvironment("localhost", flinkPort);
env.setParallelism(1);
env.setRestartStrategy(RestartStrategies.noRestart());
env.getConfig().disableSysoutLogging();
DataStream<Tuple2<Long, PojoValue>> kvStream = env.addSource(new SourceFunction<Tuple2<Long, PojoValue>>() {
@Override
public void run(SourceContext<Tuple2<Long, PojoValue>> ctx) throws Exception {
Random rnd = new Random(1337);
for (long i = 0; i < ELEMENT_COUNT; i++) {
PojoValue pojo = new PojoValue();
pojo.when = new Date(rnd.nextLong());
pojo.lon = rnd.nextLong();
pojo.lat = i;
// make every second key null to ensure proper "null" serialization
Long key = (i % 2 == 0) ? null : i;
ctx.collect(new Tuple2<>(key, pojo));
}
}
@Override
public void cancel() {
}
});
KeyedSerializationSchema<Tuple2<Long, PojoValue>> schema = new TypeInformationKeyValueSerializationSchema<>(Long.class, PojoValue.class, env.getConfig());
Properties producerProperties = FlinkKafkaProducerBase.getPropertiesFromBrokerList(brokerConnectionStrings);
producerProperties.setProperty("retries", "3");
kafkaServer.produceIntoKafka(kvStream, topic, schema, producerProperties, null);
env.execute("Write KV to Kafka");
// ----------- Read the data again -------------------
env = StreamExecutionEnvironment.createRemoteEnvironment("localhost", flinkPort);
env.setParallelism(1);
env.setRestartStrategy(RestartStrategies.noRestart());
env.getConfig().disableSysoutLogging();
KeyedDeserializationSchema<Tuple2<Long, PojoValue>> readSchema = new TypeInformationKeyValueSerializationSchema<>(Long.class, PojoValue.class, env.getConfig());
Properties props = new Properties();
props.putAll(standardProps);
props.putAll(secureProps);
DataStream<Tuple2<Long, PojoValue>> fromKafka = env.addSource(kafkaServer.getConsumer(topic, readSchema, props));
fromKafka.flatMap(new RichFlatMapFunction<Tuple2<Long, PojoValue>, Object>() {
long counter = 0;
@Override
public void flatMap(Tuple2<Long, PojoValue> value, Collector<Object> out) throws Exception {
// the elements should be in order.
Assert.assertTrue("Wrong value " + value.f1.lat, value.f1.lat == counter);
if (value.f1.lat % 2 == 0) {
assertNull("key was not null", value.f0);
} else {
Assert.assertTrue("Wrong value " + value.f0, value.f0 == counter);
}
counter++;
if (counter == ELEMENT_COUNT) {
// we got the right number of elements
throw new SuccessException();
}
}
});
tryExecute(env, "Read KV from Kafka");
deleteTestTopic(topic);
}
use of org.apache.flink.streaming.api.environment.StreamExecutionEnvironment in project flink by apache.
the class KafkaConsumerTestBase method runBigRecordTestTopology.
/**
* Test Flink's Kafka integration also with very big records (30MB)
* see http://stackoverflow.com/questions/21020347/kafka-sending-a-15mb-message
*
*/
public void runBigRecordTestTopology() throws Exception {
final String topic = "bigRecordTestTopic";
// otherwise, the kafka mini clusters may run out of heap space
final int parallelism = 1;
createTestTopic(topic, parallelism, 1);
final TypeInformation<Tuple2<Long, byte[]>> longBytesInfo = TypeInfoParser.parse("Tuple2<Long, byte[]>");
final TypeInformationSerializationSchema<Tuple2<Long, byte[]>> serSchema = new TypeInformationSerializationSchema<>(longBytesInfo, new ExecutionConfig());
final StreamExecutionEnvironment env = StreamExecutionEnvironment.createRemoteEnvironment("localhost", flinkPort);
env.setRestartStrategy(RestartStrategies.noRestart());
env.getConfig().disableSysoutLogging();
env.enableCheckpointing(100);
env.setParallelism(parallelism);
// add consuming topology:
Properties consumerProps = new Properties();
consumerProps.putAll(standardProps);
consumerProps.setProperty("fetch.message.max.bytes", Integer.toString(1024 * 1024 * 14));
// for the new fetcher
consumerProps.setProperty("max.partition.fetch.bytes", Integer.toString(1024 * 1024 * 14));
consumerProps.setProperty("queued.max.message.chunks", "1");
consumerProps.putAll(secureProps);
FlinkKafkaConsumerBase<Tuple2<Long, byte[]>> source = kafkaServer.getConsumer(topic, serSchema, consumerProps);
DataStreamSource<Tuple2<Long, byte[]>> consuming = env.addSource(source);
consuming.addSink(new SinkFunction<Tuple2<Long, byte[]>>() {
private int elCnt = 0;
@Override
public void invoke(Tuple2<Long, byte[]> value) throws Exception {
elCnt++;
if (value.f0 == -1) {
// we should have seen 11 elements now.
if (elCnt == 11) {
throw new SuccessException();
} else {
throw new RuntimeException("There have been " + elCnt + " elements");
}
}
if (elCnt > 10) {
throw new RuntimeException("More than 10 elements seen: " + elCnt);
}
}
});
// add producing topology
Properties producerProps = new Properties();
producerProps.setProperty("max.request.size", Integer.toString(1024 * 1024 * 15));
producerProps.setProperty("retries", "3");
producerProps.putAll(secureProps);
producerProps.setProperty(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, brokerConnectionStrings);
DataStream<Tuple2<Long, byte[]>> stream = env.addSource(new RichSourceFunction<Tuple2<Long, byte[]>>() {
private boolean running;
@Override
public void open(Configuration parameters) throws Exception {
super.open(parameters);
running = true;
}
@Override
public void run(SourceContext<Tuple2<Long, byte[]>> ctx) throws Exception {
Random rnd = new Random();
long cnt = 0;
int sevenMb = 1024 * 1024 * 7;
while (running) {
byte[] wl = new byte[sevenMb + rnd.nextInt(sevenMb)];
ctx.collect(new Tuple2<>(cnt++, wl));
Thread.sleep(100);
if (cnt == 10) {
// signal end
ctx.collect(new Tuple2<>(-1L, new byte[] { 1 }));
break;
}
}
}
@Override
public void cancel() {
running = false;
}
});
kafkaServer.produceIntoKafka(stream, topic, new KeyedSerializationSchemaWrapper<>(serSchema), producerProps, null);
tryExecute(env, "big topology test");
deleteTestTopic(topic);
}
use of org.apache.flink.streaming.api.environment.StreamExecutionEnvironment in project flink by apache.
the class KafkaConsumerTestBase method runCancelingOnEmptyInputTest.
/**
* Tests that the source can be properly canceled when reading empty partitions.
*/
public void runCancelingOnEmptyInputTest() throws Exception {
final String topic = "cancelingOnEmptyInputTopic";
final int parallelism = 3;
createTestTopic(topic, parallelism, 1);
final AtomicReference<Throwable> error = new AtomicReference<>();
final Runnable jobRunner = new Runnable() {
@Override
public void run() {
try {
final StreamExecutionEnvironment env = StreamExecutionEnvironment.createRemoteEnvironment("localhost", flinkPort);
env.setParallelism(parallelism);
env.enableCheckpointing(100);
env.getConfig().disableSysoutLogging();
Properties props = new Properties();
props.putAll(standardProps);
props.putAll(secureProps);
FlinkKafkaConsumerBase<String> source = kafkaServer.getConsumer(topic, new SimpleStringSchema(), props);
env.addSource(source).addSink(new DiscardingSink<String>());
env.execute("CancelingOnEmptyInputTest");
} catch (Throwable t) {
LOG.error("Job Runner failed with exception", t);
error.set(t);
}
}
};
Thread runnerThread = new Thread(jobRunner, "program runner thread");
runnerThread.start();
// wait a bit before canceling
Thread.sleep(2000);
Throwable failueCause = error.get();
if (failueCause != null) {
failueCause.printStackTrace();
Assert.fail("Test failed prematurely with: " + failueCause.getMessage());
}
// cancel
JobManagerCommunicationUtils.cancelCurrentJob(flink.getLeaderGateway(timeout));
// wait for the program to be done and validate that we failed with the right exception
runnerThread.join();
failueCause = error.get();
assertNotNull("program did not fail properly due to canceling", failueCause);
assertTrue(failueCause.getMessage().contains("Job was cancelled"));
deleteTestTopic(topic);
}
Aggregations