use of org.apache.flink.streaming.util.serialization.TypeInformationSerializationSchema in project flink by apache.
the class KafkaConsumerTestBase method readSequence.
// ------------------------------------------------------------------------
// Reading writing test data sets
// ------------------------------------------------------------------------
/**
* Runs a job using the provided environment to read a sequence of records from a single Kafka topic.
* The method allows to individually specify the expected starting offset and total read value count of each partition.
* The job will be considered successful only if all partition read results match the start offset and value count criteria.
*/
protected void readSequence(final StreamExecutionEnvironment env, final StartupMode startupMode, final Map<KafkaTopicPartition, Long> specificStartupOffsets, final Properties cc, final String topicName, final Map<Integer, Tuple2<Integer, Integer>> partitionsToValuesCountAndStartOffset) throws Exception {
final int sourceParallelism = partitionsToValuesCountAndStartOffset.keySet().size();
int finalCountTmp = 0;
for (Map.Entry<Integer, Tuple2<Integer, Integer>> valuesCountAndStartOffset : partitionsToValuesCountAndStartOffset.entrySet()) {
finalCountTmp += valuesCountAndStartOffset.getValue().f0;
}
final int finalCount = finalCountTmp;
final TypeInformation<Tuple2<Integer, Integer>> intIntTupleType = TypeInfoParser.parse("Tuple2<Integer, Integer>");
final TypeInformationSerializationSchema<Tuple2<Integer, Integer>> deser = new TypeInformationSerializationSchema<>(intIntTupleType, env.getConfig());
// create the consumer
cc.putAll(secureProps);
FlinkKafkaConsumerBase<Tuple2<Integer, Integer>> consumer = kafkaServer.getConsumer(topicName, deser, cc);
switch(startupMode) {
case EARLIEST:
consumer.setStartFromEarliest();
break;
case LATEST:
consumer.setStartFromLatest();
break;
case SPECIFIC_OFFSETS:
consumer.setStartFromSpecificOffsets(specificStartupOffsets);
break;
case GROUP_OFFSETS:
consumer.setStartFromGroupOffsets();
break;
}
DataStream<Tuple2<Integer, Integer>> source = env.addSource(consumer).setParallelism(sourceParallelism).map(new ThrottledMapper<Tuple2<Integer, Integer>>(20)).setParallelism(sourceParallelism);
// verify data
source.flatMap(new RichFlatMapFunction<Tuple2<Integer, Integer>, Integer>() {
private HashMap<Integer, BitSet> partitionsToValueCheck;
private int count = 0;
@Override
public void open(Configuration parameters) throws Exception {
partitionsToValueCheck = new HashMap<>();
for (Integer partition : partitionsToValuesCountAndStartOffset.keySet()) {
partitionsToValueCheck.put(partition, new BitSet());
}
}
@Override
public void flatMap(Tuple2<Integer, Integer> value, Collector<Integer> out) throws Exception {
int partition = value.f0;
int val = value.f1;
BitSet bitSet = partitionsToValueCheck.get(partition);
if (bitSet == null) {
throw new RuntimeException("Got a record from an unknown partition");
} else {
bitSet.set(val - partitionsToValuesCountAndStartOffset.get(partition).f1);
}
count++;
LOG.info("Received message {}, total {} messages", value, count);
// verify if we've seen everything
if (count == finalCount) {
for (Map.Entry<Integer, BitSet> partitionsToValueCheck : this.partitionsToValueCheck.entrySet()) {
BitSet check = partitionsToValueCheck.getValue();
int expectedValueCount = partitionsToValuesCountAndStartOffset.get(partitionsToValueCheck.getKey()).f0;
if (check.cardinality() != expectedValueCount) {
throw new RuntimeException("Expected cardinality to be " + expectedValueCount + ", but was " + check.cardinality());
} else if (check.nextClearBit(0) != expectedValueCount) {
throw new RuntimeException("Expected next clear bit to be " + expectedValueCount + ", but was " + check.cardinality());
}
}
// test has passed
throw new SuccessException();
}
}
}).setParallelism(1);
tryExecute(env, "Read data from Kafka");
LOG.info("Successfully read sequence for verification");
}
use of org.apache.flink.streaming.util.serialization.TypeInformationSerializationSchema in project flink by apache.
the class KafkaConsumerTestBase method runSimpleConcurrentProducerConsumerTopology.
/**
* Ensure Kafka is working on both producer and consumer side.
* This executes a job that contains two Flink pipelines.
*
* <pre>
* (generator source) --> (kafka sink)-[KAFKA-TOPIC]-(kafka source) --> (validating sink)
* </pre>
*
* We need to externally retry this test. We cannot let Flink's retry mechanism do it, because the Kafka producer
* does not guarantee exactly-once output. Hence a recovery would introduce duplicates that
* cause the test to fail.
*
* This test also ensures that FLINK-3156 doesn't happen again:
*
* The following situation caused a NPE in the FlinkKafkaConsumer
*
* topic-1 <-- elements are only produced into topic1.
* topic-2
*
* Therefore, this test is consuming as well from an empty topic.
*
*/
@RetryOnException(times = 2, exception = kafka.common.NotLeaderForPartitionException.class)
public void runSimpleConcurrentProducerConsumerTopology() throws Exception {
final String topic = "concurrentProducerConsumerTopic_" + UUID.randomUUID().toString();
final String additionalEmptyTopic = "additionalEmptyTopic_" + UUID.randomUUID().toString();
final int parallelism = 3;
final int elementsPerPartition = 100;
final int totalElements = parallelism * elementsPerPartition;
createTestTopic(topic, parallelism, 2);
// create an empty topic which will remain empty all the time
createTestTopic(additionalEmptyTopic, parallelism, 1);
final StreamExecutionEnvironment env = StreamExecutionEnvironment.createRemoteEnvironment("localhost", flinkPort);
env.setParallelism(parallelism);
env.enableCheckpointing(500);
// fail immediately
env.setRestartStrategy(RestartStrategies.noRestart());
env.getConfig().disableSysoutLogging();
TypeInformation<Tuple2<Long, String>> longStringType = TypeInfoParser.parse("Tuple2<Long, String>");
TypeInformationSerializationSchema<Tuple2<Long, String>> sourceSchema = new TypeInformationSerializationSchema<>(longStringType, env.getConfig());
TypeInformationSerializationSchema<Tuple2<Long, String>> sinkSchema = new TypeInformationSerializationSchema<>(longStringType, env.getConfig());
// ----------- add producer dataflow ----------
DataStream<Tuple2<Long, String>> stream = env.addSource(new RichParallelSourceFunction<Tuple2<Long, String>>() {
private boolean running = true;
@Override
public void run(SourceContext<Tuple2<Long, String>> ctx) throws InterruptedException {
int cnt = getRuntimeContext().getIndexOfThisSubtask() * elementsPerPartition;
int limit = cnt + elementsPerPartition;
while (running && cnt < limit) {
ctx.collect(new Tuple2<>(1000L + cnt, "kafka-" + cnt));
cnt++;
// we delay data generation a bit so that we are sure that some checkpoints are
// triggered (for FLINK-3156)
Thread.sleep(50);
}
}
@Override
public void cancel() {
running = false;
}
});
Properties producerProperties = FlinkKafkaProducerBase.getPropertiesFromBrokerList(brokerConnectionStrings);
producerProperties.setProperty("retries", "3");
producerProperties.putAll(secureProps);
kafkaServer.produceIntoKafka(stream, topic, new KeyedSerializationSchemaWrapper<>(sinkSchema), producerProperties, null);
// ----------- add consumer dataflow ----------
List<String> topics = new ArrayList<>();
topics.add(topic);
topics.add(additionalEmptyTopic);
Properties props = new Properties();
props.putAll(standardProps);
props.putAll(secureProps);
FlinkKafkaConsumerBase<Tuple2<Long, String>> source = kafkaServer.getConsumer(topics, sourceSchema, props);
DataStreamSource<Tuple2<Long, String>> consuming = env.addSource(source).setParallelism(parallelism);
consuming.addSink(new RichSinkFunction<Tuple2<Long, String>>() {
private int elCnt = 0;
private BitSet validator = new BitSet(totalElements);
@Override
public void invoke(Tuple2<Long, String> value) throws Exception {
String[] sp = value.f1.split("-");
int v = Integer.parseInt(sp[1]);
assertEquals(value.f0 - 1000, (long) v);
assertFalse("Received tuple twice", validator.get(v));
validator.set(v);
elCnt++;
if (elCnt == totalElements) {
// check if everything in the bitset is set to true
int nc;
if ((nc = validator.nextClearBit(0)) != totalElements) {
fail("The bitset was not set to 1 on all elements. Next clear:" + nc + " Set: " + validator);
}
throw new SuccessException();
}
}
@Override
public void close() throws Exception {
super.close();
}
}).setParallelism(1);
try {
tryExecutePropagateExceptions(env, "runSimpleConcurrentProducerConsumerTopology");
} catch (ProgramInvocationException | JobExecutionException e) {
// look for NotLeaderForPartitionException
Throwable cause = e.getCause();
// search for nested SuccessExceptions
int depth = 0;
while (cause != null && depth++ < 20) {
if (cause instanceof kafka.common.NotLeaderForPartitionException) {
throw (Exception) cause;
}
cause = cause.getCause();
}
throw e;
}
deleteTestTopic(topic);
}
use of org.apache.flink.streaming.util.serialization.TypeInformationSerializationSchema in project flink by apache.
the class KafkaConsumerTestBase method runMetricsTest.
/**
* Test metrics reporting for consumer
*
* @throws Exception
*/
public void runMetricsTest() throws Throwable {
// create a stream with 5 topics
final String topic = "metricsStream";
createTestTopic(topic, 5, 1);
final Tuple1<Throwable> error = new Tuple1<>(null);
Runnable job = new Runnable() {
@Override
public void run() {
try {
// start job writing & reading data.
final StreamExecutionEnvironment env1 = StreamExecutionEnvironment.createRemoteEnvironment("localhost", flinkPort);
env1.setParallelism(1);
env1.getConfig().setRestartStrategy(RestartStrategies.noRestart());
env1.getConfig().disableSysoutLogging();
// let the source read everything into the network buffers
env1.disableOperatorChaining();
Properties props = new Properties();
props.putAll(standardProps);
props.putAll(secureProps);
TypeInformationSerializationSchema<Tuple2<Integer, Integer>> schema = new TypeInformationSerializationSchema<>(TypeInfoParser.<Tuple2<Integer, Integer>>parse("Tuple2<Integer, Integer>"), env1.getConfig());
DataStream<Tuple2<Integer, Integer>> fromKafka = env1.addSource(kafkaServer.getConsumer(topic, schema, standardProps));
fromKafka.flatMap(new FlatMapFunction<Tuple2<Integer, Integer>, Void>() {
@Override
public void flatMap(Tuple2<Integer, Integer> value, Collector<Void> out) throws Exception {
// no op
}
});
DataStream<Tuple2<Integer, Integer>> fromGen = env1.addSource(new RichSourceFunction<Tuple2<Integer, Integer>>() {
boolean running = true;
@Override
public void run(SourceContext<Tuple2<Integer, Integer>> ctx) throws Exception {
int i = 0;
while (running) {
ctx.collect(Tuple2.of(i++, getRuntimeContext().getIndexOfThisSubtask()));
Thread.sleep(1);
}
}
@Override
public void cancel() {
running = false;
}
});
kafkaServer.produceIntoKafka(fromGen, topic, new KeyedSerializationSchemaWrapper<>(schema), standardProps, null);
env1.execute("Metrics test job");
} catch (Throwable t) {
LOG.warn("Got exception during execution", t);
if (!(t.getCause() instanceof JobCancellationException)) {
// we'll cancel the job
error.f0 = t;
}
}
}
};
Thread jobThread = new Thread(job);
jobThread.start();
try {
// connect to JMX
MBeanServer mBeanServer = ManagementFactory.getPlatformMBeanServer();
// wait until we've found all 5 offset metrics
Set<ObjectName> offsetMetrics = mBeanServer.queryNames(new ObjectName("*current-offsets*:*"), null);
while (offsetMetrics.size() < 5) {
// test will time out if metrics are not properly working
if (error.f0 != null) {
// fail test early
throw error.f0;
}
offsetMetrics = mBeanServer.queryNames(new ObjectName("*current-offsets*:*"), null);
Thread.sleep(50);
}
Assert.assertEquals(5, offsetMetrics.size());
// The test will fail if we never meet the condition
while (true) {
int numPosOffsets = 0;
// check that offsets are correctly reported
for (ObjectName object : offsetMetrics) {
Object offset = mBeanServer.getAttribute(object, "Value");
if ((long) offset >= 0) {
numPosOffsets++;
}
}
if (numPosOffsets == 5) {
break;
}
// wait for the consumer to consume on all partitions
Thread.sleep(50);
}
// check if producer metrics are also available.
Set<ObjectName> producerMetrics = mBeanServer.queryNames(new ObjectName("*KafkaProducer*:*"), null);
Assert.assertTrue("No producer metrics found", producerMetrics.size() > 30);
LOG.info("Found all JMX metrics. Cancelling job.");
} finally {
// cancel
JobManagerCommunicationUtils.cancelCurrentJob(flink.getLeaderGateway(timeout));
}
while (jobThread.isAlive()) {
Thread.sleep(50);
}
if (error.f0 != null) {
throw error.f0;
}
deleteTestTopic(topic);
}
use of org.apache.flink.streaming.util.serialization.TypeInformationSerializationSchema in project flink by apache.
the class KafkaConsumerTestBase method runBigRecordTestTopology.
/**
* Test Flink's Kafka integration also with very big records (30MB)
* see http://stackoverflow.com/questions/21020347/kafka-sending-a-15mb-message
*
*/
public void runBigRecordTestTopology() throws Exception {
final String topic = "bigRecordTestTopic";
// otherwise, the kafka mini clusters may run out of heap space
final int parallelism = 1;
createTestTopic(topic, parallelism, 1);
final TypeInformation<Tuple2<Long, byte[]>> longBytesInfo = TypeInfoParser.parse("Tuple2<Long, byte[]>");
final TypeInformationSerializationSchema<Tuple2<Long, byte[]>> serSchema = new TypeInformationSerializationSchema<>(longBytesInfo, new ExecutionConfig());
final StreamExecutionEnvironment env = StreamExecutionEnvironment.createRemoteEnvironment("localhost", flinkPort);
env.setRestartStrategy(RestartStrategies.noRestart());
env.getConfig().disableSysoutLogging();
env.enableCheckpointing(100);
env.setParallelism(parallelism);
// add consuming topology:
Properties consumerProps = new Properties();
consumerProps.putAll(standardProps);
consumerProps.setProperty("fetch.message.max.bytes", Integer.toString(1024 * 1024 * 14));
// for the new fetcher
consumerProps.setProperty("max.partition.fetch.bytes", Integer.toString(1024 * 1024 * 14));
consumerProps.setProperty("queued.max.message.chunks", "1");
consumerProps.putAll(secureProps);
FlinkKafkaConsumerBase<Tuple2<Long, byte[]>> source = kafkaServer.getConsumer(topic, serSchema, consumerProps);
DataStreamSource<Tuple2<Long, byte[]>> consuming = env.addSource(source);
consuming.addSink(new SinkFunction<Tuple2<Long, byte[]>>() {
private int elCnt = 0;
@Override
public void invoke(Tuple2<Long, byte[]> value) throws Exception {
elCnt++;
if (value.f0 == -1) {
// we should have seen 11 elements now.
if (elCnt == 11) {
throw new SuccessException();
} else {
throw new RuntimeException("There have been " + elCnt + " elements");
}
}
if (elCnt > 10) {
throw new RuntimeException("More than 10 elements seen: " + elCnt);
}
}
});
// add producing topology
Properties producerProps = new Properties();
producerProps.setProperty("max.request.size", Integer.toString(1024 * 1024 * 15));
producerProps.setProperty("retries", "3");
producerProps.putAll(secureProps);
producerProps.setProperty(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, brokerConnectionStrings);
DataStream<Tuple2<Long, byte[]>> stream = env.addSource(new RichSourceFunction<Tuple2<Long, byte[]>>() {
private boolean running;
@Override
public void open(Configuration parameters) throws Exception {
super.open(parameters);
running = true;
}
@Override
public void run(SourceContext<Tuple2<Long, byte[]>> ctx) throws Exception {
Random rnd = new Random();
long cnt = 0;
int sevenMb = 1024 * 1024 * 7;
while (running) {
byte[] wl = new byte[sevenMb + rnd.nextInt(sevenMb)];
ctx.collect(new Tuple2<>(cnt++, wl));
Thread.sleep(100);
if (cnt == 10) {
// signal end
ctx.collect(new Tuple2<>(-1L, new byte[] { 1 }));
break;
}
}
}
@Override
public void cancel() {
running = false;
}
});
kafkaServer.produceIntoKafka(stream, topic, new KeyedSerializationSchemaWrapper<>(serSchema), producerProps, null);
tryExecute(env, "big topology test");
deleteTestTopic(topic);
}
use of org.apache.flink.streaming.util.serialization.TypeInformationSerializationSchema in project flink by apache.
the class KafkaProducerTestBase method runCustomPartitioningTest.
/**
*
* <pre>
* +------> (sink) --+--> [KAFKA-1] --> (source) -> (map) --+
* / | \
* / | \
* (source) ----------> (sink) --+--> [KAFKA-2] --> (source) -> (map) -----+-> (sink)
* \ | /
* \ | /
* +------> (sink) --+--> [KAFKA-3] --> (source) -> (map) --+
* </pre>
*
* The mapper validates that the values come consistently from the correct Kafka partition.
*
* The final sink validates that there are no duplicates and that all partitions are present.
*/
public void runCustomPartitioningTest() {
try {
LOG.info("Starting KafkaProducerITCase.testCustomPartitioning()");
final String topic = "customPartitioningTestTopic";
final int parallelism = 3;
createTestTopic(topic, parallelism, 1);
TypeInformation<Tuple2<Long, String>> longStringInfo = TypeInfoParser.parse("Tuple2<Long, String>");
StreamExecutionEnvironment env = StreamExecutionEnvironment.createRemoteEnvironment("localhost", flinkPort);
env.setRestartStrategy(RestartStrategies.noRestart());
env.getConfig().disableSysoutLogging();
TypeInformationSerializationSchema<Tuple2<Long, String>> serSchema = new TypeInformationSerializationSchema<>(longStringInfo, env.getConfig());
TypeInformationSerializationSchema<Tuple2<Long, String>> deserSchema = new TypeInformationSerializationSchema<>(longStringInfo, env.getConfig());
// ------ producing topology ---------
// source has DOP 1 to make sure it generates no duplicates
DataStream<Tuple2<Long, String>> stream = env.addSource(new SourceFunction<Tuple2<Long, String>>() {
private boolean running = true;
@Override
public void run(SourceContext<Tuple2<Long, String>> ctx) throws Exception {
long cnt = 0;
while (running) {
ctx.collect(new Tuple2<Long, String>(cnt, "kafka-" + cnt));
cnt++;
}
}
@Override
public void cancel() {
running = false;
}
}).setParallelism(1);
Properties props = new Properties();
props.putAll(FlinkKafkaProducerBase.getPropertiesFromBrokerList(brokerConnectionStrings));
props.putAll(secureProps);
// sink partitions into
kafkaServer.produceIntoKafka(stream, topic, new KeyedSerializationSchemaWrapper<>(serSchema), props, new CustomPartitioner(parallelism)).setParallelism(parallelism);
// ------ consuming topology ---------
Properties consumerProps = new Properties();
consumerProps.putAll(standardProps);
consumerProps.putAll(secureProps);
FlinkKafkaConsumerBase<Tuple2<Long, String>> source = kafkaServer.getConsumer(topic, deserSchema, consumerProps);
env.addSource(source).setParallelism(parallelism).map(new RichMapFunction<Tuple2<Long, String>, Integer>() {
private int ourPartition = -1;
@Override
public Integer map(Tuple2<Long, String> value) {
int partition = value.f0.intValue() % parallelism;
if (ourPartition != -1) {
assertEquals("inconsistent partitioning", ourPartition, partition);
} else {
ourPartition = partition;
}
return partition;
}
}).setParallelism(parallelism).addSink(new SinkFunction<Integer>() {
private int[] valuesPerPartition = new int[parallelism];
@Override
public void invoke(Integer value) throws Exception {
valuesPerPartition[value]++;
boolean missing = false;
for (int i : valuesPerPartition) {
if (i < 100) {
missing = true;
break;
}
}
if (!missing) {
throw new SuccessException();
}
}
}).setParallelism(1);
tryExecute(env, "custom partitioning test");
deleteTestTopic(topic);
LOG.info("Finished KafkaProducerITCase.testCustomPartitioning()");
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
}
}
Aggregations