use of org.apache.flink.api.common.serialization.TypeInformationSerializationSchema in project flink by apache.
the class KafkaConsumerTestBase method runMetricsTest.
/**
* Test metrics reporting for consumer.
*
* @throws Exception
*/
public void runMetricsTest() throws Throwable {
// create a stream with 5 topics
final String topic = "metricsStream";
createTestTopic(topic, 5, 1);
final Tuple1<Throwable> error = new Tuple1<>(null);
// start job writing & reading data.
final StreamExecutionEnvironment env1 = StreamExecutionEnvironment.getExecutionEnvironment();
env1.setParallelism(1);
env1.getConfig().setRestartStrategy(RestartStrategies.noRestart());
// let the source read everything into the network buffers
env1.disableOperatorChaining();
TypeInformationSerializationSchema<Tuple2<Integer, Integer>> schema = new TypeInformationSerializationSchema<>(TypeInformation.of(new TypeHint<Tuple2<Integer, Integer>>() {
}), env1.getConfig());
DataStream<Tuple2<Integer, Integer>> fromKafka = getStream(env1, topic, schema, standardProps);
fromKafka.flatMap(new FlatMapFunction<Tuple2<Integer, Integer>, Void>() {
@Override
public void flatMap(Tuple2<Integer, Integer> value, Collector<Void> out) throws Exception {
// no op
}
});
DataStream<Tuple2<Integer, Integer>> fromGen = env1.addSource(new RichSourceFunction<Tuple2<Integer, Integer>>() {
boolean running = true;
@Override
public void run(SourceContext<Tuple2<Integer, Integer>> ctx) throws Exception {
int i = 0;
while (running) {
ctx.collect(Tuple2.of(i++, getRuntimeContext().getIndexOfThisSubtask()));
Thread.sleep(1);
}
}
@Override
public void cancel() {
running = false;
}
});
kafkaServer.produceIntoKafka(fromGen, topic, schema, standardProps, null);
JobGraph jobGraph = StreamingJobGraphGenerator.createJobGraph(env1.getStreamGraph());
final JobID jobId = jobGraph.getJobID();
Thread jobThread = new Thread(() -> {
try {
submitJobAndWaitForResult(client, jobGraph, getClass().getClassLoader());
} catch (Throwable t) {
if (!ExceptionUtils.findThrowable(t, JobCancellationException.class).isPresent()) {
LOG.warn("Got exception during execution", t);
error.f0 = t;
}
}
});
jobThread.start();
try {
// connect to JMX
MBeanServer mBeanServer = ManagementFactory.getPlatformMBeanServer();
// wait until we've found all 5 offset metrics
Set<ObjectName> offsetMetrics = mBeanServer.queryNames(new ObjectName("*current-offsets*:*"), null);
while (offsetMetrics.size() < 5) {
// test will time out if metrics are not properly working
if (error.f0 != null) {
// fail test early
throw error.f0;
}
offsetMetrics = mBeanServer.queryNames(new ObjectName("*current-offsets*:*"), null);
Thread.sleep(50);
}
Assert.assertEquals(5, offsetMetrics.size());
// The test will fail if we never meet the condition
while (true) {
int numPosOffsets = 0;
// check that offsets are correctly reported
for (ObjectName object : offsetMetrics) {
Object offset = mBeanServer.getAttribute(object, "Value");
if ((long) offset >= 0) {
numPosOffsets++;
}
}
if (numPosOffsets == 5) {
break;
}
// wait for the consumer to consume on all partitions
Thread.sleep(50);
}
// check if producer metrics are also available.
Set<ObjectName> producerMetrics = mBeanServer.queryNames(new ObjectName("*KafkaProducer*:*"), null);
Assert.assertTrue("No producer metrics found", producerMetrics.size() > 30);
LOG.info("Found all JMX metrics. Cancelling job.");
} finally {
// cancel
client.cancel(jobId).get();
// wait for the job to finish (it should due to the cancel command above)
jobThread.join();
}
if (error.f0 != null) {
throw error.f0;
}
deleteTestTopic(topic);
}
use of org.apache.flink.api.common.serialization.TypeInformationSerializationSchema in project flink by apache.
the class KafkaConsumerTestBase method runBigRecordTestTopology.
/**
* Test Flink's Kafka integration also with very big records (30MB).
*
* <p>see http://stackoverflow.com/questions/21020347/kafka-sending-a-15mb-message
*/
public void runBigRecordTestTopology() throws Exception {
final String topic = "bigRecordTestTopic";
// otherwise, the kafka mini clusters may run out of heap space
final int parallelism = 1;
createTestTopic(topic, parallelism, 1);
final TypeInformation<Tuple2<Long, byte[]>> longBytesInfo = TypeInformation.of(new TypeHint<Tuple2<Long, byte[]>>() {
});
final TypeInformationSerializationSchema<Tuple2<Long, byte[]>> serSchema = new TypeInformationSerializationSchema<>(longBytesInfo, new ExecutionConfig());
final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setRestartStrategy(RestartStrategies.noRestart());
env.enableCheckpointing(100);
env.setParallelism(parallelism);
// add consuming topology:
Properties consumerProps = new Properties();
consumerProps.putAll(standardProps);
consumerProps.setProperty("fetch.message.max.bytes", Integer.toString(1024 * 1024 * 14));
consumerProps.setProperty("max.partition.fetch.bytes", // for the new fetcher
Integer.toString(1024 * 1024 * 14));
consumerProps.setProperty("queued.max.message.chunks", "1");
consumerProps.putAll(secureProps);
DataStreamSource<Tuple2<Long, byte[]>> consuming = getStream(env, topic, serSchema, consumerProps);
consuming.addSink(new SinkFunction<Tuple2<Long, byte[]>>() {
private int elCnt = 0;
@Override
public void invoke(Tuple2<Long, byte[]> value) throws Exception {
elCnt++;
if (value.f0 == -1) {
// we should have seen 11 elements now.
if (elCnt == 11) {
throw new SuccessException();
} else {
throw new RuntimeException("There have been " + elCnt + " elements");
}
}
if (elCnt > 10) {
throw new RuntimeException("More than 10 elements seen: " + elCnt);
}
}
});
// add producing topology
Properties producerProps = new Properties();
producerProps.setProperty("max.request.size", Integer.toString(1024 * 1024 * 15));
producerProps.setProperty("retries", "3");
producerProps.putAll(secureProps);
producerProps.setProperty(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, brokerConnectionStrings);
DataStream<Tuple2<Long, byte[]>> stream = env.addSource(new RichSourceFunction<Tuple2<Long, byte[]>>() {
private boolean running;
@Override
public void open(Configuration parameters) throws Exception {
super.open(parameters);
running = true;
}
@Override
public void run(SourceContext<Tuple2<Long, byte[]>> ctx) throws Exception {
Random rnd = new Random();
long cnt = 0;
int sevenMb = 1024 * 1024 * 7;
while (running) {
byte[] wl = new byte[sevenMb + rnd.nextInt(sevenMb)];
ctx.collect(new Tuple2<>(cnt++, wl));
Thread.sleep(100);
if (cnt == 10) {
// signal end
ctx.collect(new Tuple2<>(-1L, new byte[] { 1 }));
break;
}
}
}
@Override
public void cancel() {
running = false;
}
});
kafkaServer.produceIntoKafka(stream, topic, serSchema, producerProps, null);
tryExecute(env, "big topology test");
deleteTestTopic(topic);
}
use of org.apache.flink.api.common.serialization.TypeInformationSerializationSchema in project flink by apache.
the class KafkaConsumerTestBase method runSimpleConcurrentProducerConsumerTopology.
/**
* Ensure Kafka is working on both producer and consumer side. This executes a job that contains
* two Flink pipelines.
*
* <pre>
* (generator source) --> (kafka sink)-[KAFKA-TOPIC]-(kafka source) --> (validating sink)
* </pre>
*
* <p>We need to externally retry this test. We cannot let Flink's retry mechanism do it,
* because the Kafka producer does not guarantee exactly-once output. Hence a recovery would
* introduce duplicates that cause the test to fail.
*
* <p>This test also ensures that FLINK-3156 doesn't happen again:
*
* <p>The following situation caused a NPE in the FlinkKafkaConsumer
*
* <p>topic-1 <-- elements are only produced into topic1. topic-2
*
* <p>Therefore, this test is consuming as well from an empty topic.
*/
@RetryOnException(times = 2, exception = NotLeaderForPartitionException.class)
public void runSimpleConcurrentProducerConsumerTopology() throws Exception {
final String topic = "concurrentProducerConsumerTopic_" + UUID.randomUUID().toString();
final String additionalEmptyTopic = "additionalEmptyTopic_" + UUID.randomUUID().toString();
final int parallelism = 3;
final int elementsPerPartition = 100;
final int totalElements = parallelism * elementsPerPartition;
createTestTopic(topic, parallelism, 1);
createTestTopic(additionalEmptyTopic, parallelism, // create an empty topic which will remain empty all the time
1);
final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(parallelism);
env.enableCheckpointing(500);
// fail immediately
env.setRestartStrategy(RestartStrategies.noRestart());
TypeInformation<Tuple2<Long, String>> longStringType = TypeInformation.of(new TypeHint<Tuple2<Long, String>>() {
});
TypeInformationSerializationSchema<Tuple2<Long, String>> sourceSchema = new TypeInformationSerializationSchema<>(longStringType, env.getConfig());
TypeInformationSerializationSchema<Tuple2<Long, String>> sinkSchema = new TypeInformationSerializationSchema<>(longStringType, env.getConfig());
// ----------- add producer dataflow ----------
DataStream<Tuple2<Long, String>> stream = env.addSource(new RichParallelSourceFunction<Tuple2<Long, String>>() {
private boolean running = true;
@Override
public void run(SourceContext<Tuple2<Long, String>> ctx) throws InterruptedException {
int cnt = getRuntimeContext().getIndexOfThisSubtask() * elementsPerPartition;
int limit = cnt + elementsPerPartition;
while (running && cnt < limit) {
ctx.collect(new Tuple2<>(1000L + cnt, "kafka-" + cnt));
cnt++;
// we delay data generation a bit so that we are sure that some
// checkpoints are
// triggered (for FLINK-3156)
Thread.sleep(50);
}
}
@Override
public void cancel() {
running = false;
}
});
Properties producerProperties = FlinkKafkaProducerBase.getPropertiesFromBrokerList(brokerConnectionStrings);
producerProperties.setProperty("retries", "3");
producerProperties.putAll(secureProps);
kafkaServer.produceIntoKafka(stream, topic, sinkSchema, producerProperties, null);
// ----------- add consumer dataflow ----------
List<String> topics = new ArrayList<>();
topics.add(topic);
topics.add(additionalEmptyTopic);
Properties props = new Properties();
props.putAll(standardProps);
props.putAll(secureProps);
DataStreamSource<Tuple2<Long, String>> consuming = getStream(env, topics, sourceSchema, props);
consuming.addSink(new RichSinkFunction<Tuple2<Long, String>>() {
private int elCnt = 0;
private BitSet validator = new BitSet(totalElements);
@Override
public void invoke(Tuple2<Long, String> value) throws Exception {
String[] sp = value.f1.split("-");
int v = Integer.parseInt(sp[1]);
assertEquals(value.f0 - 1000, (long) v);
assertFalse("Received tuple twice", validator.get(v));
validator.set(v);
elCnt++;
if (elCnt == totalElements) {
// check if everything in the bitset is set to true
int nc;
if ((nc = validator.nextClearBit(0)) != totalElements) {
fail("The bitset was not set to 1 on all elements. Next clear:" + nc + " Set: " + validator);
}
throw new SuccessException();
}
}
@Override
public void close() throws Exception {
super.close();
}
}).setParallelism(1);
try {
tryExecutePropagateExceptions(env, "runSimpleConcurrentProducerConsumerTopology");
} catch (ProgramInvocationException | JobExecutionException e) {
// look for NotLeaderForPartitionException
Throwable cause = e.getCause();
// search for nested SuccessExceptions
int depth = 0;
while (cause != null && depth++ < 20) {
if (cause instanceof NotLeaderForPartitionException) {
throw (Exception) cause;
}
cause = cause.getCause();
}
throw e;
}
deleteTestTopic(topic);
}
use of org.apache.flink.api.common.serialization.TypeInformationSerializationSchema in project flink by apache.
the class KafkaITCase method testTimestamps.
/**
* Kafka 20 specific test, ensuring Timestamps are properly written to and read from Kafka.
*/
@Test(timeout = 60000)
public void testTimestamps() throws Exception {
final String topic = "tstopic";
createTestTopic(topic, 3, 1);
// ---------- Produce an event time stream into Kafka -------------------
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
env.getConfig().setRestartStrategy(RestartStrategies.noRestart());
DataStream<Long> streamWithTimestamps = env.addSource(new SourceFunction<Long>() {
private static final long serialVersionUID = -2255115836471289626L;
boolean running = true;
@Override
public void run(SourceContext<Long> ctx) throws Exception {
long i = 0;
while (running) {
ctx.collectWithTimestamp(i, i * 2);
if (i++ == 1110L) {
running = false;
}
}
}
@Override
public void cancel() {
running = false;
}
});
final TypeInformationSerializationSchema<Long> longSer = new TypeInformationSerializationSchema<>(Types.LONG, env.getConfig());
FlinkKafkaProducer<Long> prod = new FlinkKafkaProducer<>(topic, new KeyedSerializationSchemaWrapper<>(longSer), standardProps, Optional.of(new FlinkKafkaPartitioner<Long>() {
private static final long serialVersionUID = -6730989584364230617L;
@Override
public int partition(Long next, byte[] key, byte[] value, String targetTopic, int[] partitions) {
return (int) (next % 3);
}
}));
prod.setWriteTimestampToKafka(true);
streamWithTimestamps.addSink(prod).setParallelism(3);
env.execute("Produce some");
// ---------- Consume stream from Kafka -------------------
env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
env.getConfig().setRestartStrategy(RestartStrategies.noRestart());
FlinkKafkaConsumer<Long> kafkaSource = new FlinkKafkaConsumer<>(topic, new KafkaITCase.LimitedLongDeserializer(), standardProps);
kafkaSource.assignTimestampsAndWatermarks(new AssignerWithPunctuatedWatermarks<Long>() {
private static final long serialVersionUID = -4834111173247835189L;
@Nullable
@Override
public Watermark checkAndGetNextWatermark(Long lastElement, long extractedTimestamp) {
if (lastElement % 11 == 0) {
return new Watermark(lastElement);
}
return null;
}
@Override
public long extractTimestamp(Long element, long previousElementTimestamp) {
return previousElementTimestamp;
}
});
DataStream<Long> stream = env.addSource(kafkaSource);
GenericTypeInfo<Object> objectTypeInfo = new GenericTypeInfo<>(Object.class);
stream.transform("timestamp validating operator", objectTypeInfo, new TimestampValidatingOperator()).setParallelism(1);
env.execute("Consume again");
deleteTestTopic(topic);
}
use of org.apache.flink.api.common.serialization.TypeInformationSerializationSchema in project flink by apache.
the class KafkaProducerTestBase method testCustomPartitioning.
/**
* This tests verifies that custom partitioning works correctly, with a default topic and
* dynamic topic. The number of partitions for each topic is deliberately different.
*
* <p>Test topology:
*
* <pre>
* +------> (sink) --+--> [DEFAULT_TOPIC-1] --> (source) -> (map) -----+
* / | | | |
* | | | | ------+--> (sink)
* +------> (sink) --+--> [DEFAULT_TOPIC-2] --> (source) -> (map) -----+
* / |
* | |
* (source) ----------> (sink) --+--> [DYNAMIC_TOPIC-1] --> (source) -> (map) -----+
* | | | | |
* \ | | | |
* +------> (sink) --+--> [DYNAMIC_TOPIC-2] --> (source) -> (map) -----+--> (sink)
* | | | | |
* \ | | | |
* +------> (sink) --+--> [DYNAMIC_TOPIC-3] --> (source) -> (map) -----+
* </pre>
*
* <p>Each topic has an independent mapper that validates the values come consistently from the
* correct Kafka partition of the topic is is responsible of.
*
* <p>Each topic also has a final sink that validates that there are no duplicates and that all
* partitions are present.
*/
@Test
public void testCustomPartitioning() {
try {
LOG.info("Starting KafkaProducerITCase.testCustomPartitioning()");
final String defaultTopic = "defaultTopic";
final int defaultTopicPartitions = 2;
final String dynamicTopic = "dynamicTopic";
final int dynamicTopicPartitions = 3;
createTestTopic(defaultTopic, defaultTopicPartitions, 1);
createTestTopic(dynamicTopic, dynamicTopicPartitions, 1);
Map<String, Integer> expectedTopicsToNumPartitions = new HashMap<>(2);
expectedTopicsToNumPartitions.put(defaultTopic, defaultTopicPartitions);
expectedTopicsToNumPartitions.put(dynamicTopic, dynamicTopicPartitions);
TypeInformation<Tuple2<Long, String>> longStringInfo = TypeInformation.of(new TypeHint<Tuple2<Long, String>>() {
});
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setRestartStrategy(RestartStrategies.noRestart());
TypeInformationSerializationSchema<Tuple2<Long, String>> serSchema = new TypeInformationSerializationSchema<>(longStringInfo, env.getConfig());
TypeInformationSerializationSchema<Tuple2<Long, String>> deserSchema = new TypeInformationSerializationSchema<>(longStringInfo, env.getConfig());
// ------ producing topology ---------
// source has DOP 1 to make sure it generates no duplicates
DataStream<Tuple2<Long, String>> stream = env.addSource(new SourceFunction<Tuple2<Long, String>>() {
private boolean running = true;
@Override
public void run(SourceContext<Tuple2<Long, String>> ctx) throws Exception {
long cnt = 0;
while (running) {
ctx.collect(new Tuple2<Long, String>(cnt, "kafka-" + cnt));
cnt++;
if (cnt % 100 == 0) {
Thread.sleep(1);
}
}
}
@Override
public void cancel() {
running = false;
}
}).setParallelism(1);
Properties props = new Properties();
props.putAll(FlinkKafkaProducerBase.getPropertiesFromBrokerList(brokerConnectionStrings));
props.putAll(secureProps);
// sink partitions into
kafkaServer.produceIntoKafka(stream, defaultTopic, // dynamic topic
new CustomKeyedSerializationSchemaWrapper(serSchema, defaultTopic, dynamicTopic), props, new CustomPartitioner(expectedTopicsToNumPartitions)).setParallelism(Math.max(defaultTopicPartitions, dynamicTopicPartitions));
// ------ consuming topology ---------
Properties consumerProps = new Properties();
consumerProps.putAll(standardProps);
consumerProps.putAll(secureProps);
FlinkKafkaConsumerBase<Tuple2<Long, String>> defaultTopicSource = kafkaServer.getConsumer(defaultTopic, deserSchema, consumerProps);
FlinkKafkaConsumerBase<Tuple2<Long, String>> dynamicTopicSource = kafkaServer.getConsumer(dynamicTopic, deserSchema, consumerProps);
env.addSource(defaultTopicSource).setParallelism(defaultTopicPartitions).map(new PartitionValidatingMapper(defaultTopicPartitions)).setParallelism(defaultTopicPartitions).addSink(new PartitionValidatingSink(defaultTopicPartitions)).setParallelism(1);
env.addSource(dynamicTopicSource).setParallelism(dynamicTopicPartitions).map(new PartitionValidatingMapper(dynamicTopicPartitions)).setParallelism(dynamicTopicPartitions).addSink(new PartitionValidatingSink(dynamicTopicPartitions)).setParallelism(1);
tryExecute(env, "custom partitioning test");
deleteTestTopic(defaultTopic);
deleteTestTopic(dynamicTopic);
LOG.info("Finished KafkaProducerITCase.testCustomPartitioning()");
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
}
}
Aggregations