use of org.apache.flink.api.common.serialization.TypeInformationSerializationSchema in project flink by apache.
the class KafkaConsumerTestBase method writeAppendSequence.
protected void writeAppendSequence(String topicName, final int originalNumElements, final int numElementsToAppend, final int parallelism) throws Exception {
LOG.info("\n===================================\n" + "== Appending sequence of " + numElementsToAppend + " into " + topicName + "===================================");
final TypeInformation<Tuple2<Integer, Integer>> resultType = TypeInformation.of(new TypeHint<Tuple2<Integer, Integer>>() {
});
final SerializationSchema<Tuple2<Integer, Integer>> serSchema = new TypeInformationSerializationSchema<>(resultType, new ExecutionConfig());
final KafkaDeserializationSchema<Tuple2<Integer, Integer>> deserSchema = new KafkaDeserializationSchemaWrapper<>(new TypeInformationSerializationSchema<>(resultType, new ExecutionConfig()));
// -------- Write the append sequence --------
StreamExecutionEnvironment writeEnv = StreamExecutionEnvironment.getExecutionEnvironment();
writeEnv.getConfig().setRestartStrategy(RestartStrategies.noRestart());
DataStream<Tuple2<Integer, Integer>> stream = writeEnv.addSource(new RichParallelSourceFunction<Tuple2<Integer, Integer>>() {
private boolean running = true;
@Override
public void run(SourceContext<Tuple2<Integer, Integer>> ctx) throws Exception {
int cnt = originalNumElements;
int partition = getRuntimeContext().getIndexOfThisSubtask();
while (running && cnt < numElementsToAppend + originalNumElements) {
ctx.collect(new Tuple2<>(partition, cnt));
cnt++;
}
}
@Override
public void cancel() {
running = false;
}
}).setParallelism(parallelism);
// the producer must not produce duplicates
Properties producerProperties = FlinkKafkaProducerBase.getPropertiesFromBrokerList(brokerConnectionStrings);
producerProperties.setProperty("retries", "0");
producerProperties.putAll(secureProps);
kafkaServer.produceIntoKafka(stream, topicName, serSchema, producerProperties, new Tuple2FlinkPartitioner(parallelism)).setParallelism(parallelism);
try {
writeEnv.execute("Write sequence");
} catch (Exception e) {
throw new Exception("Failed to append sequence to Kafka; append job failed.", e);
}
LOG.info("Finished writing append sequence");
// we need to validate the sequence, because kafka's producers are not exactly once
LOG.info("Validating sequence");
while (!getRunningJobs(client).isEmpty()) {
Thread.sleep(50);
}
if (!validateSequence(topicName, parallelism, deserSchema, originalNumElements + numElementsToAppend)) {
throw new Exception("Could not append a valid sequence to Kafka.");
}
}
use of org.apache.flink.api.common.serialization.TypeInformationSerializationSchema in project flink by apache.
the class KafkaConsumerTestBase method runStartFromLatestOffsets.
/**
* This test ensures that when explicitly set to start from latest record, the consumer ignores
* the "auto.offset.reset" behaviour as well as any committed group offsets in Kafka.
*/
public void runStartFromLatestOffsets() throws Exception {
// 50 records written to each of 3 partitions before launching a latest-starting consuming
// job
final int parallelism = 3;
final int recordsInEachPartition = 50;
// each partition will be written an extra 200 records
final int extraRecordsInEachPartition = 200;
// all already existing data in the topic, before the consuming topology has started, should
// be ignored
final String topicName = writeSequence("testStartFromLatestOffsetsTopic", recordsInEachPartition, parallelism, 1);
// the committed offsets should be ignored
KafkaTestEnvironment.KafkaOffsetHandler kafkaOffsetHandler = kafkaServer.createOffsetHandler();
kafkaOffsetHandler.setCommittedOffset(topicName, 0, 23);
kafkaOffsetHandler.setCommittedOffset(topicName, 1, 31);
kafkaOffsetHandler.setCommittedOffset(topicName, 2, 43);
// job names for the topologies for writing and consuming the extra records
final String consumeExtraRecordsJobName = "Consume Extra Records Job";
final String writeExtraRecordsJobName = "Write Extra Records Job";
// serialization / deserialization schemas for writing and consuming the extra records
final TypeInformation<Tuple2<Integer, Integer>> resultType = TypeInformation.of(new TypeHint<Tuple2<Integer, Integer>>() {
});
final SerializationSchema<Tuple2<Integer, Integer>> serSchema = new TypeInformationSerializationSchema<>(resultType, new ExecutionConfig());
final KafkaDeserializationSchema<Tuple2<Integer, Integer>> deserSchema = new KafkaDeserializationSchemaWrapper<>(new TypeInformationSerializationSchema<>(resultType, new ExecutionConfig()));
// setup and run the latest-consuming job
final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(parallelism);
final Properties readProps = new Properties();
readProps.putAll(standardProps);
// this should be ignored
readProps.setProperty("auto.offset.reset", "earliest");
DataStreamSource<Tuple2<Integer, Integer>> stream;
if (useNewSource) {
KafkaSource<Tuple2<Integer, Integer>> source = kafkaServer.getSourceBuilder(topicName, deserSchema, readProps).setStartingOffsets(OffsetsInitializer.latest()).build();
stream = env.fromSource(source, WatermarkStrategy.noWatermarks(), "KafkaSource");
} else {
FlinkKafkaConsumerBase<Tuple2<Integer, Integer>> latestReadingConsumer = kafkaServer.getConsumer(topicName, deserSchema, readProps);
latestReadingConsumer.setStartFromLatest();
stream = env.addSource(latestReadingConsumer);
}
stream.setParallelism(parallelism).flatMap(new FlatMapFunction<Tuple2<Integer, Integer>, Object>() {
@Override
public void flatMap(Tuple2<Integer, Integer> value, Collector<Object> out) throws Exception {
if (value.f1 - recordsInEachPartition < 0) {
throw new RuntimeException("test failed; consumed a record that was previously written: " + value);
}
}
}).setParallelism(1).addSink(new DiscardingSink<>());
JobGraph jobGraph = StreamingJobGraphGenerator.createJobGraph(env.getStreamGraph());
final JobID consumeJobId = jobGraph.getJobID();
final AtomicReference<Throwable> error = new AtomicReference<>();
Thread consumeThread = new Thread(() -> {
try {
submitJobAndWaitForResult(client, jobGraph, getClass().getClassLoader());
} catch (Throwable t) {
if (!ExceptionUtils.findThrowable(t, JobCancellationException.class).isPresent()) {
error.set(t);
}
}
});
consumeThread.start();
// wait until the consuming job has started, to be extra safe
waitUntilJobIsRunning(client);
// setup the extra records writing job
final StreamExecutionEnvironment env2 = StreamExecutionEnvironment.getExecutionEnvironment();
env2.setParallelism(parallelism);
DataStream<Tuple2<Integer, Integer>> extraRecordsStream = env2.addSource(new RichParallelSourceFunction<Tuple2<Integer, Integer>>() {
private boolean running = true;
@Override
public void run(SourceContext<Tuple2<Integer, Integer>> ctx) throws Exception {
int count = // the extra records should start
recordsInEachPartition;
// from the last written value
int partition = getRuntimeContext().getIndexOfThisSubtask();
while (running && count < recordsInEachPartition + extraRecordsInEachPartition) {
ctx.collect(new Tuple2<>(partition, count));
count++;
}
}
@Override
public void cancel() {
running = false;
}
});
kafkaServer.produceIntoKafka(extraRecordsStream, topicName, serSchema, readProps, null);
try {
env2.execute(writeExtraRecordsJobName);
} catch (Exception e) {
throw new RuntimeException("Writing extra records failed", e);
}
// cancel the consume job after all extra records are written
client.cancel(consumeJobId).get();
consumeThread.join();
kafkaOffsetHandler.close();
deleteTestTopic(topicName);
// check whether the consuming thread threw any test errors;
// test will fail here if the consume job had incorrectly read any records other than the
// extra records
final Throwable consumerError = error.get();
if (consumerError != null) {
throw new Exception("Exception in the consuming thread", consumerError);
}
}
use of org.apache.flink.api.common.serialization.TypeInformationSerializationSchema in project flink by apache.
the class KafkaConsumerTestBase method runMultipleSourcesOnePartitionExactlyOnceTest.
/**
* Tests the proper consumption when having more Flink sources than Kafka partitions, which
* means that some Flink sources will read no partitions.
*/
public void runMultipleSourcesOnePartitionExactlyOnceTest() throws Exception {
final String topic = "manyToOneTopic";
final int numPartitions = 5;
final int numElementsPerPartition = 1000;
final int totalElements = numPartitions * numElementsPerPartition;
final int failAfterElements = numElementsPerPartition / 3;
final int parallelism = 8;
createTestTopic(topic, numPartitions, 1);
DataGenerators.generateRandomizedIntegerSequence(StreamExecutionEnvironment.getExecutionEnvironment(), kafkaServer, topic, numPartitions, numElementsPerPartition, true);
// run the topology that fails and recovers
DeserializationSchema<Integer> schema = new TypeInformationSerializationSchema<>(BasicTypeInfo.INT_TYPE_INFO, new ExecutionConfig());
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.enableCheckpointing(500);
env.setParallelism(parallelism);
// set the number of restarts to one. The failing mapper will fail once, then it's only
// success exceptions.
env.setRestartStrategy(RestartStrategies.fixedDelayRestart(1, 0));
env.setBufferTimeout(0);
Properties props = new Properties();
props.putAll(standardProps);
props.putAll(secureProps);
getStream(env, topic, schema, props).map(new PartitionValidatingMapper(numPartitions, 1)).map(new FailingIdentityMapper<Integer>(failAfterElements)).addSink(new ValidatingExactlyOnceSink(totalElements, true)).setParallelism(1);
FailingIdentityMapper.failedBefore = false;
tryExecute(env, "multi-source-one-partitions exactly once test");
deleteTestTopic(topic);
}
use of org.apache.flink.api.common.serialization.TypeInformationSerializationSchema in project flink by apache.
the class KafkaConsumerTestBase method runBrokerFailureTest.
public void runBrokerFailureTest() throws Exception {
final String topic = "brokerFailureTestTopic";
final int parallelism = 2;
final int numElementsPerPartition = 1000;
final int totalElements = parallelism * numElementsPerPartition;
final int failAfterElements = numElementsPerPartition / 3;
createTestTopic(topic, parallelism, 1);
DataGenerators.generateRandomizedIntegerSequence(StreamExecutionEnvironment.getExecutionEnvironment(), kafkaServer, topic, parallelism, numElementsPerPartition, true);
// find leader to shut down
int leaderId = kafkaServer.getLeaderToShutDown(topic);
LOG.info("Leader to shutdown {}", leaderId);
// run the topology (the consumers must handle the failures)
DeserializationSchema<Integer> schema = new TypeInformationSerializationSchema<>(BasicTypeInfo.INT_TYPE_INFO, new ExecutionConfig());
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(parallelism);
env.enableCheckpointing(500);
env.setRestartStrategy(RestartStrategies.noRestart());
Properties props = new Properties();
props.putAll(standardProps);
props.putAll(secureProps);
getStream(env, topic, schema, props).map(new PartitionValidatingMapper(parallelism, 1)).map(new BrokerKillingMapper<Integer>(leaderId, failAfterElements)).addSink(new ValidatingExactlyOnceSink(totalElements)).setParallelism(1);
try {
BrokerKillingMapper.killedLeaderBefore = false;
tryExecute(env, "Broker failure once test");
} finally {
// start a new broker:
kafkaServer.restartBroker(leaderId);
}
}
use of org.apache.flink.api.common.serialization.TypeInformationSerializationSchema in project flink by apache.
the class KafkaConsumerTestBase method runOneSourceMultiplePartitionsExactlyOnceTest.
/**
* Tests the proper consumption when having fewer Flink sources than Kafka partitions, so one
* Flink source will read multiple Kafka partitions.
*/
public void runOneSourceMultiplePartitionsExactlyOnceTest() throws Exception {
final String topic = "oneToManyTopic";
final int numPartitions = 5;
final int numElementsPerPartition = 1000;
final int totalElements = numPartitions * numElementsPerPartition;
final int failAfterElements = numElementsPerPartition / 3;
final int parallelism = 2;
createTestTopic(topic, numPartitions, 1);
DataGenerators.generateRandomizedIntegerSequence(StreamExecutionEnvironment.getExecutionEnvironment(), kafkaServer, topic, numPartitions, numElementsPerPartition, true);
// run the topology that fails and recovers
DeserializationSchema<Integer> schema = new TypeInformationSerializationSchema<>(BasicTypeInfo.INT_TYPE_INFO, new ExecutionConfig());
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.enableCheckpointing(500);
env.setParallelism(parallelism);
env.setRestartStrategy(RestartStrategies.fixedDelayRestart(1, 0));
Properties props = new Properties();
props.putAll(standardProps);
props.putAll(secureProps);
getStream(env, topic, schema, props).map(new PartitionValidatingMapper(numPartitions, 3)).map(new FailingIdentityMapper<Integer>(failAfterElements)).addSink(new ValidatingExactlyOnceSink(totalElements)).setParallelism(1);
FailingIdentityMapper.failedBefore = false;
tryExecute(env, "One-source-multi-partitions exactly once test");
deleteTestTopic(topic);
}
Aggregations