use of org.apache.flink.api.common.serialization.TypeInformationSerializationSchema in project flink by apache.
the class KafkaConsumerTestBase method runOneToOneExactlyOnceTest.
/**
* Tests the proper consumption when having a 1:1 correspondence between kafka partitions and
* Flink sources.
*/
public void runOneToOneExactlyOnceTest() throws Exception {
final String topic = "oneToOneTopic";
final int parallelism = 5;
final int numElementsPerPartition = 1000;
final int totalElements = parallelism * numElementsPerPartition;
final int failAfterElements = numElementsPerPartition / 3;
createTestTopic(topic, parallelism, 1);
DataGenerators.generateRandomizedIntegerSequence(StreamExecutionEnvironment.getExecutionEnvironment(), kafkaServer, topic, parallelism, numElementsPerPartition, true);
// run the topology that fails and recovers
DeserializationSchema<Integer> schema = new TypeInformationSerializationSchema<>(BasicTypeInfo.INT_TYPE_INFO, new ExecutionConfig());
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.enableCheckpointing(500);
env.setParallelism(parallelism);
env.setRestartStrategy(RestartStrategies.fixedDelayRestart(1, 0));
Properties props = new Properties();
props.putAll(standardProps);
props.putAll(secureProps);
getStream(env, topic, schema, props).map(new PartitionValidatingMapper(parallelism, 1)).map(new FailingIdentityMapper<Integer>(failAfterElements)).addSink(new ValidatingExactlyOnceSink(totalElements)).setParallelism(1);
FailingIdentityMapper.failedBefore = false;
tryExecute(env, "One-to-one exactly once test");
deleteTestTopic(topic);
}
use of org.apache.flink.api.common.serialization.TypeInformationSerializationSchema in project flink by apache.
the class KafkaConsumerTestBase method writeSequence.
protected String writeSequence(String baseTopicName, final int numElements, final int parallelism, final int replicationFactor) throws Exception {
LOG.info("\n===================================\n" + "== Writing sequence of " + numElements + " into " + baseTopicName + " with p=" + parallelism + "\n" + "===================================");
final TypeInformation<Tuple2<Integer, Integer>> resultType = TypeInformation.of(new TypeHint<Tuple2<Integer, Integer>>() {
});
final SerializationSchema<Tuple2<Integer, Integer>> serSchema = new TypeInformationSerializationSchema<>(resultType, new ExecutionConfig());
final KafkaDeserializationSchema<Tuple2<Integer, Integer>> deserSchema = new KafkaDeserializationSchemaWrapper<>(new TypeInformationSerializationSchema<>(resultType, new ExecutionConfig()));
final int maxNumAttempts = 10;
for (int attempt = 1; attempt <= maxNumAttempts; attempt++) {
final String topicName = baseTopicName + '-' + attempt;
LOG.info("Writing attempt #" + attempt);
// -------- Write the Sequence --------
createTestTopic(topicName, parallelism, replicationFactor);
StreamExecutionEnvironment writeEnv = StreamExecutionEnvironment.getExecutionEnvironment();
writeEnv.getConfig().setRestartStrategy(RestartStrategies.noRestart());
DataStream<Tuple2<Integer, Integer>> stream = writeEnv.addSource(new RichParallelSourceFunction<Tuple2<Integer, Integer>>() {
private boolean running = true;
@Override
public void run(SourceContext<Tuple2<Integer, Integer>> ctx) throws Exception {
int cnt = 0;
int partition = getRuntimeContext().getIndexOfThisSubtask();
while (running && cnt < numElements) {
ctx.collect(new Tuple2<>(partition, cnt));
cnt++;
}
}
@Override
public void cancel() {
running = false;
}
}).setParallelism(parallelism);
// the producer must not produce duplicates
Properties producerProperties = FlinkKafkaProducerBase.getPropertiesFromBrokerList(brokerConnectionStrings);
producerProperties.setProperty("retries", "0");
producerProperties.putAll(secureProps);
kafkaServer.produceIntoKafka(stream, topicName, serSchema, producerProperties, new Tuple2FlinkPartitioner(parallelism)).setParallelism(parallelism);
try {
writeEnv.execute("Write sequence");
} catch (Exception e) {
LOG.error("Write attempt failed, trying again", e);
deleteTestTopic(topicName);
waitUntilNoJobIsRunning(client);
continue;
}
LOG.info("Finished writing sequence");
// -------- Validate the Sequence --------
// we need to validate the sequence, because kafka's producers are not exactly once
LOG.info("Validating sequence");
waitUntilNoJobIsRunning(client);
if (validateSequence(topicName, parallelism, deserSchema, numElements)) {
// everything is good!
return topicName;
} else {
deleteTestTopic(topicName);
// fall through the loop
}
}
throw new Exception("Could not write a valid sequence to Kafka after " + maxNumAttempts + " attempts");
}
use of org.apache.flink.api.common.serialization.TypeInformationSerializationSchema in project flink by apache.
the class KafkaConsumerTestBase method readSequence.
// ------------------------------------------------------------------------
// Reading writing test data sets
// ------------------------------------------------------------------------
/**
* Runs a job using the provided environment to read a sequence of records from a single Kafka
* topic. The method allows to individually specify the expected starting offset and total read
* value count of each partition. The job will be considered successful only if all partition
* read results match the start offset and value count criteria.
*/
protected void readSequence(final StreamExecutionEnvironment env, final StartupMode startupMode, final Map<KafkaTopicPartition, Long> specificStartupOffsets, final Long startupTimestamp, final Properties cc, final String topicName, final Map<Integer, Tuple2<Integer, Integer>> partitionsToValuesCountAndStartOffset) throws Exception {
final int sourceParallelism = partitionsToValuesCountAndStartOffset.keySet().size();
int finalCountTmp = 0;
for (Map.Entry<Integer, Tuple2<Integer, Integer>> valuesCountAndStartOffset : partitionsToValuesCountAndStartOffset.entrySet()) {
finalCountTmp += valuesCountAndStartOffset.getValue().f0;
}
final int finalCount = finalCountTmp;
final TypeInformation<Tuple2<Integer, Integer>> intIntTupleType = TypeInformation.of(new TypeHint<Tuple2<Integer, Integer>>() {
});
final TypeInformationSerializationSchema<Tuple2<Integer, Integer>> deser = new TypeInformationSerializationSchema<>(intIntTupleType, env.getConfig());
// create the consumer
cc.putAll(secureProps);
DataStreamSource<Tuple2<Integer, Integer>> source;
if (useNewSource) {
KafkaSourceBuilder<Tuple2<Integer, Integer>> sourceBuilder = kafkaServer.getSourceBuilder(topicName, deser, cc);
Map<TopicPartition, Long> startOffsets = new HashMap<>();
if (specificStartupOffsets != null) {
specificStartupOffsets.forEach((ktp, offset) -> startOffsets.put(new TopicPartition(ktp.getTopic(), ktp.getPartition()), offset));
}
setKafkaSourceOffset(startupMode, sourceBuilder, startOffsets, startupTimestamp);
source = env.fromSource(sourceBuilder.build(), WatermarkStrategy.noWatermarks(), "KafkaSource");
} else {
FlinkKafkaConsumerBase<Tuple2<Integer, Integer>> consumer = kafkaServer.getConsumer(topicName, deser, cc);
setKafkaConsumerOffset(startupMode, consumer, specificStartupOffsets, startupTimestamp);
source = env.addSource(consumer);
}
source.setParallelism(sourceParallelism).map(new ThrottledMapper<>(20)).setParallelism(sourceParallelism).flatMap(new RichFlatMapFunction<Tuple2<Integer, Integer>, Integer>() {
private HashMap<Integer, BitSet> partitionsToValueCheck;
private int count = 0;
@Override
public void open(Configuration parameters) throws Exception {
partitionsToValueCheck = new HashMap<>();
for (Integer partition : partitionsToValuesCountAndStartOffset.keySet()) {
partitionsToValueCheck.put(partition, new BitSet());
}
}
@Override
public void flatMap(Tuple2<Integer, Integer> value, Collector<Integer> out) throws Exception {
int partition = value.f0;
int val = value.f1;
BitSet bitSet = partitionsToValueCheck.get(partition);
if (bitSet == null) {
throw new RuntimeException("Got a record from an unknown partition");
} else {
bitSet.set(val - partitionsToValuesCountAndStartOffset.get(partition).f1);
}
count++;
LOG.info("Received message {}, total {} messages", value, count);
// verify if we've seen everything
if (count == finalCount) {
for (Map.Entry<Integer, BitSet> partitionsToValueCheck : this.partitionsToValueCheck.entrySet()) {
BitSet check = partitionsToValueCheck.getValue();
int expectedValueCount = partitionsToValuesCountAndStartOffset.get(partitionsToValueCheck.getKey()).f0;
if (check.cardinality() != expectedValueCount) {
throw new RuntimeException("Expected cardinality to be " + expectedValueCount + ", but was " + check.cardinality());
} else if (check.nextClearBit(0) != expectedValueCount) {
throw new RuntimeException("Expected next clear bit to be " + expectedValueCount + ", but was " + check.cardinality());
}
}
// test has passed
throw new SuccessException();
}
}
}).setParallelism(1);
tryExecute(env, "Read data from Kafka");
LOG.info("Successfully read sequence for verification");
}
use of org.apache.flink.api.common.serialization.TypeInformationSerializationSchema in project flink by apache.
the class KafkaProducerTestBase method testExactlyOnce.
/**
* This test sets KafkaProducer so that it will automatically flush the data and and fails the
* broker to check whether flushed records since last checkpoint were not duplicated.
*/
protected void testExactlyOnce(boolean regularSink, int sinksCount) throws Exception {
final String topic = (regularSink ? "exactlyOnceTopicRegularSink" : "exactlyTopicCustomOperator") + sinksCount;
final int partition = 0;
final int numElements = 1000;
final int failAfterElements = 333;
for (int i = 0; i < sinksCount; i++) {
createTestTopic(topic + i, 1, 1);
}
TypeInformationSerializationSchema<Integer> schema = new TypeInformationSerializationSchema<>(BasicTypeInfo.INT_TYPE_INFO, new ExecutionConfig());
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.enableCheckpointing(500);
env.setParallelism(1);
env.setRestartStrategy(RestartStrategies.fixedDelayRestart(1, 0));
Properties properties = new Properties();
properties.putAll(standardProps);
properties.putAll(secureProps);
// process exactly failAfterElements number of elements and then shutdown Kafka broker and
// fail application
List<Integer> expectedElements = getIntegersSequence(numElements);
DataStream<Integer> inputStream = env.addSource(new IntegerSource(numElements)).map(new FailingIdentityMapper<Integer>(failAfterElements));
for (int i = 0; i < sinksCount; i++) {
FlinkKafkaPartitioner<Integer> partitioner = new FlinkKafkaPartitioner<Integer>() {
@Override
public int partition(Integer record, byte[] key, byte[] value, String targetTopic, int[] partitions) {
return partition;
}
};
if (regularSink) {
StreamSink<Integer> kafkaSink = kafkaServer.getProducerSink(topic + i, schema, properties, partitioner);
inputStream.addSink(kafkaSink.getUserFunction());
} else {
kafkaServer.produceIntoKafka(inputStream, topic + i, schema, properties, partitioner);
}
}
FailingIdentityMapper.failedBefore = false;
TestUtils.tryExecute(env, "Exactly once test");
for (int i = 0; i < sinksCount; i++) {
// assert that before failure we successfully snapshot/flushed all expected elements
assertExactlyOnceForTopic(properties, topic + i, expectedElements);
deleteTestTopic(topic + i);
}
}
Aggregations