Search in sources :

Example 11 with TypeInformationSerializationSchema

use of org.apache.flink.api.common.serialization.TypeInformationSerializationSchema in project flink by apache.

the class KafkaConsumerTestBase method runOneToOneExactlyOnceTest.

/**
 * Tests the proper consumption when having a 1:1 correspondence between kafka partitions and
 * Flink sources.
 */
public void runOneToOneExactlyOnceTest() throws Exception {
    final String topic = "oneToOneTopic";
    final int parallelism = 5;
    final int numElementsPerPartition = 1000;
    final int totalElements = parallelism * numElementsPerPartition;
    final int failAfterElements = numElementsPerPartition / 3;
    createTestTopic(topic, parallelism, 1);
    DataGenerators.generateRandomizedIntegerSequence(StreamExecutionEnvironment.getExecutionEnvironment(), kafkaServer, topic, parallelism, numElementsPerPartition, true);
    // run the topology that fails and recovers
    DeserializationSchema<Integer> schema = new TypeInformationSerializationSchema<>(BasicTypeInfo.INT_TYPE_INFO, new ExecutionConfig());
    StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
    env.enableCheckpointing(500);
    env.setParallelism(parallelism);
    env.setRestartStrategy(RestartStrategies.fixedDelayRestart(1, 0));
    Properties props = new Properties();
    props.putAll(standardProps);
    props.putAll(secureProps);
    getStream(env, topic, schema, props).map(new PartitionValidatingMapper(parallelism, 1)).map(new FailingIdentityMapper<Integer>(failAfterElements)).addSink(new ValidatingExactlyOnceSink(totalElements)).setParallelism(1);
    FailingIdentityMapper.failedBefore = false;
    tryExecute(env, "One-to-one exactly once test");
    deleteTestTopic(topic);
}
Also used : TypeInformationSerializationSchema(org.apache.flink.api.common.serialization.TypeInformationSerializationSchema) ValidatingExactlyOnceSink(org.apache.flink.streaming.connectors.kafka.testutils.ValidatingExactlyOnceSink) PartitionValidatingMapper(org.apache.flink.streaming.connectors.kafka.testutils.PartitionValidatingMapper) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) CoreMatchers.containsString(org.hamcrest.CoreMatchers.containsString) ExecutionConfig(org.apache.flink.api.common.ExecutionConfig) Properties(java.util.Properties) TypeHint(org.apache.flink.api.common.typeinfo.TypeHint)

Example 12 with TypeInformationSerializationSchema

use of org.apache.flink.api.common.serialization.TypeInformationSerializationSchema in project flink by apache.

the class KafkaConsumerTestBase method writeSequence.

protected String writeSequence(String baseTopicName, final int numElements, final int parallelism, final int replicationFactor) throws Exception {
    LOG.info("\n===================================\n" + "== Writing sequence of " + numElements + " into " + baseTopicName + " with p=" + parallelism + "\n" + "===================================");
    final TypeInformation<Tuple2<Integer, Integer>> resultType = TypeInformation.of(new TypeHint<Tuple2<Integer, Integer>>() {
    });
    final SerializationSchema<Tuple2<Integer, Integer>> serSchema = new TypeInformationSerializationSchema<>(resultType, new ExecutionConfig());
    final KafkaDeserializationSchema<Tuple2<Integer, Integer>> deserSchema = new KafkaDeserializationSchemaWrapper<>(new TypeInformationSerializationSchema<>(resultType, new ExecutionConfig()));
    final int maxNumAttempts = 10;
    for (int attempt = 1; attempt <= maxNumAttempts; attempt++) {
        final String topicName = baseTopicName + '-' + attempt;
        LOG.info("Writing attempt #" + attempt);
        // -------- Write the Sequence --------
        createTestTopic(topicName, parallelism, replicationFactor);
        StreamExecutionEnvironment writeEnv = StreamExecutionEnvironment.getExecutionEnvironment();
        writeEnv.getConfig().setRestartStrategy(RestartStrategies.noRestart());
        DataStream<Tuple2<Integer, Integer>> stream = writeEnv.addSource(new RichParallelSourceFunction<Tuple2<Integer, Integer>>() {

            private boolean running = true;

            @Override
            public void run(SourceContext<Tuple2<Integer, Integer>> ctx) throws Exception {
                int cnt = 0;
                int partition = getRuntimeContext().getIndexOfThisSubtask();
                while (running && cnt < numElements) {
                    ctx.collect(new Tuple2<>(partition, cnt));
                    cnt++;
                }
            }

            @Override
            public void cancel() {
                running = false;
            }
        }).setParallelism(parallelism);
        // the producer must not produce duplicates
        Properties producerProperties = FlinkKafkaProducerBase.getPropertiesFromBrokerList(brokerConnectionStrings);
        producerProperties.setProperty("retries", "0");
        producerProperties.putAll(secureProps);
        kafkaServer.produceIntoKafka(stream, topicName, serSchema, producerProperties, new Tuple2FlinkPartitioner(parallelism)).setParallelism(parallelism);
        try {
            writeEnv.execute("Write sequence");
        } catch (Exception e) {
            LOG.error("Write attempt failed, trying again", e);
            deleteTestTopic(topicName);
            waitUntilNoJobIsRunning(client);
            continue;
        }
        LOG.info("Finished writing sequence");
        // -------- Validate the Sequence --------
        // we need to validate the sequence, because kafka's producers are not exactly once
        LOG.info("Validating sequence");
        waitUntilNoJobIsRunning(client);
        if (validateSequence(topicName, parallelism, deserSchema, numElements)) {
            // everything is good!
            return topicName;
        } else {
            deleteTestTopic(topicName);
        // fall through the loop
        }
    }
    throw new Exception("Could not write a valid sequence to Kafka after " + maxNumAttempts + " attempts");
}
Also used : KafkaDeserializationSchemaWrapper(org.apache.flink.streaming.connectors.kafka.internals.KafkaDeserializationSchemaWrapper) ExecutionConfig(org.apache.flink.api.common.ExecutionConfig) CoreMatchers.containsString(org.hamcrest.CoreMatchers.containsString) Properties(java.util.Properties) TypeHint(org.apache.flink.api.common.typeinfo.TypeHint) RetryOnException(org.apache.flink.testutils.junit.RetryOnException) ProgramInvocationException(org.apache.flink.client.program.ProgramInvocationException) JobExecutionException(org.apache.flink.runtime.client.JobExecutionException) IOException(java.io.IOException) NotLeaderForPartitionException(org.apache.kafka.common.errors.NotLeaderForPartitionException) SuccessException(org.apache.flink.test.util.SuccessException) TimeoutException(org.apache.kafka.common.errors.TimeoutException) JobCancellationException(org.apache.flink.runtime.client.JobCancellationException) TypeInformationSerializationSchema(org.apache.flink.api.common.serialization.TypeInformationSerializationSchema) Tuple2FlinkPartitioner(org.apache.flink.streaming.connectors.kafka.testutils.Tuple2FlinkPartitioner) Tuple2(org.apache.flink.api.java.tuple.Tuple2) RichParallelSourceFunction(org.apache.flink.streaming.api.functions.source.RichParallelSourceFunction) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment)

Example 13 with TypeInformationSerializationSchema

use of org.apache.flink.api.common.serialization.TypeInformationSerializationSchema in project flink by apache.

the class KafkaConsumerTestBase method readSequence.

// ------------------------------------------------------------------------
// Reading writing test data sets
// ------------------------------------------------------------------------
/**
 * Runs a job using the provided environment to read a sequence of records from a single Kafka
 * topic. The method allows to individually specify the expected starting offset and total read
 * value count of each partition. The job will be considered successful only if all partition
 * read results match the start offset and value count criteria.
 */
protected void readSequence(final StreamExecutionEnvironment env, final StartupMode startupMode, final Map<KafkaTopicPartition, Long> specificStartupOffsets, final Long startupTimestamp, final Properties cc, final String topicName, final Map<Integer, Tuple2<Integer, Integer>> partitionsToValuesCountAndStartOffset) throws Exception {
    final int sourceParallelism = partitionsToValuesCountAndStartOffset.keySet().size();
    int finalCountTmp = 0;
    for (Map.Entry<Integer, Tuple2<Integer, Integer>> valuesCountAndStartOffset : partitionsToValuesCountAndStartOffset.entrySet()) {
        finalCountTmp += valuesCountAndStartOffset.getValue().f0;
    }
    final int finalCount = finalCountTmp;
    final TypeInformation<Tuple2<Integer, Integer>> intIntTupleType = TypeInformation.of(new TypeHint<Tuple2<Integer, Integer>>() {
    });
    final TypeInformationSerializationSchema<Tuple2<Integer, Integer>> deser = new TypeInformationSerializationSchema<>(intIntTupleType, env.getConfig());
    // create the consumer
    cc.putAll(secureProps);
    DataStreamSource<Tuple2<Integer, Integer>> source;
    if (useNewSource) {
        KafkaSourceBuilder<Tuple2<Integer, Integer>> sourceBuilder = kafkaServer.getSourceBuilder(topicName, deser, cc);
        Map<TopicPartition, Long> startOffsets = new HashMap<>();
        if (specificStartupOffsets != null) {
            specificStartupOffsets.forEach((ktp, offset) -> startOffsets.put(new TopicPartition(ktp.getTopic(), ktp.getPartition()), offset));
        }
        setKafkaSourceOffset(startupMode, sourceBuilder, startOffsets, startupTimestamp);
        source = env.fromSource(sourceBuilder.build(), WatermarkStrategy.noWatermarks(), "KafkaSource");
    } else {
        FlinkKafkaConsumerBase<Tuple2<Integer, Integer>> consumer = kafkaServer.getConsumer(topicName, deser, cc);
        setKafkaConsumerOffset(startupMode, consumer, specificStartupOffsets, startupTimestamp);
        source = env.addSource(consumer);
    }
    source.setParallelism(sourceParallelism).map(new ThrottledMapper<>(20)).setParallelism(sourceParallelism).flatMap(new RichFlatMapFunction<Tuple2<Integer, Integer>, Integer>() {

        private HashMap<Integer, BitSet> partitionsToValueCheck;

        private int count = 0;

        @Override
        public void open(Configuration parameters) throws Exception {
            partitionsToValueCheck = new HashMap<>();
            for (Integer partition : partitionsToValuesCountAndStartOffset.keySet()) {
                partitionsToValueCheck.put(partition, new BitSet());
            }
        }

        @Override
        public void flatMap(Tuple2<Integer, Integer> value, Collector<Integer> out) throws Exception {
            int partition = value.f0;
            int val = value.f1;
            BitSet bitSet = partitionsToValueCheck.get(partition);
            if (bitSet == null) {
                throw new RuntimeException("Got a record from an unknown partition");
            } else {
                bitSet.set(val - partitionsToValuesCountAndStartOffset.get(partition).f1);
            }
            count++;
            LOG.info("Received message {}, total {} messages", value, count);
            // verify if we've seen everything
            if (count == finalCount) {
                for (Map.Entry<Integer, BitSet> partitionsToValueCheck : this.partitionsToValueCheck.entrySet()) {
                    BitSet check = partitionsToValueCheck.getValue();
                    int expectedValueCount = partitionsToValuesCountAndStartOffset.get(partitionsToValueCheck.getKey()).f0;
                    if (check.cardinality() != expectedValueCount) {
                        throw new RuntimeException("Expected cardinality to be " + expectedValueCount + ", but was " + check.cardinality());
                    } else if (check.nextClearBit(0) != expectedValueCount) {
                        throw new RuntimeException("Expected next clear bit to be " + expectedValueCount + ", but was " + check.cardinality());
                    }
                }
                // test has passed
                throw new SuccessException();
            }
        }
    }).setParallelism(1);
    tryExecute(env, "Read data from Kafka");
    LOG.info("Successfully read sequence for verification");
}
Also used : Configuration(org.apache.flink.configuration.Configuration) HashMap(java.util.HashMap) ThrottledMapper(org.apache.flink.streaming.connectors.kafka.testutils.ThrottledMapper) Collector(org.apache.flink.util.Collector) BitSet(java.util.BitSet) TypeHint(org.apache.flink.api.common.typeinfo.TypeHint) TypeInformationSerializationSchema(org.apache.flink.api.common.serialization.TypeInformationSerializationSchema) Tuple2(org.apache.flink.api.java.tuple.Tuple2) TopicPartition(org.apache.kafka.common.TopicPartition) KafkaTopicPartition(org.apache.flink.streaming.connectors.kafka.internals.KafkaTopicPartition) RichFlatMapFunction(org.apache.flink.api.common.functions.RichFlatMapFunction) SuccessException(org.apache.flink.test.util.SuccessException) Map(java.util.Map) HashMap(java.util.HashMap)

Example 14 with TypeInformationSerializationSchema

use of org.apache.flink.api.common.serialization.TypeInformationSerializationSchema in project flink by apache.

the class KafkaProducerTestBase method testExactlyOnce.

/**
 * This test sets KafkaProducer so that it will automatically flush the data and and fails the
 * broker to check whether flushed records since last checkpoint were not duplicated.
 */
protected void testExactlyOnce(boolean regularSink, int sinksCount) throws Exception {
    final String topic = (regularSink ? "exactlyOnceTopicRegularSink" : "exactlyTopicCustomOperator") + sinksCount;
    final int partition = 0;
    final int numElements = 1000;
    final int failAfterElements = 333;
    for (int i = 0; i < sinksCount; i++) {
        createTestTopic(topic + i, 1, 1);
    }
    TypeInformationSerializationSchema<Integer> schema = new TypeInformationSerializationSchema<>(BasicTypeInfo.INT_TYPE_INFO, new ExecutionConfig());
    StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
    env.enableCheckpointing(500);
    env.setParallelism(1);
    env.setRestartStrategy(RestartStrategies.fixedDelayRestart(1, 0));
    Properties properties = new Properties();
    properties.putAll(standardProps);
    properties.putAll(secureProps);
    // process exactly failAfterElements number of elements and then shutdown Kafka broker and
    // fail application
    List<Integer> expectedElements = getIntegersSequence(numElements);
    DataStream<Integer> inputStream = env.addSource(new IntegerSource(numElements)).map(new FailingIdentityMapper<Integer>(failAfterElements));
    for (int i = 0; i < sinksCount; i++) {
        FlinkKafkaPartitioner<Integer> partitioner = new FlinkKafkaPartitioner<Integer>() {

            @Override
            public int partition(Integer record, byte[] key, byte[] value, String targetTopic, int[] partitions) {
                return partition;
            }
        };
        if (regularSink) {
            StreamSink<Integer> kafkaSink = kafkaServer.getProducerSink(topic + i, schema, properties, partitioner);
            inputStream.addSink(kafkaSink.getUserFunction());
        } else {
            kafkaServer.produceIntoKafka(inputStream, topic + i, schema, properties, partitioner);
        }
    }
    FailingIdentityMapper.failedBefore = false;
    TestUtils.tryExecute(env, "Exactly once test");
    for (int i = 0; i < sinksCount; i++) {
        // assert that before failure we successfully snapshot/flushed all expected elements
        assertExactlyOnceForTopic(properties, topic + i, expectedElements);
        deleteTestTopic(topic + i);
    }
}
Also used : ExecutionConfig(org.apache.flink.api.common.ExecutionConfig) Properties(java.util.Properties) TypeHint(org.apache.flink.api.common.typeinfo.TypeHint) TypeInformationSerializationSchema(org.apache.flink.api.common.serialization.TypeInformationSerializationSchema) FlinkKafkaPartitioner(org.apache.flink.streaming.connectors.kafka.partitioner.FlinkKafkaPartitioner) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) IntegerSource(org.apache.flink.streaming.connectors.kafka.testutils.IntegerSource)

Aggregations

TypeInformationSerializationSchema (org.apache.flink.api.common.serialization.TypeInformationSerializationSchema)14 TypeHint (org.apache.flink.api.common.typeinfo.TypeHint)13 StreamExecutionEnvironment (org.apache.flink.streaming.api.environment.StreamExecutionEnvironment)13 Properties (java.util.Properties)11 ExecutionConfig (org.apache.flink.api.common.ExecutionConfig)9 CoreMatchers.containsString (org.hamcrest.CoreMatchers.containsString)9 Tuple2 (org.apache.flink.api.java.tuple.Tuple2)8 SuccessException (org.apache.flink.test.util.SuccessException)8 IOException (java.io.IOException)6 ProgramInvocationException (org.apache.flink.client.program.ProgramInvocationException)6 JobExecutionException (org.apache.flink.runtime.client.JobExecutionException)6 RetryOnException (org.apache.flink.testutils.junit.RetryOnException)6 NotLeaderForPartitionException (org.apache.kafka.common.errors.NotLeaderForPartitionException)6 JobCancellationException (org.apache.flink.runtime.client.JobCancellationException)5 TimeoutException (org.apache.kafka.common.errors.TimeoutException)5 PartitionValidatingMapper (org.apache.flink.streaming.connectors.kafka.testutils.PartitionValidatingMapper)4 ValidatingExactlyOnceSink (org.apache.flink.streaming.connectors.kafka.testutils.ValidatingExactlyOnceSink)4 KafkaDeserializationSchemaWrapper (org.apache.flink.streaming.connectors.kafka.internals.KafkaDeserializationSchemaWrapper)3 BitSet (java.util.BitSet)2 HashMap (java.util.HashMap)2