Search in sources :

Example 6 with TypeInformationSerializationSchema

use of org.apache.flink.api.common.serialization.TypeInformationSerializationSchema in project flink by apache.

the class KafkaConsumerTestBase method writeAppendSequence.

protected void writeAppendSequence(String topicName, final int originalNumElements, final int numElementsToAppend, final int parallelism) throws Exception {
    LOG.info("\n===================================\n" + "== Appending sequence of " + numElementsToAppend + " into " + topicName + "===================================");
    final TypeInformation<Tuple2<Integer, Integer>> resultType = TypeInformation.of(new TypeHint<Tuple2<Integer, Integer>>() {
    });
    final SerializationSchema<Tuple2<Integer, Integer>> serSchema = new TypeInformationSerializationSchema<>(resultType, new ExecutionConfig());
    final KafkaDeserializationSchema<Tuple2<Integer, Integer>> deserSchema = new KafkaDeserializationSchemaWrapper<>(new TypeInformationSerializationSchema<>(resultType, new ExecutionConfig()));
    // -------- Write the append sequence --------
    StreamExecutionEnvironment writeEnv = StreamExecutionEnvironment.getExecutionEnvironment();
    writeEnv.getConfig().setRestartStrategy(RestartStrategies.noRestart());
    DataStream<Tuple2<Integer, Integer>> stream = writeEnv.addSource(new RichParallelSourceFunction<Tuple2<Integer, Integer>>() {

        private boolean running = true;

        @Override
        public void run(SourceContext<Tuple2<Integer, Integer>> ctx) throws Exception {
            int cnt = originalNumElements;
            int partition = getRuntimeContext().getIndexOfThisSubtask();
            while (running && cnt < numElementsToAppend + originalNumElements) {
                ctx.collect(new Tuple2<>(partition, cnt));
                cnt++;
            }
        }

        @Override
        public void cancel() {
            running = false;
        }
    }).setParallelism(parallelism);
    // the producer must not produce duplicates
    Properties producerProperties = FlinkKafkaProducerBase.getPropertiesFromBrokerList(brokerConnectionStrings);
    producerProperties.setProperty("retries", "0");
    producerProperties.putAll(secureProps);
    kafkaServer.produceIntoKafka(stream, topicName, serSchema, producerProperties, new Tuple2FlinkPartitioner(parallelism)).setParallelism(parallelism);
    try {
        writeEnv.execute("Write sequence");
    } catch (Exception e) {
        throw new Exception("Failed to append sequence to Kafka; append job failed.", e);
    }
    LOG.info("Finished writing append sequence");
    // we need to validate the sequence, because kafka's producers are not exactly once
    LOG.info("Validating sequence");
    while (!getRunningJobs(client).isEmpty()) {
        Thread.sleep(50);
    }
    if (!validateSequence(topicName, parallelism, deserSchema, originalNumElements + numElementsToAppend)) {
        throw new Exception("Could not append a valid sequence to Kafka.");
    }
}
Also used : KafkaDeserializationSchemaWrapper(org.apache.flink.streaming.connectors.kafka.internals.KafkaDeserializationSchemaWrapper) ExecutionConfig(org.apache.flink.api.common.ExecutionConfig) Properties(java.util.Properties) TypeHint(org.apache.flink.api.common.typeinfo.TypeHint) RetryOnException(org.apache.flink.testutils.junit.RetryOnException) ProgramInvocationException(org.apache.flink.client.program.ProgramInvocationException) JobExecutionException(org.apache.flink.runtime.client.JobExecutionException) IOException(java.io.IOException) NotLeaderForPartitionException(org.apache.kafka.common.errors.NotLeaderForPartitionException) SuccessException(org.apache.flink.test.util.SuccessException) TimeoutException(org.apache.kafka.common.errors.TimeoutException) JobCancellationException(org.apache.flink.runtime.client.JobCancellationException) TypeInformationSerializationSchema(org.apache.flink.api.common.serialization.TypeInformationSerializationSchema) Tuple2FlinkPartitioner(org.apache.flink.streaming.connectors.kafka.testutils.Tuple2FlinkPartitioner) Tuple2(org.apache.flink.api.java.tuple.Tuple2) RichParallelSourceFunction(org.apache.flink.streaming.api.functions.source.RichParallelSourceFunction) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment)

Example 7 with TypeInformationSerializationSchema

use of org.apache.flink.api.common.serialization.TypeInformationSerializationSchema in project flink by apache.

the class KafkaConsumerTestBase method runStartFromLatestOffsets.

/**
 * This test ensures that when explicitly set to start from latest record, the consumer ignores
 * the "auto.offset.reset" behaviour as well as any committed group offsets in Kafka.
 */
public void runStartFromLatestOffsets() throws Exception {
    // 50 records written to each of 3 partitions before launching a latest-starting consuming
    // job
    final int parallelism = 3;
    final int recordsInEachPartition = 50;
    // each partition will be written an extra 200 records
    final int extraRecordsInEachPartition = 200;
    // all already existing data in the topic, before the consuming topology has started, should
    // be ignored
    final String topicName = writeSequence("testStartFromLatestOffsetsTopic", recordsInEachPartition, parallelism, 1);
    // the committed offsets should be ignored
    KafkaTestEnvironment.KafkaOffsetHandler kafkaOffsetHandler = kafkaServer.createOffsetHandler();
    kafkaOffsetHandler.setCommittedOffset(topicName, 0, 23);
    kafkaOffsetHandler.setCommittedOffset(topicName, 1, 31);
    kafkaOffsetHandler.setCommittedOffset(topicName, 2, 43);
    // job names for the topologies for writing and consuming the extra records
    final String consumeExtraRecordsJobName = "Consume Extra Records Job";
    final String writeExtraRecordsJobName = "Write Extra Records Job";
    // serialization / deserialization schemas for writing and consuming the extra records
    final TypeInformation<Tuple2<Integer, Integer>> resultType = TypeInformation.of(new TypeHint<Tuple2<Integer, Integer>>() {
    });
    final SerializationSchema<Tuple2<Integer, Integer>> serSchema = new TypeInformationSerializationSchema<>(resultType, new ExecutionConfig());
    final KafkaDeserializationSchema<Tuple2<Integer, Integer>> deserSchema = new KafkaDeserializationSchemaWrapper<>(new TypeInformationSerializationSchema<>(resultType, new ExecutionConfig()));
    // setup and run the latest-consuming job
    final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
    env.setParallelism(parallelism);
    final Properties readProps = new Properties();
    readProps.putAll(standardProps);
    // this should be ignored
    readProps.setProperty("auto.offset.reset", "earliest");
    DataStreamSource<Tuple2<Integer, Integer>> stream;
    if (useNewSource) {
        KafkaSource<Tuple2<Integer, Integer>> source = kafkaServer.getSourceBuilder(topicName, deserSchema, readProps).setStartingOffsets(OffsetsInitializer.latest()).build();
        stream = env.fromSource(source, WatermarkStrategy.noWatermarks(), "KafkaSource");
    } else {
        FlinkKafkaConsumerBase<Tuple2<Integer, Integer>> latestReadingConsumer = kafkaServer.getConsumer(topicName, deserSchema, readProps);
        latestReadingConsumer.setStartFromLatest();
        stream = env.addSource(latestReadingConsumer);
    }
    stream.setParallelism(parallelism).flatMap(new FlatMapFunction<Tuple2<Integer, Integer>, Object>() {

        @Override
        public void flatMap(Tuple2<Integer, Integer> value, Collector<Object> out) throws Exception {
            if (value.f1 - recordsInEachPartition < 0) {
                throw new RuntimeException("test failed; consumed a record that was previously written: " + value);
            }
        }
    }).setParallelism(1).addSink(new DiscardingSink<>());
    JobGraph jobGraph = StreamingJobGraphGenerator.createJobGraph(env.getStreamGraph());
    final JobID consumeJobId = jobGraph.getJobID();
    final AtomicReference<Throwable> error = new AtomicReference<>();
    Thread consumeThread = new Thread(() -> {
        try {
            submitJobAndWaitForResult(client, jobGraph, getClass().getClassLoader());
        } catch (Throwable t) {
            if (!ExceptionUtils.findThrowable(t, JobCancellationException.class).isPresent()) {
                error.set(t);
            }
        }
    });
    consumeThread.start();
    // wait until the consuming job has started, to be extra safe
    waitUntilJobIsRunning(client);
    // setup the extra records writing job
    final StreamExecutionEnvironment env2 = StreamExecutionEnvironment.getExecutionEnvironment();
    env2.setParallelism(parallelism);
    DataStream<Tuple2<Integer, Integer>> extraRecordsStream = env2.addSource(new RichParallelSourceFunction<Tuple2<Integer, Integer>>() {

        private boolean running = true;

        @Override
        public void run(SourceContext<Tuple2<Integer, Integer>> ctx) throws Exception {
            int count = // the extra records should start
            recordsInEachPartition;
            // from the last written value
            int partition = getRuntimeContext().getIndexOfThisSubtask();
            while (running && count < recordsInEachPartition + extraRecordsInEachPartition) {
                ctx.collect(new Tuple2<>(partition, count));
                count++;
            }
        }

        @Override
        public void cancel() {
            running = false;
        }
    });
    kafkaServer.produceIntoKafka(extraRecordsStream, topicName, serSchema, readProps, null);
    try {
        env2.execute(writeExtraRecordsJobName);
    } catch (Exception e) {
        throw new RuntimeException("Writing extra records failed", e);
    }
    // cancel the consume job after all extra records are written
    client.cancel(consumeJobId).get();
    consumeThread.join();
    kafkaOffsetHandler.close();
    deleteTestTopic(topicName);
    // check whether the consuming thread threw any test errors;
    // test will fail here if the consume job had incorrectly read any records other than the
    // extra records
    final Throwable consumerError = error.get();
    if (consumerError != null) {
        throw new Exception("Exception in the consuming thread", consumerError);
    }
}
Also used : KafkaDeserializationSchemaWrapper(org.apache.flink.streaming.connectors.kafka.internals.KafkaDeserializationSchemaWrapper) CoreMatchers.containsString(org.hamcrest.CoreMatchers.containsString) ExecutionConfig(org.apache.flink.api.common.ExecutionConfig) Properties(java.util.Properties) JobCancellationException(org.apache.flink.runtime.client.JobCancellationException) AtomicReference(java.util.concurrent.atomic.AtomicReference) TypeHint(org.apache.flink.api.common.typeinfo.TypeHint) RetryOnException(org.apache.flink.testutils.junit.RetryOnException) ProgramInvocationException(org.apache.flink.client.program.ProgramInvocationException) JobExecutionException(org.apache.flink.runtime.client.JobExecutionException) IOException(java.io.IOException) NotLeaderForPartitionException(org.apache.kafka.common.errors.NotLeaderForPartitionException) SuccessException(org.apache.flink.test.util.SuccessException) TimeoutException(org.apache.kafka.common.errors.TimeoutException) JobCancellationException(org.apache.flink.runtime.client.JobCancellationException) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) TypeInformationSerializationSchema(org.apache.flink.api.common.serialization.TypeInformationSerializationSchema) Tuple2(org.apache.flink.api.java.tuple.Tuple2) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) JobID(org.apache.flink.api.common.JobID)

Example 8 with TypeInformationSerializationSchema

use of org.apache.flink.api.common.serialization.TypeInformationSerializationSchema in project flink by apache.

the class KafkaConsumerTestBase method runMultipleSourcesOnePartitionExactlyOnceTest.

/**
 * Tests the proper consumption when having more Flink sources than Kafka partitions, which
 * means that some Flink sources will read no partitions.
 */
public void runMultipleSourcesOnePartitionExactlyOnceTest() throws Exception {
    final String topic = "manyToOneTopic";
    final int numPartitions = 5;
    final int numElementsPerPartition = 1000;
    final int totalElements = numPartitions * numElementsPerPartition;
    final int failAfterElements = numElementsPerPartition / 3;
    final int parallelism = 8;
    createTestTopic(topic, numPartitions, 1);
    DataGenerators.generateRandomizedIntegerSequence(StreamExecutionEnvironment.getExecutionEnvironment(), kafkaServer, topic, numPartitions, numElementsPerPartition, true);
    // run the topology that fails and recovers
    DeserializationSchema<Integer> schema = new TypeInformationSerializationSchema<>(BasicTypeInfo.INT_TYPE_INFO, new ExecutionConfig());
    StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
    env.enableCheckpointing(500);
    env.setParallelism(parallelism);
    // set the number of restarts to one. The failing mapper will fail once, then it's only
    // success exceptions.
    env.setRestartStrategy(RestartStrategies.fixedDelayRestart(1, 0));
    env.setBufferTimeout(0);
    Properties props = new Properties();
    props.putAll(standardProps);
    props.putAll(secureProps);
    getStream(env, topic, schema, props).map(new PartitionValidatingMapper(numPartitions, 1)).map(new FailingIdentityMapper<Integer>(failAfterElements)).addSink(new ValidatingExactlyOnceSink(totalElements, true)).setParallelism(1);
    FailingIdentityMapper.failedBefore = false;
    tryExecute(env, "multi-source-one-partitions exactly once test");
    deleteTestTopic(topic);
}
Also used : TypeInformationSerializationSchema(org.apache.flink.api.common.serialization.TypeInformationSerializationSchema) ValidatingExactlyOnceSink(org.apache.flink.streaming.connectors.kafka.testutils.ValidatingExactlyOnceSink) PartitionValidatingMapper(org.apache.flink.streaming.connectors.kafka.testutils.PartitionValidatingMapper) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) CoreMatchers.containsString(org.hamcrest.CoreMatchers.containsString) ExecutionConfig(org.apache.flink.api.common.ExecutionConfig) Properties(java.util.Properties) TypeHint(org.apache.flink.api.common.typeinfo.TypeHint)

Example 9 with TypeInformationSerializationSchema

use of org.apache.flink.api.common.serialization.TypeInformationSerializationSchema in project flink by apache.

the class KafkaConsumerTestBase method runBrokerFailureTest.

public void runBrokerFailureTest() throws Exception {
    final String topic = "brokerFailureTestTopic";
    final int parallelism = 2;
    final int numElementsPerPartition = 1000;
    final int totalElements = parallelism * numElementsPerPartition;
    final int failAfterElements = numElementsPerPartition / 3;
    createTestTopic(topic, parallelism, 1);
    DataGenerators.generateRandomizedIntegerSequence(StreamExecutionEnvironment.getExecutionEnvironment(), kafkaServer, topic, parallelism, numElementsPerPartition, true);
    // find leader to shut down
    int leaderId = kafkaServer.getLeaderToShutDown(topic);
    LOG.info("Leader to shutdown {}", leaderId);
    // run the topology (the consumers must handle the failures)
    DeserializationSchema<Integer> schema = new TypeInformationSerializationSchema<>(BasicTypeInfo.INT_TYPE_INFO, new ExecutionConfig());
    StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
    env.setParallelism(parallelism);
    env.enableCheckpointing(500);
    env.setRestartStrategy(RestartStrategies.noRestart());
    Properties props = new Properties();
    props.putAll(standardProps);
    props.putAll(secureProps);
    getStream(env, topic, schema, props).map(new PartitionValidatingMapper(parallelism, 1)).map(new BrokerKillingMapper<Integer>(leaderId, failAfterElements)).addSink(new ValidatingExactlyOnceSink(totalElements)).setParallelism(1);
    try {
        BrokerKillingMapper.killedLeaderBefore = false;
        tryExecute(env, "Broker failure once test");
    } finally {
        // start a new broker:
        kafkaServer.restartBroker(leaderId);
    }
}
Also used : TypeInformationSerializationSchema(org.apache.flink.api.common.serialization.TypeInformationSerializationSchema) ValidatingExactlyOnceSink(org.apache.flink.streaming.connectors.kafka.testutils.ValidatingExactlyOnceSink) PartitionValidatingMapper(org.apache.flink.streaming.connectors.kafka.testutils.PartitionValidatingMapper) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) CoreMatchers.containsString(org.hamcrest.CoreMatchers.containsString) ExecutionConfig(org.apache.flink.api.common.ExecutionConfig) Properties(java.util.Properties) TypeHint(org.apache.flink.api.common.typeinfo.TypeHint)

Example 10 with TypeInformationSerializationSchema

use of org.apache.flink.api.common.serialization.TypeInformationSerializationSchema in project flink by apache.

the class KafkaConsumerTestBase method runOneSourceMultiplePartitionsExactlyOnceTest.

/**
 * Tests the proper consumption when having fewer Flink sources than Kafka partitions, so one
 * Flink source will read multiple Kafka partitions.
 */
public void runOneSourceMultiplePartitionsExactlyOnceTest() throws Exception {
    final String topic = "oneToManyTopic";
    final int numPartitions = 5;
    final int numElementsPerPartition = 1000;
    final int totalElements = numPartitions * numElementsPerPartition;
    final int failAfterElements = numElementsPerPartition / 3;
    final int parallelism = 2;
    createTestTopic(topic, numPartitions, 1);
    DataGenerators.generateRandomizedIntegerSequence(StreamExecutionEnvironment.getExecutionEnvironment(), kafkaServer, topic, numPartitions, numElementsPerPartition, true);
    // run the topology that fails and recovers
    DeserializationSchema<Integer> schema = new TypeInformationSerializationSchema<>(BasicTypeInfo.INT_TYPE_INFO, new ExecutionConfig());
    StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
    env.enableCheckpointing(500);
    env.setParallelism(parallelism);
    env.setRestartStrategy(RestartStrategies.fixedDelayRestart(1, 0));
    Properties props = new Properties();
    props.putAll(standardProps);
    props.putAll(secureProps);
    getStream(env, topic, schema, props).map(new PartitionValidatingMapper(numPartitions, 3)).map(new FailingIdentityMapper<Integer>(failAfterElements)).addSink(new ValidatingExactlyOnceSink(totalElements)).setParallelism(1);
    FailingIdentityMapper.failedBefore = false;
    tryExecute(env, "One-source-multi-partitions exactly once test");
    deleteTestTopic(topic);
}
Also used : TypeInformationSerializationSchema(org.apache.flink.api.common.serialization.TypeInformationSerializationSchema) ValidatingExactlyOnceSink(org.apache.flink.streaming.connectors.kafka.testutils.ValidatingExactlyOnceSink) PartitionValidatingMapper(org.apache.flink.streaming.connectors.kafka.testutils.PartitionValidatingMapper) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) CoreMatchers.containsString(org.hamcrest.CoreMatchers.containsString) ExecutionConfig(org.apache.flink.api.common.ExecutionConfig) Properties(java.util.Properties) TypeHint(org.apache.flink.api.common.typeinfo.TypeHint)

Aggregations

TypeInformationSerializationSchema (org.apache.flink.api.common.serialization.TypeInformationSerializationSchema)14 TypeHint (org.apache.flink.api.common.typeinfo.TypeHint)13 StreamExecutionEnvironment (org.apache.flink.streaming.api.environment.StreamExecutionEnvironment)13 Properties (java.util.Properties)11 ExecutionConfig (org.apache.flink.api.common.ExecutionConfig)9 CoreMatchers.containsString (org.hamcrest.CoreMatchers.containsString)9 Tuple2 (org.apache.flink.api.java.tuple.Tuple2)8 SuccessException (org.apache.flink.test.util.SuccessException)8 IOException (java.io.IOException)6 ProgramInvocationException (org.apache.flink.client.program.ProgramInvocationException)6 JobExecutionException (org.apache.flink.runtime.client.JobExecutionException)6 RetryOnException (org.apache.flink.testutils.junit.RetryOnException)6 NotLeaderForPartitionException (org.apache.kafka.common.errors.NotLeaderForPartitionException)6 JobCancellationException (org.apache.flink.runtime.client.JobCancellationException)5 TimeoutException (org.apache.kafka.common.errors.TimeoutException)5 PartitionValidatingMapper (org.apache.flink.streaming.connectors.kafka.testutils.PartitionValidatingMapper)4 ValidatingExactlyOnceSink (org.apache.flink.streaming.connectors.kafka.testutils.ValidatingExactlyOnceSink)4 KafkaDeserializationSchemaWrapper (org.apache.flink.streaming.connectors.kafka.internals.KafkaDeserializationSchemaWrapper)3 BitSet (java.util.BitSet)2 HashMap (java.util.HashMap)2