Search in sources :

Example 31 with StreamExecutionEnvironment

use of org.apache.flink.streaming.api.environment.StreamExecutionEnvironment in project flink by apache.

the class KafkaConsumerTestBase method runAutoOffsetRetrievalAndCommitToKafka.

/**
	 * This test ensures that when the consumers retrieve some start offset from kafka (earliest, latest), that this offset
	 * is committed to Kafka, even if some partitions are not read.
	 *
	 * Test:
	 * - Create 3 partitions
	 * - write 50 messages into each.
	 * - Start three consumers with auto.offset.reset='latest' and wait until they committed into Kafka.
	 * - Check if the offsets in Kafka are set to 50 for the three partitions
	 *
	 * See FLINK-3440 as well
	 */
public void runAutoOffsetRetrievalAndCommitToKafka() throws Exception {
    // 3 partitions with 50 records each (0-49, so the expected commit offset of each partition should be 50)
    final int parallelism = 3;
    final int recordsInEachPartition = 50;
    final String topicName = writeSequence("testAutoOffsetRetrievalAndCommitToKafkaTopic", recordsInEachPartition, parallelism, 1);
    final StreamExecutionEnvironment env = StreamExecutionEnvironment.createRemoteEnvironment("localhost", flinkPort);
    env.getConfig().disableSysoutLogging();
    env.getConfig().setRestartStrategy(RestartStrategies.noRestart());
    env.setParallelism(parallelism);
    env.enableCheckpointing(200);
    Properties readProps = new Properties();
    readProps.putAll(standardProps);
    // set to reset to latest, so that partitions are initially not read
    readProps.setProperty("auto.offset.reset", "latest");
    DataStream<String> stream = env.addSource(kafkaServer.getConsumer(topicName, new SimpleStringSchema(), readProps));
    stream.addSink(new DiscardingSink<String>());
    final AtomicReference<Throwable> errorRef = new AtomicReference<>();
    final Thread runner = new Thread("runner") {

        @Override
        public void run() {
            try {
                env.execute();
            } catch (Throwable t) {
                if (!(t.getCause() instanceof JobCancellationException)) {
                    errorRef.set(t);
                }
            }
        }
    };
    runner.start();
    KafkaTestEnvironment.KafkaOffsetHandler kafkaOffsetHandler = kafkaServer.createOffsetHandler();
    // the final committed offset in Kafka should be 50
    final Long l50 = 50L;
    final long deadline = 30_000_000_000L + System.nanoTime();
    do {
        Long o1 = kafkaOffsetHandler.getCommittedOffset(topicName, 0);
        Long o2 = kafkaOffsetHandler.getCommittedOffset(topicName, 1);
        Long o3 = kafkaOffsetHandler.getCommittedOffset(topicName, 2);
        if (l50.equals(o1) && l50.equals(o2) && l50.equals(o3)) {
            break;
        }
        Thread.sleep(100);
    } while (System.nanoTime() < deadline);
    // cancel the job
    JobManagerCommunicationUtils.cancelCurrentJob(flink.getLeaderGateway(timeout));
    final Throwable t = errorRef.get();
    if (t != null) {
        throw new RuntimeException("Job failed with an exception", t);
    }
    // final check to see if offsets are correctly in Kafka
    Long o1 = kafkaOffsetHandler.getCommittedOffset(topicName, 0);
    Long o2 = kafkaOffsetHandler.getCommittedOffset(topicName, 1);
    Long o3 = kafkaOffsetHandler.getCommittedOffset(topicName, 2);
    Assert.assertEquals(Long.valueOf(50L), o1);
    Assert.assertEquals(Long.valueOf(50L), o2);
    Assert.assertEquals(Long.valueOf(50L), o3);
    kafkaOffsetHandler.close();
    deleteTestTopic(topicName);
}
Also used : AtomicReference(java.util.concurrent.atomic.AtomicReference) Properties(java.util.Properties) TypeHint(org.apache.flink.api.common.typeinfo.TypeHint) JobCancellationException(org.apache.flink.runtime.client.JobCancellationException) SimpleStringSchema(org.apache.flink.streaming.util.serialization.SimpleStringSchema) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment)

Example 32 with StreamExecutionEnvironment

use of org.apache.flink.streaming.api.environment.StreamExecutionEnvironment in project flink by apache.

the class KafkaConsumerTestBase method runStartFromSpecificOffsets.

/**
	 * This test ensures that the consumer correctly uses user-supplied specific offsets when explicitly configured to
	 * start from specific offsets. For partitions which a specific offset can not be found for, the starting position
	 * for them should fallback to the group offsets behaviour.
	 *
	 * 4 partitions will have 50 records with offsets 0 to 49. The supplied specific offsets map is:
	 * 	partition 0 --> start from offset 19
	 * 	partition 1 --> not set
	 * 	partition 2 --> start from offset 22
	 * 	partition 3 --> not set
	 * 	partition 4 --> start from offset 26 (this should be ignored because the partition does not exist)
	 *
	 * The partitions and their committed group offsets are setup as:
	 * 	partition 0 --> committed offset 23
	 * 	partition 1 --> committed offset 31
	 * 	partition 2 --> committed offset 43
	 * 	partition 3 --> no commit offset
	 *
	 * When configured to start from these specific offsets, each partition should read:
	 * 	partition 0 --> start from offset 19, read to offset 49 (31 records)
	 * 	partition 1 --> fallback to group offsets, so start from offset 31, read to offset 49 (19 records)
	 * 	partition 2 --> start from offset 22, read to offset 49 (28 records)
	 * 	partition 3 --> fallback to group offsets, but since there is no group offset for this partition,
	 * 	                will default to "auto.offset.reset" (set to "earliest"),
	 * 	                so start from offset 0, read to offset 49 (50 records)
	 */
public void runStartFromSpecificOffsets() throws Exception {
    // 4 partitions with 50 records each (offsets 0-49)
    final int parallelism = 4;
    final int recordsInEachPartition = 50;
    final String topicName = writeSequence("testStartFromSpecificOffsetsTopic", recordsInEachPartition, parallelism, 1);
    final StreamExecutionEnvironment env = StreamExecutionEnvironment.createRemoteEnvironment("localhost", flinkPort);
    env.getConfig().disableSysoutLogging();
    env.setParallelism(parallelism);
    Properties readProps = new Properties();
    readProps.putAll(standardProps);
    // partition 3 should default back to this behaviour
    readProps.setProperty("auto.offset.reset", "earliest");
    Map<KafkaTopicPartition, Long> specificStartupOffsets = new HashMap<>();
    specificStartupOffsets.put(new KafkaTopicPartition(topicName, 0), 19L);
    specificStartupOffsets.put(new KafkaTopicPartition(topicName, 2), 22L);
    // non-existing partition, should be ignored
    specificStartupOffsets.put(new KafkaTopicPartition(topicName, 4), 26L);
    // only the committed offset for partition 1 should be used, because partition 1 has no entry in specific offset map
    KafkaTestEnvironment.KafkaOffsetHandler kafkaOffsetHandler = kafkaServer.createOffsetHandler();
    kafkaOffsetHandler.setCommittedOffset(topicName, 0, 23);
    kafkaOffsetHandler.setCommittedOffset(topicName, 1, 31);
    kafkaOffsetHandler.setCommittedOffset(topicName, 2, 43);
    Map<Integer, Tuple2<Integer, Integer>> partitionsToValueCountAndStartOffsets = new HashMap<>();
    // partition 0 should read offset 19-49
    partitionsToValueCountAndStartOffsets.put(0, new Tuple2<>(31, 19));
    // partition 1 should read offset 31-49
    partitionsToValueCountAndStartOffsets.put(1, new Tuple2<>(19, 31));
    // partition 2 should read offset 22-49
    partitionsToValueCountAndStartOffsets.put(2, new Tuple2<>(28, 22));
    // partition 3 should read offset 0-49
    partitionsToValueCountAndStartOffsets.put(3, new Tuple2<>(50, 0));
    readSequence(env, StartupMode.SPECIFIC_OFFSETS, specificStartupOffsets, readProps, topicName, partitionsToValueCountAndStartOffsets);
    kafkaOffsetHandler.close();
    deleteTestTopic(topicName);
}
Also used : HashMap(java.util.HashMap) KafkaTopicPartition(org.apache.flink.streaming.connectors.kafka.internals.KafkaTopicPartition) Properties(java.util.Properties) TypeHint(org.apache.flink.api.common.typeinfo.TypeHint) Tuple2(org.apache.flink.api.java.tuple.Tuple2) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment)

Example 33 with StreamExecutionEnvironment

use of org.apache.flink.streaming.api.environment.StreamExecutionEnvironment in project flink by apache.

the class KafkaConsumerTestBase method runKeyValueTest.

public void runKeyValueTest() throws Exception {
    final String topic = "keyvaluetest";
    createTestTopic(topic, 1, 1);
    final int ELEMENT_COUNT = 5000;
    // ----------- Write some data into Kafka -------------------
    StreamExecutionEnvironment env = StreamExecutionEnvironment.createRemoteEnvironment("localhost", flinkPort);
    env.setParallelism(1);
    env.setRestartStrategy(RestartStrategies.noRestart());
    env.getConfig().disableSysoutLogging();
    DataStream<Tuple2<Long, PojoValue>> kvStream = env.addSource(new SourceFunction<Tuple2<Long, PojoValue>>() {

        @Override
        public void run(SourceContext<Tuple2<Long, PojoValue>> ctx) throws Exception {
            Random rnd = new Random(1337);
            for (long i = 0; i < ELEMENT_COUNT; i++) {
                PojoValue pojo = new PojoValue();
                pojo.when = new Date(rnd.nextLong());
                pojo.lon = rnd.nextLong();
                pojo.lat = i;
                // make every second key null to ensure proper "null" serialization
                Long key = (i % 2 == 0) ? null : i;
                ctx.collect(new Tuple2<>(key, pojo));
            }
        }

        @Override
        public void cancel() {
        }
    });
    KeyedSerializationSchema<Tuple2<Long, PojoValue>> schema = new TypeInformationKeyValueSerializationSchema<>(Long.class, PojoValue.class, env.getConfig());
    Properties producerProperties = FlinkKafkaProducerBase.getPropertiesFromBrokerList(brokerConnectionStrings);
    producerProperties.setProperty("retries", "3");
    kafkaServer.produceIntoKafka(kvStream, topic, schema, producerProperties, null);
    env.execute("Write KV to Kafka");
    // ----------- Read the data again -------------------
    env = StreamExecutionEnvironment.createRemoteEnvironment("localhost", flinkPort);
    env.setParallelism(1);
    env.setRestartStrategy(RestartStrategies.noRestart());
    env.getConfig().disableSysoutLogging();
    KeyedDeserializationSchema<Tuple2<Long, PojoValue>> readSchema = new TypeInformationKeyValueSerializationSchema<>(Long.class, PojoValue.class, env.getConfig());
    Properties props = new Properties();
    props.putAll(standardProps);
    props.putAll(secureProps);
    DataStream<Tuple2<Long, PojoValue>> fromKafka = env.addSource(kafkaServer.getConsumer(topic, readSchema, props));
    fromKafka.flatMap(new RichFlatMapFunction<Tuple2<Long, PojoValue>, Object>() {

        long counter = 0;

        @Override
        public void flatMap(Tuple2<Long, PojoValue> value, Collector<Object> out) throws Exception {
            // the elements should be in order.
            Assert.assertTrue("Wrong value " + value.f1.lat, value.f1.lat == counter);
            if (value.f1.lat % 2 == 0) {
                assertNull("key was not null", value.f0);
            } else {
                Assert.assertTrue("Wrong value " + value.f0, value.f0 == counter);
            }
            counter++;
            if (counter == ELEMENT_COUNT) {
                // we got the right number of elements
                throw new SuccessException();
            }
        }
    });
    tryExecute(env, "Read KV from Kafka");
    deleteTestTopic(topic);
}
Also used : TypeInformationKeyValueSerializationSchema(org.apache.flink.streaming.util.serialization.TypeInformationKeyValueSerializationSchema) Properties(java.util.Properties) TypeHint(org.apache.flink.api.common.typeinfo.TypeHint) RetryOnException(org.apache.flink.testutils.junit.RetryOnException) ProgramInvocationException(org.apache.flink.client.program.ProgramInvocationException) SuccessException(org.apache.flink.test.util.SuccessException) NoResourceAvailableException(org.apache.flink.runtime.jobmanager.scheduler.NoResourceAvailableException) JobExecutionException(org.apache.flink.runtime.client.JobExecutionException) TimeoutException(org.apache.kafka.common.errors.TimeoutException) JobCancellationException(org.apache.flink.runtime.client.JobCancellationException) IOException(java.io.IOException) Date(java.util.Date) Random(java.util.Random) Tuple2(org.apache.flink.api.java.tuple.Tuple2) SuccessException(org.apache.flink.test.util.SuccessException) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment)

Example 34 with StreamExecutionEnvironment

use of org.apache.flink.streaming.api.environment.StreamExecutionEnvironment in project flink by apache.

the class KafkaConsumerTestBase method runBigRecordTestTopology.

/**
	 * Test Flink's Kafka integration also with very big records (30MB)
	 * see http://stackoverflow.com/questions/21020347/kafka-sending-a-15mb-message
	 *
	 */
public void runBigRecordTestTopology() throws Exception {
    final String topic = "bigRecordTestTopic";
    // otherwise, the kafka mini clusters may run out of heap space
    final int parallelism = 1;
    createTestTopic(topic, parallelism, 1);
    final TypeInformation<Tuple2<Long, byte[]>> longBytesInfo = TypeInfoParser.parse("Tuple2<Long, byte[]>");
    final TypeInformationSerializationSchema<Tuple2<Long, byte[]>> serSchema = new TypeInformationSerializationSchema<>(longBytesInfo, new ExecutionConfig());
    final StreamExecutionEnvironment env = StreamExecutionEnvironment.createRemoteEnvironment("localhost", flinkPort);
    env.setRestartStrategy(RestartStrategies.noRestart());
    env.getConfig().disableSysoutLogging();
    env.enableCheckpointing(100);
    env.setParallelism(parallelism);
    // add consuming topology:
    Properties consumerProps = new Properties();
    consumerProps.putAll(standardProps);
    consumerProps.setProperty("fetch.message.max.bytes", Integer.toString(1024 * 1024 * 14));
    // for the new fetcher
    consumerProps.setProperty("max.partition.fetch.bytes", Integer.toString(1024 * 1024 * 14));
    consumerProps.setProperty("queued.max.message.chunks", "1");
    consumerProps.putAll(secureProps);
    FlinkKafkaConsumerBase<Tuple2<Long, byte[]>> source = kafkaServer.getConsumer(topic, serSchema, consumerProps);
    DataStreamSource<Tuple2<Long, byte[]>> consuming = env.addSource(source);
    consuming.addSink(new SinkFunction<Tuple2<Long, byte[]>>() {

        private int elCnt = 0;

        @Override
        public void invoke(Tuple2<Long, byte[]> value) throws Exception {
            elCnt++;
            if (value.f0 == -1) {
                // we should have seen 11 elements now.
                if (elCnt == 11) {
                    throw new SuccessException();
                } else {
                    throw new RuntimeException("There have been " + elCnt + " elements");
                }
            }
            if (elCnt > 10) {
                throw new RuntimeException("More than 10 elements seen: " + elCnt);
            }
        }
    });
    // add producing topology
    Properties producerProps = new Properties();
    producerProps.setProperty("max.request.size", Integer.toString(1024 * 1024 * 15));
    producerProps.setProperty("retries", "3");
    producerProps.putAll(secureProps);
    producerProps.setProperty(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, brokerConnectionStrings);
    DataStream<Tuple2<Long, byte[]>> stream = env.addSource(new RichSourceFunction<Tuple2<Long, byte[]>>() {

        private boolean running;

        @Override
        public void open(Configuration parameters) throws Exception {
            super.open(parameters);
            running = true;
        }

        @Override
        public void run(SourceContext<Tuple2<Long, byte[]>> ctx) throws Exception {
            Random rnd = new Random();
            long cnt = 0;
            int sevenMb = 1024 * 1024 * 7;
            while (running) {
                byte[] wl = new byte[sevenMb + rnd.nextInt(sevenMb)];
                ctx.collect(new Tuple2<>(cnt++, wl));
                Thread.sleep(100);
                if (cnt == 10) {
                    // signal end
                    ctx.collect(new Tuple2<>(-1L, new byte[] { 1 }));
                    break;
                }
            }
        }

        @Override
        public void cancel() {
            running = false;
        }
    });
    kafkaServer.produceIntoKafka(stream, topic, new KeyedSerializationSchemaWrapper<>(serSchema), producerProps, null);
    tryExecute(env, "big topology test");
    deleteTestTopic(topic);
}
Also used : Configuration(org.apache.flink.configuration.Configuration) ExecutionConfig(org.apache.flink.api.common.ExecutionConfig) Properties(java.util.Properties) Random(java.util.Random) TypeHint(org.apache.flink.api.common.typeinfo.TypeHint) RetryOnException(org.apache.flink.testutils.junit.RetryOnException) ProgramInvocationException(org.apache.flink.client.program.ProgramInvocationException) SuccessException(org.apache.flink.test.util.SuccessException) NoResourceAvailableException(org.apache.flink.runtime.jobmanager.scheduler.NoResourceAvailableException) JobExecutionException(org.apache.flink.runtime.client.JobExecutionException) TimeoutException(org.apache.kafka.common.errors.TimeoutException) JobCancellationException(org.apache.flink.runtime.client.JobCancellationException) IOException(java.io.IOException) TypeInformationSerializationSchema(org.apache.flink.streaming.util.serialization.TypeInformationSerializationSchema) Tuple2(org.apache.flink.api.java.tuple.Tuple2) SuccessException(org.apache.flink.test.util.SuccessException) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment)

Example 35 with StreamExecutionEnvironment

use of org.apache.flink.streaming.api.environment.StreamExecutionEnvironment in project flink by apache.

the class KafkaConsumerTestBase method runCancelingOnEmptyInputTest.

/**
	 * Tests that the source can be properly canceled when reading empty partitions. 
	 */
public void runCancelingOnEmptyInputTest() throws Exception {
    final String topic = "cancelingOnEmptyInputTopic";
    final int parallelism = 3;
    createTestTopic(topic, parallelism, 1);
    final AtomicReference<Throwable> error = new AtomicReference<>();
    final Runnable jobRunner = new Runnable() {

        @Override
        public void run() {
            try {
                final StreamExecutionEnvironment env = StreamExecutionEnvironment.createRemoteEnvironment("localhost", flinkPort);
                env.setParallelism(parallelism);
                env.enableCheckpointing(100);
                env.getConfig().disableSysoutLogging();
                Properties props = new Properties();
                props.putAll(standardProps);
                props.putAll(secureProps);
                FlinkKafkaConsumerBase<String> source = kafkaServer.getConsumer(topic, new SimpleStringSchema(), props);
                env.addSource(source).addSink(new DiscardingSink<String>());
                env.execute("CancelingOnEmptyInputTest");
            } catch (Throwable t) {
                LOG.error("Job Runner failed with exception", t);
                error.set(t);
            }
        }
    };
    Thread runnerThread = new Thread(jobRunner, "program runner thread");
    runnerThread.start();
    // wait a bit before canceling
    Thread.sleep(2000);
    Throwable failueCause = error.get();
    if (failueCause != null) {
        failueCause.printStackTrace();
        Assert.fail("Test failed prematurely with: " + failueCause.getMessage());
    }
    // cancel
    JobManagerCommunicationUtils.cancelCurrentJob(flink.getLeaderGateway(timeout));
    // wait for the program to be done and validate that we failed with the right exception
    runnerThread.join();
    failueCause = error.get();
    assertNotNull("program did not fail properly due to canceling", failueCause);
    assertTrue(failueCause.getMessage().contains("Job was cancelled"));
    deleteTestTopic(topic);
}
Also used : SimpleStringSchema(org.apache.flink.streaming.util.serialization.SimpleStringSchema) AtomicReference(java.util.concurrent.atomic.AtomicReference) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) Properties(java.util.Properties) TypeHint(org.apache.flink.api.common.typeinfo.TypeHint)

Aggregations

StreamExecutionEnvironment (org.apache.flink.streaming.api.environment.StreamExecutionEnvironment)383 Test (org.junit.Test)286 Tuple2 (org.apache.flink.api.java.tuple.Tuple2)192 OneInputTransformation (org.apache.flink.streaming.api.transformations.OneInputTransformation)81 TimeWindow (org.apache.flink.streaming.api.windowing.windows.TimeWindow)75 Tuple3 (org.apache.flink.api.java.tuple.Tuple3)48 EventTimeTrigger (org.apache.flink.streaming.api.windowing.triggers.EventTimeTrigger)42 JobGraph (org.apache.flink.runtime.jobgraph.JobGraph)34 Properties (java.util.Properties)32 ListStateDescriptor (org.apache.flink.api.common.state.ListStateDescriptor)31 TumblingEventTimeWindows (org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows)30 TypeHint (org.apache.flink.api.common.typeinfo.TypeHint)27 SuccessException (org.apache.flink.test.util.SuccessException)27 IOException (java.io.IOException)24 Configuration (org.apache.flink.configuration.Configuration)24 SlidingEventTimeWindows (org.apache.flink.streaming.api.windowing.assigners.SlidingEventTimeWindows)24 KeySelector (org.apache.flink.api.java.functions.KeySelector)22 ProcessingTimeTrigger (org.apache.flink.streaming.api.windowing.triggers.ProcessingTimeTrigger)21 ReducingStateDescriptor (org.apache.flink.api.common.state.ReducingStateDescriptor)20 MapFunction (org.apache.flink.api.common.functions.MapFunction)19