Search in sources :

Example 1 with KafkaShuffleWatermark

use of org.apache.flink.streaming.connectors.kafka.internals.KafkaShuffleFetcher.KafkaShuffleWatermark in project flink by apache.

the class KafkaShuffleITCase method testWatermarkBroadcasting.

/**
 * To test value and watermark serialization and deserialization with time characteristic:
 * EventTime.
 *
 * <p>Producer Parallelism = 1; Kafka Partition # = 1; Consumer Parallelism = 1.
 */
@Test
public void testWatermarkBroadcasting() throws Exception {
    final int numberOfPartitions = 3;
    final int producerParallelism = 2;
    final int numElementsPerProducer = 1000;
    final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
    Map<Integer, Collection<ConsumerRecord<byte[], byte[]>>> results = testKafkaShuffleProducer(topic("test_watermark_broadcast", EventTime), env, numberOfPartitions, producerParallelism, numElementsPerProducer, EventTime);
    TypeSerializer<Tuple3<Integer, Long, Integer>> typeSerializer = createTypeSerializer(env);
    KafkaShuffleElementDeserializer deserializer = new KafkaShuffleElementDeserializer<>(typeSerializer);
    // Records in a single partition are kept in order
    for (int p = 0; p < numberOfPartitions; p++) {
        Collection<ConsumerRecord<byte[], byte[]>> records = results.get(p);
        Map<Integer, List<KafkaShuffleWatermark>> watermarks = new HashMap<>();
        for (ConsumerRecord<byte[], byte[]> consumerRecord : records) {
            assertNull(consumerRecord.key());
            KafkaShuffleElement element = deserializer.deserialize(consumerRecord);
            if (element.isRecord()) {
                KafkaShuffleRecord<Tuple3<Integer, Long, Integer>> record = element.asRecord();
                assertEquals(record.getValue().f1.longValue(), INIT_TIMESTAMP + record.getValue().f0);
                assertEquals(record.getTimestamp().longValue(), record.getValue().f1.longValue());
            } else if (element.isWatermark()) {
                KafkaShuffleWatermark watermark = element.asWatermark();
                watermarks.computeIfAbsent(watermark.getSubtask(), k -> new ArrayList<>());
                watermarks.get(watermark.getSubtask()).add(watermark);
            } else {
                fail("KafkaShuffleElement is either record or watermark");
            }
        }
        // Besides, watermarks from the same producer sub task should keep in order.
        for (List<KafkaShuffleWatermark> subTaskWatermarks : watermarks.values()) {
            int index = 0;
            assertEquals(numElementsPerProducer + 1, subTaskWatermarks.size());
            for (KafkaShuffleWatermark watermark : subTaskWatermarks) {
                if (index == numElementsPerProducer) {
                    // the last element is the watermark that signifies end-of-event-time
                    assertEquals(watermark.getWatermark(), Watermark.MAX_WATERMARK.getTimestamp());
                } else {
                    assertEquals(watermark.getWatermark(), INIT_TIMESTAMP + index++);
                }
            }
        }
    }
}
Also used : KafkaShuffleElementDeserializer(org.apache.flink.streaming.connectors.kafka.internals.KafkaShuffleFetcher.KafkaShuffleElementDeserializer) PARTITION_NUMBER(org.apache.flink.streaming.connectors.kafka.shuffle.FlinkKafkaShuffle.PARTITION_NUMBER) Tuple3(org.apache.flink.api.java.tuple.Tuple3) EventTime(org.apache.flink.streaming.api.TimeCharacteristic.EventTime) IngestionTime(org.apache.flink.streaming.api.TimeCharacteristic.IngestionTime) TupleTypeInfo(org.apache.flink.api.java.typeutils.TupleTypeInfo) Watermark(org.apache.flink.streaming.api.watermark.Watermark) HashMap(java.util.HashMap) RestartStrategies(org.apache.flink.api.common.restartstrategy.RestartStrategies) PropertiesUtil(org.apache.flink.util.PropertiesUtil) Lists(org.apache.flink.shaded.guava30.com.google.common.collect.Lists) ArrayList(java.util.ArrayList) BasicTypeInfo(org.apache.flink.api.common.typeinfo.BasicTypeInfo) Map(java.util.Map) Timeout(org.junit.rules.Timeout) Assert.fail(org.junit.Assert.fail) Iterables(org.apache.flink.shaded.guava30.com.google.common.collect.Iterables) PRODUCER_PARALLELISM(org.apache.flink.streaming.connectors.kafka.shuffle.FlinkKafkaShuffle.PRODUCER_PARALLELISM) Tuple(org.apache.flink.api.java.tuple.Tuple) TestUtils.tryExecute(org.apache.flink.test.util.TestUtils.tryExecute) TimeCharacteristic(org.apache.flink.streaming.api.TimeCharacteristic) TypeSerializer(org.apache.flink.api.common.typeutils.TypeSerializer) Properties(java.util.Properties) ProcessingTime(org.apache.flink.streaming.api.TimeCharacteristic.ProcessingTime) Assert.assertNotNull(org.junit.Assert.assertNotNull) KafkaShuffleElement(org.apache.flink.streaming.connectors.kafka.internals.KafkaShuffleFetcher.KafkaShuffleElement) Collection(java.util.Collection) KeyedStream(org.apache.flink.streaming.api.datastream.KeyedStream) Test(org.junit.Test) KafkaShuffleElementDeserializer(org.apache.flink.streaming.connectors.kafka.internals.KafkaShuffleFetcher.KafkaShuffleElementDeserializer) KafkaShuffleWatermark(org.apache.flink.streaming.connectors.kafka.internals.KafkaShuffleFetcher.KafkaShuffleWatermark) DataStream(org.apache.flink.streaming.api.datastream.DataStream) List(java.util.List) Rule(org.junit.Rule) Assert.assertNull(org.junit.Assert.assertNull) KafkaShuffleRecord(org.apache.flink.streaming.connectors.kafka.internals.KafkaShuffleFetcher.KafkaShuffleRecord) ConsumerRecord(org.apache.kafka.clients.consumer.ConsumerRecord) Assert.assertEquals(org.junit.Assert.assertEquals) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) KafkaShuffleWatermark(org.apache.flink.streaming.connectors.kafka.internals.KafkaShuffleFetcher.KafkaShuffleWatermark) HashMap(java.util.HashMap) KafkaShuffleElement(org.apache.flink.streaming.connectors.kafka.internals.KafkaShuffleFetcher.KafkaShuffleElement) ArrayList(java.util.ArrayList) ConsumerRecord(org.apache.kafka.clients.consumer.ConsumerRecord) Tuple3(org.apache.flink.api.java.tuple.Tuple3) Collection(java.util.Collection) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) ArrayList(java.util.ArrayList) List(java.util.List) Test(org.junit.Test)

Example 2 with KafkaShuffleWatermark

use of org.apache.flink.streaming.connectors.kafka.internals.KafkaShuffleFetcher.KafkaShuffleWatermark in project flink by apache.

the class KafkaShuffleITCase method testRecordSerDe.

private void testRecordSerDe(TimeCharacteristic timeCharacteristic) throws Exception {
    final int numElementsPerProducer = 2000;
    final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
    // Records in a single partition are kept in order
    Collection<ConsumerRecord<byte[], byte[]>> records = Iterables.getOnlyElement(testKafkaShuffleProducer(topic("test_serde", timeCharacteristic), env, 1, 1, numElementsPerProducer, timeCharacteristic).values());
    switch(timeCharacteristic) {
        case ProcessingTime:
            // NonTimestampContext, no intermediate watermarks, and one end-of-event-time
            // watermark
            assertEquals(records.size(), numElementsPerProducer + 1);
            break;
        case IngestionTime:
            // with default interval 200, hence difficult to control the number of watermarks
            break;
        case EventTime:
            // ManualWatermarkContext
            // `numElementsPerProducer` records, `numElementsPerProducer` watermarks, and one
            // end-of-event-time watermark
            assertEquals(records.size(), numElementsPerProducer * 2 + 1);
            break;
        default:
            fail("unknown TimeCharacteristic type");
    }
    TypeSerializer<Tuple3<Integer, Long, Integer>> typeSerializer = createTypeSerializer(env);
    KafkaShuffleElementDeserializer deserializer = new KafkaShuffleElementDeserializer<>(typeSerializer);
    int recordIndex = 0;
    int watermarkIndex = 0;
    for (ConsumerRecord<byte[], byte[]> consumerRecord : records) {
        assertNull(consumerRecord.key());
        KafkaShuffleElement element = deserializer.deserialize(consumerRecord);
        if (element.isRecord()) {
            KafkaShuffleRecord<Tuple3<Integer, Long, Integer>> record = element.asRecord();
            switch(timeCharacteristic) {
                case ProcessingTime:
                    assertNull(record.getTimestamp());
                    break;
                case IngestionTime:
                    assertNotNull(record.getTimestamp());
                    break;
                case EventTime:
                    assertEquals(record.getTimestamp().longValue(), record.getValue().f1.longValue());
                    break;
                default:
                    fail("unknown TimeCharacteristic type");
            }
            assertEquals(record.getValue().f0.intValue(), recordIndex);
            assertEquals(record.getValue().f1.longValue(), INIT_TIMESTAMP + recordIndex);
            assertEquals(record.getValue().f2.intValue(), 0);
            recordIndex++;
        } else if (element.isWatermark()) {
            KafkaShuffleWatermark watermark = element.asWatermark();
            switch(timeCharacteristic) {
                case ProcessingTime:
                    assertEquals(watermark.getSubtask(), 0);
                    // the last element is the watermark that signifies end-of-event-time
                    assertEquals(numElementsPerProducer, recordIndex);
                    assertEquals(watermark.getWatermark(), Watermark.MAX_WATERMARK.getTimestamp());
                    break;
                case IngestionTime:
                    break;
                case EventTime:
                    assertEquals(watermark.getSubtask(), 0);
                    if (watermarkIndex == recordIndex) {
                        // the last element is the watermark that signifies end-of-event-time
                        assertEquals(watermark.getWatermark(), Watermark.MAX_WATERMARK.getTimestamp());
                    } else {
                        assertEquals(watermark.getWatermark(), INIT_TIMESTAMP + watermarkIndex);
                    }
                    break;
                default:
                    fail("unknown TimeCharacteristic type");
            }
            watermarkIndex++;
        } else {
            fail("KafkaShuffleElement is either record or watermark");
        }
    }
}
Also used : KafkaShuffleElementDeserializer(org.apache.flink.streaming.connectors.kafka.internals.KafkaShuffleFetcher.KafkaShuffleElementDeserializer) KafkaShuffleWatermark(org.apache.flink.streaming.connectors.kafka.internals.KafkaShuffleFetcher.KafkaShuffleWatermark) Tuple3(org.apache.flink.api.java.tuple.Tuple3) KafkaShuffleElement(org.apache.flink.streaming.connectors.kafka.internals.KafkaShuffleFetcher.KafkaShuffleElement) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) ConsumerRecord(org.apache.kafka.clients.consumer.ConsumerRecord)

Aggregations

Tuple3 (org.apache.flink.api.java.tuple.Tuple3)2 StreamExecutionEnvironment (org.apache.flink.streaming.api.environment.StreamExecutionEnvironment)2 KafkaShuffleElement (org.apache.flink.streaming.connectors.kafka.internals.KafkaShuffleFetcher.KafkaShuffleElement)2 KafkaShuffleElementDeserializer (org.apache.flink.streaming.connectors.kafka.internals.KafkaShuffleFetcher.KafkaShuffleElementDeserializer)2 KafkaShuffleWatermark (org.apache.flink.streaming.connectors.kafka.internals.KafkaShuffleFetcher.KafkaShuffleWatermark)2 ConsumerRecord (org.apache.kafka.clients.consumer.ConsumerRecord)2 ArrayList (java.util.ArrayList)1 Collection (java.util.Collection)1 HashMap (java.util.HashMap)1 List (java.util.List)1 Map (java.util.Map)1 Properties (java.util.Properties)1 RestartStrategies (org.apache.flink.api.common.restartstrategy.RestartStrategies)1 BasicTypeInfo (org.apache.flink.api.common.typeinfo.BasicTypeInfo)1 TypeSerializer (org.apache.flink.api.common.typeutils.TypeSerializer)1 Tuple (org.apache.flink.api.java.tuple.Tuple)1 TupleTypeInfo (org.apache.flink.api.java.typeutils.TupleTypeInfo)1 Iterables (org.apache.flink.shaded.guava30.com.google.common.collect.Iterables)1 Lists (org.apache.flink.shaded.guava30.com.google.common.collect.Lists)1 TimeCharacteristic (org.apache.flink.streaming.api.TimeCharacteristic)1