Search in sources :

Example 6 with KeyedDeserializationSchemaWrapper

use of org.apache.flink.streaming.util.serialization.KeyedDeserializationSchemaWrapper in project flink by apache.

the class Kafka09FetcherTest method ensureOffsetsGetCommitted.

@Test
public void ensureOffsetsGetCommitted() throws Exception {
    // test data
    final KafkaTopicPartition testPartition1 = new KafkaTopicPartition("test", 42);
    final KafkaTopicPartition testPartition2 = new KafkaTopicPartition("another", 99);
    final Map<KafkaTopicPartition, Long> testCommitData1 = new HashMap<>();
    testCommitData1.put(testPartition1, 11L);
    testCommitData1.put(testPartition2, 18L);
    final Map<KafkaTopicPartition, Long> testCommitData2 = new HashMap<>();
    testCommitData2.put(testPartition1, 19L);
    testCommitData2.put(testPartition2, 28L);
    final BlockingQueue<Map<TopicPartition, OffsetAndMetadata>> commitStore = new LinkedBlockingQueue<>();
    // ----- the mock consumer with poll(), wakeup(), and commit(A)sync calls ----
    final MultiShotLatch blockerLatch = new MultiShotLatch();
    KafkaConsumer<?, ?> mockConsumer = mock(KafkaConsumer.class);
    when(mockConsumer.poll(anyLong())).thenAnswer(new Answer<ConsumerRecords<?, ?>>() {

        @Override
        public ConsumerRecords<?, ?> answer(InvocationOnMock invocation) throws InterruptedException {
            blockerLatch.await();
            return ConsumerRecords.empty();
        }
    });
    doAnswer(new Answer<Void>() {

        @Override
        public Void answer(InvocationOnMock invocation) {
            blockerLatch.trigger();
            return null;
        }
    }).when(mockConsumer).wakeup();
    doAnswer(new Answer<Void>() {

        @Override
        public Void answer(InvocationOnMock invocation) {
            @SuppressWarnings("unchecked") Map<TopicPartition, OffsetAndMetadata> offsets = (Map<TopicPartition, OffsetAndMetadata>) invocation.getArguments()[0];
            OffsetCommitCallback callback = (OffsetCommitCallback) invocation.getArguments()[1];
            commitStore.add(offsets);
            callback.onComplete(offsets, null);
            return null;
        }
    }).when(mockConsumer).commitAsync(Mockito.<Map<TopicPartition, OffsetAndMetadata>>any(), any(OffsetCommitCallback.class));
    // make sure the fetcher creates the mock consumer
    whenNew(KafkaConsumer.class).withAnyArguments().thenReturn(mockConsumer);
    // ----- create the test fetcher -----
    @SuppressWarnings("unchecked") SourceContext<String> sourceContext = mock(SourceContext.class);
    Map<KafkaTopicPartition, Long> partitionsWithInitialOffsets = Collections.singletonMap(new KafkaTopicPartition("test", 42), KafkaTopicPartitionStateSentinel.GROUP_OFFSET);
    KeyedDeserializationSchema<String> schema = new KeyedDeserializationSchemaWrapper<>(new SimpleStringSchema());
    final Kafka09Fetcher<String> fetcher = new Kafka09Fetcher<>(sourceContext, partitionsWithInitialOffsets, null, /* periodic watermark extractor */
    null, /* punctuated watermark extractor */
    new TestProcessingTimeService(), 10, /* watermark interval */
    this.getClass().getClassLoader(), "task_name", new UnregisteredMetricsGroup(), schema, new Properties(), 0L, false);
    // ----- run the fetcher -----
    final AtomicReference<Throwable> error = new AtomicReference<>();
    final Thread fetcherRunner = new Thread("fetcher runner") {

        @Override
        public void run() {
            try {
                fetcher.runFetchLoop();
            } catch (Throwable t) {
                error.set(t);
            }
        }
    };
    fetcherRunner.start();
    // ----- trigger the first offset commit -----
    fetcher.commitInternalOffsetsToKafka(testCommitData1);
    Map<TopicPartition, OffsetAndMetadata> result1 = commitStore.take();
    for (Entry<TopicPartition, OffsetAndMetadata> entry : result1.entrySet()) {
        TopicPartition partition = entry.getKey();
        if (partition.topic().equals("test")) {
            assertEquals(42, partition.partition());
            assertEquals(12L, entry.getValue().offset());
        } else if (partition.topic().equals("another")) {
            assertEquals(99, partition.partition());
            assertEquals(17L, entry.getValue().offset());
        }
    }
    // ----- trigger the second offset commit -----
    fetcher.commitInternalOffsetsToKafka(testCommitData2);
    Map<TopicPartition, OffsetAndMetadata> result2 = commitStore.take();
    for (Entry<TopicPartition, OffsetAndMetadata> entry : result2.entrySet()) {
        TopicPartition partition = entry.getKey();
        if (partition.topic().equals("test")) {
            assertEquals(42, partition.partition());
            assertEquals(20L, entry.getValue().offset());
        } else if (partition.topic().equals("another")) {
            assertEquals(99, partition.partition());
            assertEquals(27L, entry.getValue().offset());
        }
    }
    // ----- test done, wait till the fetcher is done for a clean shutdown -----
    fetcher.cancel();
    fetcherRunner.join();
    // check that there were no errors in the fetcher
    final Throwable caughtError = error.get();
    if (caughtError != null && !(caughtError instanceof Handover.ClosedException)) {
        throw new Exception("Exception in the fetcher", caughtError);
    }
}
Also used : UnregisteredMetricsGroup(org.apache.flink.metrics.groups.UnregisteredMetricsGroup) HashMap(java.util.HashMap) MultiShotLatch(org.apache.flink.core.testutils.MultiShotLatch) KafkaTopicPartition(org.apache.flink.streaming.connectors.kafka.internals.KafkaTopicPartition) LinkedBlockingQueue(java.util.concurrent.LinkedBlockingQueue) Properties(java.util.Properties) ConsumerRecords(org.apache.kafka.clients.consumer.ConsumerRecords) KeyedDeserializationSchemaWrapper(org.apache.flink.streaming.util.serialization.KeyedDeserializationSchemaWrapper) OffsetAndMetadata(org.apache.kafka.clients.consumer.OffsetAndMetadata) Handover(org.apache.flink.streaming.connectors.kafka.internal.Handover) AtomicReference(java.util.concurrent.atomic.AtomicReference) KafkaConsumerThread(org.apache.flink.streaming.connectors.kafka.internal.KafkaConsumerThread) Kafka09Fetcher(org.apache.flink.streaming.connectors.kafka.internal.Kafka09Fetcher) InvocationOnMock(org.mockito.invocation.InvocationOnMock) TopicPartition(org.apache.kafka.common.TopicPartition) KafkaTopicPartition(org.apache.flink.streaming.connectors.kafka.internals.KafkaTopicPartition) Mockito.anyLong(org.mockito.Mockito.anyLong) SimpleStringSchema(org.apache.flink.streaming.util.serialization.SimpleStringSchema) TestProcessingTimeService(org.apache.flink.streaming.runtime.tasks.TestProcessingTimeService) HashMap(java.util.HashMap) Map(java.util.Map) OffsetCommitCallback(org.apache.kafka.clients.consumer.OffsetCommitCallback) PrepareForTest(org.powermock.core.classloader.annotations.PrepareForTest) Test(org.junit.Test)

Example 7 with KeyedDeserializationSchemaWrapper

use of org.apache.flink.streaming.util.serialization.KeyedDeserializationSchemaWrapper in project flink by apache.

the class KafkaConsumerTestBase method writeSequence.

protected String writeSequence(String baseTopicName, final int numElements, final int parallelism, final int replicationFactor) throws Exception {
    LOG.info("\n===================================\n" + "== Writing sequence of " + numElements + " into " + baseTopicName + " with p=" + parallelism + "\n" + "===================================");
    final TypeInformation<Tuple2<Integer, Integer>> resultType = TypeInformation.of(new TypeHint<Tuple2<Integer, Integer>>() {
    });
    final KeyedSerializationSchema<Tuple2<Integer, Integer>> serSchema = new KeyedSerializationSchemaWrapper<>(new TypeInformationSerializationSchema<>(resultType, new ExecutionConfig()));
    final KeyedDeserializationSchema<Tuple2<Integer, Integer>> deserSchema = new KeyedDeserializationSchemaWrapper<>(new TypeInformationSerializationSchema<>(resultType, new ExecutionConfig()));
    final int maxNumAttempts = 10;
    for (int attempt = 1; attempt <= maxNumAttempts; attempt++) {
        final String topicName = baseTopicName + '-' + attempt;
        LOG.info("Writing attempt #1");
        // -------- Write the Sequence --------
        createTestTopic(topicName, parallelism, replicationFactor);
        StreamExecutionEnvironment writeEnv = StreamExecutionEnvironment.createRemoteEnvironment("localhost", flinkPort);
        writeEnv.getConfig().setRestartStrategy(RestartStrategies.noRestart());
        writeEnv.getConfig().disableSysoutLogging();
        DataStream<Tuple2<Integer, Integer>> stream = writeEnv.addSource(new RichParallelSourceFunction<Tuple2<Integer, Integer>>() {

            private boolean running = true;

            @Override
            public void run(SourceContext<Tuple2<Integer, Integer>> ctx) throws Exception {
                int cnt = 0;
                int partition = getRuntimeContext().getIndexOfThisSubtask();
                while (running && cnt < numElements) {
                    ctx.collect(new Tuple2<>(partition, cnt));
                    cnt++;
                }
            }

            @Override
            public void cancel() {
                running = false;
            }
        }).setParallelism(parallelism);
        // the producer must not produce duplicates
        Properties producerProperties = FlinkKafkaProducerBase.getPropertiesFromBrokerList(brokerConnectionStrings);
        producerProperties.setProperty("retries", "0");
        producerProperties.putAll(secureProps);
        kafkaServer.produceIntoKafka(stream, topicName, serSchema, producerProperties, new Tuple2Partitioner(parallelism)).setParallelism(parallelism);
        try {
            writeEnv.execute("Write sequence");
        } catch (Exception e) {
            LOG.error("Write attempt failed, trying again", e);
            deleteTestTopic(topicName);
            JobManagerCommunicationUtils.waitUntilNoJobIsRunning(flink.getLeaderGateway(timeout));
            continue;
        }
        LOG.info("Finished writing sequence");
        // -------- Validate the Sequence --------
        // we need to validate the sequence, because kafka's producers are not exactly once
        LOG.info("Validating sequence");
        JobManagerCommunicationUtils.waitUntilNoJobIsRunning(flink.getLeaderGateway(timeout));
        final StreamExecutionEnvironment readEnv = StreamExecutionEnvironment.createRemoteEnvironment("localhost", flinkPort);
        readEnv.getConfig().setRestartStrategy(RestartStrategies.noRestart());
        readEnv.getConfig().disableSysoutLogging();
        readEnv.setParallelism(parallelism);
        Properties readProps = (Properties) standardProps.clone();
        readProps.setProperty("group.id", "flink-tests-validator");
        readProps.putAll(secureProps);
        FlinkKafkaConsumerBase<Tuple2<Integer, Integer>> consumer = kafkaServer.getConsumer(topicName, deserSchema, readProps);
        readEnv.addSource(consumer).map(new RichMapFunction<Tuple2<Integer, Integer>, Tuple2<Integer, Integer>>() {

            private final int totalCount = parallelism * numElements;

            private int count = 0;

            @Override
            public Tuple2<Integer, Integer> map(Tuple2<Integer, Integer> value) throws Exception {
                if (++count == totalCount) {
                    throw new SuccessException();
                } else {
                    return value;
                }
            }
        }).setParallelism(1).addSink(new DiscardingSink<Tuple2<Integer, Integer>>()).setParallelism(1);
        final AtomicReference<Throwable> errorRef = new AtomicReference<>();
        Thread runner = new Thread() {

            @Override
            public void run() {
                try {
                    tryExecute(readEnv, "sequence validation");
                } catch (Throwable t) {
                    errorRef.set(t);
                }
            }
        };
        runner.start();
        final long deadline = System.nanoTime() + 10_000_000_000L;
        long delay;
        while (runner.isAlive() && (delay = deadline - System.nanoTime()) > 0) {
            runner.join(delay / 1_000_000L);
        }
        boolean success;
        if (runner.isAlive()) {
            // did not finish in time, maybe the producer dropped one or more records and
            // the validation did not reach the exit point
            success = false;
            JobManagerCommunicationUtils.cancelCurrentJob(flink.getLeaderGateway(timeout));
        } else {
            Throwable error = errorRef.get();
            if (error != null) {
                success = false;
                LOG.info("Attempt " + attempt + " failed with exception", error);
            } else {
                success = true;
            }
        }
        JobManagerCommunicationUtils.waitUntilNoJobIsRunning(flink.getLeaderGateway(timeout));
        if (success) {
            // everything is good!
            return topicName;
        } else {
            deleteTestTopic(topicName);
        // fall through the loop
        }
    }
    throw new Exception("Could not write a valid sequence to Kafka after " + maxNumAttempts + " attempts");
}
Also used : KeyedSerializationSchemaWrapper(org.apache.flink.streaming.util.serialization.KeyedSerializationSchemaWrapper) DiscardingSink(org.apache.flink.streaming.api.functions.sink.DiscardingSink) ExecutionConfig(org.apache.flink.api.common.ExecutionConfig) Properties(java.util.Properties) KeyedDeserializationSchemaWrapper(org.apache.flink.streaming.util.serialization.KeyedDeserializationSchemaWrapper) Tuple2Partitioner(org.apache.flink.streaming.connectors.kafka.testutils.Tuple2Partitioner) AtomicReference(java.util.concurrent.atomic.AtomicReference) TypeHint(org.apache.flink.api.common.typeinfo.TypeHint) RetryOnException(org.apache.flink.testutils.junit.RetryOnException) ProgramInvocationException(org.apache.flink.client.program.ProgramInvocationException) SuccessException(org.apache.flink.test.util.SuccessException) NoResourceAvailableException(org.apache.flink.runtime.jobmanager.scheduler.NoResourceAvailableException) JobExecutionException(org.apache.flink.runtime.client.JobExecutionException) TimeoutException(org.apache.kafka.common.errors.TimeoutException) JobCancellationException(org.apache.flink.runtime.client.JobCancellationException) IOException(java.io.IOException) Tuple2(org.apache.flink.api.java.tuple.Tuple2) RichMapFunction(org.apache.flink.api.common.functions.RichMapFunction) RichParallelSourceFunction(org.apache.flink.streaming.api.functions.source.RichParallelSourceFunction) SuccessException(org.apache.flink.test.util.SuccessException) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment)

Example 8 with KeyedDeserializationSchemaWrapper

use of org.apache.flink.streaming.util.serialization.KeyedDeserializationSchemaWrapper in project flink by apache.

the class KafkaConsumerTestBase method runStartFromLatestOffsets.

/**
	 * This test ensures that when explicitly set to start from latest record, the consumer
	 * ignores the "auto.offset.reset" behaviour as well as any committed group offsets in Kafka.
	 */
public void runStartFromLatestOffsets() throws Exception {
    // 50 records written to each of 3 partitions before launching a latest-starting consuming job
    final int parallelism = 3;
    final int recordsInEachPartition = 50;
    // each partition will be written an extra 200 records
    final int extraRecordsInEachPartition = 200;
    // all already existing data in the topic, before the consuming topology has started, should be ignored
    final String topicName = writeSequence("testStartFromLatestOffsetsTopic", recordsInEachPartition, parallelism, 1);
    // the committed offsets should be ignored
    KafkaTestEnvironment.KafkaOffsetHandler kafkaOffsetHandler = kafkaServer.createOffsetHandler();
    kafkaOffsetHandler.setCommittedOffset(topicName, 0, 23);
    kafkaOffsetHandler.setCommittedOffset(topicName, 1, 31);
    kafkaOffsetHandler.setCommittedOffset(topicName, 2, 43);
    // job names for the topologies for writing and consuming the extra records
    final String consumeExtraRecordsJobName = "Consume Extra Records Job";
    final String writeExtraRecordsJobName = "Write Extra Records Job";
    // seriliazation / deserialization schemas for writing and consuming the extra records
    final TypeInformation<Tuple2<Integer, Integer>> resultType = TypeInformation.of(new TypeHint<Tuple2<Integer, Integer>>() {
    });
    final KeyedSerializationSchema<Tuple2<Integer, Integer>> serSchema = new KeyedSerializationSchemaWrapper<>(new TypeInformationSerializationSchema<>(resultType, new ExecutionConfig()));
    final KeyedDeserializationSchema<Tuple2<Integer, Integer>> deserSchema = new KeyedDeserializationSchemaWrapper<>(new TypeInformationSerializationSchema<>(resultType, new ExecutionConfig()));
    // setup and run the latest-consuming job
    final StreamExecutionEnvironment env = StreamExecutionEnvironment.createRemoteEnvironment("localhost", flinkPort);
    env.getConfig().disableSysoutLogging();
    env.setParallelism(parallelism);
    final Properties readProps = new Properties();
    readProps.putAll(standardProps);
    // this should be ignored
    readProps.setProperty("auto.offset.reset", "earliest");
    FlinkKafkaConsumerBase<Tuple2<Integer, Integer>> latestReadingConsumer = kafkaServer.getConsumer(topicName, deserSchema, readProps);
    latestReadingConsumer.setStartFromLatest();
    env.addSource(latestReadingConsumer).setParallelism(parallelism).flatMap(new FlatMapFunction<Tuple2<Integer, Integer>, Object>() {

        @Override
        public void flatMap(Tuple2<Integer, Integer> value, Collector<Object> out) throws Exception {
            if (value.f1 - recordsInEachPartition < 0) {
                throw new RuntimeException("test failed; consumed a record that was previously written: " + value);
            }
        }
    }).setParallelism(1).addSink(new DiscardingSink<>());
    final AtomicReference<Throwable> error = new AtomicReference<>();
    Thread consumeThread = new Thread(new Runnable() {

        @Override
        public void run() {
            try {
                env.execute(consumeExtraRecordsJobName);
            } catch (Throwable t) {
                if (!(t.getCause() instanceof JobCancellationException)) {
                    error.set(t);
                }
            }
        }
    });
    consumeThread.start();
    // wait until the consuming job has started, to be extra safe
    JobManagerCommunicationUtils.waitUntilJobIsRunning(flink.getLeaderGateway(timeout), consumeExtraRecordsJobName);
    // setup the extra records writing job
    final StreamExecutionEnvironment env2 = StreamExecutionEnvironment.createRemoteEnvironment("localhost", flinkPort);
    DataStream<Tuple2<Integer, Integer>> extraRecordsStream = env2.addSource(new RichParallelSourceFunction<Tuple2<Integer, Integer>>() {

        private boolean running = true;

        @Override
        public void run(SourceContext<Tuple2<Integer, Integer>> ctx) throws Exception {
            // the extra records should start from the last written value
            int count = recordsInEachPartition;
            int partition = getRuntimeContext().getIndexOfThisSubtask();
            while (running && count < recordsInEachPartition + extraRecordsInEachPartition) {
                ctx.collect(new Tuple2<>(partition, count));
                count++;
            }
        }

        @Override
        public void cancel() {
            running = false;
        }
    }).setParallelism(parallelism);
    kafkaServer.produceIntoKafka(extraRecordsStream, topicName, serSchema, readProps, null);
    try {
        env2.execute(writeExtraRecordsJobName);
    } catch (Exception e) {
        throw new RuntimeException("Writing extra records failed", e);
    }
    // cancel the consume job after all extra records are written
    JobManagerCommunicationUtils.cancelCurrentJob(flink.getLeaderGateway(timeout), consumeExtraRecordsJobName);
    consumeThread.join();
    kafkaOffsetHandler.close();
    deleteTestTopic(topicName);
    // check whether the consuming thread threw any test errors;
    // test will fail here if the consume job had incorrectly read any records other than the extra records
    final Throwable consumerError = error.get();
    if (consumerError != null) {
        throw new Exception("Exception in the consuming thread", consumerError);
    }
}
Also used : KeyedSerializationSchemaWrapper(org.apache.flink.streaming.util.serialization.KeyedSerializationSchemaWrapper) ExecutionConfig(org.apache.flink.api.common.ExecutionConfig) Properties(java.util.Properties) JobCancellationException(org.apache.flink.runtime.client.JobCancellationException) KeyedDeserializationSchemaWrapper(org.apache.flink.streaming.util.serialization.KeyedDeserializationSchemaWrapper) AtomicReference(java.util.concurrent.atomic.AtomicReference) TypeHint(org.apache.flink.api.common.typeinfo.TypeHint) RetryOnException(org.apache.flink.testutils.junit.RetryOnException) ProgramInvocationException(org.apache.flink.client.program.ProgramInvocationException) SuccessException(org.apache.flink.test.util.SuccessException) NoResourceAvailableException(org.apache.flink.runtime.jobmanager.scheduler.NoResourceAvailableException) JobExecutionException(org.apache.flink.runtime.client.JobExecutionException) TimeoutException(org.apache.kafka.common.errors.TimeoutException) JobCancellationException(org.apache.flink.runtime.client.JobCancellationException) IOException(java.io.IOException) Tuple2(org.apache.flink.api.java.tuple.Tuple2) RichParallelSourceFunction(org.apache.flink.streaming.api.functions.source.RichParallelSourceFunction) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment)

Aggregations

Properties (java.util.Properties)8 AtomicReference (java.util.concurrent.atomic.AtomicReference)8 KeyedDeserializationSchemaWrapper (org.apache.flink.streaming.util.serialization.KeyedDeserializationSchemaWrapper)8 HashMap (java.util.HashMap)6 UnregisteredMetricsGroup (org.apache.flink.metrics.groups.UnregisteredMetricsGroup)6 KafkaConsumerThread (org.apache.flink.streaming.connectors.kafka.internal.KafkaConsumerThread)6 KafkaTopicPartition (org.apache.flink.streaming.connectors.kafka.internals.KafkaTopicPartition)6 TestProcessingTimeService (org.apache.flink.streaming.runtime.tasks.TestProcessingTimeService)6 SimpleStringSchema (org.apache.flink.streaming.util.serialization.SimpleStringSchema)6 ConsumerRecords (org.apache.kafka.clients.consumer.ConsumerRecords)6 Test (org.junit.Test)6 Mockito.anyLong (org.mockito.Mockito.anyLong)6 InvocationOnMock (org.mockito.invocation.InvocationOnMock)6 PrepareForTest (org.powermock.core.classloader.annotations.PrepareForTest)6 MultiShotLatch (org.apache.flink.core.testutils.MultiShotLatch)4 Handover (org.apache.flink.streaming.connectors.kafka.internal.Handover)4 TopicPartition (org.apache.kafka.common.TopicPartition)4 Kafka010Fetcher (org.apache.flink.streaming.connectors.kafka.internal.Kafka010Fetcher)3 Kafka09Fetcher (org.apache.flink.streaming.connectors.kafka.internal.Kafka09Fetcher)3 IOException (java.io.IOException)2