Search in sources :

Example 6 with TaskCorruptedException

use of org.apache.kafka.streams.errors.TaskCorruptedException in project kafka by apache.

the class ProcessorStateManager method initializeStoreOffsetsFromCheckpoint.

// package-private for test only
void initializeStoreOffsetsFromCheckpoint(final boolean storeDirIsEmpty) {
    try {
        final Map<TopicPartition, Long> loadedCheckpoints = checkpointFile.read();
        log.trace("Loaded offsets from the checkpoint file: {}", loadedCheckpoints);
        for (final StateStoreMetadata store : stores.values()) {
            if (store.corrupted) {
                log.error("Tried to initialize store offsets for corrupted store {}", store);
                throw new IllegalStateException("Should not initialize offsets for a corrupted task");
            }
            if (store.changelogPartition == null) {
                log.info("State store {} is not logged and hence would not be restored", store.stateStore.name());
            } else if (!store.stateStore.persistent()) {
                log.info("Initializing to the starting offset for changelog {} of in-memory state store {}", store.changelogPartition, store.stateStore.name());
            } else if (store.offset() == null) {
                if (loadedCheckpoints.containsKey(store.changelogPartition)) {
                    final Long offset = changelogOffsetFromCheckpointedOffset(loadedCheckpoints.remove(store.changelogPartition));
                    store.setOffset(offset);
                    log.debug("State store {} initialized from checkpoint with offset {} at changelog {}", store.stateStore.name(), store.offset, store.changelogPartition);
                } else {
                    // in that case we need to treat it as a task-corrupted exception
                    if (eosEnabled && !storeDirIsEmpty) {
                        log.warn("State store {} did not find checkpoint offsets while stores are not empty, " + "since under EOS it has the risk of getting uncommitted data in stores we have to " + "treat it as a task corruption error and wipe out the local state of task {} " + "before re-bootstrapping", store.stateStore.name(), taskId);
                        throw new TaskCorruptedException(Collections.singleton(taskId));
                    } else {
                        log.info("State store {} did not find checkpoint offset, hence would " + "default to the starting offset at changelog {}", store.stateStore.name(), store.changelogPartition);
                    }
                }
            } else {
                loadedCheckpoints.remove(store.changelogPartition);
                log.debug("Skipping re-initialization of offset from checkpoint for recycled store {}", store.stateStore.name());
            }
        }
        if (!loadedCheckpoints.isEmpty()) {
            log.warn("Some loaded checkpoint offsets cannot find their corresponding state stores: {}", loadedCheckpoints);
        }
        if (eosEnabled) {
            checkpointFile.delete();
        }
    } catch (final TaskCorruptedException e) {
        throw e;
    } catch (final IOException | RuntimeException e) {
        // both IOException or runtime exception like number parsing can throw
        throw new ProcessorStateException(format("%sError loading and deleting checkpoint file when creating the state manager", logPrefix), e);
    }
}
Also used : TaskCorruptedException(org.apache.kafka.streams.errors.TaskCorruptedException) TopicPartition(org.apache.kafka.common.TopicPartition) IOException(java.io.IOException) ProcessorStateException(org.apache.kafka.streams.errors.ProcessorStateException)

Example 7 with TaskCorruptedException

use of org.apache.kafka.streams.errors.TaskCorruptedException in project kafka by apache.

the class StreamThread method runLoop.

/**
 * Main event loop for polling, and processing records through topologies.
 *
 * @throws IllegalStateException If store gets registered after initialized is already finished
 * @throws StreamsException      if the store's change log does not contain the partition
 */
// Needed to include StreamsConfig.EXACTLY_ONCE_BETA in error log for UnsupportedVersionException
@SuppressWarnings("deprecation")
boolean runLoop() {
    subscribeConsumer();
    // until the rebalance is completed before we close and commit the tasks
    while (isRunning() || taskManager.isRebalanceInProgress()) {
        try {
            checkForTopologyUpdates();
            // stop polling regardless of the rebalance status since we know there are no tasks left
            if (!isRunning() && topologyMetadata.isEmpty()) {
                log.info("Shutting down thread with empty topology.");
                break;
            }
            maybeSendShutdown();
            final long size = cacheResizeSize.getAndSet(-1L);
            if (size != -1L) {
                cacheResizer.accept(size);
            }
            runOnce();
            if (nextProbingRebalanceMs.get() < time.milliseconds()) {
                log.info("Triggering the followup rebalance scheduled for {} ms.", nextProbingRebalanceMs.get());
                mainConsumer.enforceRebalance();
                nextProbingRebalanceMs.set(Long.MAX_VALUE);
            }
        } catch (final TaskCorruptedException e) {
            log.warn("Detected the states of tasks " + e.corruptedTasks() + " are corrupted. " + "Will close the task as dirty and re-create and bootstrap from scratch.", e);
            try {
                // check if any active task got corrupted. We will trigger a rebalance in that case.
                // once the task corruptions have been handled
                final boolean enforceRebalance = taskManager.handleCorruption(e.corruptedTasks());
                if (enforceRebalance && eosEnabled) {
                    log.info("Active task(s) got corrupted. Triggering a rebalance.");
                    mainConsumer.enforceRebalance();
                }
            } catch (final TaskMigratedException taskMigrated) {
                handleTaskMigrated(taskMigrated);
            }
        } catch (final TaskMigratedException e) {
            handleTaskMigrated(e);
        } catch (final UnsupportedVersionException e) {
            final String errorMessage = e.getMessage();
            if (errorMessage != null && errorMessage.startsWith("Broker unexpectedly doesn't support requireStable flag on version ")) {
                log.error("Shutting down because the Kafka cluster seems to be on a too old version. " + "Setting {}=\"{}\"/\"{}\" requires broker version 2.5 or higher.", StreamsConfig.PROCESSING_GUARANTEE_CONFIG, StreamsConfig.EXACTLY_ONCE_V2, StreamsConfig.EXACTLY_ONCE_BETA);
            }
            failedStreamThreadSensor.record();
            this.streamsUncaughtExceptionHandler.accept(new StreamsException(e), false);
            return false;
        } catch (final StreamsException e) {
            throw e;
        } catch (final Exception e) {
            throw new StreamsException(e);
        }
    }
    return true;
}
Also used : TaskCorruptedException(org.apache.kafka.streams.errors.TaskCorruptedException) StreamsException(org.apache.kafka.streams.errors.StreamsException) KafkaException(org.apache.kafka.common.KafkaException) StreamsException(org.apache.kafka.streams.errors.StreamsException) TaskMigratedException(org.apache.kafka.streams.errors.TaskMigratedException) TaskCorruptedException(org.apache.kafka.streams.errors.TaskCorruptedException) UnsupportedVersionException(org.apache.kafka.common.errors.UnsupportedVersionException) InvalidOffsetException(org.apache.kafka.clients.consumer.InvalidOffsetException) TaskMigratedException(org.apache.kafka.streams.errors.TaskMigratedException) UnsupportedVersionException(org.apache.kafka.common.errors.UnsupportedVersionException)

Example 8 with TaskCorruptedException

use of org.apache.kafka.streams.errors.TaskCorruptedException in project kafka by apache.

the class TaskManagerTest method shouldThrowTaskCorruptedExceptionForTimeoutExceptionOnCommitWithEosV2.

@Test
public void shouldThrowTaskCorruptedExceptionForTimeoutExceptionOnCommitWithEosV2() {
    setUpTaskManager(ProcessingMode.EXACTLY_ONCE_V2);
    final StreamsProducer producer = mock(StreamsProducer.class);
    expect(activeTaskCreator.threadProducer()).andReturn(producer).andReturn(producer);
    final Map<TopicPartition, OffsetAndMetadata> offsetsT00 = singletonMap(t1p0, new OffsetAndMetadata(0L, null));
    final Map<TopicPartition, OffsetAndMetadata> offsetsT01 = singletonMap(t1p1, new OffsetAndMetadata(1L, null));
    final Map<TopicPartition, OffsetAndMetadata> allOffsets = new HashMap<>(offsetsT00);
    allOffsets.putAll(offsetsT01);
    producer.commitTransaction(allOffsets, null);
    expectLastCall().andThrow(new TimeoutException("KABOOM!"));
    producer.commitTransaction(allOffsets, null);
    expectLastCall();
    final StateMachineTask task00 = new StateMachineTask(taskId00, taskId00Partitions, true);
    task00.setCommittableOffsetsAndMetadata(offsetsT00);
    final StateMachineTask task01 = new StateMachineTask(taskId01, taskId01Partitions, true);
    task01.setCommittableOffsetsAndMetadata(offsetsT01);
    final StateMachineTask task02 = new StateMachineTask(taskId02, taskId02Partitions, true);
    expect(consumer.groupMetadata()).andStubReturn(null);
    replay(producer, activeTaskCreator, consumer);
    task00.setCommitNeeded();
    task01.setCommitNeeded();
    final TaskCorruptedException exception = assertThrows(TaskCorruptedException.class, () -> taskManager.commit(mkSet(task00, task01, task02)));
    assertThat(exception.corruptedTasks(), equalTo(mkSet(taskId00, taskId01)));
}
Also used : TaskCorruptedException(org.apache.kafka.streams.errors.TaskCorruptedException) HashMap(java.util.HashMap) TopicPartition(org.apache.kafka.common.TopicPartition) OffsetAndMetadata(org.apache.kafka.clients.consumer.OffsetAndMetadata) TimeoutException(org.apache.kafka.common.errors.TimeoutException) Test(org.junit.Test)

Example 9 with TaskCorruptedException

use of org.apache.kafka.streams.errors.TaskCorruptedException in project kafka by apache.

the class TaskManager method handleCorruption.

/**
 * @throws TaskMigratedException
 */
boolean handleCorruption(final Set<TaskId> corruptedTasks) {
    final Set<Task> corruptedActiveTasks = new HashSet<>();
    final Set<Task> corruptedStandbyTasks = new HashSet<>();
    for (final TaskId taskId : corruptedTasks) {
        final Task task = tasks.task(taskId);
        if (task.isActive()) {
            corruptedActiveTasks.add(task);
        } else {
            corruptedStandbyTasks.add(task);
        }
    }
    // Make sure to clean up any corrupted standby tasks in their entirety before committing
    // since TaskMigrated can be thrown and the resulting handleLostAll will only clean up active tasks
    closeDirtyAndRevive(corruptedStandbyTasks, true);
    // We need to commit before closing the corrupted active tasks since this will force the ongoing txn to abort
    try {
        final Collection<Task> tasksToCommit = tasks().values().stream().filter(t -> t.state() == Task.State.RUNNING || t.state() == Task.State.RESTORING).filter(t -> !corruptedTasks.contains(t.id())).collect(Collectors.toSet());
        commitTasksAndMaybeUpdateCommittableOffsets(tasksToCommit, new HashMap<>());
    } catch (final TaskCorruptedException e) {
        log.info("Some additional tasks were found corrupted while trying to commit, these will be added to the " + "tasks to clean and revive: {}", e.corruptedTasks());
        corruptedActiveTasks.addAll(tasks.tasks(e.corruptedTasks()));
    } catch (final TimeoutException e) {
        log.info("Hit TimeoutException when committing all non-corrupted tasks, these will be closed and revived");
        final Collection<Task> uncorruptedTasks = new HashSet<>(tasks.activeTasks());
        uncorruptedTasks.removeAll(corruptedActiveTasks);
        // Those tasks which just timed out can just be closed dirty without marking changelogs as corrupted
        closeDirtyAndRevive(uncorruptedTasks, false);
    }
    closeDirtyAndRevive(corruptedActiveTasks, true);
    return !corruptedActiveTasks.isEmpty();
}
Also used : TaskIdFormatException(org.apache.kafka.streams.errors.TaskIdFormatException) TaskId(org.apache.kafka.streams.processor.TaskId) KafkaException(org.apache.kafka.common.KafkaException) HashMap(java.util.HashMap) StreamsException(org.apache.kafka.streams.errors.StreamsException) ConsumerRecords(org.apache.kafka.clients.consumer.ConsumerRecords) AtomicReference(java.util.concurrent.atomic.AtomicReference) TreeSet(java.util.TreeSet) RecordsToDelete(org.apache.kafka.clients.admin.RecordsToDelete) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) LinkedHashMap(java.util.LinkedHashMap) TaskMigratedException(org.apache.kafka.streams.errors.TaskMigratedException) DeleteRecordsResult(org.apache.kafka.clients.admin.DeleteRecordsResult) LogContext(org.apache.kafka.common.utils.LogContext) Map(java.util.Map) Metric(org.apache.kafka.common.Metric) MetricName(org.apache.kafka.common.MetricName) TaskDirectory(org.apache.kafka.streams.processor.internals.StateDirectory.TaskDirectory) Admin(org.apache.kafka.clients.admin.Admin) StreamsMetricsImpl(org.apache.kafka.streams.processor.internals.metrics.StreamsMetricsImpl) LinkedList(java.util.LinkedList) TaskCorruptedException(org.apache.kafka.streams.errors.TaskCorruptedException) Consumer(org.apache.kafka.clients.consumer.Consumer) TopicPartition(org.apache.kafka.common.TopicPartition) TimeoutException(org.apache.kafka.common.errors.TimeoutException) Logger(org.slf4j.Logger) Time(org.apache.kafka.common.utils.Time) Iterator(java.util.Iterator) State(org.apache.kafka.streams.processor.internals.Task.State) Collection(java.util.Collection) Set(java.util.Set) IOException(java.io.IOException) ProcessingMode(org.apache.kafka.streams.internals.StreamsConfigUtils.ProcessingMode) UUID(java.util.UUID) Collectors(java.util.stream.Collectors) File(java.io.File) Utils.union(org.apache.kafka.common.utils.Utils.union) EXACTLY_ONCE_V2(org.apache.kafka.streams.internals.StreamsConfigUtils.ProcessingMode.EXACTLY_ONCE_V2) List(java.util.List) Stream(java.util.stream.Stream) Utils.intersection(org.apache.kafka.common.utils.Utils.intersection) OffsetAndMetadata(org.apache.kafka.clients.consumer.OffsetAndMetadata) LockException(org.apache.kafka.streams.errors.LockException) OffsetCheckpoint(org.apache.kafka.streams.state.internals.OffsetCheckpoint) Comparator(java.util.Comparator) StateManagerUtil.parseTaskDirectoryName(org.apache.kafka.streams.processor.internals.StateManagerUtil.parseTaskDirectoryName) Collections(java.util.Collections) TaskId(org.apache.kafka.streams.processor.TaskId) TaskCorruptedException(org.apache.kafka.streams.errors.TaskCorruptedException) Collection(java.util.Collection) HashSet(java.util.HashSet) TimeoutException(org.apache.kafka.common.errors.TimeoutException)

Example 10 with TaskCorruptedException

use of org.apache.kafka.streams.errors.TaskCorruptedException in project kafka by apache.

the class TaskManager method commit.

/**
 * @throws TaskMigratedException if committing offsets failed (non-EOS)
 *                               or if the task producer got fenced (EOS)
 * @throws TimeoutException if task.timeout.ms has been exceeded (non-EOS)
 * @throws TaskCorruptedException if committing offsets failed due to TimeoutException (EOS)
 * @return number of committed offsets, or -1 if we are in the middle of a rebalance and cannot commit
 */
int commit(final Collection<Task> tasksToCommit) {
    int committed = 0;
    final Map<Task, Map<TopicPartition, OffsetAndMetadata>> consumedOffsetsAndMetadataPerTask = new HashMap<>();
    try {
        committed = commitTasksAndMaybeUpdateCommittableOffsets(tasksToCommit, consumedOffsetsAndMetadataPerTask);
    } catch (final TimeoutException timeoutException) {
        consumedOffsetsAndMetadataPerTask.keySet().forEach(t -> t.maybeInitTaskTimeoutOrThrow(time.milliseconds(), timeoutException));
    }
    return committed;
}
Also used : TaskIdFormatException(org.apache.kafka.streams.errors.TaskIdFormatException) TaskId(org.apache.kafka.streams.processor.TaskId) KafkaException(org.apache.kafka.common.KafkaException) HashMap(java.util.HashMap) StreamsException(org.apache.kafka.streams.errors.StreamsException) ConsumerRecords(org.apache.kafka.clients.consumer.ConsumerRecords) AtomicReference(java.util.concurrent.atomic.AtomicReference) TreeSet(java.util.TreeSet) RecordsToDelete(org.apache.kafka.clients.admin.RecordsToDelete) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) LinkedHashMap(java.util.LinkedHashMap) TaskMigratedException(org.apache.kafka.streams.errors.TaskMigratedException) DeleteRecordsResult(org.apache.kafka.clients.admin.DeleteRecordsResult) LogContext(org.apache.kafka.common.utils.LogContext) Map(java.util.Map) Metric(org.apache.kafka.common.Metric) MetricName(org.apache.kafka.common.MetricName) TaskDirectory(org.apache.kafka.streams.processor.internals.StateDirectory.TaskDirectory) Admin(org.apache.kafka.clients.admin.Admin) StreamsMetricsImpl(org.apache.kafka.streams.processor.internals.metrics.StreamsMetricsImpl) LinkedList(java.util.LinkedList) TaskCorruptedException(org.apache.kafka.streams.errors.TaskCorruptedException) Consumer(org.apache.kafka.clients.consumer.Consumer) TopicPartition(org.apache.kafka.common.TopicPartition) TimeoutException(org.apache.kafka.common.errors.TimeoutException) Logger(org.slf4j.Logger) Time(org.apache.kafka.common.utils.Time) Iterator(java.util.Iterator) State(org.apache.kafka.streams.processor.internals.Task.State) Collection(java.util.Collection) Set(java.util.Set) IOException(java.io.IOException) ProcessingMode(org.apache.kafka.streams.internals.StreamsConfigUtils.ProcessingMode) UUID(java.util.UUID) Collectors(java.util.stream.Collectors) File(java.io.File) Utils.union(org.apache.kafka.common.utils.Utils.union) EXACTLY_ONCE_V2(org.apache.kafka.streams.internals.StreamsConfigUtils.ProcessingMode.EXACTLY_ONCE_V2) List(java.util.List) Stream(java.util.stream.Stream) Utils.intersection(org.apache.kafka.common.utils.Utils.intersection) OffsetAndMetadata(org.apache.kafka.clients.consumer.OffsetAndMetadata) LockException(org.apache.kafka.streams.errors.LockException) OffsetCheckpoint(org.apache.kafka.streams.state.internals.OffsetCheckpoint) Comparator(java.util.Comparator) StateManagerUtil.parseTaskDirectoryName(org.apache.kafka.streams.processor.internals.StateManagerUtil.parseTaskDirectoryName) Collections(java.util.Collections) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map) OffsetCheckpoint(org.apache.kafka.streams.state.internals.OffsetCheckpoint) TimeoutException(org.apache.kafka.common.errors.TimeoutException)

Aggregations

TaskCorruptedException (org.apache.kafka.streams.errors.TaskCorruptedException)17 TaskId (org.apache.kafka.streams.processor.TaskId)11 TopicPartition (org.apache.kafka.common.TopicPartition)10 LogContext (org.apache.kafka.common.utils.LogContext)9 Test (org.junit.Test)9 TimeoutException (org.apache.kafka.common.errors.TimeoutException)8 LinkedList (java.util.LinkedList)7 StreamsException (org.apache.kafka.streams.errors.StreamsException)7 StreamsMetricsImpl (org.apache.kafka.streams.processor.internals.metrics.StreamsMetricsImpl)7 HashMap (java.util.HashMap)6 TaskMigratedException (org.apache.kafka.streams.errors.TaskMigratedException)6 HashSet (java.util.HashSet)5 Set (java.util.Set)5 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)5 AtomicLong (java.util.concurrent.atomic.AtomicLong)5 ConsumerGroupMetadata (org.apache.kafka.clients.consumer.ConsumerGroupMetadata)5 OffsetAndMetadata (org.apache.kafka.clients.consumer.OffsetAndMetadata)5 KafkaException (org.apache.kafka.common.KafkaException)5 ArrayList (java.util.ArrayList)4 Collection (java.util.Collection)4