use of org.apache.kafka.streams.errors.TaskCorruptedException in project kafka by apache.
the class ProcessorStateManager method initializeStoreOffsetsFromCheckpoint.
// package-private for test only
void initializeStoreOffsetsFromCheckpoint(final boolean storeDirIsEmpty) {
try {
final Map<TopicPartition, Long> loadedCheckpoints = checkpointFile.read();
log.trace("Loaded offsets from the checkpoint file: {}", loadedCheckpoints);
for (final StateStoreMetadata store : stores.values()) {
if (store.corrupted) {
log.error("Tried to initialize store offsets for corrupted store {}", store);
throw new IllegalStateException("Should not initialize offsets for a corrupted task");
}
if (store.changelogPartition == null) {
log.info("State store {} is not logged and hence would not be restored", store.stateStore.name());
} else if (!store.stateStore.persistent()) {
log.info("Initializing to the starting offset for changelog {} of in-memory state store {}", store.changelogPartition, store.stateStore.name());
} else if (store.offset() == null) {
if (loadedCheckpoints.containsKey(store.changelogPartition)) {
final Long offset = changelogOffsetFromCheckpointedOffset(loadedCheckpoints.remove(store.changelogPartition));
store.setOffset(offset);
log.debug("State store {} initialized from checkpoint with offset {} at changelog {}", store.stateStore.name(), store.offset, store.changelogPartition);
} else {
// in that case we need to treat it as a task-corrupted exception
if (eosEnabled && !storeDirIsEmpty) {
log.warn("State store {} did not find checkpoint offsets while stores are not empty, " + "since under EOS it has the risk of getting uncommitted data in stores we have to " + "treat it as a task corruption error and wipe out the local state of task {} " + "before re-bootstrapping", store.stateStore.name(), taskId);
throw new TaskCorruptedException(Collections.singleton(taskId));
} else {
log.info("State store {} did not find checkpoint offset, hence would " + "default to the starting offset at changelog {}", store.stateStore.name(), store.changelogPartition);
}
}
} else {
loadedCheckpoints.remove(store.changelogPartition);
log.debug("Skipping re-initialization of offset from checkpoint for recycled store {}", store.stateStore.name());
}
}
if (!loadedCheckpoints.isEmpty()) {
log.warn("Some loaded checkpoint offsets cannot find their corresponding state stores: {}", loadedCheckpoints);
}
if (eosEnabled) {
checkpointFile.delete();
}
} catch (final TaskCorruptedException e) {
throw e;
} catch (final IOException | RuntimeException e) {
// both IOException or runtime exception like number parsing can throw
throw new ProcessorStateException(format("%sError loading and deleting checkpoint file when creating the state manager", logPrefix), e);
}
}
use of org.apache.kafka.streams.errors.TaskCorruptedException in project kafka by apache.
the class StreamThread method runLoop.
/**
* Main event loop for polling, and processing records through topologies.
*
* @throws IllegalStateException If store gets registered after initialized is already finished
* @throws StreamsException if the store's change log does not contain the partition
*/
// Needed to include StreamsConfig.EXACTLY_ONCE_BETA in error log for UnsupportedVersionException
@SuppressWarnings("deprecation")
boolean runLoop() {
subscribeConsumer();
// until the rebalance is completed before we close and commit the tasks
while (isRunning() || taskManager.isRebalanceInProgress()) {
try {
checkForTopologyUpdates();
// stop polling regardless of the rebalance status since we know there are no tasks left
if (!isRunning() && topologyMetadata.isEmpty()) {
log.info("Shutting down thread with empty topology.");
break;
}
maybeSendShutdown();
final long size = cacheResizeSize.getAndSet(-1L);
if (size != -1L) {
cacheResizer.accept(size);
}
runOnce();
if (nextProbingRebalanceMs.get() < time.milliseconds()) {
log.info("Triggering the followup rebalance scheduled for {} ms.", nextProbingRebalanceMs.get());
mainConsumer.enforceRebalance();
nextProbingRebalanceMs.set(Long.MAX_VALUE);
}
} catch (final TaskCorruptedException e) {
log.warn("Detected the states of tasks " + e.corruptedTasks() + " are corrupted. " + "Will close the task as dirty and re-create and bootstrap from scratch.", e);
try {
// check if any active task got corrupted. We will trigger a rebalance in that case.
// once the task corruptions have been handled
final boolean enforceRebalance = taskManager.handleCorruption(e.corruptedTasks());
if (enforceRebalance && eosEnabled) {
log.info("Active task(s) got corrupted. Triggering a rebalance.");
mainConsumer.enforceRebalance();
}
} catch (final TaskMigratedException taskMigrated) {
handleTaskMigrated(taskMigrated);
}
} catch (final TaskMigratedException e) {
handleTaskMigrated(e);
} catch (final UnsupportedVersionException e) {
final String errorMessage = e.getMessage();
if (errorMessage != null && errorMessage.startsWith("Broker unexpectedly doesn't support requireStable flag on version ")) {
log.error("Shutting down because the Kafka cluster seems to be on a too old version. " + "Setting {}=\"{}\"/\"{}\" requires broker version 2.5 or higher.", StreamsConfig.PROCESSING_GUARANTEE_CONFIG, StreamsConfig.EXACTLY_ONCE_V2, StreamsConfig.EXACTLY_ONCE_BETA);
}
failedStreamThreadSensor.record();
this.streamsUncaughtExceptionHandler.accept(new StreamsException(e), false);
return false;
} catch (final StreamsException e) {
throw e;
} catch (final Exception e) {
throw new StreamsException(e);
}
}
return true;
}
use of org.apache.kafka.streams.errors.TaskCorruptedException in project kafka by apache.
the class TaskManagerTest method shouldThrowTaskCorruptedExceptionForTimeoutExceptionOnCommitWithEosV2.
@Test
public void shouldThrowTaskCorruptedExceptionForTimeoutExceptionOnCommitWithEosV2() {
setUpTaskManager(ProcessingMode.EXACTLY_ONCE_V2);
final StreamsProducer producer = mock(StreamsProducer.class);
expect(activeTaskCreator.threadProducer()).andReturn(producer).andReturn(producer);
final Map<TopicPartition, OffsetAndMetadata> offsetsT00 = singletonMap(t1p0, new OffsetAndMetadata(0L, null));
final Map<TopicPartition, OffsetAndMetadata> offsetsT01 = singletonMap(t1p1, new OffsetAndMetadata(1L, null));
final Map<TopicPartition, OffsetAndMetadata> allOffsets = new HashMap<>(offsetsT00);
allOffsets.putAll(offsetsT01);
producer.commitTransaction(allOffsets, null);
expectLastCall().andThrow(new TimeoutException("KABOOM!"));
producer.commitTransaction(allOffsets, null);
expectLastCall();
final StateMachineTask task00 = new StateMachineTask(taskId00, taskId00Partitions, true);
task00.setCommittableOffsetsAndMetadata(offsetsT00);
final StateMachineTask task01 = new StateMachineTask(taskId01, taskId01Partitions, true);
task01.setCommittableOffsetsAndMetadata(offsetsT01);
final StateMachineTask task02 = new StateMachineTask(taskId02, taskId02Partitions, true);
expect(consumer.groupMetadata()).andStubReturn(null);
replay(producer, activeTaskCreator, consumer);
task00.setCommitNeeded();
task01.setCommitNeeded();
final TaskCorruptedException exception = assertThrows(TaskCorruptedException.class, () -> taskManager.commit(mkSet(task00, task01, task02)));
assertThat(exception.corruptedTasks(), equalTo(mkSet(taskId00, taskId01)));
}
use of org.apache.kafka.streams.errors.TaskCorruptedException in project kafka by apache.
the class TaskManager method handleCorruption.
/**
* @throws TaskMigratedException
*/
boolean handleCorruption(final Set<TaskId> corruptedTasks) {
final Set<Task> corruptedActiveTasks = new HashSet<>();
final Set<Task> corruptedStandbyTasks = new HashSet<>();
for (final TaskId taskId : corruptedTasks) {
final Task task = tasks.task(taskId);
if (task.isActive()) {
corruptedActiveTasks.add(task);
} else {
corruptedStandbyTasks.add(task);
}
}
// Make sure to clean up any corrupted standby tasks in their entirety before committing
// since TaskMigrated can be thrown and the resulting handleLostAll will only clean up active tasks
closeDirtyAndRevive(corruptedStandbyTasks, true);
// We need to commit before closing the corrupted active tasks since this will force the ongoing txn to abort
try {
final Collection<Task> tasksToCommit = tasks().values().stream().filter(t -> t.state() == Task.State.RUNNING || t.state() == Task.State.RESTORING).filter(t -> !corruptedTasks.contains(t.id())).collect(Collectors.toSet());
commitTasksAndMaybeUpdateCommittableOffsets(tasksToCommit, new HashMap<>());
} catch (final TaskCorruptedException e) {
log.info("Some additional tasks were found corrupted while trying to commit, these will be added to the " + "tasks to clean and revive: {}", e.corruptedTasks());
corruptedActiveTasks.addAll(tasks.tasks(e.corruptedTasks()));
} catch (final TimeoutException e) {
log.info("Hit TimeoutException when committing all non-corrupted tasks, these will be closed and revived");
final Collection<Task> uncorruptedTasks = new HashSet<>(tasks.activeTasks());
uncorruptedTasks.removeAll(corruptedActiveTasks);
// Those tasks which just timed out can just be closed dirty without marking changelogs as corrupted
closeDirtyAndRevive(uncorruptedTasks, false);
}
closeDirtyAndRevive(corruptedActiveTasks, true);
return !corruptedActiveTasks.isEmpty();
}
use of org.apache.kafka.streams.errors.TaskCorruptedException in project kafka by apache.
the class TaskManager method commit.
/**
* @throws TaskMigratedException if committing offsets failed (non-EOS)
* or if the task producer got fenced (EOS)
* @throws TimeoutException if task.timeout.ms has been exceeded (non-EOS)
* @throws TaskCorruptedException if committing offsets failed due to TimeoutException (EOS)
* @return number of committed offsets, or -1 if we are in the middle of a rebalance and cannot commit
*/
int commit(final Collection<Task> tasksToCommit) {
int committed = 0;
final Map<Task, Map<TopicPartition, OffsetAndMetadata>> consumedOffsetsAndMetadataPerTask = new HashMap<>();
try {
committed = commitTasksAndMaybeUpdateCommittableOffsets(tasksToCommit, consumedOffsetsAndMetadataPerTask);
} catch (final TimeoutException timeoutException) {
consumedOffsetsAndMetadataPerTask.keySet().forEach(t -> t.maybeInitTaskTimeoutOrThrow(time.milliseconds(), timeoutException));
}
return committed;
}
Aggregations