use of org.apache.kafka.streams.errors.TaskCorruptedException in project kafka by apache.
the class TaskExecutor method commitOffsetsOrTransaction.
/**
* Caution: do not invoke this directly if it's possible a rebalance is occurring, as the commit will fail. If
* this is a possibility, prefer the {@link #commitTasksAndMaybeUpdateCommittableOffsets} instead.
*
* @throws TaskMigratedException if committing offsets failed due to CommitFailedException (non-EOS)
* @throws TimeoutException if committing offsets failed due to TimeoutException (non-EOS)
* @throws TaskCorruptedException if committing offsets failed due to TimeoutException (EOS)
*/
void commitOffsetsOrTransaction(final Map<Task, Map<TopicPartition, OffsetAndMetadata>> offsetsPerTask) {
// avoid logging actual Task objects
log.debug("Committing task offsets {}", offsetsPerTask.entrySet().stream().collect(Collectors.toMap(t -> t.getKey().id(), Entry::getValue)));
final Set<TaskId> corruptedTasks = new HashSet<>();
if (!offsetsPerTask.isEmpty()) {
if (processingMode == EXACTLY_ONCE_ALPHA) {
for (final Map.Entry<Task, Map<TopicPartition, OffsetAndMetadata>> taskToCommit : offsetsPerTask.entrySet()) {
final Task task = taskToCommit.getKey();
try {
tasks.streamsProducerForTask(task.id()).commitTransaction(taskToCommit.getValue(), tasks.mainConsumer().groupMetadata());
updateTaskCommitMetadata(taskToCommit.getValue());
} catch (final TimeoutException timeoutException) {
log.error(String.format("Committing task %s failed.", task.id()), timeoutException);
corruptedTasks.add(task.id());
}
}
} else {
final Map<TopicPartition, OffsetAndMetadata> allOffsets = offsetsPerTask.values().stream().flatMap(e -> e.entrySet().stream()).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
if (processingMode == EXACTLY_ONCE_V2) {
try {
tasks.threadProducer().commitTransaction(allOffsets, tasks.mainConsumer().groupMetadata());
updateTaskCommitMetadata(allOffsets);
} catch (final TimeoutException timeoutException) {
log.error(String.format("Committing task(s) %s failed.", offsetsPerTask.keySet().stream().map(t -> t.id().toString()).collect(Collectors.joining(", "))), timeoutException);
offsetsPerTask.keySet().forEach(task -> corruptedTasks.add(task.id()));
}
} else {
try {
tasks.mainConsumer().commitSync(allOffsets);
updateTaskCommitMetadata(allOffsets);
} catch (final CommitFailedException error) {
throw new TaskMigratedException("Consumer committing offsets failed, " + "indicating the corresponding thread is no longer part of the group", error);
} catch (final TimeoutException timeoutException) {
log.error(String.format("Committing task(s) %s failed.", offsetsPerTask.keySet().stream().map(t -> t.id().toString()).collect(Collectors.joining(", "))), timeoutException);
throw timeoutException;
} catch (final KafkaException error) {
throw new StreamsException("Error encountered committing offsets via consumer", error);
}
}
}
if (!corruptedTasks.isEmpty()) {
throw new TaskCorruptedException(corruptedTasks);
}
}
}
use of org.apache.kafka.streams.errors.TaskCorruptedException in project kafka by apache.
the class StoreChangelogReader method restore.
// 1. if there are any registered changelogs that needs initialization, try to initialize them first;
// 2. if all changelogs have finished, return early;
// 3. if there are any restoring changelogs, try to read from the restore consumer and process them.
@Override
public void restore(final Map<TaskId, Task> tasks) {
initializeChangelogs(tasks, registeredChangelogs());
if (!activeRestoringChangelogs().isEmpty() && state == ChangelogReaderState.STANDBY_UPDATING) {
throw new IllegalStateException("Should not be in standby updating state if there are still un-completed active changelogs");
}
if (allChangelogsCompleted()) {
log.debug("Finished restoring all changelogs {}", changelogs.keySet());
return;
}
final Set<TopicPartition> restoringChangelogs = restoringChangelogs();
if (!restoringChangelogs.isEmpty()) {
final ConsumerRecords<byte[], byte[]> polledRecords;
try {
// for restoring active and updating standby we may prefer different poll time
// in order to make sure we call the main consumer#poll in time.
// TODO: once we move ChangelogReader to a separate thread this may no longer be a concern
polledRecords = restoreConsumer.poll(state == ChangelogReaderState.STANDBY_UPDATING ? Duration.ZERO : pollTime);
// TODO (?) If we cannot fetch records during restore, should we trigger `task.timeout.ms` ?
// TODO (?) If we cannot fetch records for standby task, should we trigger `task.timeout.ms` ?
} catch (final InvalidOffsetException e) {
log.warn("Encountered " + e.getClass().getName() + " fetching records from restore consumer for partitions " + e.partitions() + ", it is likely that " + "the consumer's position has fallen out of the topic partition offset range because the topic was " + "truncated or compacted on the broker, marking the corresponding tasks as corrupted and re-initializing" + " it later.", e);
final Set<TaskId> corruptedTasks = new HashSet<>();
e.partitions().forEach(partition -> corruptedTasks.add(changelogs.get(partition).stateManager.taskId()));
throw new TaskCorruptedException(corruptedTasks, e);
} catch (final KafkaException e) {
throw new StreamsException("Restore consumer get unexpected error polling records.", e);
}
for (final TopicPartition partition : polledRecords.partitions()) {
bufferChangelogRecords(restoringChangelogByPartition(partition), polledRecords.records(partition));
}
for (final TopicPartition partition : restoringChangelogs) {
// even if some partition do not have any accumulated data, we still trigger
// restoring since some changelog may not need to restore any at all, and the
// restore to end check needs to be executed still.
// TODO: we always try to restore as a batch when some records are accumulated, which may result in
// small batches; this can be optimized in the future, e.g. wait longer for larger batches.
final TaskId taskId = changelogs.get(partition).stateManager.taskId();
try {
if (restoreChangelog(changelogs.get(partition))) {
tasks.get(taskId).clearTaskTimeout();
}
} catch (final TimeoutException timeoutException) {
tasks.get(taskId).maybeInitTaskTimeoutOrThrow(time.milliseconds(), timeoutException);
}
}
maybeUpdateLimitOffsetsForStandbyChangelogs(tasks);
maybeLogRestorationProgress();
}
}
use of org.apache.kafka.streams.errors.TaskCorruptedException in project kafka by apache.
the class StreamTask method process.
/**
* Process one record.
*
* @return true if this method processes a record, false if it does not process a record.
* @throws TaskMigratedException if the task producer got fenced (EOS only)
*/
@SuppressWarnings("unchecked")
public boolean process(final long wallClockTime) {
if (record == null) {
if (!isProcessable(wallClockTime)) {
return false;
}
// get the next record to process
record = partitionGroup.nextRecord(recordInfo, wallClockTime);
// if there is no record to process, return immediately
if (record == null) {
return false;
}
}
try {
final TopicPartition partition = recordInfo.partition();
if (!(record instanceof CorruptedRecord)) {
doProcess(wallClockTime);
}
// update the consumed offset map after processing is done
consumedOffsets.put(partition, record.offset());
commitNeeded = true;
// decreased to the threshold, we can then resume the consumption on this partition
if (recordInfo.queue().size() == maxBufferedSize) {
mainConsumer.resume(singleton(partition));
}
record = null;
} catch (final TimeoutException timeoutException) {
if (!eosEnabled) {
throw timeoutException;
} else {
record = null;
throw new TaskCorruptedException(Collections.singleton(id));
}
} catch (final StreamsException exception) {
record = null;
throw exception;
} catch (final RuntimeException e) {
final StreamsException error = new StreamsException(String.format("Exception caught in process. taskId=%s, processor=%s, topic=%s, partition=%d, offset=%d, stacktrace=%s", id(), processorContext.currentNode().name(), record.topic(), record.partition(), record.offset(), getStacktraceString(e)), e);
record = null;
throw error;
} finally {
processorContext.setCurrentNode(null);
}
return true;
}
use of org.apache.kafka.streams.errors.TaskCorruptedException in project kafka by apache.
the class ProcessorStateManagerTest method shouldThrowTaskCorruptedWithoutPersistentStoreCheckpointAndNonEmptyDir.
@Test
public void shouldThrowTaskCorruptedWithoutPersistentStoreCheckpointAndNonEmptyDir() throws IOException {
final long checkpointOffset = 10L;
final Map<TopicPartition, Long> offsets = mkMap(mkEntry(persistentStorePartition, checkpointOffset), mkEntry(nonPersistentStorePartition, checkpointOffset), mkEntry(irrelevantPartition, 999L));
checkpoint.write(offsets);
final ProcessorStateManager stateMgr = getStateManager(Task.TaskType.ACTIVE, true);
try {
stateMgr.registerStore(persistentStore, persistentStore.stateRestoreCallback, null);
stateMgr.registerStore(persistentStoreTwo, persistentStoreTwo.stateRestoreCallback, null);
stateMgr.registerStore(nonPersistentStore, nonPersistentStore.stateRestoreCallback, null);
final TaskCorruptedException exception = assertThrows(TaskCorruptedException.class, () -> stateMgr.initializeStoreOffsetsFromCheckpoint(false));
assertEquals(Collections.singleton(taskId), exception.corruptedTasks());
} finally {
stateMgr.close();
}
}
use of org.apache.kafka.streams.errors.TaskCorruptedException in project kafka by apache.
the class StreamThreadTest method shouldReinitializeRevivedTasksInAnyState.
@Test
public void shouldReinitializeRevivedTasksInAnyState() {
final StreamThread thread = createStreamThread(CLIENT_ID, new StreamsConfig(configProps(false)), false);
final String storeName = "store";
final String storeChangelog = "stream-thread-test-store-changelog";
final TopicPartition storeChangelogTopicPartition = new TopicPartition(storeChangelog, 1);
internalTopologyBuilder.addSource(null, "name", null, null, null, topic1);
final AtomicBoolean shouldThrow = new AtomicBoolean(false);
final AtomicBoolean processed = new AtomicBoolean(false);
internalTopologyBuilder.addProcessor("proc", () -> record -> {
if (shouldThrow.get()) {
throw new TaskCorruptedException(singleton(task1));
} else {
processed.set(true);
}
}, "name");
internalTopologyBuilder.addStateStore(Stores.keyValueStoreBuilder(Stores.persistentKeyValueStore(storeName), Serdes.String(), Serdes.String()), "proc");
thread.setState(StreamThread.State.STARTING);
thread.rebalanceListener().onPartitionsRevoked(Collections.emptySet());
final Map<TaskId, Set<TopicPartition>> activeTasks = new HashMap<>();
final List<TopicPartition> assignedPartitions = new ArrayList<>();
// assign single partition
assignedPartitions.add(t1p1);
activeTasks.put(task1, Collections.singleton(t1p1));
thread.taskManager().handleAssignment(activeTasks, emptyMap());
final MockConsumer<byte[], byte[]> mockConsumer = (MockConsumer<byte[], byte[]>) thread.mainConsumer();
mockConsumer.assign(assignedPartitions);
mockConsumer.updateBeginningOffsets(mkMap(mkEntry(t1p1, 0L)));
final MockConsumer<byte[], byte[]> restoreConsumer = (MockConsumer<byte[], byte[]>) thread.restoreConsumer();
restoreConsumer.updateBeginningOffsets(mkMap(mkEntry(storeChangelogTopicPartition, 0L)));
final MockAdminClient admin = (MockAdminClient) thread.adminClient();
admin.updateEndOffsets(singletonMap(storeChangelogTopicPartition, 0L));
thread.rebalanceListener().onPartitionsAssigned(assignedPartitions);
// the first iteration completes the restoration
thread.runOnce();
assertThat(thread.activeTasks().size(), equalTo(1));
// the second transits to running and unpause the input
thread.runOnce();
// the third actually polls, processes the record, and throws the corruption exception
addRecord(mockConsumer, 0L);
shouldThrow.set(true);
final TaskCorruptedException taskCorruptedException = assertThrows(TaskCorruptedException.class, thread::runOnce);
// Now, we can handle the corruption
thread.taskManager().handleCorruption(taskCorruptedException.corruptedTasks());
// again, complete the restoration
thread.runOnce();
// transit to running and unpause
thread.runOnce();
// process the record
addRecord(mockConsumer, 0L);
shouldThrow.set(false);
assertThat(processed.get(), is(false));
thread.runOnce();
assertThat(processed.get(), is(true));
thread.taskManager().shutdown(true);
}
Aggregations