use of org.apache.druid.indexer.TaskStatus in project druid by druid-io.
the class RemoteTaskRunnerTest method testBlacklistZKWorkers50Percent.
/**
* With 2 workers and maxPercentageBlacklistWorkers(50), one worker should get blacklisted after the second failure
* and the second worker should never be blacklisted even after exceeding maxRetriesBeforeBlacklist.
*/
@Test
public void testBlacklistZKWorkers50Percent() throws Exception {
rtrTestUtils.makeWorker("worker", 10);
rtrTestUtils.makeWorker("worker2", 10);
RemoteTaskRunnerConfig rtrConfig = new TestRemoteTaskRunnerConfig(TIMEOUT_PERIOD);
rtrConfig.setMaxPercentageBlacklistWorkers(50);
makeRemoteTaskRunner(rtrConfig);
String firstWorker = null;
String secondWorker = null;
for (int i = 1; i < 13; i++) {
String taskId = StringUtils.format("rt-%d", i);
TestRealtimeTask task = new TestRealtimeTask(taskId, new TaskResource(taskId, 1), "foo", TaskStatus.success(taskId), jsonMapper);
Future<TaskStatus> taskFuture = remoteTaskRunner.run(task);
if (i == 1) {
if (rtrTestUtils.taskAnnounced("worker2", task.getId())) {
firstWorker = "worker2";
secondWorker = "worker";
} else {
firstWorker = "worker";
secondWorker = "worker2";
}
}
final String expectedWorker = i % 2 == 0 || i > 4 ? secondWorker : firstWorker;
Assert.assertTrue(rtrTestUtils.taskAnnounced(expectedWorker, task.getId()));
rtrTestUtils.mockWorkerRunningTask(expectedWorker, task);
rtrTestUtils.mockWorkerCompleteFailedTask(expectedWorker, task);
Assert.assertTrue(taskFuture.get().isFailure());
Assert.assertEquals(i > 2 ? 1 : 0, remoteTaskRunner.getBlackListedWorkers().size());
Assert.assertEquals(i > 4 ? i - 2 : ((i + 1) / 2), remoteTaskRunner.findWorkerRunningTask(task.getId()).getContinuouslyFailedTasksCount());
}
}
use of org.apache.druid.indexer.TaskStatus in project druid by druid-io.
the class TaskQueue method notifyStatus.
/**
* Notify this queue that some task has an updated status. If this update is valid, the status will be persisted in
* the task storage facility. If the status is a completed status, the task will be unlocked and no further
* updates will be accepted.
*
* @param task task to update
* @param taskStatus new task status
*
* @throws NullPointerException if task or status is null
* @throws IllegalArgumentException if the task ID does not match the status ID
* @throws IllegalStateException if this queue is currently shut down
*/
private void notifyStatus(final Task task, final TaskStatus taskStatus, String reasonFormat, Object... args) {
giant.lock();
TaskLocation taskLocation = TaskLocation.unknown();
try {
Preconditions.checkNotNull(task, "task");
Preconditions.checkNotNull(taskStatus, "status");
Preconditions.checkState(active, "Queue is not active!");
Preconditions.checkArgument(task.getId().equals(taskStatus.getId()), "Mismatching task ids[%s/%s]", task.getId(), taskStatus.getId());
// Inform taskRunner that this task can be shut down
try {
taskLocation = taskRunner.getTaskLocation(task.getId());
taskRunner.shutdown(task.getId(), reasonFormat, args);
} catch (Exception e) {
log.warn(e, "TaskRunner failed to cleanup task after completion: %s", task.getId());
}
// Remove from running tasks
int removed = 0;
for (int i = tasks.size() - 1; i >= 0; i--) {
if (tasks.get(i).getId().equals(task.getId())) {
removed++;
removeTaskInternal(tasks.get(i));
break;
}
}
if (removed == 0) {
log.warn("Unknown task completed: %s", task.getId());
} else if (removed > 1) {
log.makeAlert("Removed multiple copies of task").addData("count", removed).addData("task", task.getId()).emit();
}
// Remove from futures list
taskFutures.remove(task.getId());
if (removed > 0) {
// If we thought this task should be running, save status to DB
try {
final Optional<TaskStatus> previousStatus = taskStorage.getStatus(task.getId());
if (!previousStatus.isPresent() || !previousStatus.get().isRunnable()) {
log.makeAlert("Ignoring notification for already-complete task").addData("task", task.getId()).emit();
} else {
taskStorage.setStatus(taskStatus.withLocation(taskLocation));
log.info("Task done: %s", task);
managementMayBeNecessary.signalAll();
}
} catch (Exception e) {
log.makeAlert(e, "Failed to persist status for task").addData("task", task.getId()).addData("statusCode", taskStatus.getStatusCode()).emit();
}
}
} finally {
giant.unlock();
}
}
use of org.apache.druid.indexer.TaskStatus in project druid by druid-io.
the class RemoteTaskRunner method scheduleTasksCleanupForWorker.
/**
* Schedule a task that will, at some point in the future, clean up znodes and issue failures for "tasksToFail"
* if they are being run by "worker".
*/
private void scheduleTasksCleanupForWorker(final String worker, final List<String> tasksToFail) {
// This method is only called from the PathChildrenCache event handler, so this may look like a race,
// but is actually not.
cancelWorkerCleanup(worker);
final ListenableScheduledFuture<?> cleanupTask = cleanupExec.schedule(() -> {
log.info("Running scheduled cleanup for Worker[%s]", worker);
try {
for (String assignedTask : tasksToFail) {
String taskPath = JOINER.join(indexerZkConfig.getTasksPath(), worker, assignedTask);
String statusPath = JOINER.join(indexerZkConfig.getStatusPath(), worker, assignedTask);
if (cf.checkExists().forPath(taskPath) != null) {
cf.delete().guaranteed().forPath(taskPath);
}
if (cf.checkExists().forPath(statusPath) != null) {
cf.delete().guaranteed().forPath(statusPath);
}
log.info("Failing task[%s]", assignedTask);
RemoteTaskRunnerWorkItem taskRunnerWorkItem = runningTasks.remove(assignedTask);
if (taskRunnerWorkItem != null) {
final TaskStatus taskStatus = TaskStatus.failure(assignedTask, StringUtils.format("Canceled for worker cleanup. See overlord logs for more details."));
taskRunnerWorkItem.setResult(taskStatus);
TaskRunnerUtils.notifyStatusChanged(listeners, assignedTask, taskStatus);
} else {
log.warn("RemoteTaskRunner has no knowledge of task[%s]", assignedTask);
}
}
// worker is gone, remove worker task status announcements path.
String workerStatusPath = JOINER.join(indexerZkConfig.getStatusPath(), worker);
if (cf.checkExists().forPath(workerStatusPath) != null) {
cf.delete().guaranteed().forPath(JOINER.join(indexerZkConfig.getStatusPath(), worker));
}
} catch (Exception e) {
log.makeAlert("Exception while cleaning up worker[%s]", worker).emit();
throw new RuntimeException(e);
}
}, config.getTaskCleanupTimeout().toStandardDuration().getMillis(), TimeUnit.MILLISECONDS);
removedWorkerCleanups.put(worker, cleanupTask);
// Remove this entry from removedWorkerCleanups when done, if it's actually the one in there.
Futures.addCallback(cleanupTask, new FutureCallback<Object>() {
@Override
public void onSuccess(Object result) {
removedWorkerCleanups.remove(worker, cleanupTask);
}
@Override
public void onFailure(Throwable t) {
removedWorkerCleanups.remove(worker, cleanupTask);
}
});
}
use of org.apache.druid.indexer.TaskStatus in project druid by druid-io.
the class BaseRestorableTaskRunner method restore.
@Override
public List<Pair<Task, ListenableFuture<TaskStatus>>> restore() {
final File restoreFile = getRestoreFile();
final TaskRestoreInfo taskRestoreInfo;
if (restoreFile.exists()) {
try {
taskRestoreInfo = jsonMapper.readValue(restoreFile, TaskRestoreInfo.class);
} catch (Exception e) {
LOG.error(e, "Failed to read restorable tasks from file[%s]. Skipping restore.", restoreFile);
return ImmutableList.of();
}
} else {
return ImmutableList.of();
}
final List<Pair<Task, ListenableFuture<TaskStatus>>> retVal = new ArrayList<>();
for (final String taskId : taskRestoreInfo.getRunningTasks()) {
try {
final File taskFile = new File(taskConfig.getTaskDir(taskId), "task.json");
final Task task = jsonMapper.readValue(taskFile, Task.class);
if (!task.getId().equals(taskId)) {
throw new ISE("Task[%s] restore file had wrong id[%s]", taskId, task.getId());
}
if (taskConfig.isRestoreTasksOnRestart() && task.canRestore()) {
LOG.info("Restoring task[%s].", task.getId());
retVal.add(Pair.of(task, run(task)));
}
} catch (Exception e) {
LOG.warn(e, "Failed to restore task[%s]. Trying to restore other tasks.", taskId);
}
}
if (!retVal.isEmpty()) {
LOG.info("Restored %,d tasks: %s", retVal.size(), Joiner.on(", ").join(retVal));
}
return retVal;
}
use of org.apache.druid.indexer.TaskStatus in project druid by druid-io.
the class ThreadingTaskRunner method run.
@Override
public ListenableFuture<TaskStatus> run(Task task) {
synchronized (tasks) {
tasks.computeIfAbsent(task.getId(), k -> new ThreadingTaskRunnerWorkItem(task, taskExecutor.submit(new Callable<TaskStatus>() {
@Override
public TaskStatus call() {
final String attemptUUID = UUID.randomUUID().toString();
final File taskDir = taskConfig.getTaskDir(task.getId());
final File attemptDir = new File(taskDir, attemptUUID);
final TaskLocation taskLocation = TaskLocation.create(node.getHost(), node.getPlaintextPort(), node.getTlsPort());
final ThreadingTaskRunnerWorkItem taskWorkItem;
try {
FileUtils.mkdirp(attemptDir);
final File taskFile = new File(taskDir, "task.json");
final File reportsFile = new File(attemptDir, "report.json");
taskReportFileWriter.add(task.getId(), reportsFile);
// time to adjust process holders
synchronized (tasks) {
taskWorkItem = tasks.get(task.getId());
if (taskWorkItem == null) {
LOGGER.makeAlert("TaskInfo disappeared").addData("task", task.getId()).emit();
throw new ISE("TaskInfo disappeared for task[%s]!", task.getId());
}
if (taskWorkItem.shutdown) {
throw new IllegalStateException("Task has been shut down!");
}
}
if (!taskFile.exists()) {
jsonMapper.writeValue(taskFile, task);
}
// This will block for a while. So we append the thread information with more details
final String priorThreadName = Thread.currentThread().getName();
Thread.currentThread().setName(StringUtils.format("[%s]-%s", task.getId(), priorThreadName));
TaskStatus taskStatus;
final TaskToolbox toolbox = toolboxFactory.build(task);
TaskRunnerUtils.notifyLocationChanged(listeners, task.getId(), taskLocation);
TaskRunnerUtils.notifyStatusChanged(listeners, task.getId(), TaskStatus.running(task.getId()));
taskWorkItem.setState(RunnerTaskState.RUNNING);
try {
taskStatus = task.run(toolbox);
} catch (Throwable t) {
LOGGER.error(t, "Exception caught while running the task.");
taskStatus = TaskStatus.failure(task.getId(), "Failed with an exception. See indexer logs for more details.");
} finally {
taskWorkItem.setState(RunnerTaskState.NONE);
Thread.currentThread().setName(priorThreadName);
if (reportsFile.exists()) {
taskLogPusher.pushTaskReports(task.getId(), reportsFile);
}
}
TaskRunnerUtils.notifyStatusChanged(listeners, task.getId(), taskStatus);
return taskStatus;
} catch (Throwable t) {
LOGGER.error(t, "Exception caught during execution");
throw new RuntimeException(t);
} finally {
try {
taskReportFileWriter.delete(task.getId());
appenderatorsManager.removeAppenderatorsForTask(task.getId(), task.getDataSource());
synchronized (tasks) {
tasks.remove(task.getId());
if (!stopping) {
saveRunningTasks();
}
}
try {
if (!stopping && taskDir.exists()) {
FileUtils.deleteDirectory(taskDir);
LOGGER.info("Removed task directory: %s", taskDir);
}
} catch (Exception e) {
LOGGER.makeAlert(e, "Failed to delete task directory").addData("taskDir", taskDir.toString()).addData("task", task.getId()).emit();
}
} catch (Exception e) {
LOGGER.error(e, "Suppressing exception caught while cleaning up task");
}
}
}
})));
saveRunningTasks();
return tasks.get(task.getId()).getResult();
}
}
Aggregations