use of org.apache.druid.indexer.TaskStatus in project druid by druid-io.
the class SingleTaskBackgroundRunner method stop.
@Override
@LifecycleStop
public void stop() {
stopping = true;
if (executorService != null) {
try {
executorService.shutdown();
} catch (SecurityException ex) {
log.error(ex, "I can't control my own threads!");
}
}
if (runningItem != null) {
final Task task = runningItem.getTask();
final long start = System.currentTimeMillis();
final long elapsed;
boolean error = false;
// stopGracefully for resource cleaning
log.info("Starting graceful shutdown of task[%s].", task.getId());
task.stopGracefully(taskConfig);
if (taskConfig.isRestoreTasksOnRestart() && task.canRestore()) {
try {
final TaskStatus taskStatus = runningItem.getResult().get(new Interval(DateTimes.utc(start), taskConfig.getGracefulShutdownTimeout()).toDurationMillis(), TimeUnit.MILLISECONDS);
// Ignore status, it doesn't matter for graceful shutdowns.
log.info("Graceful shutdown of task[%s] finished in %,dms.", task.getId(), System.currentTimeMillis() - start);
TaskRunnerUtils.notifyStatusChanged(listeners, task.getId(), taskStatus);
} catch (Exception e) {
log.makeAlert(e, "Graceful task shutdown failed: %s", task.getDataSource()).addData("taskId", task.getId()).addData("dataSource", task.getDataSource()).emit();
log.warn(e, "Graceful shutdown of task[%s] aborted with exception.", task.getId());
error = true;
// Creating a new status to only feed listeners seems quite strange.
// This is currently OK because we have no listeners yet registered in peon.
// However, we should fix this in the near future by always retrieving task status
// from one single source of truth that is also propagated to the overlord.
// See https://github.com/apache/druid/issues/11445.
TaskRunnerUtils.notifyStatusChanged(listeners, task.getId(), TaskStatus.failure(task.getId(), "Failed to stop gracefully with exception. See task logs for more details."));
}
} else {
// Creating a new status to only feed listeners seems quite strange.
// This is currently OK because we have no listeners yet registered in peon.
// However, we should fix this in the near future by always retrieving task status
// from one single source of truth that is also propagated to the overlord.
// See https://github.com/apache/druid/issues/11445.
TaskRunnerUtils.notifyStatusChanged(listeners, task.getId(), TaskStatus.failure(task.getId(), "Canceled as task execution process stopped"));
}
elapsed = System.currentTimeMillis() - start;
final ServiceMetricEvent.Builder metricBuilder = ServiceMetricEvent.builder().setDimension("task", task.getId()).setDimension("dataSource", task.getDataSource()).setDimension("graceful", // for backward compatibility
"true").setDimension("error", String.valueOf(error));
emitter.emit(metricBuilder.build("task/interrupt/count", 1L));
emitter.emit(metricBuilder.build("task/interrupt/elapsed", elapsed));
}
// Ok, now interrupt everything.
if (executorService != null) {
try {
executorService.shutdownNow();
} catch (SecurityException ex) {
log.error(ex, "I can't control my own threads!");
}
}
}
use of org.apache.druid.indexer.TaskStatus in project druid by druid-io.
the class SingleTaskBackgroundRunner method run.
@Override
public ListenableFuture<TaskStatus> run(final Task task) {
if (runningItem == null) {
final TaskToolbox toolbox = toolboxFactory.build(task);
final Object taskPriorityObj = task.getContextValue(TaskThreadPriority.CONTEXT_KEY);
int taskPriority = 0;
try {
taskPriority = taskPriorityObj == null ? 0 : Numbers.parseInt(taskPriorityObj);
} catch (NumberFormatException e) {
log.error(e, "Error parsing task priority [%s] for task [%s]", taskPriorityObj, task.getId());
}
// Ensure an executor for that priority exists
executorService = buildExecutorService(taskPriority);
final ListenableFuture<TaskStatus> statusFuture = executorService.submit(new SingleTaskBackgroundRunnerCallable(task, location, toolbox));
runningItem = new SingleTaskBackgroundRunnerWorkItem(task, location, statusFuture);
return statusFuture;
} else {
throw new ISE("Already running task[%s]", runningItem.getTask().getId());
}
}
use of org.apache.druid.indexer.TaskStatus in project druid by druid-io.
the class TaskQueue method manageInternal.
@VisibleForTesting
void manageInternal() {
// Task futures available from the taskRunner
final Map<String, ListenableFuture<TaskStatus>> runnerTaskFutures = new HashMap<>();
for (final TaskRunnerWorkItem workItem : taskRunner.getKnownTasks()) {
runnerTaskFutures.put(workItem.getTaskId(), workItem.getResult());
}
// Copy tasks list, as notifyStatus may modify it.
for (final Task task : ImmutableList.copyOf(tasks)) {
if (!taskFutures.containsKey(task.getId())) {
final ListenableFuture<TaskStatus> runnerTaskFuture;
if (runnerTaskFutures.containsKey(task.getId())) {
runnerTaskFuture = runnerTaskFutures.get(task.getId());
} else {
// Task should be running, so run it.
final boolean taskIsReady;
try {
taskIsReady = task.isReady(taskActionClientFactory.create(task));
} catch (Exception e) {
log.warn(e, "Exception thrown during isReady for task: %s", task.getId());
final String errorMessage;
if (e instanceof MaxAllowedLocksExceededException) {
errorMessage = e.getMessage();
} else {
errorMessage = "Failed while waiting for the task to be ready to run. " + "See overlord logs for more details.";
}
notifyStatus(task, TaskStatus.failure(task.getId(), errorMessage), errorMessage);
continue;
}
if (taskIsReady) {
log.info("Asking taskRunner to run: %s", task.getId());
runnerTaskFuture = taskRunner.run(task);
} else {
// Task.isReady() can internally lock intervals or segments.
// We should release them if the task is not ready.
taskLockbox.unlockAll(task);
continue;
}
}
taskFutures.put(task.getId(), attachCallbacks(task, runnerTaskFuture));
} else if (isTaskPending(task)) {
// if the taskFutures contain this task and this task is pending, also let the taskRunner
// to run it to guarantee it will be assigned to run
// see https://github.com/apache/druid/pull/6991
taskRunner.run(task);
}
}
// Kill tasks that shouldn't be running
final Set<String> knownTaskIds = tasks.stream().map(Task::getId).collect(Collectors.toSet());
final Set<String> tasksToKill = Sets.difference(runnerTaskFutures.keySet(), knownTaskIds);
if (!tasksToKill.isEmpty()) {
log.info("Asking taskRunner to clean up %,d tasks.", tasksToKill.size());
for (final String taskId : tasksToKill) {
try {
taskRunner.shutdown(taskId, "task is not in knownTaskIds[%s]", knownTaskIds);
} catch (Exception e) {
log.warn(e, "TaskRunner failed to clean up task: %s", taskId);
}
}
}
}
use of org.apache.druid.indexer.TaskStatus in project druid by druid-io.
the class HttpRemoteTaskRunner method taskAddedOrUpdated.
void taskAddedOrUpdated(final TaskAnnouncement announcement, final WorkerHolder workerHolder) {
final String taskId = announcement.getTaskId();
final Worker worker = workerHolder.getWorker();
log.debug("Worker[%s] wrote [%s] status for task [%s] on [%s]", worker.getHost(), announcement.getTaskStatus().getStatusCode(), taskId, announcement.getTaskLocation());
HttpRemoteTaskRunnerWorkItem taskItem;
boolean shouldShutdownTask = false;
boolean isTaskCompleted = false;
synchronized (statusLock) {
taskItem = tasks.get(taskId);
if (taskItem == null) {
// Try to find information about it in the TaskStorage
Optional<TaskStatus> knownStatusInStorage = taskStorage.getStatus(taskId);
if (knownStatusInStorage.isPresent()) {
switch(knownStatusInStorage.get().getStatusCode()) {
case RUNNING:
taskItem = new HttpRemoteTaskRunnerWorkItem(taskId, worker, TaskLocation.unknown(), null, announcement.getTaskType(), HttpRemoteTaskRunnerWorkItem.State.RUNNING);
tasks.put(taskId, taskItem);
break;
case SUCCESS:
case FAILED:
if (!announcement.getTaskStatus().isComplete()) {
log.info("Worker[%s] reported status for completed, known from taskStorage, task[%s]. Ignored.", worker.getHost(), taskId);
}
break;
default:
log.makeAlert("Found unrecognized state[%s] of task[%s] in taskStorage. Notification[%s] from worker[%s] is ignored.", knownStatusInStorage.get().getStatusCode(), taskId, announcement, worker.getHost()).emit();
}
} else {
log.warn("Worker[%s] reported status[%s] for unknown task[%s]. Ignored.", worker.getHost(), announcement.getStatus(), taskId);
}
}
if (taskItem == null) {
if (!announcement.getTaskStatus().isComplete()) {
shouldShutdownTask = true;
}
} else {
switch(announcement.getTaskStatus().getStatusCode()) {
case RUNNING:
switch(taskItem.getState()) {
case PENDING:
case PENDING_WORKER_ASSIGN:
taskItem.setWorker(worker);
taskItem.setState(HttpRemoteTaskRunnerWorkItem.State.RUNNING);
log.info("Task[%s] started RUNNING on worker[%s].", taskId, worker.getHost());
// fall through
case RUNNING:
if (worker.getHost().equals(taskItem.getWorker().getHost())) {
if (!announcement.getTaskLocation().equals(taskItem.getLocation())) {
log.info("Task[%s] location changed on worker[%s]. new location[%s].", taskId, worker.getHost(), announcement.getTaskLocation());
taskItem.setLocation(announcement.getTaskLocation());
TaskRunnerUtils.notifyLocationChanged(listeners, taskId, announcement.getTaskLocation());
}
} else {
log.warn("Found worker[%s] running task[%s] which is being run by another worker[%s]. Notification ignored.", worker.getHost(), taskId, taskItem.getWorker().getHost());
shouldShutdownTask = true;
}
break;
case COMPLETE:
log.warn("Worker[%s] reported status for completed task[%s]. Ignored.", worker.getHost(), taskId);
shouldShutdownTask = true;
break;
default:
log.makeAlert("Found unrecognized state[%s] of task[%s]. Notification[%s] from worker[%s] is ignored.", taskItem.getState(), taskId, announcement, worker.getHost()).emit();
}
break;
case FAILED:
case SUCCESS:
switch(taskItem.getState()) {
case PENDING:
case PENDING_WORKER_ASSIGN:
taskItem.setWorker(worker);
taskItem.setState(HttpRemoteTaskRunnerWorkItem.State.RUNNING);
log.info("Task[%s] finished on worker[%s].", taskId, worker.getHost());
// fall through
case RUNNING:
if (worker.getHost().equals(taskItem.getWorker().getHost())) {
if (!announcement.getTaskLocation().equals(taskItem.getLocation())) {
log.info("Task[%s] location changed on worker[%s]. new location[%s].", taskId, worker.getHost(), announcement.getTaskLocation());
taskItem.setLocation(announcement.getTaskLocation());
TaskRunnerUtils.notifyLocationChanged(listeners, taskId, announcement.getTaskLocation());
}
isTaskCompleted = true;
} else {
log.warn("Worker[%s] reported completed task[%s] which is being run by another worker[%s]. Notification ignored.", worker.getHost(), taskId, taskItem.getWorker().getHost());
}
break;
case COMPLETE:
// this can happen when a worker is restarted and reports its list of completed tasks again.
break;
default:
log.makeAlert("Found unrecognized state[%s] of task[%s]. Notification[%s] from worker[%s] is ignored.", taskItem.getState(), taskId, announcement, worker.getHost()).emit();
}
break;
default:
log.makeAlert("Worker[%s] reported unrecognized state[%s] for task[%s].", worker.getHost(), announcement.getTaskStatus().getStatusCode(), taskId).emit();
}
}
}
if (isTaskCompleted) {
// taskComplete(..) must be called outside of statusLock, see comments on method.
taskComplete(taskItem, workerHolder, announcement.getTaskStatus());
}
if (shouldShutdownTask) {
log.warn("Killing task[%s] on worker[%s].", taskId, worker.getHost());
workerHolder.shutdownTask(taskId);
}
synchronized (statusLock) {
statusLock.notifyAll();
}
}
use of org.apache.druid.indexer.TaskStatus in project druid by druid-io.
the class AppenderatorDriverRealtimeIndexTaskTest method testLateData.
@Test(timeout = 60_000L)
public void testLateData() throws Exception {
expectPublishedSegments(1);
final AppenderatorDriverRealtimeIndexTask task = makeRealtimeTask(null);
final ListenableFuture<TaskStatus> statusFuture = runTask(task);
// Wait for firehose to show up, it starts off null.
while (task.getFirehose() == null) {
Thread.sleep(50);
}
final TestFirehose firehose = (TestFirehose) task.getFirehose();
firehose.addRows(ImmutableList.of(ImmutableMap.of("t", now.getMillis(), "dim1", "foo", "met1", "1"), // Data is from 2 days ago, should still be processed
ImmutableMap.of("t", now.minus(new Period("P2D")).getMillis(), "dim2", "bar", "met1", 2.0)));
// Stop the firehose, this will drain out existing events.
firehose.close();
// Wait for publish.
Collection<DataSegment> publishedSegments = awaitSegments();
// Check metrics.
Assert.assertEquals(2, task.getRowIngestionMeters().getProcessed());
Assert.assertEquals(0, task.getRowIngestionMeters().getThrownAway());
Assert.assertEquals(0, task.getRowIngestionMeters().getUnparseable());
// Do some queries.
Assert.assertEquals(2, sumMetric(task, null, "rows").longValue());
Assert.assertEquals(3, sumMetric(task, null, "met1").longValue());
awaitHandoffs();
for (DataSegment publishedSegment : publishedSegments) {
Pair<Executor, Runnable> executorRunnablePair = handOffCallbacks.get(new SegmentDescriptor(publishedSegment.getInterval(), publishedSegment.getVersion(), publishedSegment.getShardSpec().getPartitionNum()));
Assert.assertNotNull(publishedSegment + " missing from handoff callbacks: " + handOffCallbacks, executorRunnablePair);
// Simulate handoff.
executorRunnablePair.lhs.execute(executorRunnablePair.rhs);
}
handOffCallbacks.clear();
// Wait for the task to finish.
final TaskStatus taskStatus = statusFuture.get();
Assert.assertEquals(TaskState.SUCCESS, taskStatus.getStatusCode());
}
Aggregations