Search in sources :

Example 6 with TaskStatus

use of org.apache.druid.indexer.TaskStatus in project druid by druid-io.

the class SingleTaskBackgroundRunner method stop.

@Override
@LifecycleStop
public void stop() {
    stopping = true;
    if (executorService != null) {
        try {
            executorService.shutdown();
        } catch (SecurityException ex) {
            log.error(ex, "I can't control my own threads!");
        }
    }
    if (runningItem != null) {
        final Task task = runningItem.getTask();
        final long start = System.currentTimeMillis();
        final long elapsed;
        boolean error = false;
        // stopGracefully for resource cleaning
        log.info("Starting graceful shutdown of task[%s].", task.getId());
        task.stopGracefully(taskConfig);
        if (taskConfig.isRestoreTasksOnRestart() && task.canRestore()) {
            try {
                final TaskStatus taskStatus = runningItem.getResult().get(new Interval(DateTimes.utc(start), taskConfig.getGracefulShutdownTimeout()).toDurationMillis(), TimeUnit.MILLISECONDS);
                // Ignore status, it doesn't matter for graceful shutdowns.
                log.info("Graceful shutdown of task[%s] finished in %,dms.", task.getId(), System.currentTimeMillis() - start);
                TaskRunnerUtils.notifyStatusChanged(listeners, task.getId(), taskStatus);
            } catch (Exception e) {
                log.makeAlert(e, "Graceful task shutdown failed: %s", task.getDataSource()).addData("taskId", task.getId()).addData("dataSource", task.getDataSource()).emit();
                log.warn(e, "Graceful shutdown of task[%s] aborted with exception.", task.getId());
                error = true;
                // Creating a new status to only feed listeners seems quite strange.
                // This is currently OK because we have no listeners yet registered in peon.
                // However, we should fix this in the near future by always retrieving task status
                // from one single source of truth that is also propagated to the overlord.
                // See https://github.com/apache/druid/issues/11445.
                TaskRunnerUtils.notifyStatusChanged(listeners, task.getId(), TaskStatus.failure(task.getId(), "Failed to stop gracefully with exception. See task logs for more details."));
            }
        } else {
            // Creating a new status to only feed listeners seems quite strange.
            // This is currently OK because we have no listeners yet registered in peon.
            // However, we should fix this in the near future by always retrieving task status
            // from one single source of truth that is also propagated to the overlord.
            // See https://github.com/apache/druid/issues/11445.
            TaskRunnerUtils.notifyStatusChanged(listeners, task.getId(), TaskStatus.failure(task.getId(), "Canceled as task execution process stopped"));
        }
        elapsed = System.currentTimeMillis() - start;
        final ServiceMetricEvent.Builder metricBuilder = ServiceMetricEvent.builder().setDimension("task", task.getId()).setDimension("dataSource", task.getDataSource()).setDimension("graceful", // for backward compatibility
        "true").setDimension("error", String.valueOf(error));
        emitter.emit(metricBuilder.build("task/interrupt/count", 1L));
        emitter.emit(metricBuilder.build("task/interrupt/elapsed", elapsed));
    }
    // Ok, now interrupt everything.
    if (executorService != null) {
        try {
            executorService.shutdownNow();
        } catch (SecurityException ex) {
            log.error(ex, "I can't control my own threads!");
        }
    }
}
Also used : Task(org.apache.druid.indexing.common.task.Task) ServiceMetricEvent(org.apache.druid.java.util.emitter.service.ServiceMetricEvent) TaskStatus(org.apache.druid.indexer.TaskStatus) Interval(org.joda.time.Interval) LifecycleStop(org.apache.druid.java.util.common.lifecycle.LifecycleStop)

Example 7 with TaskStatus

use of org.apache.druid.indexer.TaskStatus in project druid by druid-io.

the class SingleTaskBackgroundRunner method run.

@Override
public ListenableFuture<TaskStatus> run(final Task task) {
    if (runningItem == null) {
        final TaskToolbox toolbox = toolboxFactory.build(task);
        final Object taskPriorityObj = task.getContextValue(TaskThreadPriority.CONTEXT_KEY);
        int taskPriority = 0;
        try {
            taskPriority = taskPriorityObj == null ? 0 : Numbers.parseInt(taskPriorityObj);
        } catch (NumberFormatException e) {
            log.error(e, "Error parsing task priority [%s] for task [%s]", taskPriorityObj, task.getId());
        }
        // Ensure an executor for that priority exists
        executorService = buildExecutorService(taskPriority);
        final ListenableFuture<TaskStatus> statusFuture = executorService.submit(new SingleTaskBackgroundRunnerCallable(task, location, toolbox));
        runningItem = new SingleTaskBackgroundRunnerWorkItem(task, location, statusFuture);
        return statusFuture;
    } else {
        throw new ISE("Already running task[%s]", runningItem.getTask().getId());
    }
}
Also used : TaskToolbox(org.apache.druid.indexing.common.TaskToolbox) ISE(org.apache.druid.java.util.common.ISE) TaskStatus(org.apache.druid.indexer.TaskStatus)

Example 8 with TaskStatus

use of org.apache.druid.indexer.TaskStatus in project druid by druid-io.

the class TaskQueue method manageInternal.

@VisibleForTesting
void manageInternal() {
    // Task futures available from the taskRunner
    final Map<String, ListenableFuture<TaskStatus>> runnerTaskFutures = new HashMap<>();
    for (final TaskRunnerWorkItem workItem : taskRunner.getKnownTasks()) {
        runnerTaskFutures.put(workItem.getTaskId(), workItem.getResult());
    }
    // Copy tasks list, as notifyStatus may modify it.
    for (final Task task : ImmutableList.copyOf(tasks)) {
        if (!taskFutures.containsKey(task.getId())) {
            final ListenableFuture<TaskStatus> runnerTaskFuture;
            if (runnerTaskFutures.containsKey(task.getId())) {
                runnerTaskFuture = runnerTaskFutures.get(task.getId());
            } else {
                // Task should be running, so run it.
                final boolean taskIsReady;
                try {
                    taskIsReady = task.isReady(taskActionClientFactory.create(task));
                } catch (Exception e) {
                    log.warn(e, "Exception thrown during isReady for task: %s", task.getId());
                    final String errorMessage;
                    if (e instanceof MaxAllowedLocksExceededException) {
                        errorMessage = e.getMessage();
                    } else {
                        errorMessage = "Failed while waiting for the task to be ready to run. " + "See overlord logs for more details.";
                    }
                    notifyStatus(task, TaskStatus.failure(task.getId(), errorMessage), errorMessage);
                    continue;
                }
                if (taskIsReady) {
                    log.info("Asking taskRunner to run: %s", task.getId());
                    runnerTaskFuture = taskRunner.run(task);
                } else {
                    // Task.isReady() can internally lock intervals or segments.
                    // We should release them if the task is not ready.
                    taskLockbox.unlockAll(task);
                    continue;
                }
            }
            taskFutures.put(task.getId(), attachCallbacks(task, runnerTaskFuture));
        } else if (isTaskPending(task)) {
            // if the taskFutures contain this task and this task is pending, also let the taskRunner
            // to run it to guarantee it will be assigned to run
            // see https://github.com/apache/druid/pull/6991
            taskRunner.run(task);
        }
    }
    // Kill tasks that shouldn't be running
    final Set<String> knownTaskIds = tasks.stream().map(Task::getId).collect(Collectors.toSet());
    final Set<String> tasksToKill = Sets.difference(runnerTaskFutures.keySet(), knownTaskIds);
    if (!tasksToKill.isEmpty()) {
        log.info("Asking taskRunner to clean up %,d tasks.", tasksToKill.size());
        for (final String taskId : tasksToKill) {
            try {
                taskRunner.shutdown(taskId, "task is not in knownTaskIds[%s]", knownTaskIds);
            } catch (Exception e) {
                log.warn(e, "TaskRunner failed to clean up task: %s", taskId);
            }
        }
    }
}
Also used : Task(org.apache.druid.indexing.common.task.Task) HashMap(java.util.HashMap) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) MaxAllowedLocksExceededException(org.apache.druid.indexing.common.task.batch.MaxAllowedLocksExceededException) ListenableFuture(com.google.common.util.concurrent.ListenableFuture) TaskStatus(org.apache.druid.indexer.TaskStatus) EntryExistsException(org.apache.druid.metadata.EntryExistsException) MaxAllowedLocksExceededException(org.apache.druid.indexing.common.task.batch.MaxAllowedLocksExceededException) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 9 with TaskStatus

use of org.apache.druid.indexer.TaskStatus in project druid by druid-io.

the class HttpRemoteTaskRunner method taskAddedOrUpdated.

void taskAddedOrUpdated(final TaskAnnouncement announcement, final WorkerHolder workerHolder) {
    final String taskId = announcement.getTaskId();
    final Worker worker = workerHolder.getWorker();
    log.debug("Worker[%s] wrote [%s] status for task [%s] on [%s]", worker.getHost(), announcement.getTaskStatus().getStatusCode(), taskId, announcement.getTaskLocation());
    HttpRemoteTaskRunnerWorkItem taskItem;
    boolean shouldShutdownTask = false;
    boolean isTaskCompleted = false;
    synchronized (statusLock) {
        taskItem = tasks.get(taskId);
        if (taskItem == null) {
            // Try to find information about it in the TaskStorage
            Optional<TaskStatus> knownStatusInStorage = taskStorage.getStatus(taskId);
            if (knownStatusInStorage.isPresent()) {
                switch(knownStatusInStorage.get().getStatusCode()) {
                    case RUNNING:
                        taskItem = new HttpRemoteTaskRunnerWorkItem(taskId, worker, TaskLocation.unknown(), null, announcement.getTaskType(), HttpRemoteTaskRunnerWorkItem.State.RUNNING);
                        tasks.put(taskId, taskItem);
                        break;
                    case SUCCESS:
                    case FAILED:
                        if (!announcement.getTaskStatus().isComplete()) {
                            log.info("Worker[%s] reported status for completed, known from taskStorage, task[%s]. Ignored.", worker.getHost(), taskId);
                        }
                        break;
                    default:
                        log.makeAlert("Found unrecognized state[%s] of task[%s] in taskStorage. Notification[%s] from worker[%s] is ignored.", knownStatusInStorage.get().getStatusCode(), taskId, announcement, worker.getHost()).emit();
                }
            } else {
                log.warn("Worker[%s] reported status[%s] for unknown task[%s]. Ignored.", worker.getHost(), announcement.getStatus(), taskId);
            }
        }
        if (taskItem == null) {
            if (!announcement.getTaskStatus().isComplete()) {
                shouldShutdownTask = true;
            }
        } else {
            switch(announcement.getTaskStatus().getStatusCode()) {
                case RUNNING:
                    switch(taskItem.getState()) {
                        case PENDING:
                        case PENDING_WORKER_ASSIGN:
                            taskItem.setWorker(worker);
                            taskItem.setState(HttpRemoteTaskRunnerWorkItem.State.RUNNING);
                            log.info("Task[%s] started RUNNING on worker[%s].", taskId, worker.getHost());
                        // fall through
                        case RUNNING:
                            if (worker.getHost().equals(taskItem.getWorker().getHost())) {
                                if (!announcement.getTaskLocation().equals(taskItem.getLocation())) {
                                    log.info("Task[%s] location changed on worker[%s]. new location[%s].", taskId, worker.getHost(), announcement.getTaskLocation());
                                    taskItem.setLocation(announcement.getTaskLocation());
                                    TaskRunnerUtils.notifyLocationChanged(listeners, taskId, announcement.getTaskLocation());
                                }
                            } else {
                                log.warn("Found worker[%s] running task[%s] which is being run by another worker[%s]. Notification ignored.", worker.getHost(), taskId, taskItem.getWorker().getHost());
                                shouldShutdownTask = true;
                            }
                            break;
                        case COMPLETE:
                            log.warn("Worker[%s] reported status for completed task[%s]. Ignored.", worker.getHost(), taskId);
                            shouldShutdownTask = true;
                            break;
                        default:
                            log.makeAlert("Found unrecognized state[%s] of task[%s]. Notification[%s] from worker[%s] is ignored.", taskItem.getState(), taskId, announcement, worker.getHost()).emit();
                    }
                    break;
                case FAILED:
                case SUCCESS:
                    switch(taskItem.getState()) {
                        case PENDING:
                        case PENDING_WORKER_ASSIGN:
                            taskItem.setWorker(worker);
                            taskItem.setState(HttpRemoteTaskRunnerWorkItem.State.RUNNING);
                            log.info("Task[%s] finished on worker[%s].", taskId, worker.getHost());
                        // fall through
                        case RUNNING:
                            if (worker.getHost().equals(taskItem.getWorker().getHost())) {
                                if (!announcement.getTaskLocation().equals(taskItem.getLocation())) {
                                    log.info("Task[%s] location changed on worker[%s]. new location[%s].", taskId, worker.getHost(), announcement.getTaskLocation());
                                    taskItem.setLocation(announcement.getTaskLocation());
                                    TaskRunnerUtils.notifyLocationChanged(listeners, taskId, announcement.getTaskLocation());
                                }
                                isTaskCompleted = true;
                            } else {
                                log.warn("Worker[%s] reported completed task[%s] which is being run by another worker[%s]. Notification ignored.", worker.getHost(), taskId, taskItem.getWorker().getHost());
                            }
                            break;
                        case COMPLETE:
                            // this can happen when a worker is restarted and reports its list of completed tasks again.
                            break;
                        default:
                            log.makeAlert("Found unrecognized state[%s] of task[%s]. Notification[%s] from worker[%s] is ignored.", taskItem.getState(), taskId, announcement, worker.getHost()).emit();
                    }
                    break;
                default:
                    log.makeAlert("Worker[%s] reported unrecognized state[%s] for task[%s].", worker.getHost(), announcement.getTaskStatus().getStatusCode(), taskId).emit();
            }
        }
    }
    if (isTaskCompleted) {
        // taskComplete(..) must be called outside of statusLock, see comments on method.
        taskComplete(taskItem, workerHolder, announcement.getTaskStatus());
    }
    if (shouldShutdownTask) {
        log.warn("Killing task[%s] on worker[%s].", taskId, worker.getHost());
        workerHolder.shutdownTask(taskId);
    }
    synchronized (statusLock) {
        statusLock.notifyAll();
    }
}
Also used : Worker(org.apache.druid.indexing.worker.Worker) TaskStatus(org.apache.druid.indexer.TaskStatus)

Example 10 with TaskStatus

use of org.apache.druid.indexer.TaskStatus in project druid by druid-io.

the class AppenderatorDriverRealtimeIndexTaskTest method testLateData.

@Test(timeout = 60_000L)
public void testLateData() throws Exception {
    expectPublishedSegments(1);
    final AppenderatorDriverRealtimeIndexTask task = makeRealtimeTask(null);
    final ListenableFuture<TaskStatus> statusFuture = runTask(task);
    // Wait for firehose to show up, it starts off null.
    while (task.getFirehose() == null) {
        Thread.sleep(50);
    }
    final TestFirehose firehose = (TestFirehose) task.getFirehose();
    firehose.addRows(ImmutableList.of(ImmutableMap.of("t", now.getMillis(), "dim1", "foo", "met1", "1"), // Data is from 2 days ago, should still be processed
    ImmutableMap.of("t", now.minus(new Period("P2D")).getMillis(), "dim2", "bar", "met1", 2.0)));
    // Stop the firehose, this will drain out existing events.
    firehose.close();
    // Wait for publish.
    Collection<DataSegment> publishedSegments = awaitSegments();
    // Check metrics.
    Assert.assertEquals(2, task.getRowIngestionMeters().getProcessed());
    Assert.assertEquals(0, task.getRowIngestionMeters().getThrownAway());
    Assert.assertEquals(0, task.getRowIngestionMeters().getUnparseable());
    // Do some queries.
    Assert.assertEquals(2, sumMetric(task, null, "rows").longValue());
    Assert.assertEquals(3, sumMetric(task, null, "met1").longValue());
    awaitHandoffs();
    for (DataSegment publishedSegment : publishedSegments) {
        Pair<Executor, Runnable> executorRunnablePair = handOffCallbacks.get(new SegmentDescriptor(publishedSegment.getInterval(), publishedSegment.getVersion(), publishedSegment.getShardSpec().getPartitionNum()));
        Assert.assertNotNull(publishedSegment + " missing from handoff callbacks: " + handOffCallbacks, executorRunnablePair);
        // Simulate handoff.
        executorRunnablePair.lhs.execute(executorRunnablePair.rhs);
    }
    handOffCallbacks.clear();
    // Wait for the task to finish.
    final TaskStatus taskStatus = statusFuture.get();
    Assert.assertEquals(TaskState.SUCCESS, taskStatus.getStatusCode());
}
Also used : Executor(java.util.concurrent.Executor) SegmentDescriptor(org.apache.druid.query.SegmentDescriptor) Period(org.joda.time.Period) TaskStatus(org.apache.druid.indexer.TaskStatus) DataSegment(org.apache.druid.timeline.DataSegment) InitializedNullHandlingTest(org.apache.druid.testing.InitializedNullHandlingTest) Test(org.junit.Test)

Aggregations

TaskStatus (org.apache.druid.indexer.TaskStatus)135 Test (org.junit.Test)103 DataSegment (org.apache.druid.timeline.DataSegment)55 List (java.util.List)50 ImmutableList (com.google.common.collect.ImmutableList)44 ArrayList (java.util.ArrayList)41 TaskToolbox (org.apache.druid.indexing.common.TaskToolbox)40 Task (org.apache.druid.indexing.common.task.Task)39 InitializedNullHandlingTest (org.apache.druid.testing.InitializedNullHandlingTest)37 Map (java.util.Map)34 File (java.io.File)32 IOException (java.io.IOException)26 ObjectMapper (com.fasterxml.jackson.databind.ObjectMapper)25 ImmutableMap (com.google.common.collect.ImmutableMap)25 SegmentDescriptor (org.apache.druid.query.SegmentDescriptor)25 DataSchema (org.apache.druid.segment.indexing.DataSchema)25 ISE (org.apache.druid.java.util.common.ISE)24 HashMap (java.util.HashMap)23 Executor (java.util.concurrent.Executor)23 Pair (org.apache.druid.java.util.common.Pair)23