Search in sources :

Example 71 with TaskStatus

use of org.apache.druid.indexer.TaskStatus in project druid by druid-io.

the class RemoteTaskRunnerTest method testBlacklistZKWorkers50Percent.

/**
 * With 2 workers and maxPercentageBlacklistWorkers(50), one worker should get blacklisted after the second failure
 * and the second worker should never be blacklisted even after exceeding maxRetriesBeforeBlacklist.
 */
@Test
public void testBlacklistZKWorkers50Percent() throws Exception {
    rtrTestUtils.makeWorker("worker", 10);
    rtrTestUtils.makeWorker("worker2", 10);
    RemoteTaskRunnerConfig rtrConfig = new TestRemoteTaskRunnerConfig(TIMEOUT_PERIOD);
    rtrConfig.setMaxPercentageBlacklistWorkers(50);
    makeRemoteTaskRunner(rtrConfig);
    String firstWorker = null;
    String secondWorker = null;
    for (int i = 1; i < 13; i++) {
        String taskId = StringUtils.format("rt-%d", i);
        TestRealtimeTask task = new TestRealtimeTask(taskId, new TaskResource(taskId, 1), "foo", TaskStatus.success(taskId), jsonMapper);
        Future<TaskStatus> taskFuture = remoteTaskRunner.run(task);
        if (i == 1) {
            if (rtrTestUtils.taskAnnounced("worker2", task.getId())) {
                firstWorker = "worker2";
                secondWorker = "worker";
            } else {
                firstWorker = "worker";
                secondWorker = "worker2";
            }
        }
        final String expectedWorker = i % 2 == 0 || i > 4 ? secondWorker : firstWorker;
        Assert.assertTrue(rtrTestUtils.taskAnnounced(expectedWorker, task.getId()));
        rtrTestUtils.mockWorkerRunningTask(expectedWorker, task);
        rtrTestUtils.mockWorkerCompleteFailedTask(expectedWorker, task);
        Assert.assertTrue(taskFuture.get().isFailure());
        Assert.assertEquals(i > 2 ? 1 : 0, remoteTaskRunner.getBlackListedWorkers().size());
        Assert.assertEquals(i > 4 ? i - 2 : ((i + 1) / 2), remoteTaskRunner.findWorkerRunningTask(task.getId()).getContinuouslyFailedTasksCount());
    }
}
Also used : TestRealtimeTask(org.apache.druid.indexing.common.TestRealtimeTask) TaskResource(org.apache.druid.indexing.common.task.TaskResource) TaskStatus(org.apache.druid.indexer.TaskStatus) RemoteTaskRunnerConfig(org.apache.druid.indexing.overlord.config.RemoteTaskRunnerConfig) Test(org.junit.Test)

Example 72 with TaskStatus

use of org.apache.druid.indexer.TaskStatus in project druid by druid-io.

the class TaskQueue method notifyStatus.

/**
 * Notify this queue that some task has an updated status. If this update is valid, the status will be persisted in
 * the task storage facility. If the status is a completed status, the task will be unlocked and no further
 * updates will be accepted.
 *
 * @param task       task to update
 * @param taskStatus new task status
 *
 * @throws NullPointerException     if task or status is null
 * @throws IllegalArgumentException if the task ID does not match the status ID
 * @throws IllegalStateException    if this queue is currently shut down
 */
private void notifyStatus(final Task task, final TaskStatus taskStatus, String reasonFormat, Object... args) {
    giant.lock();
    TaskLocation taskLocation = TaskLocation.unknown();
    try {
        Preconditions.checkNotNull(task, "task");
        Preconditions.checkNotNull(taskStatus, "status");
        Preconditions.checkState(active, "Queue is not active!");
        Preconditions.checkArgument(task.getId().equals(taskStatus.getId()), "Mismatching task ids[%s/%s]", task.getId(), taskStatus.getId());
        // Inform taskRunner that this task can be shut down
        try {
            taskLocation = taskRunner.getTaskLocation(task.getId());
            taskRunner.shutdown(task.getId(), reasonFormat, args);
        } catch (Exception e) {
            log.warn(e, "TaskRunner failed to cleanup task after completion: %s", task.getId());
        }
        // Remove from running tasks
        int removed = 0;
        for (int i = tasks.size() - 1; i >= 0; i--) {
            if (tasks.get(i).getId().equals(task.getId())) {
                removed++;
                removeTaskInternal(tasks.get(i));
                break;
            }
        }
        if (removed == 0) {
            log.warn("Unknown task completed: %s", task.getId());
        } else if (removed > 1) {
            log.makeAlert("Removed multiple copies of task").addData("count", removed).addData("task", task.getId()).emit();
        }
        // Remove from futures list
        taskFutures.remove(task.getId());
        if (removed > 0) {
            // If we thought this task should be running, save status to DB
            try {
                final Optional<TaskStatus> previousStatus = taskStorage.getStatus(task.getId());
                if (!previousStatus.isPresent() || !previousStatus.get().isRunnable()) {
                    log.makeAlert("Ignoring notification for already-complete task").addData("task", task.getId()).emit();
                } else {
                    taskStorage.setStatus(taskStatus.withLocation(taskLocation));
                    log.info("Task done: %s", task);
                    managementMayBeNecessary.signalAll();
                }
            } catch (Exception e) {
                log.makeAlert(e, "Failed to persist status for task").addData("task", task.getId()).addData("statusCode", taskStatus.getStatusCode()).emit();
            }
        }
    } finally {
        giant.unlock();
    }
}
Also used : TaskStatus(org.apache.druid.indexer.TaskStatus) TaskLocation(org.apache.druid.indexer.TaskLocation) EntryExistsException(org.apache.druid.metadata.EntryExistsException) MaxAllowedLocksExceededException(org.apache.druid.indexing.common.task.batch.MaxAllowedLocksExceededException)

Example 73 with TaskStatus

use of org.apache.druid.indexer.TaskStatus in project druid by druid-io.

the class RemoteTaskRunner method scheduleTasksCleanupForWorker.

/**
 * Schedule a task that will, at some point in the future, clean up znodes and issue failures for "tasksToFail"
 * if they are being run by "worker".
 */
private void scheduleTasksCleanupForWorker(final String worker, final List<String> tasksToFail) {
    // This method is only called from the PathChildrenCache event handler, so this may look like a race,
    // but is actually not.
    cancelWorkerCleanup(worker);
    final ListenableScheduledFuture<?> cleanupTask = cleanupExec.schedule(() -> {
        log.info("Running scheduled cleanup for Worker[%s]", worker);
        try {
            for (String assignedTask : tasksToFail) {
                String taskPath = JOINER.join(indexerZkConfig.getTasksPath(), worker, assignedTask);
                String statusPath = JOINER.join(indexerZkConfig.getStatusPath(), worker, assignedTask);
                if (cf.checkExists().forPath(taskPath) != null) {
                    cf.delete().guaranteed().forPath(taskPath);
                }
                if (cf.checkExists().forPath(statusPath) != null) {
                    cf.delete().guaranteed().forPath(statusPath);
                }
                log.info("Failing task[%s]", assignedTask);
                RemoteTaskRunnerWorkItem taskRunnerWorkItem = runningTasks.remove(assignedTask);
                if (taskRunnerWorkItem != null) {
                    final TaskStatus taskStatus = TaskStatus.failure(assignedTask, StringUtils.format("Canceled for worker cleanup. See overlord logs for more details."));
                    taskRunnerWorkItem.setResult(taskStatus);
                    TaskRunnerUtils.notifyStatusChanged(listeners, assignedTask, taskStatus);
                } else {
                    log.warn("RemoteTaskRunner has no knowledge of task[%s]", assignedTask);
                }
            }
            // worker is gone, remove worker task status announcements path.
            String workerStatusPath = JOINER.join(indexerZkConfig.getStatusPath(), worker);
            if (cf.checkExists().forPath(workerStatusPath) != null) {
                cf.delete().guaranteed().forPath(JOINER.join(indexerZkConfig.getStatusPath(), worker));
            }
        } catch (Exception e) {
            log.makeAlert("Exception while cleaning up worker[%s]", worker).emit();
            throw new RuntimeException(e);
        }
    }, config.getTaskCleanupTimeout().toStandardDuration().getMillis(), TimeUnit.MILLISECONDS);
    removedWorkerCleanups.put(worker, cleanupTask);
    // Remove this entry from removedWorkerCleanups when done, if it's actually the one in there.
    Futures.addCallback(cleanupTask, new FutureCallback<Object>() {

        @Override
        public void onSuccess(Object result) {
            removedWorkerCleanups.remove(worker, cleanupTask);
        }

        @Override
        public void onFailure(Throwable t) {
            removedWorkerCleanups.remove(worker, cleanupTask);
        }
    });
}
Also used : TaskStatus(org.apache.druid.indexer.TaskStatus) TimeoutException(java.util.concurrent.TimeoutException) KeeperException(org.apache.zookeeper.KeeperException) IOException(java.io.IOException) ExecutionException(java.util.concurrent.ExecutionException)

Example 74 with TaskStatus

use of org.apache.druid.indexer.TaskStatus in project druid by druid-io.

the class BaseRestorableTaskRunner method restore.

@Override
public List<Pair<Task, ListenableFuture<TaskStatus>>> restore() {
    final File restoreFile = getRestoreFile();
    final TaskRestoreInfo taskRestoreInfo;
    if (restoreFile.exists()) {
        try {
            taskRestoreInfo = jsonMapper.readValue(restoreFile, TaskRestoreInfo.class);
        } catch (Exception e) {
            LOG.error(e, "Failed to read restorable tasks from file[%s]. Skipping restore.", restoreFile);
            return ImmutableList.of();
        }
    } else {
        return ImmutableList.of();
    }
    final List<Pair<Task, ListenableFuture<TaskStatus>>> retVal = new ArrayList<>();
    for (final String taskId : taskRestoreInfo.getRunningTasks()) {
        try {
            final File taskFile = new File(taskConfig.getTaskDir(taskId), "task.json");
            final Task task = jsonMapper.readValue(taskFile, Task.class);
            if (!task.getId().equals(taskId)) {
                throw new ISE("Task[%s] restore file had wrong id[%s]", taskId, task.getId());
            }
            if (taskConfig.isRestoreTasksOnRestart() && task.canRestore()) {
                LOG.info("Restoring task[%s].", task.getId());
                retVal.add(Pair.of(task, run(task)));
            }
        } catch (Exception e) {
            LOG.warn(e, "Failed to restore task[%s]. Trying to restore other tasks.", taskId);
        }
    }
    if (!retVal.isEmpty()) {
        LOG.info("Restored %,d tasks: %s", retVal.size(), Joiner.on(", ").join(retVal));
    }
    return retVal;
}
Also used : Task(org.apache.druid.indexing.common.task.Task) ArrayList(java.util.ArrayList) CopyOnWriteArrayList(java.util.concurrent.CopyOnWriteArrayList) ISE(org.apache.druid.java.util.common.ISE) TaskStatus(org.apache.druid.indexer.TaskStatus) File(java.io.File) Pair(org.apache.druid.java.util.common.Pair)

Example 75 with TaskStatus

use of org.apache.druid.indexer.TaskStatus in project druid by druid-io.

the class ThreadingTaskRunner method run.

@Override
public ListenableFuture<TaskStatus> run(Task task) {
    synchronized (tasks) {
        tasks.computeIfAbsent(task.getId(), k -> new ThreadingTaskRunnerWorkItem(task, taskExecutor.submit(new Callable<TaskStatus>() {

            @Override
            public TaskStatus call() {
                final String attemptUUID = UUID.randomUUID().toString();
                final File taskDir = taskConfig.getTaskDir(task.getId());
                final File attemptDir = new File(taskDir, attemptUUID);
                final TaskLocation taskLocation = TaskLocation.create(node.getHost(), node.getPlaintextPort(), node.getTlsPort());
                final ThreadingTaskRunnerWorkItem taskWorkItem;
                try {
                    FileUtils.mkdirp(attemptDir);
                    final File taskFile = new File(taskDir, "task.json");
                    final File reportsFile = new File(attemptDir, "report.json");
                    taskReportFileWriter.add(task.getId(), reportsFile);
                    // time to adjust process holders
                    synchronized (tasks) {
                        taskWorkItem = tasks.get(task.getId());
                        if (taskWorkItem == null) {
                            LOGGER.makeAlert("TaskInfo disappeared").addData("task", task.getId()).emit();
                            throw new ISE("TaskInfo disappeared for task[%s]!", task.getId());
                        }
                        if (taskWorkItem.shutdown) {
                            throw new IllegalStateException("Task has been shut down!");
                        }
                    }
                    if (!taskFile.exists()) {
                        jsonMapper.writeValue(taskFile, task);
                    }
                    // This will block for a while. So we append the thread information with more details
                    final String priorThreadName = Thread.currentThread().getName();
                    Thread.currentThread().setName(StringUtils.format("[%s]-%s", task.getId(), priorThreadName));
                    TaskStatus taskStatus;
                    final TaskToolbox toolbox = toolboxFactory.build(task);
                    TaskRunnerUtils.notifyLocationChanged(listeners, task.getId(), taskLocation);
                    TaskRunnerUtils.notifyStatusChanged(listeners, task.getId(), TaskStatus.running(task.getId()));
                    taskWorkItem.setState(RunnerTaskState.RUNNING);
                    try {
                        taskStatus = task.run(toolbox);
                    } catch (Throwable t) {
                        LOGGER.error(t, "Exception caught while running the task.");
                        taskStatus = TaskStatus.failure(task.getId(), "Failed with an exception. See indexer logs for more details.");
                    } finally {
                        taskWorkItem.setState(RunnerTaskState.NONE);
                        Thread.currentThread().setName(priorThreadName);
                        if (reportsFile.exists()) {
                            taskLogPusher.pushTaskReports(task.getId(), reportsFile);
                        }
                    }
                    TaskRunnerUtils.notifyStatusChanged(listeners, task.getId(), taskStatus);
                    return taskStatus;
                } catch (Throwable t) {
                    LOGGER.error(t, "Exception caught during execution");
                    throw new RuntimeException(t);
                } finally {
                    try {
                        taskReportFileWriter.delete(task.getId());
                        appenderatorsManager.removeAppenderatorsForTask(task.getId(), task.getDataSource());
                        synchronized (tasks) {
                            tasks.remove(task.getId());
                            if (!stopping) {
                                saveRunningTasks();
                            }
                        }
                        try {
                            if (!stopping && taskDir.exists()) {
                                FileUtils.deleteDirectory(taskDir);
                                LOGGER.info("Removed task directory: %s", taskDir);
                            }
                        } catch (Exception e) {
                            LOGGER.makeAlert(e, "Failed to delete task directory").addData("taskDir", taskDir.toString()).addData("task", task.getId()).emit();
                        }
                    } catch (Exception e) {
                        LOGGER.error(e, "Suppressing exception caught while cleaning up task");
                    }
                }
            }
        })));
        saveRunningTasks();
        return tasks.get(task.getId()).getResult();
    }
}
Also used : TaskStatus(org.apache.druid.indexer.TaskStatus) TaskLocation(org.apache.druid.indexer.TaskLocation) TimeoutException(java.util.concurrent.TimeoutException) TaskToolbox(org.apache.druid.indexing.common.TaskToolbox) ISE(org.apache.druid.java.util.common.ISE) File(java.io.File)

Aggregations

TaskStatus (org.apache.druid.indexer.TaskStatus)135 Test (org.junit.Test)103 DataSegment (org.apache.druid.timeline.DataSegment)55 List (java.util.List)50 ImmutableList (com.google.common.collect.ImmutableList)44 ArrayList (java.util.ArrayList)41 TaskToolbox (org.apache.druid.indexing.common.TaskToolbox)40 Task (org.apache.druid.indexing.common.task.Task)39 InitializedNullHandlingTest (org.apache.druid.testing.InitializedNullHandlingTest)37 Map (java.util.Map)34 File (java.io.File)32 IOException (java.io.IOException)26 ObjectMapper (com.fasterxml.jackson.databind.ObjectMapper)25 ImmutableMap (com.google.common.collect.ImmutableMap)25 SegmentDescriptor (org.apache.druid.query.SegmentDescriptor)25 DataSchema (org.apache.druid.segment.indexing.DataSchema)25 ISE (org.apache.druid.java.util.common.ISE)24 HashMap (java.util.HashMap)23 Executor (java.util.concurrent.Executor)23 Pair (org.apache.druid.java.util.common.Pair)23