Search in sources :

Example 1 with Worker

use of org.apache.druid.indexing.worker.Worker in project druid by druid-io.

the class HttpRemoteTaskRunner method streamTaskReports.

@Override
public Optional<ByteSource> streamTaskReports(String taskId) {
    // Read on tasks is safe
    @SuppressWarnings("GuardedBy") HttpRemoteTaskRunnerWorkItem taskRunnerWorkItem = tasks.get(taskId);
    Worker worker = null;
    if (taskRunnerWorkItem != null && taskRunnerWorkItem.getState() != HttpRemoteTaskRunnerWorkItem.State.COMPLETE) {
        worker = taskRunnerWorkItem.getWorker();
    }
    if (worker == null || !workers.containsKey(worker.getHost())) {
        // Worker is not running this task, it might be available in deep storage
        return Optional.absent();
    } else {
        // Worker is still running this task
        TaskLocation taskLocation = taskRunnerWorkItem.getLocation();
        final URL url = TaskRunnerUtils.makeTaskLocationURL(taskLocation, "/druid/worker/v1/chat/%s/liveReports", taskId);
        return Optional.of(new ByteSource() {

            @Override
            public InputStream openStream() throws IOException {
                try {
                    return httpClient.go(new Request(HttpMethod.GET, url), new InputStreamResponseHandler()).get();
                } catch (InterruptedException e) {
                    throw new RuntimeException(e);
                } catch (ExecutionException e) {
                    // Unwrap if possible
                    Throwables.propagateIfPossible(e.getCause(), IOException.class);
                    throw new RuntimeException(e);
                }
            }
        });
    }
}
Also used : InputStream(java.io.InputStream) Request(org.apache.druid.java.util.http.client.Request) IOException(java.io.IOException) TaskLocation(org.apache.druid.indexer.TaskLocation) URL(java.net.URL) InputStreamResponseHandler(org.apache.druid.java.util.http.client.response.InputStreamResponseHandler) Worker(org.apache.druid.indexing.worker.Worker) ByteSource(com.google.common.io.ByteSource) ExecutionException(java.util.concurrent.ExecutionException)

Example 2 with Worker

use of org.apache.druid.indexing.worker.Worker in project druid by druid-io.

the class HttpRemoteTaskRunner method scheduleTasksCleanupForWorker.

private void scheduleTasksCleanupForWorker(final String workerHostAndPort) {
    cancelWorkerCleanup(workerHostAndPort);
    final ListenableScheduledFuture<?> cleanupTask = cleanupExec.schedule(() -> {
        log.info("Running scheduled cleanup for Worker[%s]", workerHostAndPort);
        try {
            Set<HttpRemoteTaskRunnerWorkItem> tasksToFail = new HashSet<>();
            synchronized (statusLock) {
                for (Map.Entry<String, HttpRemoteTaskRunnerWorkItem> e : tasks.entrySet()) {
                    if (e.getValue().getState() == HttpRemoteTaskRunnerWorkItem.State.RUNNING) {
                        Worker w = e.getValue().getWorker();
                        if (w != null && w.getHost().equals(workerHostAndPort)) {
                            tasksToFail.add(e.getValue());
                        }
                    }
                }
            }
            for (HttpRemoteTaskRunnerWorkItem taskItem : tasksToFail) {
                if (!taskItem.getResult().isDone()) {
                    log.warn("Failing task[%s] because worker[%s] disappeared and did not report within cleanup timeout[%s].", workerHostAndPort, taskItem.getTaskId(), config.getTaskCleanupTimeout());
                    // taskComplete(..) must be called outside of statusLock, see comments on method.
                    taskComplete(taskItem, null, TaskStatus.failure(taskItem.getTaskId(), StringUtils.format("The worker that this task was assigned disappeared and " + "did not report cleanup within timeout[%s]. " + "See overlord and middleManager/indexer logs for more details.", config.getTaskCleanupTimeout())));
                }
            }
        } catch (Exception e) {
            log.makeAlert("Exception while cleaning up worker[%s]", workerHostAndPort).emit();
            throw new RuntimeException(e);
        }
    }, config.getTaskCleanupTimeout().toStandardDuration().getMillis(), TimeUnit.MILLISECONDS);
    removedWorkerCleanups.put(workerHostAndPort, cleanupTask);
    // Remove this entry from removedWorkerCleanups when done, if it's actually the one in there.
    Futures.addCallback(cleanupTask, new FutureCallback<Object>() {

        @Override
        public void onSuccess(Object result) {
            removedWorkerCleanups.remove(workerHostAndPort, cleanupTask);
        }

        @Override
        public void onFailure(Throwable t) {
            removedWorkerCleanups.remove(workerHostAndPort, cleanupTask);
        }
    });
}
Also used : KeeperException(org.apache.zookeeper.KeeperException) IOException(java.io.IOException) ExecutionException(java.util.concurrent.ExecutionException) Worker(org.apache.druid.indexing.worker.Worker) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) HashMap(java.util.HashMap) ConcurrentMap(java.util.concurrent.ConcurrentMap) HashSet(java.util.HashSet)

Example 3 with Worker

use of org.apache.druid.indexing.worker.Worker in project druid by druid-io.

the class HttpRemoteTaskRunner method taskAddedOrUpdated.

void taskAddedOrUpdated(final TaskAnnouncement announcement, final WorkerHolder workerHolder) {
    final String taskId = announcement.getTaskId();
    final Worker worker = workerHolder.getWorker();
    log.debug("Worker[%s] wrote [%s] status for task [%s] on [%s]", worker.getHost(), announcement.getTaskStatus().getStatusCode(), taskId, announcement.getTaskLocation());
    HttpRemoteTaskRunnerWorkItem taskItem;
    boolean shouldShutdownTask = false;
    boolean isTaskCompleted = false;
    synchronized (statusLock) {
        taskItem = tasks.get(taskId);
        if (taskItem == null) {
            // Try to find information about it in the TaskStorage
            Optional<TaskStatus> knownStatusInStorage = taskStorage.getStatus(taskId);
            if (knownStatusInStorage.isPresent()) {
                switch(knownStatusInStorage.get().getStatusCode()) {
                    case RUNNING:
                        taskItem = new HttpRemoteTaskRunnerWorkItem(taskId, worker, TaskLocation.unknown(), null, announcement.getTaskType(), HttpRemoteTaskRunnerWorkItem.State.RUNNING);
                        tasks.put(taskId, taskItem);
                        break;
                    case SUCCESS:
                    case FAILED:
                        if (!announcement.getTaskStatus().isComplete()) {
                            log.info("Worker[%s] reported status for completed, known from taskStorage, task[%s]. Ignored.", worker.getHost(), taskId);
                        }
                        break;
                    default:
                        log.makeAlert("Found unrecognized state[%s] of task[%s] in taskStorage. Notification[%s] from worker[%s] is ignored.", knownStatusInStorage.get().getStatusCode(), taskId, announcement, worker.getHost()).emit();
                }
            } else {
                log.warn("Worker[%s] reported status[%s] for unknown task[%s]. Ignored.", worker.getHost(), announcement.getStatus(), taskId);
            }
        }
        if (taskItem == null) {
            if (!announcement.getTaskStatus().isComplete()) {
                shouldShutdownTask = true;
            }
        } else {
            switch(announcement.getTaskStatus().getStatusCode()) {
                case RUNNING:
                    switch(taskItem.getState()) {
                        case PENDING:
                        case PENDING_WORKER_ASSIGN:
                            taskItem.setWorker(worker);
                            taskItem.setState(HttpRemoteTaskRunnerWorkItem.State.RUNNING);
                            log.info("Task[%s] started RUNNING on worker[%s].", taskId, worker.getHost());
                        // fall through
                        case RUNNING:
                            if (worker.getHost().equals(taskItem.getWorker().getHost())) {
                                if (!announcement.getTaskLocation().equals(taskItem.getLocation())) {
                                    log.info("Task[%s] location changed on worker[%s]. new location[%s].", taskId, worker.getHost(), announcement.getTaskLocation());
                                    taskItem.setLocation(announcement.getTaskLocation());
                                    TaskRunnerUtils.notifyLocationChanged(listeners, taskId, announcement.getTaskLocation());
                                }
                            } else {
                                log.warn("Found worker[%s] running task[%s] which is being run by another worker[%s]. Notification ignored.", worker.getHost(), taskId, taskItem.getWorker().getHost());
                                shouldShutdownTask = true;
                            }
                            break;
                        case COMPLETE:
                            log.warn("Worker[%s] reported status for completed task[%s]. Ignored.", worker.getHost(), taskId);
                            shouldShutdownTask = true;
                            break;
                        default:
                            log.makeAlert("Found unrecognized state[%s] of task[%s]. Notification[%s] from worker[%s] is ignored.", taskItem.getState(), taskId, announcement, worker.getHost()).emit();
                    }
                    break;
                case FAILED:
                case SUCCESS:
                    switch(taskItem.getState()) {
                        case PENDING:
                        case PENDING_WORKER_ASSIGN:
                            taskItem.setWorker(worker);
                            taskItem.setState(HttpRemoteTaskRunnerWorkItem.State.RUNNING);
                            log.info("Task[%s] finished on worker[%s].", taskId, worker.getHost());
                        // fall through
                        case RUNNING:
                            if (worker.getHost().equals(taskItem.getWorker().getHost())) {
                                if (!announcement.getTaskLocation().equals(taskItem.getLocation())) {
                                    log.info("Task[%s] location changed on worker[%s]. new location[%s].", taskId, worker.getHost(), announcement.getTaskLocation());
                                    taskItem.setLocation(announcement.getTaskLocation());
                                    TaskRunnerUtils.notifyLocationChanged(listeners, taskId, announcement.getTaskLocation());
                                }
                                isTaskCompleted = true;
                            } else {
                                log.warn("Worker[%s] reported completed task[%s] which is being run by another worker[%s]. Notification ignored.", worker.getHost(), taskId, taskItem.getWorker().getHost());
                            }
                            break;
                        case COMPLETE:
                            // this can happen when a worker is restarted and reports its list of completed tasks again.
                            break;
                        default:
                            log.makeAlert("Found unrecognized state[%s] of task[%s]. Notification[%s] from worker[%s] is ignored.", taskItem.getState(), taskId, announcement, worker.getHost()).emit();
                    }
                    break;
                default:
                    log.makeAlert("Worker[%s] reported unrecognized state[%s] for task[%s].", worker.getHost(), announcement.getTaskStatus().getStatusCode(), taskId).emit();
            }
        }
    }
    if (isTaskCompleted) {
        // taskComplete(..) must be called outside of statusLock, see comments on method.
        taskComplete(taskItem, workerHolder, announcement.getTaskStatus());
    }
    if (shouldShutdownTask) {
        log.warn("Killing task[%s] on worker[%s].", taskId, worker.getHost());
        workerHolder.shutdownTask(taskId);
    }
    synchronized (statusLock) {
        statusLock.notifyAll();
    }
}
Also used : Worker(org.apache.druid.indexing.worker.Worker) TaskStatus(org.apache.druid.indexer.TaskStatus)

Example 4 with Worker

use of org.apache.druid.indexing.worker.Worker in project druid by druid-io.

the class HttpRemoteTaskRunner method streamTaskLog.

@Override
public Optional<ByteSource> streamTaskLog(String taskId, long offset) {
    // Read on tasks is safe
    @SuppressWarnings("GuardedBy") HttpRemoteTaskRunnerWorkItem taskRunnerWorkItem = tasks.get(taskId);
    Worker worker = null;
    if (taskRunnerWorkItem != null && taskRunnerWorkItem.getState() != HttpRemoteTaskRunnerWorkItem.State.COMPLETE) {
        worker = taskRunnerWorkItem.getWorker();
    }
    if (worker == null || !workers.containsKey(worker.getHost())) {
        // Worker is not running this task, it might be available in deep storage
        return Optional.absent();
    } else {
        // Worker is still running this task
        final URL url = TaskRunnerUtils.makeWorkerURL(worker, "/druid/worker/v1/task/%s/log?offset=%s", taskId, Long.toString(offset));
        return Optional.of(new ByteSource() {

            @Override
            public InputStream openStream() throws IOException {
                try {
                    return httpClient.go(new Request(HttpMethod.GET, url), new InputStreamResponseHandler()).get();
                } catch (InterruptedException e) {
                    throw new RuntimeException(e);
                } catch (ExecutionException e) {
                    // Unwrap if possible
                    Throwables.propagateIfPossible(e.getCause(), IOException.class);
                    throw new RuntimeException(e);
                }
            }
        });
    }
}
Also used : InputStream(java.io.InputStream) Request(org.apache.druid.java.util.http.client.Request) IOException(java.io.IOException) URL(java.net.URL) InputStreamResponseHandler(org.apache.druid.java.util.http.client.response.InputStreamResponseHandler) Worker(org.apache.druid.indexing.worker.Worker) ByteSource(com.google.common.io.ByteSource) ExecutionException(java.util.concurrent.ExecutionException)

Example 5 with Worker

use of org.apache.druid.indexing.worker.Worker in project druid by druid-io.

the class RemoteTaskRunner method getLazyTaskSlotCount.

@Override
public Map<String, Long> getLazyTaskSlotCount() {
    Map<String, Long> totalLazyPeons = new HashMap<>();
    for (Worker worker : getLazyWorkers()) {
        String workerCategory = worker.getCategory();
        int workerLazyPeons = worker.getCapacity();
        totalLazyPeons.compute(workerCategory, (category, lazyPeons) -> lazyPeons == null ? workerLazyPeons : lazyPeons + workerLazyPeons);
    }
    return totalLazyPeons;
}
Also used : ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) HashMap(java.util.HashMap) Worker(org.apache.druid.indexing.worker.Worker)

Aggregations

Worker (org.apache.druid.indexing.worker.Worker)46 Test (org.junit.Test)32 NoopTask (org.apache.druid.indexing.common.task.NoopTask)21 ImmutableWorkerInfo (org.apache.druid.indexing.overlord.ImmutableWorkerInfo)15 ArrayList (java.util.ArrayList)14 Task (org.apache.druid.indexing.common.task.Task)13 TaskStorage (org.apache.druid.indexing.overlord.TaskStorage)12 RemoteTaskRunnerConfig (org.apache.druid.indexing.overlord.config.RemoteTaskRunnerConfig)11 ObjectMapper (com.fasterxml.jackson.databind.ObjectMapper)10 ScheduledExecutorService (java.util.concurrent.ScheduledExecutorService)10 AtomicReference (java.util.concurrent.atomic.AtomicReference)10 HttpRemoteTaskRunnerConfig (org.apache.druid.indexing.overlord.config.HttpRemoteTaskRunnerConfig)10 HttpClient (org.apache.druid.java.util.http.client.HttpClient)10 IndexerZkConfig (org.apache.druid.server.initialization.IndexerZkConfig)10 ImmutableList (com.google.common.collect.ImmutableList)9 List (java.util.List)9 ConcurrentHashMap (java.util.concurrent.ConcurrentHashMap)9 CuratorFramework (org.apache.curator.framework.CuratorFramework)9 DruidNodeDiscoveryProvider (org.apache.druid.discovery.DruidNodeDiscoveryProvider)9 TaskStatus (org.apache.druid.indexer.TaskStatus)9