use of org.apache.druid.indexing.worker.Worker in project druid by druid-io.
the class HttpRemoteTaskRunner method streamTaskReports.
@Override
public Optional<ByteSource> streamTaskReports(String taskId) {
// Read on tasks is safe
@SuppressWarnings("GuardedBy") HttpRemoteTaskRunnerWorkItem taskRunnerWorkItem = tasks.get(taskId);
Worker worker = null;
if (taskRunnerWorkItem != null && taskRunnerWorkItem.getState() != HttpRemoteTaskRunnerWorkItem.State.COMPLETE) {
worker = taskRunnerWorkItem.getWorker();
}
if (worker == null || !workers.containsKey(worker.getHost())) {
// Worker is not running this task, it might be available in deep storage
return Optional.absent();
} else {
// Worker is still running this task
TaskLocation taskLocation = taskRunnerWorkItem.getLocation();
final URL url = TaskRunnerUtils.makeTaskLocationURL(taskLocation, "/druid/worker/v1/chat/%s/liveReports", taskId);
return Optional.of(new ByteSource() {
@Override
public InputStream openStream() throws IOException {
try {
return httpClient.go(new Request(HttpMethod.GET, url), new InputStreamResponseHandler()).get();
} catch (InterruptedException e) {
throw new RuntimeException(e);
} catch (ExecutionException e) {
// Unwrap if possible
Throwables.propagateIfPossible(e.getCause(), IOException.class);
throw new RuntimeException(e);
}
}
});
}
}
use of org.apache.druid.indexing.worker.Worker in project druid by druid-io.
the class HttpRemoteTaskRunner method scheduleTasksCleanupForWorker.
private void scheduleTasksCleanupForWorker(final String workerHostAndPort) {
cancelWorkerCleanup(workerHostAndPort);
final ListenableScheduledFuture<?> cleanupTask = cleanupExec.schedule(() -> {
log.info("Running scheduled cleanup for Worker[%s]", workerHostAndPort);
try {
Set<HttpRemoteTaskRunnerWorkItem> tasksToFail = new HashSet<>();
synchronized (statusLock) {
for (Map.Entry<String, HttpRemoteTaskRunnerWorkItem> e : tasks.entrySet()) {
if (e.getValue().getState() == HttpRemoteTaskRunnerWorkItem.State.RUNNING) {
Worker w = e.getValue().getWorker();
if (w != null && w.getHost().equals(workerHostAndPort)) {
tasksToFail.add(e.getValue());
}
}
}
}
for (HttpRemoteTaskRunnerWorkItem taskItem : tasksToFail) {
if (!taskItem.getResult().isDone()) {
log.warn("Failing task[%s] because worker[%s] disappeared and did not report within cleanup timeout[%s].", workerHostAndPort, taskItem.getTaskId(), config.getTaskCleanupTimeout());
// taskComplete(..) must be called outside of statusLock, see comments on method.
taskComplete(taskItem, null, TaskStatus.failure(taskItem.getTaskId(), StringUtils.format("The worker that this task was assigned disappeared and " + "did not report cleanup within timeout[%s]. " + "See overlord and middleManager/indexer logs for more details.", config.getTaskCleanupTimeout())));
}
}
} catch (Exception e) {
log.makeAlert("Exception while cleaning up worker[%s]", workerHostAndPort).emit();
throw new RuntimeException(e);
}
}, config.getTaskCleanupTimeout().toStandardDuration().getMillis(), TimeUnit.MILLISECONDS);
removedWorkerCleanups.put(workerHostAndPort, cleanupTask);
// Remove this entry from removedWorkerCleanups when done, if it's actually the one in there.
Futures.addCallback(cleanupTask, new FutureCallback<Object>() {
@Override
public void onSuccess(Object result) {
removedWorkerCleanups.remove(workerHostAndPort, cleanupTask);
}
@Override
public void onFailure(Throwable t) {
removedWorkerCleanups.remove(workerHostAndPort, cleanupTask);
}
});
}
use of org.apache.druid.indexing.worker.Worker in project druid by druid-io.
the class HttpRemoteTaskRunner method taskAddedOrUpdated.
void taskAddedOrUpdated(final TaskAnnouncement announcement, final WorkerHolder workerHolder) {
final String taskId = announcement.getTaskId();
final Worker worker = workerHolder.getWorker();
log.debug("Worker[%s] wrote [%s] status for task [%s] on [%s]", worker.getHost(), announcement.getTaskStatus().getStatusCode(), taskId, announcement.getTaskLocation());
HttpRemoteTaskRunnerWorkItem taskItem;
boolean shouldShutdownTask = false;
boolean isTaskCompleted = false;
synchronized (statusLock) {
taskItem = tasks.get(taskId);
if (taskItem == null) {
// Try to find information about it in the TaskStorage
Optional<TaskStatus> knownStatusInStorage = taskStorage.getStatus(taskId);
if (knownStatusInStorage.isPresent()) {
switch(knownStatusInStorage.get().getStatusCode()) {
case RUNNING:
taskItem = new HttpRemoteTaskRunnerWorkItem(taskId, worker, TaskLocation.unknown(), null, announcement.getTaskType(), HttpRemoteTaskRunnerWorkItem.State.RUNNING);
tasks.put(taskId, taskItem);
break;
case SUCCESS:
case FAILED:
if (!announcement.getTaskStatus().isComplete()) {
log.info("Worker[%s] reported status for completed, known from taskStorage, task[%s]. Ignored.", worker.getHost(), taskId);
}
break;
default:
log.makeAlert("Found unrecognized state[%s] of task[%s] in taskStorage. Notification[%s] from worker[%s] is ignored.", knownStatusInStorage.get().getStatusCode(), taskId, announcement, worker.getHost()).emit();
}
} else {
log.warn("Worker[%s] reported status[%s] for unknown task[%s]. Ignored.", worker.getHost(), announcement.getStatus(), taskId);
}
}
if (taskItem == null) {
if (!announcement.getTaskStatus().isComplete()) {
shouldShutdownTask = true;
}
} else {
switch(announcement.getTaskStatus().getStatusCode()) {
case RUNNING:
switch(taskItem.getState()) {
case PENDING:
case PENDING_WORKER_ASSIGN:
taskItem.setWorker(worker);
taskItem.setState(HttpRemoteTaskRunnerWorkItem.State.RUNNING);
log.info("Task[%s] started RUNNING on worker[%s].", taskId, worker.getHost());
// fall through
case RUNNING:
if (worker.getHost().equals(taskItem.getWorker().getHost())) {
if (!announcement.getTaskLocation().equals(taskItem.getLocation())) {
log.info("Task[%s] location changed on worker[%s]. new location[%s].", taskId, worker.getHost(), announcement.getTaskLocation());
taskItem.setLocation(announcement.getTaskLocation());
TaskRunnerUtils.notifyLocationChanged(listeners, taskId, announcement.getTaskLocation());
}
} else {
log.warn("Found worker[%s] running task[%s] which is being run by another worker[%s]. Notification ignored.", worker.getHost(), taskId, taskItem.getWorker().getHost());
shouldShutdownTask = true;
}
break;
case COMPLETE:
log.warn("Worker[%s] reported status for completed task[%s]. Ignored.", worker.getHost(), taskId);
shouldShutdownTask = true;
break;
default:
log.makeAlert("Found unrecognized state[%s] of task[%s]. Notification[%s] from worker[%s] is ignored.", taskItem.getState(), taskId, announcement, worker.getHost()).emit();
}
break;
case FAILED:
case SUCCESS:
switch(taskItem.getState()) {
case PENDING:
case PENDING_WORKER_ASSIGN:
taskItem.setWorker(worker);
taskItem.setState(HttpRemoteTaskRunnerWorkItem.State.RUNNING);
log.info("Task[%s] finished on worker[%s].", taskId, worker.getHost());
// fall through
case RUNNING:
if (worker.getHost().equals(taskItem.getWorker().getHost())) {
if (!announcement.getTaskLocation().equals(taskItem.getLocation())) {
log.info("Task[%s] location changed on worker[%s]. new location[%s].", taskId, worker.getHost(), announcement.getTaskLocation());
taskItem.setLocation(announcement.getTaskLocation());
TaskRunnerUtils.notifyLocationChanged(listeners, taskId, announcement.getTaskLocation());
}
isTaskCompleted = true;
} else {
log.warn("Worker[%s] reported completed task[%s] which is being run by another worker[%s]. Notification ignored.", worker.getHost(), taskId, taskItem.getWorker().getHost());
}
break;
case COMPLETE:
// this can happen when a worker is restarted and reports its list of completed tasks again.
break;
default:
log.makeAlert("Found unrecognized state[%s] of task[%s]. Notification[%s] from worker[%s] is ignored.", taskItem.getState(), taskId, announcement, worker.getHost()).emit();
}
break;
default:
log.makeAlert("Worker[%s] reported unrecognized state[%s] for task[%s].", worker.getHost(), announcement.getTaskStatus().getStatusCode(), taskId).emit();
}
}
}
if (isTaskCompleted) {
// taskComplete(..) must be called outside of statusLock, see comments on method.
taskComplete(taskItem, workerHolder, announcement.getTaskStatus());
}
if (shouldShutdownTask) {
log.warn("Killing task[%s] on worker[%s].", taskId, worker.getHost());
workerHolder.shutdownTask(taskId);
}
synchronized (statusLock) {
statusLock.notifyAll();
}
}
use of org.apache.druid.indexing.worker.Worker in project druid by druid-io.
the class HttpRemoteTaskRunner method streamTaskLog.
@Override
public Optional<ByteSource> streamTaskLog(String taskId, long offset) {
// Read on tasks is safe
@SuppressWarnings("GuardedBy") HttpRemoteTaskRunnerWorkItem taskRunnerWorkItem = tasks.get(taskId);
Worker worker = null;
if (taskRunnerWorkItem != null && taskRunnerWorkItem.getState() != HttpRemoteTaskRunnerWorkItem.State.COMPLETE) {
worker = taskRunnerWorkItem.getWorker();
}
if (worker == null || !workers.containsKey(worker.getHost())) {
// Worker is not running this task, it might be available in deep storage
return Optional.absent();
} else {
// Worker is still running this task
final URL url = TaskRunnerUtils.makeWorkerURL(worker, "/druid/worker/v1/task/%s/log?offset=%s", taskId, Long.toString(offset));
return Optional.of(new ByteSource() {
@Override
public InputStream openStream() throws IOException {
try {
return httpClient.go(new Request(HttpMethod.GET, url), new InputStreamResponseHandler()).get();
} catch (InterruptedException e) {
throw new RuntimeException(e);
} catch (ExecutionException e) {
// Unwrap if possible
Throwables.propagateIfPossible(e.getCause(), IOException.class);
throw new RuntimeException(e);
}
}
});
}
}
use of org.apache.druid.indexing.worker.Worker in project druid by druid-io.
the class RemoteTaskRunner method getLazyTaskSlotCount.
@Override
public Map<String, Long> getLazyTaskSlotCount() {
Map<String, Long> totalLazyPeons = new HashMap<>();
for (Worker worker : getLazyWorkers()) {
String workerCategory = worker.getCategory();
int workerLazyPeons = worker.getCapacity();
totalLazyPeons.compute(workerCategory, (category, lazyPeons) -> lazyPeons == null ? workerLazyPeons : lazyPeons + workerLazyPeons);
}
return totalLazyPeons;
}
Aggregations