use of org.apache.druid.indexer.TaskLocation in project druid by druid-io.
the class IndexTaskClient method submitRequest.
/**
* Sends an HTTP request to the task of the specified {@code taskId} and returns a response if it succeeded.
*/
protected <IntermediateType, FinalType> FinalType submitRequest(String taskId, // nullable if content is empty
@Nullable String mediaType, HttpMethod method, String encodedPathSuffix, @Nullable String encodedQueryString, byte[] content, HttpResponseHandler<IntermediateType, FinalType> responseHandler, boolean retry) throws IOException, ChannelException, NoTaskLocationException {
final RetryPolicy retryPolicy = retryPolicyFactory.makeRetryPolicy();
while (true) {
String path = StringUtils.format("%s/%s/%s", BASE_PATH, StringUtils.urlEncode(taskId), encodedPathSuffix);
Optional<TaskStatus> status = taskInfoProvider.getTaskStatus(taskId);
if (!status.isPresent() || !status.get().isRunnable()) {
throw new TaskNotRunnableException(StringUtils.format("Aborting request because task [%s] is not runnable", taskId));
}
final TaskLocation location = taskInfoProvider.getTaskLocation(taskId);
if (location.equals(TaskLocation.unknown())) {
throw new NoTaskLocationException(StringUtils.format("No TaskLocation available for task [%s]", taskId));
}
final Request request = createRequest(taskId, location, path, encodedQueryString, method, mediaType, content);
Either<StringFullResponseHolder, FinalType> response = null;
try {
// Netty throws some annoying exceptions if a connection can't be opened, which happens relatively frequently
// for tasks that happen to still be starting up, so test the connection first to keep the logs clean.
checkConnection(request.getUrl().getHost(), request.getUrl().getPort());
response = submitRequest(request, responseHandler);
if (response.isValue()) {
return response.valueOrThrow();
} else {
final StringBuilder exceptionMessage = new StringBuilder();
final HttpResponseStatus httpResponseStatus = response.error().getStatus();
final String httpResponseContent = response.error().getContent();
exceptionMessage.append("Received server error with status [").append(httpResponseStatus).append("]");
if (!Strings.isNullOrEmpty(httpResponseContent)) {
final String choppedMessage = StringUtils.chop(StringUtils.nullToEmptyNonDruidDataString(httpResponseContent), 1000);
exceptionMessage.append("; first 1KB of body: ").append(choppedMessage);
}
if (httpResponseStatus.getCode() == 400) {
// don't bother retrying if it's a bad request
throw new IAE(exceptionMessage.toString());
} else {
throw new IOE(exceptionMessage.toString());
}
}
} catch (IOException | ChannelException e) {
// Since workers are free to move tasks around to different ports, there is a chance that a task may have been
// moved but our view of its location has not been updated yet from ZK. To detect this case, we send a header
// identifying our expected recipient in the request; if this doesn't correspond to the worker we messaged, the
// worker will return an HTTP 404 with its ID in the response header. If we get a mismatching task ID, then
// we will wait for a short period then retry the request indefinitely, expecting the task's location to
// eventually be updated.
final Duration delay;
if (response != null && !response.isValue() && response.error().getStatus().equals(HttpResponseStatus.NOT_FOUND)) {
String headerId = StringUtils.urlDecode(response.error().getResponse().headers().get(ChatHandlerResource.TASK_ID_HEADER));
if (headerId != null && !headerId.equals(taskId)) {
log.warn("Expected worker to have taskId [%s] but has taskId [%s], will retry in [%d]s", taskId, headerId, TASK_MISMATCH_RETRY_DELAY_SECONDS);
delay = Duration.standardSeconds(TASK_MISMATCH_RETRY_DELAY_SECONDS);
} else {
delay = retryPolicy.getAndIncrementRetryDelay();
}
} else {
delay = retryPolicy.getAndIncrementRetryDelay();
}
final String urlForLog = request.getUrl().toString();
if (!retry) {
// if retry=false, we probably aren't too concerned if the operation doesn't succeed (i.e. the request was
// for informational purposes only); log at INFO instead of WARN.
log.noStackTrace().info(e, "submitRequest failed for [%s]", urlForLog);
throw e;
} else if (delay == null) {
// When retrying, log the final failure at WARN level, since it is likely to be bad news.
log.warn(e, "submitRequest failed for [%s]", urlForLog);
throw e;
} else {
try {
final long sleepTime = delay.getMillis();
// When retrying, log non-final failures at INFO level.
log.noStackTrace().info(e, "submitRequest failed for [%s]; will try again in [%s]", urlForLog, new Duration(sleepTime).toString());
Thread.sleep(sleepTime);
} catch (InterruptedException e2) {
Thread.currentThread().interrupt();
e.addSuppressed(e2);
throw new RuntimeException(e);
}
}
} catch (NoTaskLocationException e) {
log.info("No TaskLocation available for task [%s], this task may not have been assigned to a worker yet " + "or may have already completed", taskId);
throw e;
} catch (Exception e) {
log.warn(e, "Exception while sending request");
throw e;
}
}
}
use of org.apache.druid.indexer.TaskLocation in project druid by druid-io.
the class QuotableWhiteSpaceSplitter method run.
@Override
public ListenableFuture<TaskStatus> run(final Task task) {
synchronized (tasks) {
tasks.computeIfAbsent(task.getId(), k -> new ForkingTaskRunnerWorkItem(task, exec.submit(new Callable<TaskStatus>() {
@Override
public TaskStatus call() {
final String attemptUUID = UUID.randomUUID().toString();
final File taskDir = taskConfig.getTaskDir(task.getId());
final File attemptDir = new File(taskDir, attemptUUID);
final ProcessHolder processHolder;
final String childHost = node.getHost();
int childPort = -1;
int tlsChildPort = -1;
if (node.isEnablePlaintextPort()) {
childPort = portFinder.findUnusedPort();
}
if (node.isEnableTlsPort()) {
tlsChildPort = portFinder.findUnusedPort();
}
final TaskLocation taskLocation = TaskLocation.create(childHost, childPort, tlsChildPort);
try {
final Closer closer = Closer.create();
try {
FileUtils.mkdirp(attemptDir);
final File taskFile = new File(taskDir, "task.json");
final File statusFile = new File(attemptDir, "status.json");
final File logFile = new File(taskDir, "log");
final File reportsFile = new File(attemptDir, "report.json");
// time to adjust process holders
synchronized (tasks) {
final ForkingTaskRunnerWorkItem taskWorkItem = tasks.get(task.getId());
if (taskWorkItem == null) {
LOGGER.makeAlert("TaskInfo disappeared!").addData("task", task.getId()).emit();
throw new ISE("TaskInfo disappeared for task[%s]!", task.getId());
}
if (taskWorkItem.shutdown) {
throw new IllegalStateException("Task has been shut down!");
}
if (taskWorkItem.processHolder != null) {
LOGGER.makeAlert("TaskInfo already has a processHolder").addData("task", task.getId()).emit();
throw new ISE("TaskInfo already has processHolder for task[%s]!", task.getId());
}
final List<String> command = new ArrayList<>();
final String taskClasspath;
if (task.getClasspathPrefix() != null && !task.getClasspathPrefix().isEmpty()) {
taskClasspath = Joiner.on(File.pathSeparator).join(task.getClasspathPrefix(), config.getClasspath());
} else {
taskClasspath = config.getClasspath();
}
command.add(config.getJavaCommand());
command.add("-cp");
command.add(taskClasspath);
Iterables.addAll(command, new QuotableWhiteSpaceSplitter(config.getJavaOpts()));
Iterables.addAll(command, config.getJavaOptsArray());
// Override task specific javaOpts
Object taskJavaOpts = task.getContextValue(ForkingTaskRunnerConfig.JAVA_OPTS_PROPERTY);
if (taskJavaOpts != null) {
Iterables.addAll(command, new QuotableWhiteSpaceSplitter((String) taskJavaOpts));
}
for (String propName : props.stringPropertyNames()) {
for (String allowedPrefix : config.getAllowedPrefixes()) {
// See https://github.com/apache/druid/issues/1841
if (propName.startsWith(allowedPrefix) && !ForkingTaskRunnerConfig.JAVA_OPTS_PROPERTY.equals(propName) && !ForkingTaskRunnerConfig.JAVA_OPTS_ARRAY_PROPERTY.equals(propName)) {
command.add(StringUtils.format("-D%s=%s", propName, props.getProperty(propName)));
}
}
}
// Override child JVM specific properties
for (String propName : props.stringPropertyNames()) {
if (propName.startsWith(CHILD_PROPERTY_PREFIX)) {
command.add(StringUtils.format("-D%s=%s", propName.substring(CHILD_PROPERTY_PREFIX.length()), props.getProperty(propName)));
}
}
// Override task specific properties
final Map<String, Object> context = task.getContext();
if (context != null) {
for (String propName : context.keySet()) {
if (propName.startsWith(CHILD_PROPERTY_PREFIX)) {
command.add(StringUtils.format("-D%s=%s", propName.substring(CHILD_PROPERTY_PREFIX.length()), task.getContextValue(propName)));
}
}
}
// Add dataSource, taskId and taskType for metrics or logging
command.add(StringUtils.format("-D%s%s=%s", MonitorsConfig.METRIC_DIMENSION_PREFIX, DruidMetrics.DATASOURCE, task.getDataSource()));
command.add(StringUtils.format("-D%s%s=%s", MonitorsConfig.METRIC_DIMENSION_PREFIX, DruidMetrics.TASK_ID, task.getId()));
command.add(StringUtils.format("-D%s%s=%s", MonitorsConfig.METRIC_DIMENSION_PREFIX, DruidMetrics.TASK_TYPE, task.getType()));
command.add(StringUtils.format("-Ddruid.host=%s", childHost));
command.add(StringUtils.format("-Ddruid.plaintextPort=%d", childPort));
command.add(StringUtils.format("-Ddruid.tlsPort=%d", tlsChildPort));
// Let tasks know where they are running on.
// This information is used in native parallel indexing with shuffle.
command.add(StringUtils.format("-Ddruid.task.executor.service=%s", node.getServiceName()));
command.add(StringUtils.format("-Ddruid.task.executor.host=%s", node.getHost()));
command.add(StringUtils.format("-Ddruid.task.executor.plaintextPort=%d", node.getPlaintextPort()));
command.add(StringUtils.format("-Ddruid.task.executor.enablePlaintextPort=%s", node.isEnablePlaintextPort()));
command.add(StringUtils.format("-Ddruid.task.executor.tlsPort=%d", node.getTlsPort()));
command.add(StringUtils.format("-Ddruid.task.executor.enableTlsPort=%s", node.isEnableTlsPort()));
// These are not enabled per default to allow the user to either set or not set them
// Users are highly suggested to be set in druid.indexer.runner.javaOpts
// See org.apache.druid.concurrent.TaskThreadPriority#getThreadPriorityFromTaskPriority(int)
// for more information
// command.add("-XX:+UseThreadPriorities");
// command.add("-XX:ThreadPriorityPolicy=42");
command.add("org.apache.druid.cli.Main");
command.add("internal");
command.add("peon");
command.add(taskFile.toString());
command.add(statusFile.toString());
command.add(reportsFile.toString());
String nodeType = task.getNodeType();
if (nodeType != null) {
command.add("--nodeType");
command.add(nodeType);
}
// join queries
if (task.supportsQueries()) {
command.add("--loadBroadcastSegments");
command.add("true");
}
if (!taskFile.exists()) {
jsonMapper.writeValue(taskFile, task);
}
LOGGER.info("Running command: %s", getMaskedCommand(startupLoggingConfig.getMaskProperties(), command));
taskWorkItem.processHolder = runTaskProcess(command, logFile, taskLocation);
processHolder = taskWorkItem.processHolder;
processHolder.registerWithCloser(closer);
}
TaskRunnerUtils.notifyLocationChanged(listeners, task.getId(), taskLocation);
TaskRunnerUtils.notifyStatusChanged(listeners, task.getId(), TaskStatus.running(task.getId()));
LOGGER.info("Logging task %s output to: %s", task.getId(), logFile);
final int exitCode = waitForTaskProcessToComplete(task, processHolder, logFile, reportsFile);
final TaskStatus status;
if (exitCode == 0) {
LOGGER.info("Process exited successfully for task: %s", task.getId());
// Process exited successfully
status = jsonMapper.readValue(statusFile, TaskStatus.class);
} else {
LOGGER.error("Process exited with code[%d] for task: %s", exitCode, task.getId());
// Process exited unsuccessfully
status = TaskStatus.failure(task.getId(), StringUtils.format("Task execution process exited unsuccessfully with code[%s]. " + "See middleManager logs for more details.", exitCode));
}
TaskRunnerUtils.notifyStatusChanged(listeners, task.getId(), status);
return status;
} catch (Throwable t) {
throw closer.rethrow(t);
} finally {
closer.close();
}
} catch (Throwable t) {
LOGGER.info(t, "Exception caught during execution");
throw new RuntimeException(t);
} finally {
try {
synchronized (tasks) {
final ForkingTaskRunnerWorkItem taskWorkItem = tasks.remove(task.getId());
if (taskWorkItem != null && taskWorkItem.processHolder != null) {
taskWorkItem.processHolder.shutdown();
}
if (!stopping) {
saveRunningTasks();
}
}
if (node.isEnablePlaintextPort()) {
portFinder.markPortUnused(childPort);
}
if (node.isEnableTlsPort()) {
portFinder.markPortUnused(tlsChildPort);
}
try {
if (!stopping && taskDir.exists()) {
FileUtils.deleteDirectory(taskDir);
LOGGER.info("Removing task directory: %s", taskDir);
}
} catch (Exception e) {
LOGGER.makeAlert(e, "Failed to delete task directory").addData("taskDir", taskDir.toString()).addData("task", task.getId()).emit();
}
} catch (Exception e) {
LOGGER.error(e, "Suppressing exception caught while cleaning up task");
}
}
}
})));
saveRunningTasks();
return tasks.get(task.getId()).getResult();
}
}
use of org.apache.druid.indexer.TaskLocation in project druid by druid-io.
the class HttpRemoteTaskRunner method streamTaskReports.
@Override
public Optional<ByteSource> streamTaskReports(String taskId) {
// Read on tasks is safe
@SuppressWarnings("GuardedBy") HttpRemoteTaskRunnerWorkItem taskRunnerWorkItem = tasks.get(taskId);
Worker worker = null;
if (taskRunnerWorkItem != null && taskRunnerWorkItem.getState() != HttpRemoteTaskRunnerWorkItem.State.COMPLETE) {
worker = taskRunnerWorkItem.getWorker();
}
if (worker == null || !workers.containsKey(worker.getHost())) {
// Worker is not running this task, it might be available in deep storage
return Optional.absent();
} else {
// Worker is still running this task
TaskLocation taskLocation = taskRunnerWorkItem.getLocation();
final URL url = TaskRunnerUtils.makeTaskLocationURL(taskLocation, "/druid/worker/v1/chat/%s/liveReports", taskId);
return Optional.of(new ByteSource() {
@Override
public InputStream openStream() throws IOException {
try {
return httpClient.go(new Request(HttpMethod.GET, url), new InputStreamResponseHandler()).get();
} catch (InterruptedException e) {
throw new RuntimeException(e);
} catch (ExecutionException e) {
// Unwrap if possible
Throwables.propagateIfPossible(e.getCause(), IOException.class);
throw new RuntimeException(e);
}
}
});
}
}
use of org.apache.druid.indexer.TaskLocation in project druid by druid-io.
the class SingleTaskBackgroundRunnerTest method testStopNonRestorableTask.
@Test
public void testStopNonRestorableTask() throws InterruptedException {
// latch to wait for SingleTaskBackgroundRunnerCallable to be executed before stopping the task
// We need this latch because TaskRunnerListener is currently racy.
// See https://github.com/apache/druid/issues/11445 for more details.
CountDownLatch runLatch = new CountDownLatch(1);
// statusChanged callback can be called by multiple threads.
AtomicReference<TaskStatus> statusHolder = new AtomicReference<>();
runner.registerListener(new TaskRunnerListener() {
@Override
public String getListenerId() {
return "testStopNonRestorableTask";
}
@Override
public void locationChanged(String taskId, TaskLocation newLocation) {
// do nothing
}
@Override
public void statusChanged(String taskId, TaskStatus status) {
if (status.getStatusCode() == TaskState.RUNNING) {
runLatch.countDown();
} else {
statusHolder.set(status);
}
}
}, Execs.directExecutor());
runner.run(new NoopTask(null, null, "datasource", // 10 sec
10000, 0, null, null, null));
Assert.assertTrue(runLatch.await(1, TimeUnit.SECONDS));
runner.stop();
Assert.assertEquals(TaskState.FAILED, statusHolder.get().getStatusCode());
Assert.assertEquals("Canceled as task execution process stopped", statusHolder.get().getErrorMsg());
}
use of org.apache.druid.indexer.TaskLocation in project druid by druid-io.
the class TaskRunnerUtilsTest method testMakeTaskLocationURL.
@Test
public void testMakeTaskLocationURL() {
final URL url = TaskRunnerUtils.makeTaskLocationURL(new TaskLocation("1.2.3.4", 8090, 8290), "/druid/worker/v1/task/%s/log", "foo bar&");
Assert.assertEquals("https://1.2.3.4:8290/druid/worker/v1/task/foo%20bar%26/log", url.toString());
}
Aggregations