use of org.apache.druid.indexer.TaskStatus in project druid by druid-io.
the class IndexTaskClient method submitRequest.
/**
* Sends an HTTP request to the task of the specified {@code taskId} and returns a response if it succeeded.
*/
protected <IntermediateType, FinalType> FinalType submitRequest(String taskId, // nullable if content is empty
@Nullable String mediaType, HttpMethod method, String encodedPathSuffix, @Nullable String encodedQueryString, byte[] content, HttpResponseHandler<IntermediateType, FinalType> responseHandler, boolean retry) throws IOException, ChannelException, NoTaskLocationException {
final RetryPolicy retryPolicy = retryPolicyFactory.makeRetryPolicy();
while (true) {
String path = StringUtils.format("%s/%s/%s", BASE_PATH, StringUtils.urlEncode(taskId), encodedPathSuffix);
Optional<TaskStatus> status = taskInfoProvider.getTaskStatus(taskId);
if (!status.isPresent() || !status.get().isRunnable()) {
throw new TaskNotRunnableException(StringUtils.format("Aborting request because task [%s] is not runnable", taskId));
}
final TaskLocation location = taskInfoProvider.getTaskLocation(taskId);
if (location.equals(TaskLocation.unknown())) {
throw new NoTaskLocationException(StringUtils.format("No TaskLocation available for task [%s]", taskId));
}
final Request request = createRequest(taskId, location, path, encodedQueryString, method, mediaType, content);
Either<StringFullResponseHolder, FinalType> response = null;
try {
// Netty throws some annoying exceptions if a connection can't be opened, which happens relatively frequently
// for tasks that happen to still be starting up, so test the connection first to keep the logs clean.
checkConnection(request.getUrl().getHost(), request.getUrl().getPort());
response = submitRequest(request, responseHandler);
if (response.isValue()) {
return response.valueOrThrow();
} else {
final StringBuilder exceptionMessage = new StringBuilder();
final HttpResponseStatus httpResponseStatus = response.error().getStatus();
final String httpResponseContent = response.error().getContent();
exceptionMessage.append("Received server error with status [").append(httpResponseStatus).append("]");
if (!Strings.isNullOrEmpty(httpResponseContent)) {
final String choppedMessage = StringUtils.chop(StringUtils.nullToEmptyNonDruidDataString(httpResponseContent), 1000);
exceptionMessage.append("; first 1KB of body: ").append(choppedMessage);
}
if (httpResponseStatus.getCode() == 400) {
// don't bother retrying if it's a bad request
throw new IAE(exceptionMessage.toString());
} else {
throw new IOE(exceptionMessage.toString());
}
}
} catch (IOException | ChannelException e) {
// Since workers are free to move tasks around to different ports, there is a chance that a task may have been
// moved but our view of its location has not been updated yet from ZK. To detect this case, we send a header
// identifying our expected recipient in the request; if this doesn't correspond to the worker we messaged, the
// worker will return an HTTP 404 with its ID in the response header. If we get a mismatching task ID, then
// we will wait for a short period then retry the request indefinitely, expecting the task's location to
// eventually be updated.
final Duration delay;
if (response != null && !response.isValue() && response.error().getStatus().equals(HttpResponseStatus.NOT_FOUND)) {
String headerId = StringUtils.urlDecode(response.error().getResponse().headers().get(ChatHandlerResource.TASK_ID_HEADER));
if (headerId != null && !headerId.equals(taskId)) {
log.warn("Expected worker to have taskId [%s] but has taskId [%s], will retry in [%d]s", taskId, headerId, TASK_MISMATCH_RETRY_DELAY_SECONDS);
delay = Duration.standardSeconds(TASK_MISMATCH_RETRY_DELAY_SECONDS);
} else {
delay = retryPolicy.getAndIncrementRetryDelay();
}
} else {
delay = retryPolicy.getAndIncrementRetryDelay();
}
final String urlForLog = request.getUrl().toString();
if (!retry) {
// if retry=false, we probably aren't too concerned if the operation doesn't succeed (i.e. the request was
// for informational purposes only); log at INFO instead of WARN.
log.noStackTrace().info(e, "submitRequest failed for [%s]", urlForLog);
throw e;
} else if (delay == null) {
// When retrying, log the final failure at WARN level, since it is likely to be bad news.
log.warn(e, "submitRequest failed for [%s]", urlForLog);
throw e;
} else {
try {
final long sleepTime = delay.getMillis();
// When retrying, log non-final failures at INFO level.
log.noStackTrace().info(e, "submitRequest failed for [%s]; will try again in [%s]", urlForLog, new Duration(sleepTime).toString());
Thread.sleep(sleepTime);
} catch (InterruptedException e2) {
Thread.currentThread().interrupt();
e.addSuppressed(e2);
throw new RuntimeException(e);
}
}
} catch (NoTaskLocationException e) {
log.info("No TaskLocation available for task [%s], this task may not have been assigned to a worker yet " + "or may have already completed", taskId);
throw e;
} catch (Exception e) {
log.warn(e, "Exception while sending request");
throw e;
}
}
}
use of org.apache.druid.indexer.TaskStatus in project druid by druid-io.
the class ParallelIndexSupervisorTask method runSinglePhaseParallel.
/**
* Run the single phase parallel indexing for best-effort rollup. In this mode, each sub task created by
* the supervisor task reads data and generates segments individually.
*/
private TaskStatus runSinglePhaseParallel(TaskToolbox toolbox) throws Exception {
ingestionState = IngestionState.BUILD_SEGMENTS;
ParallelIndexTaskRunner<SinglePhaseSubTask, PushedSegmentsReport> parallelSinglePhaseRunner = createRunner(toolbox, this::createSinglePhaseTaskRunner);
final TaskState state = runNextPhase(parallelSinglePhaseRunner);
TaskStatus taskStatus;
if (state.isSuccess()) {
// noinspection ConstantConditions
publishSegments(toolbox, parallelSinglePhaseRunner.getReports());
if (awaitSegmentAvailabilityTimeoutMillis > 0) {
waitForSegmentAvailability(parallelSinglePhaseRunner.getReports());
}
taskStatus = TaskStatus.success(getId());
} else {
// there is only success or failure after running....
Preconditions.checkState(state.isFailure(), "Unrecognized state after task is complete[%s]", state);
final String errorMessage;
if (parallelSinglePhaseRunner.getStopReason() != null) {
errorMessage = parallelSinglePhaseRunner.getStopReason();
} else {
errorMessage = StringUtils.format(TASK_PHASE_FAILURE_MSG, parallelSinglePhaseRunner.getName());
}
taskStatus = TaskStatus.failure(getId(), errorMessage);
}
toolbox.getTaskReportFileWriter().write(getId(), getTaskCompletionReports(taskStatus, segmentAvailabilityConfirmationCompleted));
return taskStatus;
}
use of org.apache.druid.indexer.TaskStatus in project druid by druid-io.
the class ParallelIndexSupervisorTask method runHashPartitionMultiPhaseParallel.
@VisibleForTesting
TaskStatus runHashPartitionMultiPhaseParallel(TaskToolbox toolbox) throws Exception {
TaskState state;
ParallelIndexIngestionSpec ingestionSchemaToUse = ingestionSchema;
if (!(ingestionSchema.getTuningConfig().getPartitionsSpec() instanceof HashedPartitionsSpec)) {
// only range and hash partitioning is supported for multiphase parallel ingestion, see runMultiPhaseParallel()
throw new ISE("forceGuaranteedRollup is set but partitionsSpec [%s] is not a single_dim or hash partition spec.", ingestionSchema.getTuningConfig().getPartitionsSpec());
}
final Map<Interval, Integer> intervalToNumShards;
HashedPartitionsSpec partitionsSpec = (HashedPartitionsSpec) ingestionSchema.getTuningConfig().getPartitionsSpec();
final boolean needsInputSampling = partitionsSpec.getNumShards() == null || ingestionSchemaToUse.getDataSchema().getGranularitySpec().inputIntervals().isEmpty();
if (needsInputSampling) {
// 0. need to determine intervals and numShards by scanning the data
LOG.info("Needs to determine intervals or numShards, beginning %s phase.", PartialDimensionCardinalityTask.TYPE);
ParallelIndexTaskRunner<PartialDimensionCardinalityTask, DimensionCardinalityReport> cardinalityRunner = createRunner(toolbox, this::createPartialDimensionCardinalityRunner);
state = runNextPhase(cardinalityRunner);
if (state.isFailure()) {
String errMsg = StringUtils.format(TASK_PHASE_FAILURE_MSG, cardinalityRunner.getName());
return TaskStatus.failure(getId(), errMsg);
}
if (cardinalityRunner.getReports().isEmpty()) {
String msg = "No valid rows for hash partitioning." + " All rows may have invalid timestamps or have been filtered out.";
LOG.warn(msg);
return TaskStatus.success(getId(), msg);
}
if (partitionsSpec.getNumShards() == null) {
int effectiveMaxRowsPerSegment = partitionsSpec.getMaxRowsPerSegment() == null ? PartitionsSpec.DEFAULT_MAX_ROWS_PER_SEGMENT : partitionsSpec.getMaxRowsPerSegment();
LOG.info("effective maxRowsPerSegment is: " + effectiveMaxRowsPerSegment);
intervalToNumShards = determineNumShardsFromCardinalityReport(cardinalityRunner.getReports().values(), effectiveMaxRowsPerSegment);
} else {
intervalToNumShards = CollectionUtils.mapValues(mergeCardinalityReports(cardinalityRunner.getReports().values()), k -> partitionsSpec.getNumShards());
}
ingestionSchemaToUse = rewriteIngestionSpecWithIntervalsIfMissing(ingestionSchemaToUse, intervalToNumShards.keySet());
} else {
// numShards will be determined in PartialHashSegmentGenerateTask
intervalToNumShards = null;
}
// 1. Partial segment generation phase
final ParallelIndexIngestionSpec segmentCreateIngestionSpec = ingestionSchemaToUse;
ParallelIndexTaskRunner<PartialHashSegmentGenerateTask, GeneratedPartitionsReport> indexingRunner = createRunner(toolbox, f -> createPartialHashSegmentGenerateRunner(toolbox, segmentCreateIngestionSpec, intervalToNumShards));
state = runNextPhase(indexingRunner);
if (state.isFailure()) {
String errMsg = StringUtils.format(TASK_PHASE_FAILURE_MSG, indexingRunner.getName());
return TaskStatus.failure(getId(), errMsg);
}
// 2. Partial segment merge phase
// partition (interval, partitionId) -> partition locations
Map<Partition, List<PartitionLocation>> partitionToLocations = getPartitionToLocations(indexingRunner.getReports());
final List<PartialSegmentMergeIOConfig> ioConfigs = createGenericMergeIOConfigs(ingestionSchema.getTuningConfig().getTotalNumMergeTasks(), partitionToLocations);
final ParallelIndexIngestionSpec segmentMergeIngestionSpec = ingestionSchemaToUse;
final ParallelIndexTaskRunner<PartialGenericSegmentMergeTask, PushedSegmentsReport> mergeRunner = createRunner(toolbox, tb -> createPartialGenericSegmentMergeRunner(tb, ioConfigs, segmentMergeIngestionSpec));
state = runNextPhase(mergeRunner);
TaskStatus taskStatus;
if (state.isSuccess()) {
// noinspection ConstantConditions
publishSegments(toolbox, mergeRunner.getReports());
if (awaitSegmentAvailabilityTimeoutMillis > 0) {
waitForSegmentAvailability(mergeRunner.getReports());
}
taskStatus = TaskStatus.success(getId());
} else {
// there is only success or failure after running....
Preconditions.checkState(state.isFailure(), "Unrecognized state after task is complete[%s]", state);
String errMsg = StringUtils.format(TASK_PHASE_FAILURE_MSG, mergeRunner.getName());
taskStatus = TaskStatus.failure(getId(), errMsg);
}
toolbox.getTaskReportFileWriter().write(getId(), getTaskCompletionReports(taskStatus, segmentAvailabilityConfirmationCompleted));
return taskStatus;
}
use of org.apache.druid.indexer.TaskStatus in project druid by druid-io.
the class QuotableWhiteSpaceSplitter method run.
@Override
public ListenableFuture<TaskStatus> run(final Task task) {
synchronized (tasks) {
tasks.computeIfAbsent(task.getId(), k -> new ForkingTaskRunnerWorkItem(task, exec.submit(new Callable<TaskStatus>() {
@Override
public TaskStatus call() {
final String attemptUUID = UUID.randomUUID().toString();
final File taskDir = taskConfig.getTaskDir(task.getId());
final File attemptDir = new File(taskDir, attemptUUID);
final ProcessHolder processHolder;
final String childHost = node.getHost();
int childPort = -1;
int tlsChildPort = -1;
if (node.isEnablePlaintextPort()) {
childPort = portFinder.findUnusedPort();
}
if (node.isEnableTlsPort()) {
tlsChildPort = portFinder.findUnusedPort();
}
final TaskLocation taskLocation = TaskLocation.create(childHost, childPort, tlsChildPort);
try {
final Closer closer = Closer.create();
try {
FileUtils.mkdirp(attemptDir);
final File taskFile = new File(taskDir, "task.json");
final File statusFile = new File(attemptDir, "status.json");
final File logFile = new File(taskDir, "log");
final File reportsFile = new File(attemptDir, "report.json");
// time to adjust process holders
synchronized (tasks) {
final ForkingTaskRunnerWorkItem taskWorkItem = tasks.get(task.getId());
if (taskWorkItem == null) {
LOGGER.makeAlert("TaskInfo disappeared!").addData("task", task.getId()).emit();
throw new ISE("TaskInfo disappeared for task[%s]!", task.getId());
}
if (taskWorkItem.shutdown) {
throw new IllegalStateException("Task has been shut down!");
}
if (taskWorkItem.processHolder != null) {
LOGGER.makeAlert("TaskInfo already has a processHolder").addData("task", task.getId()).emit();
throw new ISE("TaskInfo already has processHolder for task[%s]!", task.getId());
}
final List<String> command = new ArrayList<>();
final String taskClasspath;
if (task.getClasspathPrefix() != null && !task.getClasspathPrefix().isEmpty()) {
taskClasspath = Joiner.on(File.pathSeparator).join(task.getClasspathPrefix(), config.getClasspath());
} else {
taskClasspath = config.getClasspath();
}
command.add(config.getJavaCommand());
command.add("-cp");
command.add(taskClasspath);
Iterables.addAll(command, new QuotableWhiteSpaceSplitter(config.getJavaOpts()));
Iterables.addAll(command, config.getJavaOptsArray());
// Override task specific javaOpts
Object taskJavaOpts = task.getContextValue(ForkingTaskRunnerConfig.JAVA_OPTS_PROPERTY);
if (taskJavaOpts != null) {
Iterables.addAll(command, new QuotableWhiteSpaceSplitter((String) taskJavaOpts));
}
for (String propName : props.stringPropertyNames()) {
for (String allowedPrefix : config.getAllowedPrefixes()) {
// See https://github.com/apache/druid/issues/1841
if (propName.startsWith(allowedPrefix) && !ForkingTaskRunnerConfig.JAVA_OPTS_PROPERTY.equals(propName) && !ForkingTaskRunnerConfig.JAVA_OPTS_ARRAY_PROPERTY.equals(propName)) {
command.add(StringUtils.format("-D%s=%s", propName, props.getProperty(propName)));
}
}
}
// Override child JVM specific properties
for (String propName : props.stringPropertyNames()) {
if (propName.startsWith(CHILD_PROPERTY_PREFIX)) {
command.add(StringUtils.format("-D%s=%s", propName.substring(CHILD_PROPERTY_PREFIX.length()), props.getProperty(propName)));
}
}
// Override task specific properties
final Map<String, Object> context = task.getContext();
if (context != null) {
for (String propName : context.keySet()) {
if (propName.startsWith(CHILD_PROPERTY_PREFIX)) {
command.add(StringUtils.format("-D%s=%s", propName.substring(CHILD_PROPERTY_PREFIX.length()), task.getContextValue(propName)));
}
}
}
// Add dataSource, taskId and taskType for metrics or logging
command.add(StringUtils.format("-D%s%s=%s", MonitorsConfig.METRIC_DIMENSION_PREFIX, DruidMetrics.DATASOURCE, task.getDataSource()));
command.add(StringUtils.format("-D%s%s=%s", MonitorsConfig.METRIC_DIMENSION_PREFIX, DruidMetrics.TASK_ID, task.getId()));
command.add(StringUtils.format("-D%s%s=%s", MonitorsConfig.METRIC_DIMENSION_PREFIX, DruidMetrics.TASK_TYPE, task.getType()));
command.add(StringUtils.format("-Ddruid.host=%s", childHost));
command.add(StringUtils.format("-Ddruid.plaintextPort=%d", childPort));
command.add(StringUtils.format("-Ddruid.tlsPort=%d", tlsChildPort));
// Let tasks know where they are running on.
// This information is used in native parallel indexing with shuffle.
command.add(StringUtils.format("-Ddruid.task.executor.service=%s", node.getServiceName()));
command.add(StringUtils.format("-Ddruid.task.executor.host=%s", node.getHost()));
command.add(StringUtils.format("-Ddruid.task.executor.plaintextPort=%d", node.getPlaintextPort()));
command.add(StringUtils.format("-Ddruid.task.executor.enablePlaintextPort=%s", node.isEnablePlaintextPort()));
command.add(StringUtils.format("-Ddruid.task.executor.tlsPort=%d", node.getTlsPort()));
command.add(StringUtils.format("-Ddruid.task.executor.enableTlsPort=%s", node.isEnableTlsPort()));
// These are not enabled per default to allow the user to either set or not set them
// Users are highly suggested to be set in druid.indexer.runner.javaOpts
// See org.apache.druid.concurrent.TaskThreadPriority#getThreadPriorityFromTaskPriority(int)
// for more information
// command.add("-XX:+UseThreadPriorities");
// command.add("-XX:ThreadPriorityPolicy=42");
command.add("org.apache.druid.cli.Main");
command.add("internal");
command.add("peon");
command.add(taskFile.toString());
command.add(statusFile.toString());
command.add(reportsFile.toString());
String nodeType = task.getNodeType();
if (nodeType != null) {
command.add("--nodeType");
command.add(nodeType);
}
// join queries
if (task.supportsQueries()) {
command.add("--loadBroadcastSegments");
command.add("true");
}
if (!taskFile.exists()) {
jsonMapper.writeValue(taskFile, task);
}
LOGGER.info("Running command: %s", getMaskedCommand(startupLoggingConfig.getMaskProperties(), command));
taskWorkItem.processHolder = runTaskProcess(command, logFile, taskLocation);
processHolder = taskWorkItem.processHolder;
processHolder.registerWithCloser(closer);
}
TaskRunnerUtils.notifyLocationChanged(listeners, task.getId(), taskLocation);
TaskRunnerUtils.notifyStatusChanged(listeners, task.getId(), TaskStatus.running(task.getId()));
LOGGER.info("Logging task %s output to: %s", task.getId(), logFile);
final int exitCode = waitForTaskProcessToComplete(task, processHolder, logFile, reportsFile);
final TaskStatus status;
if (exitCode == 0) {
LOGGER.info("Process exited successfully for task: %s", task.getId());
// Process exited successfully
status = jsonMapper.readValue(statusFile, TaskStatus.class);
} else {
LOGGER.error("Process exited with code[%d] for task: %s", exitCode, task.getId());
// Process exited unsuccessfully
status = TaskStatus.failure(task.getId(), StringUtils.format("Task execution process exited unsuccessfully with code[%s]. " + "See middleManager logs for more details.", exitCode));
}
TaskRunnerUtils.notifyStatusChanged(listeners, task.getId(), status);
return status;
} catch (Throwable t) {
throw closer.rethrow(t);
} finally {
closer.close();
}
} catch (Throwable t) {
LOGGER.info(t, "Exception caught during execution");
throw new RuntimeException(t);
} finally {
try {
synchronized (tasks) {
final ForkingTaskRunnerWorkItem taskWorkItem = tasks.remove(task.getId());
if (taskWorkItem != null && taskWorkItem.processHolder != null) {
taskWorkItem.processHolder.shutdown();
}
if (!stopping) {
saveRunningTasks();
}
}
if (node.isEnablePlaintextPort()) {
portFinder.markPortUnused(childPort);
}
if (node.isEnableTlsPort()) {
portFinder.markPortUnused(tlsChildPort);
}
try {
if (!stopping && taskDir.exists()) {
FileUtils.deleteDirectory(taskDir);
LOGGER.info("Removing task directory: %s", taskDir);
}
} catch (Exception e) {
LOGGER.makeAlert(e, "Failed to delete task directory").addData("taskDir", taskDir.toString()).addData("task", task.getId()).emit();
}
} catch (Exception e) {
LOGGER.error(e, "Suppressing exception caught while cleaning up task");
}
}
}
})));
saveRunningTasks();
return tasks.get(task.getId()).getResult();
}
}
use of org.apache.druid.indexer.TaskStatus in project druid by druid-io.
the class PartialSegmentMergeTask method runTask.
@Override
public TaskStatus runTask(TaskToolbox toolbox) throws Exception {
// Group partitionLocations by interval and partitionId
final Map<Interval, Int2ObjectMap<List<PartitionLocation>>> intervalToBuckets = new HashMap<>();
for (PartitionLocation location : ioConfig.getPartitionLocations()) {
intervalToBuckets.computeIfAbsent(location.getInterval(), k -> new Int2ObjectOpenHashMap<>()).computeIfAbsent(location.getBucketId(), k -> new ArrayList<>()).add(location);
}
final List<TaskLock> locks = toolbox.getTaskActionClient().submit(new SurrogateAction<>(supervisorTaskId, new LockListAction()));
final Map<Interval, String> intervalToVersion = Maps.newHashMapWithExpectedSize(locks.size());
locks.forEach(lock -> {
if (lock.isRevoked()) {
throw new ISE("Lock[%s] is revoked", lock);
}
final String mustBeNull = intervalToVersion.put(lock.getInterval(), lock.getVersion());
if (mustBeNull != null) {
throw new ISE("Unexpected state: Two versions([%s], [%s]) for the same interval[%s]", lock.getVersion(), mustBeNull, lock.getInterval());
}
});
final Stopwatch fetchStopwatch = Stopwatch.createStarted();
final Map<Interval, Int2ObjectMap<List<File>>> intervalToUnzippedFiles = fetchSegmentFiles(toolbox, intervalToBuckets);
final long fetchTime = fetchStopwatch.elapsed(TimeUnit.SECONDS);
fetchStopwatch.stop();
LOG.info("Fetch took [%s] seconds", fetchTime);
final ParallelIndexSupervisorTaskClient taskClient = toolbox.getSupervisorTaskClientFactory().build(new ClientBasedTaskInfoProvider(toolbox.getIndexingServiceClient()), getId(), // always use a single http thread
1, getTuningConfig().getChatHandlerTimeout(), getTuningConfig().getChatHandlerNumRetries());
final File persistDir = toolbox.getPersistDir();
org.apache.commons.io.FileUtils.deleteQuietly(persistDir);
FileUtils.mkdirp(persistDir);
final Set<DataSegment> pushedSegments = mergeAndPushSegments(toolbox, getDataSchema(), getTuningConfig(), persistDir, intervalToVersion, intervalToUnzippedFiles);
taskClient.report(supervisorTaskId, new PushedSegmentsReport(getId(), Collections.emptySet(), pushedSegments, ImmutableMap.of()));
return TaskStatus.success(getId());
}
Aggregations