use of org.apache.druid.indexer.TaskStatusPlus in project druid by druid-io.
the class CompactSegments method run.
@Override
public DruidCoordinatorRuntimeParams run(DruidCoordinatorRuntimeParams params) {
LOG.info("Compact segments");
final CoordinatorCompactionConfig dynamicConfig = params.getCoordinatorCompactionConfig();
final CoordinatorStats stats = new CoordinatorStats();
List<DataSourceCompactionConfig> compactionConfigList = dynamicConfig.getCompactionConfigs();
if (dynamicConfig.getMaxCompactionTaskSlots() > 0) {
Map<String, VersionedIntervalTimeline<String, DataSegment>> dataSources = params.getUsedSegmentsTimelinesPerDataSource();
if (compactionConfigList != null && !compactionConfigList.isEmpty()) {
Map<String, DataSourceCompactionConfig> compactionConfigs = compactionConfigList.stream().collect(Collectors.toMap(DataSourceCompactionConfig::getDataSource, Function.identity()));
final List<TaskStatusPlus> compactionTasks = filterNonCompactionTasks(indexingServiceClient.getActiveTasks());
// dataSource -> list of intervals for which compaction will be skipped in this run
final Map<String, List<Interval>> intervalsToSkipCompaction = new HashMap<>();
int numEstimatedNonCompleteCompactionTasks = 0;
for (TaskStatusPlus status : compactionTasks) {
final TaskPayloadResponse response = indexingServiceClient.getTaskPayload(status.getId());
if (response == null) {
throw new ISE("Got a null paylord from overlord for task[%s]", status.getId());
}
if (COMPACTION_TASK_TYPE.equals(response.getPayload().getType())) {
final ClientCompactionTaskQuery compactionTaskQuery = (ClientCompactionTaskQuery) response.getPayload();
DataSourceCompactionConfig dataSourceCompactionConfig = compactionConfigs.get(status.getDataSource());
if (dataSourceCompactionConfig != null && dataSourceCompactionConfig.getGranularitySpec() != null) {
Granularity configuredSegmentGranularity = dataSourceCompactionConfig.getGranularitySpec().getSegmentGranularity();
if (configuredSegmentGranularity != null && compactionTaskQuery.getGranularitySpec() != null && !configuredSegmentGranularity.equals(compactionTaskQuery.getGranularitySpec().getSegmentGranularity())) {
// We will cancel active compaction task if segmentGranularity changes and we will need to
// re-compact the interval
LOG.info("Canceled task[%s] as task segmentGranularity is [%s] but compaction config " + "segmentGranularity is [%s]", status.getId(), compactionTaskQuery.getGranularitySpec().getSegmentGranularity(), configuredSegmentGranularity);
indexingServiceClient.cancelTask(status.getId());
continue;
}
}
// Skip interval as the current active compaction task is good
final Interval interval = compactionTaskQuery.getIoConfig().getInputSpec().getInterval();
intervalsToSkipCompaction.computeIfAbsent(status.getDataSource(), k -> new ArrayList<>()).add(interval);
// Since we keep the current active compaction task running, we count the active task slots
numEstimatedNonCompleteCompactionTasks += findMaxNumTaskSlotsUsedByOneCompactionTask(compactionTaskQuery.getTuningConfig());
} else {
throw new ISE("task[%s] is not a compactionTask", status.getId());
}
}
// Skip all the intervals locked by higher priority tasks for each datasource
// This must be done after the invalid compaction tasks are cancelled
// in the loop above so that their intervals are not considered locked
getLockedIntervalsToSkip(compactionConfigList).forEach((dataSource, intervals) -> intervalsToSkipCompaction.computeIfAbsent(dataSource, ds -> new ArrayList<>()).addAll(intervals));
final CompactionSegmentIterator iterator = policy.reset(compactionConfigs, dataSources, intervalsToSkipCompaction);
int totalCapacity;
if (dynamicConfig.isUseAutoScaleSlots()) {
try {
totalCapacity = indexingServiceClient.getTotalWorkerCapacityWithAutoScale();
} catch (Exception e) {
LOG.warn("Failed to get total worker capacity with auto scale slots. Falling back to current capacity count");
totalCapacity = indexingServiceClient.getTotalWorkerCapacity();
}
} else {
totalCapacity = indexingServiceClient.getTotalWorkerCapacity();
}
final int compactionTaskCapacity = (int) Math.min(totalCapacity * dynamicConfig.getCompactionTaskSlotRatio(), dynamicConfig.getMaxCompactionTaskSlots());
final int numAvailableCompactionTaskSlots;
if (numEstimatedNonCompleteCompactionTasks > 0) {
numAvailableCompactionTaskSlots = Math.max(0, compactionTaskCapacity - numEstimatedNonCompleteCompactionTasks);
} else {
// compactionTaskCapacity might be 0 if totalWorkerCapacity is low.
// This guarantees that at least one slot is available if
// compaction is enabled and numEstimatedNonCompleteCompactionTasks is 0.
numAvailableCompactionTaskSlots = Math.max(1, compactionTaskCapacity);
}
LOG.info("Found [%d] available task slots for compaction out of [%d] max compaction task capacity", numAvailableCompactionTaskSlots, compactionTaskCapacity);
stats.addToGlobalStat(AVAILABLE_COMPACTION_TASK_SLOT, numAvailableCompactionTaskSlots);
stats.addToGlobalStat(MAX_COMPACTION_TASK_SLOT, compactionTaskCapacity);
final Map<String, AutoCompactionSnapshot.Builder> currentRunAutoCompactionSnapshotBuilders = new HashMap<>();
if (numAvailableCompactionTaskSlots > 0) {
stats.accumulate(doRun(compactionConfigs, currentRunAutoCompactionSnapshotBuilders, numAvailableCompactionTaskSlots, iterator));
} else {
stats.accumulate(makeStats(currentRunAutoCompactionSnapshotBuilders, 0, iterator));
}
} else {
LOG.info("compactionConfig is empty. Skip.");
autoCompactionSnapshotPerDataSource.set(new HashMap<>());
}
} else {
LOG.info("maxCompactionTaskSlots was set to 0. Skip compaction");
autoCompactionSnapshotPerDataSource.set(new HashMap<>());
}
return params.buildFromExisting().withCoordinatorStats(stats).build();
}
use of org.apache.druid.indexer.TaskStatusPlus in project druid by druid-io.
the class HttpIndexingServiceClient method getActiveTasks.
@Override
public List<TaskStatusPlus> getActiveTasks() {
// Must retrieve waiting, then pending, then running, so if tasks move from one state to the next between
// calls then we still catch them. (Tasks always go waiting -> pending -> running.)
//
// Consider switching to new-style /druid/indexer/v1/tasks API in the future.
final List<TaskStatusPlus> tasks = new ArrayList<>();
final Set<String> taskIdsSeen = new HashSet<>();
final Iterable<TaskStatusPlus> activeTasks = Iterables.concat(getTasks("waitingTasks"), getTasks("pendingTasks"), getTasks("runningTasks"));
for (TaskStatusPlus task : activeTasks) {
// for example, and we see it twice.)
if (taskIdsSeen.add(task.getId())) {
tasks.add(task);
}
}
return tasks;
}
use of org.apache.druid.indexer.TaskStatusPlus in project druid by druid-io.
the class TaskMonitor method start.
public void start(long taskStatusCheckingPeriod) {
synchronized (startStopLock) {
running = true;
log.info("Starting taskMonitor");
// NOTE: This polling can be improved to event-driven pushing by registering TaskRunnerListener to TaskRunner.
// That listener should be able to send the events reported to TaskRunner to this TaskMonitor.
taskStatusChecker.scheduleAtFixedRate(() -> {
try {
final Iterator<Entry<String, MonitorEntry>> iterator = runningTasks.entrySet().iterator();
while (iterator.hasNext()) {
final Entry<String, MonitorEntry> entry = iterator.next();
final String specId = entry.getKey();
final MonitorEntry monitorEntry = entry.getValue();
final String taskId = monitorEntry.runningTask.getId();
final TaskStatusResponse taskStatusResponse = indexingServiceClient.getTaskStatus(taskId);
final TaskStatusPlus taskStatus = taskStatusResponse.getStatus();
if (taskStatus != null) {
switch(Preconditions.checkNotNull(taskStatus.getStatusCode(), "taskState")) {
case SUCCESS:
// Succeeded tasks must have sent a report
if (!reportsMap.containsKey(taskId)) {
throw new ISE("Missing reports from task[%s]!", taskId);
}
incrementNumSucceededTasks();
// Remote the current entry after updating taskHistories to make sure that task history
// exists either runningTasks or taskHistories.
monitorEntry.setLastStatus(taskStatus);
iterator.remove();
break;
case FAILED:
// We don't need reports from failed tasks
reportsMap.remove(taskId);
incrementNumFailedTasks();
log.warn("task[%s] failed!", taskId);
if (monitorEntry.numTries() < maxRetry) {
log.info("We still have more chances[%d/%d] to process the spec[%s].", monitorEntry.numTries(), maxRetry, monitorEntry.spec.getId());
retry(specId, monitorEntry, taskStatus);
} else {
log.error("spec[%s] failed after [%d] tries", monitorEntry.spec.getId(), monitorEntry.numTries());
// Remote the current entry after updating taskHistories to make sure that task history
// exists either runningTasks or taskHistories.
monitorEntry.setLastStatus(taskStatus);
iterator.remove();
}
break;
case RUNNING:
monitorEntry.updateStatus(taskStatus);
break;
default:
throw new ISE("Unknown taskStatus[%s] for task[%s[", taskStatus.getStatusCode(), taskId);
}
}
}
} catch (Throwable t) {
// Note that we only log the message here so that task monitoring continues to happen or else
// the task which created this monitor will keep on waiting endlessly assuming monitored tasks
// are still running.
log.error(t, "Error while monitoring");
}
}, taskStatusCheckingPeriod, taskStatusCheckingPeriod, TimeUnit.MILLISECONDS);
}
}
use of org.apache.druid.indexer.TaskStatusPlus in project druid by druid-io.
the class ParallelIndexSupervisorTaskKillTest method testSubTaskFail.
@Test(timeout = 5000L)
public void testSubTaskFail() throws Exception {
final ParallelIndexSupervisorTask task = newTask(Intervals.of("2017/2018"), new ParallelIndexIOConfig(null, new TestInputSource(Pair.of(new TestInput(10L, TaskState.FAILED), 1), Pair.of(new TestInput(Integer.MAX_VALUE, TaskState.FAILED), 3)), new NoopInputFormat(), false, null));
final TaskActionClient actionClient = createActionClient(task);
final TaskToolbox toolbox = createTaskToolbox(task, actionClient);
prepareTaskForLocking(task);
Assert.assertTrue(task.isReady(actionClient));
final TaskStatus taskStatus = task.run(toolbox);
Assert.assertEquals("Failed in phase[segment generation]. See task logs for details.", taskStatus.getErrorMsg());
Assert.assertEquals(TaskState.FAILED, taskStatus.getStatusCode());
final SinglePhaseParallelIndexTaskRunner runner = (SinglePhaseParallelIndexTaskRunner) task.getCurrentRunner();
Assert.assertTrue(runner.getRunningTaskIds().isEmpty());
final List<SubTaskSpec<SinglePhaseSubTask>> completeSubTaskSpecs = runner.getCompleteSubTaskSpecs();
Assert.assertEquals(1, completeSubTaskSpecs.size());
final TaskHistory<SinglePhaseSubTask> history = runner.getCompleteSubTaskSpecAttemptHistory(completeSubTaskSpecs.get(0).getId());
Assert.assertNotNull(history);
Assert.assertEquals(3, history.getAttemptHistory().size());
for (TaskStatusPlus status : history.getAttemptHistory()) {
Assert.assertEquals(TaskState.FAILED, status.getStatusCode());
}
Assert.assertEquals(3, runner.getTaskMonitor().getNumCanceledTasks());
}
use of org.apache.druid.indexer.TaskStatusPlus in project druid by druid-io.
the class ParallelIndexSupervisorTaskResourceTest method testAPIs.
@Test(timeout = 20000L)
public void testAPIs() throws Exception {
task = newTask(Intervals.of("2017/2018"), new ParallelIndexIOConfig(null, new TestInputSource(IntStream.range(0, NUM_SUB_TASKS).boxed().collect(Collectors.toList())), new NoopInputFormat(), false, null));
getIndexingServiceClient().runTask(task.getId(), task);
Thread.sleep(1000);
final SinglePhaseParallelIndexTaskRunner runner = (SinglePhaseParallelIndexTaskRunner) task.getCurrentRunner();
Assert.assertNotNull("runner is null", runner);
// test getMode
Response response = task.getMode(newRequest());
Assert.assertEquals(200, response.getStatus());
Assert.assertEquals("parallel", response.getEntity());
// test expectedNumSucceededTasks
response = task.getProgress(newRequest());
Assert.assertEquals(200, response.getStatus());
Assert.assertEquals(NUM_SUB_TASKS, ((ParallelIndexingPhaseProgress) response.getEntity()).getEstimatedExpectedSucceeded());
// We use polling to check the state of taskMonitor in this test.
while (getNumSubTasks(ParallelIndexingPhaseProgress::getRunning) < NUM_SUB_TASKS) {
Thread.sleep(100);
}
int succeededTasks = 0;
int failedTasks = 0;
checkState(succeededTasks, failedTasks, buildStateMap());
// numRunningTasks and numSucceededTasks after some successful subTasks
succeededTasks += 2;
for (int i = 0; i < succeededTasks; i++) {
runningTasks.get(0).setState(TaskState.SUCCESS);
}
while (getNumSubTasks(ParallelIndexingPhaseProgress::getSucceeded) < succeededTasks) {
Thread.sleep(100);
}
checkState(succeededTasks, failedTasks, buildStateMap());
// numRunningTasks and numSucceededTasks after some failed subTasks
failedTasks += 3;
for (int i = 0; i < failedTasks; i++) {
runningTasks.get(0).setState(TaskState.FAILED);
}
// Wait for new tasks to be started
while (getNumSubTasks(ParallelIndexingPhaseProgress::getFailed) < failedTasks || runningTasks.size() < NUM_SUB_TASKS - succeededTasks) {
Thread.sleep(100);
}
checkState(succeededTasks, failedTasks, buildStateMap());
// Make sure only one subTask is running
succeededTasks += 7;
for (int i = 0; i < 7; i++) {
runningTasks.get(0).setState(TaskState.SUCCESS);
}
while (getNumSubTasks(ParallelIndexingPhaseProgress::getSucceeded) < succeededTasks) {
Thread.sleep(100);
}
checkState(succeededTasks, failedTasks, buildStateMap());
Assert.assertEquals(1, runningSpecs.size());
final String lastRunningSpecId = runningSpecs.keySet().iterator().next();
final List<TaskStatusPlus> taskHistory = taskHistories.get(lastRunningSpecId);
// This should be a failed task history because new tasks appear later in runningTasks.
Assert.assertEquals(1, taskHistory.size());
// Test one more failure
runningTasks.get(0).setState(TaskState.FAILED);
failedTasks++;
while (getNumSubTasks(ParallelIndexingPhaseProgress::getFailed) < failedTasks) {
Thread.sleep(100);
}
while (getNumSubTasks(ParallelIndexingPhaseProgress::getRunning) < 1) {
Thread.sleep(100);
}
checkState(succeededTasks, failedTasks, buildStateMap());
Assert.assertEquals(2, taskHistory.size());
runningTasks.get(0).setState(TaskState.SUCCESS);
succeededTasks++;
while (getNumSubTasks(ParallelIndexingPhaseProgress::getSucceeded) < succeededTasks) {
Thread.sleep(100);
}
Assert.assertEquals(TaskState.SUCCESS, getIndexingServiceClient().waitToFinish(task, 1000, TimeUnit.MILLISECONDS).getStatusCode());
}
Aggregations