the class ADTaskManager method setHCDetectorTaskDone.
* Set state for HC detector level task when all entities done.
* The state could be FINISHED,FAILED or STOPPED.
* 1. If input task state is FINISHED, will check FINISHED entity task count. If
* there is no FINISHED entity task, will set HC detector level task as FAILED; otherwise
* set as FINISHED.
* 2. If input task state is not FINISHED, will set HC detector level task's state as the same.
* @param adTask AD task
* @param state AD task state
* @param listener action listener
public void setHCDetectorTaskDone(ADTask adTask, ADTaskState state, ActionListener<AnomalyDetectorJobResponse> listener) {
String detectorId = adTask.getDetectorId();
String taskId = adTask.isEntityTask() ? adTask.getParentTaskId() : adTask.getTaskId();
String detectorTaskId = adTask.getDetectorLevelTaskId();
ActionListener<UpdateResponse> wrappedListener = ActionListener.wrap(response -> {"Historical HC detector done with state: {}. Remove from cache, detector id:{}",, detectorId);
}, e -> {
// Will reset task state when get detector with task or maintain tasks in hourly cron.
if (e instanceof LimitExceededException && e.getMessage().contains(HC_DETECTOR_TASK_IS_UPDATING)) {
logger.warn("HC task is updating, skip this update for task: " + taskId);
} else {
logger.error("Failed to update task: " + taskId, e);
// wait for 2 seconds to acquire updating HC detector task semaphore
long timeoutInMillis = 2000;
if (state == ADTaskState.FINISHED) {
this.countEntityTasksByState(detectorTaskId, ImmutableList.of(ADTaskState.FINISHED), ActionListener.wrap(r -> {"number of finished entity tasks: {}, for detector {}", r, adTask.getDetectorId());
// Set task as FAILED if no finished entity task; otherwise set as FINISHED
ADTaskState hcDetectorTaskState = r == 0 ? ADTaskState.FAILED : ADTaskState.FINISHED;
// execute in AD batch task thread pool in case waiting for semaphore waste any shared OpenSearch thread pool
threadPool.executor(AD_BATCH_TASK_THREAD_POOL_NAME).execute(() -> {
updateADHCDetectorTask(detectorId, taskId, ImmutableMap.of(STATE_FIELD,, TASK_PROGRESS_FIELD, 1.0, EXECUTION_END_TIME_FIELD,, timeoutInMillis, wrappedListener);
}, e -> {
logger.error("Failed to get finished entity tasks", e);
String errorMessage = getErrorMessage(e);
threadPool.executor(AD_BATCH_TASK_THREAD_POOL_NAME).execute(() -> {
updateADHCDetectorTask(detectorId, taskId, ImmutableMap.of(STATE_FIELD, // set as FAILED if fail to get finished entity tasks., TASK_PROGRESS_FIELD, 1.0, ERROR_FIELD, errorMessage, EXECUTION_END_TIME_FIELD,, timeoutInMillis, wrappedListener);
} else {
threadPool.executor(AD_BATCH_TASK_THREAD_POOL_NAME).execute(() -> {
updateADHCDetectorTask(detectorId, taskId, ImmutableMap.of(STATE_FIELD,, ERROR_FIELD, adTask.getError(), EXECUTION_END_TIME_FIELD,, timeoutInMillis, wrappedListener);
listener.onResponse(new AnomalyDetectorJobResponse(taskId, 0, 0, 0, RestStatus.OK));
the class ADTaskManager method updateLatestRealtimeTaskOnCoordinatingNode.
* Update realtime task cache on realtime detector's coordinating node.
* @param detectorId detector id
* @param state new state
* @param rcfTotalUpdates rcf total updates
* @param detectorIntervalInMinutes detector interval in minutes
* @param error error
* @param listener action listener
public void updateLatestRealtimeTaskOnCoordinatingNode(String detectorId, String state, Long rcfTotalUpdates, Long detectorIntervalInMinutes, String error, ActionListener<UpdateResponse> listener) {
Float initProgress = null;
String newState = null;
// calculate init progress and task state with RCF total updates
if (detectorIntervalInMinutes != null && rcfTotalUpdates != null) {
newState =;
if (rcfTotalUpdates < NUM_MIN_SAMPLES) {
initProgress = (float) rcfTotalUpdates / NUM_MIN_SAMPLES;
} else {
newState =;
initProgress = 1.0f;
// Check if new state is not null and override state calculated with rcf total updates
if (state != null) {
newState = state;
error = Optional.ofNullable(error).orElse("");
if (!adTaskCacheManager.isRealtimeTaskChanged(detectorId, newState, initProgress, error)) {
// If task not changed, no need to update, just return
Map<String, Object> updatedFields = new HashMap<>();
updatedFields.put(COORDINATING_NODE_FIELD, clusterService.localNode().getId());
if (initProgress != null) {
updatedFields.put(INIT_PROGRESS_FIELD, initProgress);
updatedFields.put(ESTIMATED_MINUTES_LEFT_FIELD, Math.max(0, NUM_MIN_SAMPLES - rcfTotalUpdates) * detectorIntervalInMinutes);
if (newState != null) {
updatedFields.put(STATE_FIELD, newState);
if (error != null) {
updatedFields.put(ERROR_FIELD, error);
Float finalInitProgress = initProgress;
// Variable used in lambda expression should be final or effectively final
String finalError = error;
String finalNewState = newState;
updateLatestADTask(detectorId, ADTaskType.REALTIME_TASK_TYPES, updatedFields, ActionListener.wrap(r -> {
logger.debug("Updated latest realtime AD task successfully for detector {}", detectorId);
adTaskCacheManager.updateRealtimeTaskCache(detectorId, finalNewState, finalInitProgress, finalError);
}, e -> {
logger.error("Failed to update realtime task for detector " + detectorId, e);
the class ForwardADTaskTransportAction method doExecute.
protected void doExecute(Task task, ForwardADTaskRequest request, ActionListener<AnomalyDetectorJobResponse> listener) {
ADTaskAction adTaskAction = request.getAdTaskAction();
AnomalyDetector detector = request.getDetector();
DetectionDateRange detectionDateRange = request.getDetectionDateRange();
String detectorId = detector.getDetectorId();
ADTask adTask = request.getAdTask();
User user = request.getUser();
Integer availableTaskSlots = request.getAvailableTaskSLots();
String entityValue = adTaskManager.convertEntityToString(adTask);
switch(adTaskAction) {
logger.debug("Received APPLY_FOR_TASK_SLOTS action for detector {}", detectorId);
adTaskManager.checkTaskSlots(adTask, detector, detectionDateRange, user, ADTaskAction.START, transportService, listener);
logger.debug("Received CHECK_AVAILABLE_TASK_SLOTS action for detector {}", detectorId);
adTaskManager.checkTaskSlots(adTask, detector, detectionDateRange, user, ADTaskAction.SCALE_ENTITY_TASK_SLOTS, transportService, listener);
case START:
// Start historical analysis for detector
logger.debug("Received START action for detector {}", detectorId);
adTaskManager.startDetector(detector, detectionDateRange, user, transportService, ActionListener.wrap(r -> {
adTaskCacheManager.setDetectorTaskSlots(detector.getDetectorId(), availableTaskSlots);
}, e -> listener.onFailure(e)));
logger.debug("Received NEXT_ENTITY action for detector {}, task {}", detectorId, adTask.getTaskId());
// Run next entity for HC detector historical analysis.
if (detector.isMultientityDetector()) {
// AD task could be HC detector level task or entity task
adTaskCacheManager.removeRunningEntity(detectorId, entityValue);
if (!adTaskCacheManager.hasEntity(detectorId)) {
adTaskCacheManager.setDetectorTaskSlots(detectorId, 0);"Historical HC detector done, will remove from cache, detector id:{}", detectorId);
listener.onResponse(new AnomalyDetectorJobResponse(detectorId, 0, 0, 0, RestStatus.OK));
ADTaskState state = !adTask.isEntityTask() && adTask.getError() != null ? ADTaskState.FAILED : ADTaskState.FINISHED;
adTaskManager.setHCDetectorTaskDone(adTask, state, listener);
} else {
logger.debug("Run next entity for detector " + detectorId);
adTaskManager.runNextEntityForHCADHistorical(adTask, transportService, listener);
adTaskManager.updateADHCDetectorTask(detectorId, adTask.getParentTaskId(), ImmutableMap.of(STATE_FIELD,, TASK_PROGRESS_FIELD, adTaskManager.hcDetectorProgress(detectorId), ERROR_FIELD, adTask.getError() != null ? adTask.getError() : ""));
} else {
logger.warn("Can only handle HC entity task for NEXT_ENTITY action, taskId:{} , taskType:{}", adTask.getTaskId(), adTask.getTaskType());
listener.onFailure(new IllegalArgumentException("Unsupported task"));
logger.debug("Received PUSH_BACK_ENTITY action for detector {}, task {}", detectorId, adTask.getTaskId());
// Push back entity to pending entities queue and run next entity.
if (adTask.isEntityTask()) {
// AD task must be entity level task.
adTaskCacheManager.removeRunningEntity(detectorId, entityValue);
if (adTaskManager.isRetryableError(adTask.getError()) && !adTaskCacheManager.exceedRetryLimit(adTask.getDetectorId(), adTask.getTaskId())) {
// If retryable exception happens when run entity task, will push back entity to the end
// of pending entities queue, then we can retry it later.
adTaskCacheManager.pushBackEntity(adTask.getTaskId(), adTask.getDetectorId(), entityValue);
} else {
// If exception is not retryable or exceeds retry limit, will remove this entity.
adTaskCacheManager.removeEntity(adTask.getDetectorId(), entityValue);
logger.warn("Entity task failed, task id: {}, entity: {}", adTask.getTaskId(), adTask.getEntity().toString());
if (!adTaskCacheManager.hasEntity(detectorId)) {
adTaskCacheManager.setDetectorTaskSlots(detectorId, 0);
adTaskManager.setHCDetectorTaskDone(adTask, ADTaskState.FINISHED, listener);
} else {
logger.debug("scale task slots for PUSH_BACK_ENTITY, detector {} task {}", detectorId, adTask.getTaskId());
int taskSlots = adTaskCacheManager.scaleDownHCDetectorTaskSlots(detectorId, 1);
if (taskSlots == 1) {
logger.debug("After scale down, only 1 task slot reserved for detector {}, run next entity", detectorId);
adTaskManager.runNextEntityForHCADHistorical(adTask, transportService, listener);
listener.onResponse(new AnomalyDetectorJobResponse(adTask.getTaskId(), 0, 0, 0, RestStatus.ACCEPTED));
} else {
logger.warn("Can only push back entity task");
listener.onFailure(new IllegalArgumentException("Can only push back entity task"));
logger.debug("Received SCALE_ENTITY_TASK_LANE action for detector {}", detectorId);
// Check current available task slots and scale entity task lane.
if (availableTaskSlots != null && availableTaskSlots > 0) {
int newSlots = Math.min(availableTaskSlots, adTaskManager.detectorTaskSlotScaleDelta(detectorId));
if (newSlots > 0) {
adTaskCacheManager.setAllowedRunningEntities(detectorId, newSlots);
adTaskCacheManager.scaleUpDetectorTaskSlots(detectorId, newSlots);
listener.onResponse(new AnomalyDetectorJobResponse(detector.getDetectorId(), 0, 0, 0, RestStatus.OK));
case CANCEL:
logger.debug("Received CANCEL action for detector {}", detectorId);
// on worker node.
if (detector.isMultientityDetector()) {
adTaskCacheManager.removeRunningEntity(detectorId, entityValue);
if (!adTaskCacheManager.hasEntity(detectorId) || !adTask.isEntityTask()) {
adTaskManager.setHCDetectorTaskDone(adTask, ADTaskState.STOPPED, listener);
listener.onResponse(new AnomalyDetectorJobResponse(adTask.getTaskId(), 0, 0, 0, RestStatus.OK));
} else {
listener.onFailure(new IllegalArgumentException("Only support cancel HC now"));
logger.debug("Received CLEAN_STALE_RUNNING_ENTITIES action for detector {}", detectorId);
// Clean stale running entities of HC detector. For example, some worker node crashed or failed to send
// entity task done message to coordinating node, then coordinating node can't remove running entity
// from cache. We will check task profile when get task. If some entities exist in coordinating cache but
// doesn't exist in worker node's cache, we will clean up these stale running entities on coordinating node.
List<String> staleRunningEntities = request.getStaleRunningEntities();
logger.debug("Clean stale running entities of task {}, staleRunningEntities: {}", adTask.getTaskId(), Arrays.toString(staleRunningEntities.toArray(new String[0])));
for (String entity : staleRunningEntities) {
adTaskManager.removeStaleRunningEntity(adTask, entity, transportService, listener);
listener.onResponse(new AnomalyDetectorJobResponse(adTask.getTaskId(), 0, 0, 0, RestStatus.OK));
boolean historicalTask = adTask.isHistoricalTask();
logger.debug("Received CLEAN_CACHE action for detector {}, taskId: {}, historical: {}", detectorId, adTask.getTaskId(), historicalTask);
if (historicalTask) {
// Don't clear task cache if still has running entity. CLEAN_STALE_RUNNING_ENTITIES will clean
// stale running entity.
} else {
// If hash ring changed like new node added when scale out, the realtime job coordinating node may
// change, then we should clean up cache on old coordinating node.
listener.onResponse(new AnomalyDetectorJobResponse(detector.getDetectorId(), 0, 0, 0, RestStatus.OK));
listener.onFailure(new OpenSearchStatusException("Unsupported AD task action " + adTaskAction, RestStatus.BAD_REQUEST));