use of org.opensearch.ad.model.ADTaskState in project anomaly-detection by opensearch-project.
the class ADTaskManager method stopLatestRealtimeTask.
/**
* Update latest realtime task.
*
* @param detectorId detector id
* @param state task state
* @param error error
* @param transportService transport service
* @param listener action listener
*/
public void stopLatestRealtimeTask(String detectorId, ADTaskState state, Exception error, TransportService transportService, ActionListener<AnomalyDetectorJobResponse> listener) {
getAndExecuteOnLatestDetectorLevelTask(detectorId, REALTIME_TASK_TYPES, (adTask) -> {
if (adTask.isPresent() && !adTask.get().isDone()) {
Map<String, Object> updatedFields = new HashMap<>();
updatedFields.put(ADTask.STATE_FIELD, state.name());
if (error != null) {
updatedFields.put(ADTask.ERROR_FIELD, error.getMessage());
}
AnomalyDetectorFunction function = () -> updateADTask(adTask.get().getTaskId(), updatedFields, ActionListener.wrap(r -> {
if (error == null) {
listener.onResponse(new AnomalyDetectorJobResponse(detectorId, 0, 0, 0, RestStatus.OK));
} else {
listener.onFailure(error);
}
}, e -> {
listener.onFailure(e);
}));
String coordinatingNode = adTask.get().getCoordinatingNode();
if (coordinatingNode != null && transportService != null) {
cleanDetectorCache(adTask.get(), transportService, function, listener);
} else {
function.execute();
}
} else {
listener.onFailure(new OpenSearchStatusException("Anomaly detector job is already stopped: " + detectorId, RestStatus.OK));
}
}, null, false, listener);
}
use of org.opensearch.ad.model.ADTaskState in project anomaly-detection by opensearch-project.
the class ADTaskManager method setHCDetectorTaskDone.
/**
* Set state for HC detector level task when all entities done.
*
* The state could be FINISHED,FAILED or STOPPED.
* 1. If input task state is FINISHED, will check FINISHED entity task count. If
* there is no FINISHED entity task, will set HC detector level task as FAILED; otherwise
* set as FINISHED.
* 2. If input task state is not FINISHED, will set HC detector level task's state as the same.
*
* @param adTask AD task
* @param state AD task state
* @param listener action listener
*/
public void setHCDetectorTaskDone(ADTask adTask, ADTaskState state, ActionListener<AnomalyDetectorJobResponse> listener) {
String detectorId = adTask.getDetectorId();
String taskId = adTask.isEntityTask() ? adTask.getParentTaskId() : adTask.getTaskId();
String detectorTaskId = adTask.getDetectorLevelTaskId();
ActionListener<UpdateResponse> wrappedListener = ActionListener.wrap(response -> {
logger.info("Historical HC detector done with state: {}. Remove from cache, detector id:{}", state.name(), detectorId);
adTaskCacheManager.removeHistoricalTaskCache(detectorId);
}, e -> {
// Will reset task state when get detector with task or maintain tasks in hourly cron.
if (e instanceof LimitExceededException && e.getMessage().contains(HC_DETECTOR_TASK_IS_UPDATING)) {
logger.warn("HC task is updating, skip this update for task: " + taskId);
} else {
logger.error("Failed to update task: " + taskId, e);
}
adTaskCacheManager.removeHistoricalTaskCache(detectorId);
});
// wait for 2 seconds to acquire updating HC detector task semaphore
long timeoutInMillis = 2000;
if (state == ADTaskState.FINISHED) {
this.countEntityTasksByState(detectorTaskId, ImmutableList.of(ADTaskState.FINISHED), ActionListener.wrap(r -> {
logger.info("number of finished entity tasks: {}, for detector {}", r, adTask.getDetectorId());
// Set task as FAILED if no finished entity task; otherwise set as FINISHED
ADTaskState hcDetectorTaskState = r == 0 ? ADTaskState.FAILED : ADTaskState.FINISHED;
// execute in AD batch task thread pool in case waiting for semaphore waste any shared OpenSearch thread pool
threadPool.executor(AD_BATCH_TASK_THREAD_POOL_NAME).execute(() -> {
updateADHCDetectorTask(detectorId, taskId, ImmutableMap.of(STATE_FIELD, hcDetectorTaskState.name(), TASK_PROGRESS_FIELD, 1.0, EXECUTION_END_TIME_FIELD, Instant.now().toEpochMilli()), timeoutInMillis, wrappedListener);
});
}, e -> {
logger.error("Failed to get finished entity tasks", e);
String errorMessage = getErrorMessage(e);
threadPool.executor(AD_BATCH_TASK_THREAD_POOL_NAME).execute(() -> {
updateADHCDetectorTask(detectorId, taskId, ImmutableMap.of(STATE_FIELD, // set as FAILED if fail to get finished entity tasks.
ADTaskState.FAILED.name(), TASK_PROGRESS_FIELD, 1.0, ERROR_FIELD, errorMessage, EXECUTION_END_TIME_FIELD, Instant.now().toEpochMilli()), timeoutInMillis, wrappedListener);
});
}));
} else {
threadPool.executor(AD_BATCH_TASK_THREAD_POOL_NAME).execute(() -> {
updateADHCDetectorTask(detectorId, taskId, ImmutableMap.of(STATE_FIELD, state.name(), ERROR_FIELD, adTask.getError(), EXECUTION_END_TIME_FIELD, Instant.now().toEpochMilli()), timeoutInMillis, wrappedListener);
});
}
listener.onResponse(new AnomalyDetectorJobResponse(taskId, 0, 0, 0, RestStatus.OK));
}
use of org.opensearch.ad.model.ADTaskState in project anomaly-detection by opensearch-project.
the class ForwardADTaskTransportAction method doExecute.
@Override
protected void doExecute(Task task, ForwardADTaskRequest request, ActionListener<AnomalyDetectorJobResponse> listener) {
ADTaskAction adTaskAction = request.getAdTaskAction();
AnomalyDetector detector = request.getDetector();
DetectionDateRange detectionDateRange = request.getDetectionDateRange();
String detectorId = detector.getDetectorId();
ADTask adTask = request.getAdTask();
User user = request.getUser();
Integer availableTaskSlots = request.getAvailableTaskSLots();
String entityValue = adTaskManager.convertEntityToString(adTask);
switch(adTaskAction) {
case APPLY_FOR_TASK_SLOTS:
logger.debug("Received APPLY_FOR_TASK_SLOTS action for detector {}", detectorId);
adTaskManager.checkTaskSlots(adTask, detector, detectionDateRange, user, ADTaskAction.START, transportService, listener);
break;
case CHECK_AVAILABLE_TASK_SLOTS:
logger.debug("Received CHECK_AVAILABLE_TASK_SLOTS action for detector {}", detectorId);
adTaskManager.checkTaskSlots(adTask, detector, detectionDateRange, user, ADTaskAction.SCALE_ENTITY_TASK_SLOTS, transportService, listener);
break;
case START:
// Start historical analysis for detector
logger.debug("Received START action for detector {}", detectorId);
adTaskManager.startDetector(detector, detectionDateRange, user, transportService, ActionListener.wrap(r -> {
adTaskCacheManager.setDetectorTaskSlots(detector.getDetectorId(), availableTaskSlots);
listener.onResponse(r);
}, e -> listener.onFailure(e)));
break;
case NEXT_ENTITY:
logger.debug("Received NEXT_ENTITY action for detector {}, task {}", detectorId, adTask.getTaskId());
// Run next entity for HC detector historical analysis.
if (detector.isMultientityDetector()) {
// AD task could be HC detector level task or entity task
adTaskCacheManager.removeRunningEntity(detectorId, entityValue);
if (!adTaskCacheManager.hasEntity(detectorId)) {
adTaskCacheManager.setDetectorTaskSlots(detectorId, 0);
logger.info("Historical HC detector done, will remove from cache, detector id:{}", detectorId);
listener.onResponse(new AnomalyDetectorJobResponse(detectorId, 0, 0, 0, RestStatus.OK));
ADTaskState state = !adTask.isEntityTask() && adTask.getError() != null ? ADTaskState.FAILED : ADTaskState.FINISHED;
adTaskManager.setHCDetectorTaskDone(adTask, state, listener);
} else {
logger.debug("Run next entity for detector " + detectorId);
adTaskManager.runNextEntityForHCADHistorical(adTask, transportService, listener);
adTaskManager.updateADHCDetectorTask(detectorId, adTask.getParentTaskId(), ImmutableMap.of(STATE_FIELD, ADTaskState.RUNNING.name(), TASK_PROGRESS_FIELD, adTaskManager.hcDetectorProgress(detectorId), ERROR_FIELD, adTask.getError() != null ? adTask.getError() : ""));
}
} else {
logger.warn("Can only handle HC entity task for NEXT_ENTITY action, taskId:{} , taskType:{}", adTask.getTaskId(), adTask.getTaskType());
listener.onFailure(new IllegalArgumentException("Unsupported task"));
}
break;
case PUSH_BACK_ENTITY:
logger.debug("Received PUSH_BACK_ENTITY action for detector {}, task {}", detectorId, adTask.getTaskId());
// Push back entity to pending entities queue and run next entity.
if (adTask.isEntityTask()) {
// AD task must be entity level task.
adTaskCacheManager.removeRunningEntity(detectorId, entityValue);
if (adTaskManager.isRetryableError(adTask.getError()) && !adTaskCacheManager.exceedRetryLimit(adTask.getDetectorId(), adTask.getTaskId())) {
// If retryable exception happens when run entity task, will push back entity to the end
// of pending entities queue, then we can retry it later.
adTaskCacheManager.pushBackEntity(adTask.getTaskId(), adTask.getDetectorId(), entityValue);
} else {
// If exception is not retryable or exceeds retry limit, will remove this entity.
adTaskCacheManager.removeEntity(adTask.getDetectorId(), entityValue);
logger.warn("Entity task failed, task id: {}, entity: {}", adTask.getTaskId(), adTask.getEntity().toString());
}
if (!adTaskCacheManager.hasEntity(detectorId)) {
adTaskCacheManager.setDetectorTaskSlots(detectorId, 0);
adTaskManager.setHCDetectorTaskDone(adTask, ADTaskState.FINISHED, listener);
} else {
logger.debug("scale task slots for PUSH_BACK_ENTITY, detector {} task {}", detectorId, adTask.getTaskId());
int taskSlots = adTaskCacheManager.scaleDownHCDetectorTaskSlots(detectorId, 1);
if (taskSlots == 1) {
logger.debug("After scale down, only 1 task slot reserved for detector {}, run next entity", detectorId);
adTaskManager.runNextEntityForHCADHistorical(adTask, transportService, listener);
}
listener.onResponse(new AnomalyDetectorJobResponse(adTask.getTaskId(), 0, 0, 0, RestStatus.ACCEPTED));
}
} else {
logger.warn("Can only push back entity task");
listener.onFailure(new IllegalArgumentException("Can only push back entity task"));
}
break;
case SCALE_ENTITY_TASK_SLOTS:
logger.debug("Received SCALE_ENTITY_TASK_LANE action for detector {}", detectorId);
// Check current available task slots and scale entity task lane.
if (availableTaskSlots != null && availableTaskSlots > 0) {
int newSlots = Math.min(availableTaskSlots, adTaskManager.detectorTaskSlotScaleDelta(detectorId));
if (newSlots > 0) {
adTaskCacheManager.setAllowedRunningEntities(detectorId, newSlots);
adTaskCacheManager.scaleUpDetectorTaskSlots(detectorId, newSlots);
}
}
listener.onResponse(new AnomalyDetectorJobResponse(detector.getDetectorId(), 0, 0, 0, RestStatus.OK));
break;
case CANCEL:
logger.debug("Received CANCEL action for detector {}", detectorId);
// on worker node.
if (detector.isMultientityDetector()) {
adTaskCacheManager.clearPendingEntities(detectorId);
adTaskCacheManager.removeRunningEntity(detectorId, entityValue);
if (!adTaskCacheManager.hasEntity(detectorId) || !adTask.isEntityTask()) {
adTaskManager.setHCDetectorTaskDone(adTask, ADTaskState.STOPPED, listener);
}
listener.onResponse(new AnomalyDetectorJobResponse(adTask.getTaskId(), 0, 0, 0, RestStatus.OK));
} else {
listener.onFailure(new IllegalArgumentException("Only support cancel HC now"));
}
break;
case CLEAN_STALE_RUNNING_ENTITIES:
logger.debug("Received CLEAN_STALE_RUNNING_ENTITIES action for detector {}", detectorId);
// Clean stale running entities of HC detector. For example, some worker node crashed or failed to send
// entity task done message to coordinating node, then coordinating node can't remove running entity
// from cache. We will check task profile when get task. If some entities exist in coordinating cache but
// doesn't exist in worker node's cache, we will clean up these stale running entities on coordinating node.
List<String> staleRunningEntities = request.getStaleRunningEntities();
logger.debug("Clean stale running entities of task {}, staleRunningEntities: {}", adTask.getTaskId(), Arrays.toString(staleRunningEntities.toArray(new String[0])));
for (String entity : staleRunningEntities) {
adTaskManager.removeStaleRunningEntity(adTask, entity, transportService, listener);
}
listener.onResponse(new AnomalyDetectorJobResponse(adTask.getTaskId(), 0, 0, 0, RestStatus.OK));
break;
case CLEAN_CACHE:
boolean historicalTask = adTask.isHistoricalTask();
logger.debug("Received CLEAN_CACHE action for detector {}, taskId: {}, historical: {}", detectorId, adTask.getTaskId(), historicalTask);
if (historicalTask) {
// Don't clear task cache if still has running entity. CLEAN_STALE_RUNNING_ENTITIES will clean
// stale running entity.
adTaskCacheManager.removeHistoricalTaskCacheIfNoRunningEntity(detectorId);
} else {
adTaskCacheManager.removeRealtimeTaskCache(detectorId);
// If hash ring changed like new node added when scale out, the realtime job coordinating node may
// change, then we should clean up cache on old coordinating node.
stateManager.clear(detectorId);
featureManager.clear(detectorId);
}
listener.onResponse(new AnomalyDetectorJobResponse(detector.getDetectorId(), 0, 0, 0, RestStatus.OK));
break;
default:
listener.onFailure(new OpenSearchStatusException("Unsupported AD task action " + adTaskAction, RestStatus.BAD_REQUEST));
break;
}
}
use of org.opensearch.ad.model.ADTaskState in project anomaly-detection by opensearch-project.
the class ADTaskManager method countEntityTasksByState.
/**
* Count entity tasks by state with detector level task id(parent task id).
*
* @param detectorTaskId detector level task id
* @param taskStates task states
* @param listener action listener
*/
public void countEntityTasksByState(String detectorTaskId, List<ADTaskState> taskStates, ActionListener<Long> listener) {
BoolQueryBuilder queryBuilder = new BoolQueryBuilder();
queryBuilder.filter(new TermQueryBuilder(PARENT_TASK_ID_FIELD, detectorTaskId));
if (taskStates != null && taskStates.size() > 0) {
queryBuilder.filter(new TermsQueryBuilder(STATE_FIELD, taskStates.stream().map(s -> s.name()).collect(Collectors.toList())));
}
SearchSourceBuilder sourceBuilder = new SearchSourceBuilder();
sourceBuilder.query(queryBuilder);
sourceBuilder.size(0);
sourceBuilder.trackTotalHits(true);
SearchRequest request = new SearchRequest();
request.source(sourceBuilder);
request.indices(DETECTION_STATE_INDEX);
client.search(request, ActionListener.wrap(r -> {
TotalHits totalHits = r.getHits().getTotalHits();
listener.onResponse(totalHits.value);
}, e -> listener.onFailure(e)));
}
use of org.opensearch.ad.model.ADTaskState in project anomaly-detection by opensearch-project.
the class AnomalyDetectorJobTransportActionTests method testCleanOldTaskDocs.
// TODO: fix this flaky test case
@Ignore
public void testCleanOldTaskDocs() throws InterruptedException, IOException {
AnomalyDetector detector = TestHelpers.randomDetector(ImmutableList.of(maxValueFeature()), testIndex, detectionIntervalInMinutes, timeField);
String detectorId = createDetector(detector);
createDetectionStateIndex();
List<ADTaskState> states = ImmutableList.of(ADTaskState.FAILED, ADTaskState.FINISHED, ADTaskState.STOPPED);
for (ADTaskState state : states) {
ADTask task = randomADTask(randomAlphaOfLength(5), detector, detectorId, dateRange, state);
createADTask(task);
}
long count = countDocs(CommonName.DETECTION_STATE_INDEX);
assertEquals(states.size(), count);
AnomalyDetectorJobRequest request = new AnomalyDetectorJobRequest(detectorId, dateRange, true, randomLong(), randomLong(), START_JOB);
AtomicReference<AnomalyDetectorJobResponse> response = new AtomicReference<>();
CountDownLatch latch = new CountDownLatch(1);
Thread.sleep(2000);
client().execute(AnomalyDetectorJobAction.INSTANCE, request, ActionListener.wrap(r -> {
latch.countDown();
response.set(r);
}, e -> {
latch.countDown();
}));
latch.await();
Thread.sleep(10000);
count = countDetectorDocs(detectorId);
// we have one latest task, so total count should add 1
assertEquals(maxOldAdTaskDocsPerDetector + 1, count);
}
Aggregations