use of org.opensearch.ad.model.Entity in project anomaly-detection by opensearch-project.
the class CheckpointReadWorkerTests method testHostException.
@SuppressWarnings("unchecked")
public void testHostException() throws IOException {
String detectorId2 = "456";
Entity entity4 = Entity.createSingleAttributeEntity(categoryField, "value4");
EntityFeatureRequest request4 = new EntityFeatureRequest(Integer.MAX_VALUE, detectorId2, RequestPriority.MEDIUM, entity4, new double[] { 0 }, 0);
AnomalyDetector detector2 = TestHelpers.randomAnomalyDetectorUsingCategoryFields(detectorId2, Arrays.asList(categoryField));
doAnswer(invocation -> {
ActionListener<Optional<AnomalyDetector>> listener = invocation.getArgument(1);
listener.onResponse(Optional.of(detector2));
return null;
}).when(nodeStateManager).getAnomalyDetector(eq(detectorId2), any(ActionListener.class));
doAnswer(invocation -> {
ActionListener<Optional<AnomalyDetector>> listener = invocation.getArgument(1);
listener.onResponse(Optional.of(detector));
return null;
}).when(nodeStateManager).getAnomalyDetector(eq(detectorId), any(ActionListener.class));
doAnswer(invocation -> {
MultiGetItemResponse[] items = new MultiGetItemResponse[2];
items[0] = new MultiGetItemResponse(new GetResponse(new GetResult(CommonName.CHECKPOINT_INDEX_NAME, "_doc", entity.getModelId(detectorId).get(), 1, 1, 0, true, null, null, null)), null);
items[1] = new MultiGetItemResponse(new GetResponse(new GetResult(CommonName.CHECKPOINT_INDEX_NAME, "_doc", entity4.getModelId(detectorId2).get(), 1, 1, 0, true, null, null, null)), null);
ActionListener<MultiGetResponse> listener = invocation.getArgument(1);
listener.onResponse(new MultiGetResponse(items));
return null;
}).when(checkpoint).batchRead(any(), any());
doThrow(LimitExceededException.class).when(entityCache).hostIfPossible(eq(detector2), any());
List<EntityFeatureRequest> requests = new ArrayList<>();
requests.add(request);
requests.add(request4);
worker.putAll(requests);
verify(coldstartQueue, never()).put(any());
verify(entityCache, times(2)).hostIfPossible(any(), any());
verify(nodeStateManager, times(1)).setException(eq(detectorId2), any(LimitExceededException.class));
verify(nodeStateManager, never()).setException(eq(detectorId), any(LimitExceededException.class));
}
use of org.opensearch.ad.model.Entity in project anomaly-detection by opensearch-project.
the class ADTaskManager method checkTaskSlots.
/**
* Check available task slots before start historical analysis and scale task lane.
* This check will be done on lead node which will gather detector task slots of all
* data nodes and calculate how many task slots available.
*
* @param adTask AD task
* @param detector detector
* @param detectionDateRange detection date range
* @param user user
* @param afterCheckAction target task action to run after task slot checking
* @param transportService transport service
* @param listener action listener
*/
public void checkTaskSlots(ADTask adTask, AnomalyDetector detector, DetectionDateRange detectionDateRange, User user, ADTaskAction afterCheckAction, TransportService transportService, ActionListener<AnomalyDetectorJobResponse> listener) {
String detectorId = detector.getDetectorId();
logger.debug("Start checking task slots for detector: {}, task action: {}", detectorId, afterCheckAction);
if (!checkingTaskSlot.tryAcquire()) {
logger.info("Can't acquire checking task slot semaphore for detector {}", detectorId);
listener.onFailure(new OpenSearchStatusException("Too many historical analysis requests in short time. Please retry later.", RestStatus.FORBIDDEN));
return;
}
ActionListener<AnomalyDetectorJobResponse> wrappedActionListener = ActionListener.runAfter(listener, () -> {
checkingTaskSlot.release(1);
logger.debug("Release checking task slot semaphore on lead node for detector {}", detectorId);
});
hashRing.getNodesWithSameLocalAdVersion(nodes -> {
int maxAdTaskSlots = nodes.length * maxAdBatchTaskPerNode;
ADStatsRequest adStatsRequest = new ADStatsRequest(nodes);
adStatsRequest.addAll(ImmutableSet.of(AD_USED_BATCH_TASK_SLOT_COUNT.getName(), AD_DETECTOR_ASSIGNED_BATCH_TASK_SLOT_COUNT.getName()));
client.execute(ADStatsNodesAction.INSTANCE, adStatsRequest, ActionListener.wrap(adStatsResponse -> {
// Total entity tasks running on worker nodes
int totalUsedTaskSlots = 0;
// Total assigned task slots on coordinating nodes
int totalAssignedTaskSlots = 0;
for (ADStatsNodeResponse response : adStatsResponse.getNodes()) {
totalUsedTaskSlots += (int) response.getStatsMap().get(AD_USED_BATCH_TASK_SLOT_COUNT.getName());
totalAssignedTaskSlots += (int) response.getStatsMap().get(AD_DETECTOR_ASSIGNED_BATCH_TASK_SLOT_COUNT.getName());
}
logger.info("Current total used task slots is {}, total detector assigned task slots is {} when start historical " + "analysis for detector {}", totalUsedTaskSlots, totalAssignedTaskSlots, detectorId);
// In happy case, totalAssignedTaskSlots >= totalUsedTaskSlots. If some coordinating node left, then we can't
// get detector task slots cached on it, so it's possible that totalAssignedTaskSlots < totalUsedTaskSlots.
int currentUsedTaskSlots = Math.max(totalUsedTaskSlots, totalAssignedTaskSlots);
if (currentUsedTaskSlots >= maxAdTaskSlots) {
wrappedActionListener.onFailure(new OpenSearchStatusException("No available task slot", RestStatus.BAD_REQUEST));
return;
}
int availableAdTaskSlots = maxAdTaskSlots - currentUsedTaskSlots;
logger.info("Current available task slots is {} for historical analysis of detector {}", availableAdTaskSlots, detectorId);
if (ADTaskAction.SCALE_ENTITY_TASK_SLOTS == afterCheckAction) {
forwardToCoordinatingNode(adTask, detector, detectionDateRange, user, afterCheckAction, transportService, wrappedActionListener, availableAdTaskSlots);
return;
}
// It takes long time to check top entities especially for multi-category HC. Tested with
// 1.8 billion docs for multi-category HC, it took more than 20 seconds and caused timeout.
// By removing top entity check, it took about 200ms to return. So just remove it to make
// sure REST API can return quickly.
// We may assign more task slots. For example, cluster has 4 data nodes, each node can run 2
// batch tasks, so the available task slot number is 8. If max running entities per HC is 4,
// then we will assign 4 tasks slots to this HC detector (4 is less than 8). The data index
// only has 2 entities. So we assign 2 more task slots than actual need. But it's ok as we
// will auto tune task slot when historical analysis task starts.
int approvedTaskSlots = detector.isMultientityDetector() ? Math.min(maxRunningEntitiesPerDetector, availableAdTaskSlots) : 1;
forwardToCoordinatingNode(adTask, detector, detectionDateRange, user, afterCheckAction, transportService, wrappedActionListener, approvedTaskSlots);
}, exception -> {
logger.error("Failed to get node's task stats for detector " + detectorId, exception);
wrappedActionListener.onFailure(exception);
}));
}, wrappedActionListener);
}
use of org.opensearch.ad.model.Entity in project anomaly-detection by opensearch-project.
the class ADTaskManager method resetHistoricalDetectorTaskState.
private <T> void resetHistoricalDetectorTaskState(List<ADTask> runningHistoricalTasks, AnomalyDetectorFunction function, TransportService transportService, ActionListener<T> listener) {
if (isNullOrEmpty(runningHistoricalTasks)) {
function.execute();
return;
}
ADTask adTask = runningHistoricalTasks.get(0);
// state when get historical task with get detector API.
if (!lastUpdateTimeOfHistoricalTaskExpired(adTask)) {
function.execute();
return;
}
String taskId = adTask.getTaskId();
AnomalyDetector detector = adTask.getDetector();
getADTaskProfile(adTask, ActionListener.wrap(taskProfile -> {
boolean taskStopped = isTaskStopped(taskId, detector, taskProfile);
if (taskStopped) {
logger.debug("Reset task state as stopped, task id: {}", adTask.getTaskId());
if (// This means coordinating node doesn't have HC detector cache
taskProfile.getTaskId() == null && detector.isMultientityDetector() && !isNullOrEmpty(taskProfile.getEntityTaskProfiles())) {
// If coordinating node restarted, HC detector cache on it will be gone. But worker node still
// runs entity tasks, we'd better stop these entity tasks to clean up resource earlier.
stopHistoricalAnalysis(adTask.getDetectorId(), Optional.of(adTask), null, ActionListener.wrap(r -> {
logger.debug("Restop detector successfully");
resetTaskStateAsStopped(adTask, function, transportService, listener);
}, e -> {
logger.error("Failed to restop detector ", e);
listener.onFailure(e);
}));
} else {
resetTaskStateAsStopped(adTask, function, transportService, listener);
}
} else {
function.execute();
// If still running, check if there is any stale running entities and clean them
if (ADTaskType.HISTORICAL_HC_DETECTOR.name().equals(adTask.getTaskType())) {
// and poll next entity from pending entity queue and run it.
if (!isNullOrEmpty(taskProfile.getRunningEntities()) && hcBatchTaskExpired(taskProfile.getLatestHCTaskRunTime())) {
List<String> runningTasksInCoordinatingNodeCache = new ArrayList<>(taskProfile.getRunningEntities());
List<String> runningTasksOnWorkerNode = new ArrayList<>();
if (taskProfile.getEntityTaskProfiles() != null && taskProfile.getEntityTaskProfiles().size() > 0) {
taskProfile.getEntityTaskProfiles().forEach(entryTask -> runningTasksOnWorkerNode.add(convertEntityToString(entryTask.getEntity(), detector)));
}
if (runningTasksInCoordinatingNodeCache.size() > runningTasksOnWorkerNode.size()) {
runningTasksInCoordinatingNodeCache.removeAll(runningTasksOnWorkerNode);
forwardStaleRunningEntitiesToCoordinatingNode(adTask, ADTaskAction.CLEAN_STALE_RUNNING_ENTITIES, transportService, runningTasksInCoordinatingNodeCache, ActionListener.wrap(res -> logger.debug("Forwarded task to clean stale running entity, task id {}", taskId), ex -> logger.error("Failed to forward clean stale running entity for task " + taskId, ex)));
}
}
}
}
}, e -> {
logger.error("Failed to get AD task profile for task " + adTask.getTaskId(), e);
function.execute();
}));
}
use of org.opensearch.ad.model.Entity in project anomaly-detection by opensearch-project.
the class ADTaskManager method getADTaskProfile.
/**
* Get AD task profile.
* @param adDetectorLevelTask detector level task
* @param listener action listener
*/
private void getADTaskProfile(ADTask adDetectorLevelTask, ActionListener<ADTaskProfile> listener) {
String detectorId = adDetectorLevelTask.getDetectorId();
hashRing.getAllEligibleDataNodesWithKnownAdVersion(dataNodes -> {
ADTaskProfileRequest adTaskProfileRequest = new ADTaskProfileRequest(detectorId, dataNodes);
client.execute(ADTaskProfileAction.INSTANCE, adTaskProfileRequest, ActionListener.wrap(response -> {
if (response.hasFailures()) {
listener.onFailure(response.failures().get(0));
return;
}
List<ADEntityTaskProfile> adEntityTaskProfiles = new ArrayList<>();
ADTaskProfile detectorTaskProfile = new ADTaskProfile(adDetectorLevelTask);
for (ADTaskProfileNodeResponse node : response.getNodes()) {
ADTaskProfile taskProfile = node.getAdTaskProfile();
if (taskProfile != null) {
if (taskProfile.getNodeId() != null) {
// HC detector: task profile from coordinating node
// Single entity detector: task profile from worker node
detectorTaskProfile.setTaskId(taskProfile.getTaskId());
detectorTaskProfile.setShingleSize(taskProfile.getShingleSize());
detectorTaskProfile.setRcfTotalUpdates(taskProfile.getRcfTotalUpdates());
detectorTaskProfile.setThresholdModelTrained(taskProfile.getThresholdModelTrained());
detectorTaskProfile.setThresholdModelTrainingDataSize(taskProfile.getThresholdModelTrainingDataSize());
detectorTaskProfile.setModelSizeInBytes(taskProfile.getModelSizeInBytes());
detectorTaskProfile.setNodeId(taskProfile.getNodeId());
detectorTaskProfile.setTotalEntitiesCount(taskProfile.getTotalEntitiesCount());
detectorTaskProfile.setDetectorTaskSlots(taskProfile.getDetectorTaskSlots());
detectorTaskProfile.setPendingEntitiesCount(taskProfile.getPendingEntitiesCount());
detectorTaskProfile.setRunningEntitiesCount(taskProfile.getRunningEntitiesCount());
detectorTaskProfile.setRunningEntities(taskProfile.getRunningEntities());
detectorTaskProfile.setAdTaskType(taskProfile.getAdTaskType());
}
if (taskProfile.getEntityTaskProfiles() != null) {
adEntityTaskProfiles.addAll(taskProfile.getEntityTaskProfiles());
}
}
}
if (adEntityTaskProfiles != null && adEntityTaskProfiles.size() > 0) {
detectorTaskProfile.setEntityTaskProfiles(adEntityTaskProfiles);
}
listener.onResponse(detectorTaskProfile);
}, e -> {
logger.error("Failed to get task profile for task " + adDetectorLevelTask.getTaskId(), e);
listener.onFailure(e);
}));
}, listener);
}
use of org.opensearch.ad.model.Entity in project anomaly-detection by opensearch-project.
the class ADTaskManager method setHCDetectorTaskDone.
/**
* Set state for HC detector level task when all entities done.
*
* The state could be FINISHED,FAILED or STOPPED.
* 1. If input task state is FINISHED, will check FINISHED entity task count. If
* there is no FINISHED entity task, will set HC detector level task as FAILED; otherwise
* set as FINISHED.
* 2. If input task state is not FINISHED, will set HC detector level task's state as the same.
*
* @param adTask AD task
* @param state AD task state
* @param listener action listener
*/
public void setHCDetectorTaskDone(ADTask adTask, ADTaskState state, ActionListener<AnomalyDetectorJobResponse> listener) {
String detectorId = adTask.getDetectorId();
String taskId = adTask.isEntityTask() ? adTask.getParentTaskId() : adTask.getTaskId();
String detectorTaskId = adTask.getDetectorLevelTaskId();
ActionListener<UpdateResponse> wrappedListener = ActionListener.wrap(response -> {
logger.info("Historical HC detector done with state: {}. Remove from cache, detector id:{}", state.name(), detectorId);
adTaskCacheManager.removeHistoricalTaskCache(detectorId);
}, e -> {
// Will reset task state when get detector with task or maintain tasks in hourly cron.
if (e instanceof LimitExceededException && e.getMessage().contains(HC_DETECTOR_TASK_IS_UPDATING)) {
logger.warn("HC task is updating, skip this update for task: " + taskId);
} else {
logger.error("Failed to update task: " + taskId, e);
}
adTaskCacheManager.removeHistoricalTaskCache(detectorId);
});
// wait for 2 seconds to acquire updating HC detector task semaphore
long timeoutInMillis = 2000;
if (state == ADTaskState.FINISHED) {
this.countEntityTasksByState(detectorTaskId, ImmutableList.of(ADTaskState.FINISHED), ActionListener.wrap(r -> {
logger.info("number of finished entity tasks: {}, for detector {}", r, adTask.getDetectorId());
// Set task as FAILED if no finished entity task; otherwise set as FINISHED
ADTaskState hcDetectorTaskState = r == 0 ? ADTaskState.FAILED : ADTaskState.FINISHED;
// execute in AD batch task thread pool in case waiting for semaphore waste any shared OpenSearch thread pool
threadPool.executor(AD_BATCH_TASK_THREAD_POOL_NAME).execute(() -> {
updateADHCDetectorTask(detectorId, taskId, ImmutableMap.of(STATE_FIELD, hcDetectorTaskState.name(), TASK_PROGRESS_FIELD, 1.0, EXECUTION_END_TIME_FIELD, Instant.now().toEpochMilli()), timeoutInMillis, wrappedListener);
});
}, e -> {
logger.error("Failed to get finished entity tasks", e);
String errorMessage = getErrorMessage(e);
threadPool.executor(AD_BATCH_TASK_THREAD_POOL_NAME).execute(() -> {
updateADHCDetectorTask(detectorId, taskId, ImmutableMap.of(STATE_FIELD, // set as FAILED if fail to get finished entity tasks.
ADTaskState.FAILED.name(), TASK_PROGRESS_FIELD, 1.0, ERROR_FIELD, errorMessage, EXECUTION_END_TIME_FIELD, Instant.now().toEpochMilli()), timeoutInMillis, wrappedListener);
});
}));
} else {
threadPool.executor(AD_BATCH_TASK_THREAD_POOL_NAME).execute(() -> {
updateADHCDetectorTask(detectorId, taskId, ImmutableMap.of(STATE_FIELD, state.name(), ERROR_FIELD, adTask.getError(), EXECUTION_END_TIME_FIELD, Instant.now().toEpochMilli()), timeoutInMillis, wrappedListener);
});
}
listener.onResponse(new AnomalyDetectorJobResponse(taskId, 0, 0, 0, RestStatus.OK));
}
Aggregations