use of org.opensearch.transport.TransportService in project anomaly-detection by opensearch-project.
the class ADTaskManager method cleanDetectorCache.
protected void cleanDetectorCache(ADTask adTask, TransportService transportService, AnomalyDetectorFunction function) {
String detectorId = adTask.getDetectorId();
String taskId = adTask.getTaskId();
cleanDetectorCache(adTask, transportService, function, ActionListener.wrap(r -> {
logger.debug("Successfully cleaned cache for detector {}, task {}", detectorId, taskId);
}, e -> {
logger.error("Failed to clean cache for detector " + detectorId + ", task " + taskId, e);
}));
}
use of org.opensearch.transport.TransportService in project anomaly-detection by opensearch-project.
the class ADTaskManager method checkTaskSlots.
/**
* Check available task slots before start historical analysis and scale task lane.
* This check will be done on lead node which will gather detector task slots of all
* data nodes and calculate how many task slots available.
*
* @param adTask AD task
* @param detector detector
* @param detectionDateRange detection date range
* @param user user
* @param afterCheckAction target task action to run after task slot checking
* @param transportService transport service
* @param listener action listener
*/
public void checkTaskSlots(ADTask adTask, AnomalyDetector detector, DetectionDateRange detectionDateRange, User user, ADTaskAction afterCheckAction, TransportService transportService, ActionListener<AnomalyDetectorJobResponse> listener) {
String detectorId = detector.getDetectorId();
logger.debug("Start checking task slots for detector: {}, task action: {}", detectorId, afterCheckAction);
if (!checkingTaskSlot.tryAcquire()) {
logger.info("Can't acquire checking task slot semaphore for detector {}", detectorId);
listener.onFailure(new OpenSearchStatusException("Too many historical analysis requests in short time. Please retry later.", RestStatus.FORBIDDEN));
return;
}
ActionListener<AnomalyDetectorJobResponse> wrappedActionListener = ActionListener.runAfter(listener, () -> {
checkingTaskSlot.release(1);
logger.debug("Release checking task slot semaphore on lead node for detector {}", detectorId);
});
hashRing.getNodesWithSameLocalAdVersion(nodes -> {
int maxAdTaskSlots = nodes.length * maxAdBatchTaskPerNode;
ADStatsRequest adStatsRequest = new ADStatsRequest(nodes);
adStatsRequest.addAll(ImmutableSet.of(AD_USED_BATCH_TASK_SLOT_COUNT.getName(), AD_DETECTOR_ASSIGNED_BATCH_TASK_SLOT_COUNT.getName()));
client.execute(ADStatsNodesAction.INSTANCE, adStatsRequest, ActionListener.wrap(adStatsResponse -> {
// Total entity tasks running on worker nodes
int totalUsedTaskSlots = 0;
// Total assigned task slots on coordinating nodes
int totalAssignedTaskSlots = 0;
for (ADStatsNodeResponse response : adStatsResponse.getNodes()) {
totalUsedTaskSlots += (int) response.getStatsMap().get(AD_USED_BATCH_TASK_SLOT_COUNT.getName());
totalAssignedTaskSlots += (int) response.getStatsMap().get(AD_DETECTOR_ASSIGNED_BATCH_TASK_SLOT_COUNT.getName());
}
logger.info("Current total used task slots is {}, total detector assigned task slots is {} when start historical " + "analysis for detector {}", totalUsedTaskSlots, totalAssignedTaskSlots, detectorId);
// In happy case, totalAssignedTaskSlots >= totalUsedTaskSlots. If some coordinating node left, then we can't
// get detector task slots cached on it, so it's possible that totalAssignedTaskSlots < totalUsedTaskSlots.
int currentUsedTaskSlots = Math.max(totalUsedTaskSlots, totalAssignedTaskSlots);
if (currentUsedTaskSlots >= maxAdTaskSlots) {
wrappedActionListener.onFailure(new OpenSearchStatusException("No available task slot", RestStatus.BAD_REQUEST));
return;
}
int availableAdTaskSlots = maxAdTaskSlots - currentUsedTaskSlots;
logger.info("Current available task slots is {} for historical analysis of detector {}", availableAdTaskSlots, detectorId);
if (ADTaskAction.SCALE_ENTITY_TASK_SLOTS == afterCheckAction) {
forwardToCoordinatingNode(adTask, detector, detectionDateRange, user, afterCheckAction, transportService, wrappedActionListener, availableAdTaskSlots);
return;
}
// It takes long time to check top entities especially for multi-category HC. Tested with
// 1.8 billion docs for multi-category HC, it took more than 20 seconds and caused timeout.
// By removing top entity check, it took about 200ms to return. So just remove it to make
// sure REST API can return quickly.
// We may assign more task slots. For example, cluster has 4 data nodes, each node can run 2
// batch tasks, so the available task slot number is 8. If max running entities per HC is 4,
// then we will assign 4 tasks slots to this HC detector (4 is less than 8). The data index
// only has 2 entities. So we assign 2 more task slots than actual need. But it's ok as we
// will auto tune task slot when historical analysis task starts.
int approvedTaskSlots = detector.isMultientityDetector() ? Math.min(maxRunningEntitiesPerDetector, availableAdTaskSlots) : 1;
forwardToCoordinatingNode(adTask, detector, detectionDateRange, user, afterCheckAction, transportService, wrappedActionListener, approvedTaskSlots);
}, exception -> {
logger.error("Failed to get node's task stats for detector " + detectorId, exception);
wrappedActionListener.onFailure(exception);
}));
}, wrappedActionListener);
}
use of org.opensearch.transport.TransportService in project anomaly-detection by opensearch-project.
the class ADTaskManager method resetHistoricalDetectorTaskState.
private <T> void resetHistoricalDetectorTaskState(List<ADTask> runningHistoricalTasks, AnomalyDetectorFunction function, TransportService transportService, ActionListener<T> listener) {
if (isNullOrEmpty(runningHistoricalTasks)) {
function.execute();
return;
}
ADTask adTask = runningHistoricalTasks.get(0);
// state when get historical task with get detector API.
if (!lastUpdateTimeOfHistoricalTaskExpired(adTask)) {
function.execute();
return;
}
String taskId = adTask.getTaskId();
AnomalyDetector detector = adTask.getDetector();
getADTaskProfile(adTask, ActionListener.wrap(taskProfile -> {
boolean taskStopped = isTaskStopped(taskId, detector, taskProfile);
if (taskStopped) {
logger.debug("Reset task state as stopped, task id: {}", adTask.getTaskId());
if (// This means coordinating node doesn't have HC detector cache
taskProfile.getTaskId() == null && detector.isMultientityDetector() && !isNullOrEmpty(taskProfile.getEntityTaskProfiles())) {
// If coordinating node restarted, HC detector cache on it will be gone. But worker node still
// runs entity tasks, we'd better stop these entity tasks to clean up resource earlier.
stopHistoricalAnalysis(adTask.getDetectorId(), Optional.of(adTask), null, ActionListener.wrap(r -> {
logger.debug("Restop detector successfully");
resetTaskStateAsStopped(adTask, function, transportService, listener);
}, e -> {
logger.error("Failed to restop detector ", e);
listener.onFailure(e);
}));
} else {
resetTaskStateAsStopped(adTask, function, transportService, listener);
}
} else {
function.execute();
// If still running, check if there is any stale running entities and clean them
if (ADTaskType.HISTORICAL_HC_DETECTOR.name().equals(adTask.getTaskType())) {
// and poll next entity from pending entity queue and run it.
if (!isNullOrEmpty(taskProfile.getRunningEntities()) && hcBatchTaskExpired(taskProfile.getLatestHCTaskRunTime())) {
List<String> runningTasksInCoordinatingNodeCache = new ArrayList<>(taskProfile.getRunningEntities());
List<String> runningTasksOnWorkerNode = new ArrayList<>();
if (taskProfile.getEntityTaskProfiles() != null && taskProfile.getEntityTaskProfiles().size() > 0) {
taskProfile.getEntityTaskProfiles().forEach(entryTask -> runningTasksOnWorkerNode.add(convertEntityToString(entryTask.getEntity(), detector)));
}
if (runningTasksInCoordinatingNodeCache.size() > runningTasksOnWorkerNode.size()) {
runningTasksInCoordinatingNodeCache.removeAll(runningTasksOnWorkerNode);
forwardStaleRunningEntitiesToCoordinatingNode(adTask, ADTaskAction.CLEAN_STALE_RUNNING_ENTITIES, transportService, runningTasksInCoordinatingNodeCache, ActionListener.wrap(res -> logger.debug("Forwarded task to clean stale running entity, task id {}", taskId), ex -> logger.error("Failed to forward clean stale running entity for task " + taskId, ex)));
}
}
}
}
}, e -> {
logger.error("Failed to get AD task profile for task " + adTask.getTaskId(), e);
function.execute();
}));
}
use of org.opensearch.transport.TransportService in project anomaly-detection by opensearch-project.
the class ADTaskManager method initRealtimeTaskCacheAndCleanupStaleCache.
/**
* Init realtime task cache and clean up realtime task cache on old coordinating node. Realtime AD
* depends on job scheduler to choose node (job coordinating node) to run AD job. Nodes have primary
* or replica shard of AD job index are candidate to run AD job. Job scheduler will build hash ring
* on these candidate nodes and choose one to run AD job. If AD job index shard relocated, for example
* new node added into cluster, then job scheduler will rebuild hash ring and may choose different
* node to run AD job. So we need to init realtime task cache on new AD job coordinating node and
* clean up cache on old coordinating node.
*
* If realtime task cache inited for the first time on this node, listener will return true; otherwise
* listener will return false.
*
* @param detectorId detector id
* @param detector anomaly detector
* @param transportService transport service
* @param listener listener
*/
public void initRealtimeTaskCacheAndCleanupStaleCache(String detectorId, AnomalyDetector detector, TransportService transportService, ActionListener<Boolean> listener) {
try {
if (adTaskCacheManager.getRealtimeTaskCache(detectorId) != null) {
listener.onResponse(false);
return;
}
getAndExecuteOnLatestDetectorLevelTask(detectorId, REALTIME_TASK_TYPES, (adTaskOptional) -> {
if (!adTaskOptional.isPresent()) {
logger.debug("Can't find realtime task for detector {}, init realtime task cache directly", detectorId);
AnomalyDetectorFunction function = () -> createNewADTask(detector, null, detector.getUser(), clusterService.localNode().getId(), ActionListener.wrap(r -> {
logger.info("Recreate realtime task successfully for detector {}", detectorId);
adTaskCacheManager.initRealtimeTaskCache(detectorId, detector.getDetectorIntervalInMilliseconds());
listener.onResponse(true);
}, e -> {
logger.error("Failed to recreate realtime task for detector " + detectorId, e);
listener.onFailure(e);
}));
recreateRealtimeTask(function, listener);
return;
}
ADTask adTask = adTaskOptional.get();
String localNodeId = clusterService.localNode().getId();
String oldCoordinatingNode = adTask.getCoordinatingNode();
if (oldCoordinatingNode != null && !localNodeId.equals(oldCoordinatingNode)) {
logger.warn("AD realtime job coordinating node changed from {} to this node {} for detector {}", oldCoordinatingNode, localNodeId, detectorId);
cleanDetectorCache(adTask, transportService, () -> {
logger.info("Realtime task cache cleaned on old coordinating node {} for detector {}", oldCoordinatingNode, detectorId);
adTaskCacheManager.initRealtimeTaskCache(detectorId, detector.getDetectorIntervalInMilliseconds());
listener.onResponse(true);
}, listener);
} else {
logger.info("Init realtime task cache for detector {}", detectorId);
adTaskCacheManager.initRealtimeTaskCache(detectorId, detector.getDetectorIntervalInMilliseconds());
listener.onResponse(true);
}
}, transportService, false, listener);
} catch (Exception e) {
logger.error("Failed to init realtime task cache for " + detectorId, e);
listener.onFailure(e);
}
}
use of org.opensearch.transport.TransportService in project anomaly-detection by opensearch-project.
the class ADTaskManager method maintainRunningHistoricalTask.
private void maintainRunningHistoricalTask(ConcurrentLinkedQueue<ADTask> taskQueue, TransportService transportService) {
ADTask adTask = taskQueue.poll();
if (adTask == null) {
return;
}
threadPool.schedule(() -> {
resetHistoricalDetectorTaskState(ImmutableList.of(adTask), () -> {
logger.debug("Finished maintaining running historical task {}", adTask.getTaskId());
maintainRunningHistoricalTask(taskQueue, transportService);
}, transportService, ActionListener.wrap(r -> {
logger.debug("Reset historical task state done for task {}, detector {}", adTask.getTaskId(), adTask.getDetectorId());
}, e -> {
logger.error("Failed to reset historical task state for task " + adTask.getTaskId(), e);
}));
}, TimeValue.timeValueSeconds(DEFAULT_MAINTAIN_INTERVAL_IN_SECONDS), AD_BATCH_TASK_THREAD_POOL_NAME);
}
Aggregations