use of org.opensearch.ad.transport.ADStatsRequest in project anomaly-detection by opensearch-project.
the class ADTaskManager method checkTaskSlots.
/**
* Check available task slots before start historical analysis and scale task lane.
* This check will be done on lead node which will gather detector task slots of all
* data nodes and calculate how many task slots available.
*
* @param adTask AD task
* @param detector detector
* @param detectionDateRange detection date range
* @param user user
* @param afterCheckAction target task action to run after task slot checking
* @param transportService transport service
* @param listener action listener
*/
public void checkTaskSlots(ADTask adTask, AnomalyDetector detector, DetectionDateRange detectionDateRange, User user, ADTaskAction afterCheckAction, TransportService transportService, ActionListener<AnomalyDetectorJobResponse> listener) {
String detectorId = detector.getDetectorId();
logger.debug("Start checking task slots for detector: {}, task action: {}", detectorId, afterCheckAction);
if (!checkingTaskSlot.tryAcquire()) {
logger.info("Can't acquire checking task slot semaphore for detector {}", detectorId);
listener.onFailure(new OpenSearchStatusException("Too many historical analysis requests in short time. Please retry later.", RestStatus.FORBIDDEN));
return;
}
ActionListener<AnomalyDetectorJobResponse> wrappedActionListener = ActionListener.runAfter(listener, () -> {
checkingTaskSlot.release(1);
logger.debug("Release checking task slot semaphore on lead node for detector {}", detectorId);
});
hashRing.getNodesWithSameLocalAdVersion(nodes -> {
int maxAdTaskSlots = nodes.length * maxAdBatchTaskPerNode;
ADStatsRequest adStatsRequest = new ADStatsRequest(nodes);
adStatsRequest.addAll(ImmutableSet.of(AD_USED_BATCH_TASK_SLOT_COUNT.getName(), AD_DETECTOR_ASSIGNED_BATCH_TASK_SLOT_COUNT.getName()));
client.execute(ADStatsNodesAction.INSTANCE, adStatsRequest, ActionListener.wrap(adStatsResponse -> {
// Total entity tasks running on worker nodes
int totalUsedTaskSlots = 0;
// Total assigned task slots on coordinating nodes
int totalAssignedTaskSlots = 0;
for (ADStatsNodeResponse response : adStatsResponse.getNodes()) {
totalUsedTaskSlots += (int) response.getStatsMap().get(AD_USED_BATCH_TASK_SLOT_COUNT.getName());
totalAssignedTaskSlots += (int) response.getStatsMap().get(AD_DETECTOR_ASSIGNED_BATCH_TASK_SLOT_COUNT.getName());
}
logger.info("Current total used task slots is {}, total detector assigned task slots is {} when start historical " + "analysis for detector {}", totalUsedTaskSlots, totalAssignedTaskSlots, detectorId);
// In happy case, totalAssignedTaskSlots >= totalUsedTaskSlots. If some coordinating node left, then we can't
// get detector task slots cached on it, so it's possible that totalAssignedTaskSlots < totalUsedTaskSlots.
int currentUsedTaskSlots = Math.max(totalUsedTaskSlots, totalAssignedTaskSlots);
if (currentUsedTaskSlots >= maxAdTaskSlots) {
wrappedActionListener.onFailure(new OpenSearchStatusException("No available task slot", RestStatus.BAD_REQUEST));
return;
}
int availableAdTaskSlots = maxAdTaskSlots - currentUsedTaskSlots;
logger.info("Current available task slots is {} for historical analysis of detector {}", availableAdTaskSlots, detectorId);
if (ADTaskAction.SCALE_ENTITY_TASK_SLOTS == afterCheckAction) {
forwardToCoordinatingNode(adTask, detector, detectionDateRange, user, afterCheckAction, transportService, wrappedActionListener, availableAdTaskSlots);
return;
}
// It takes long time to check top entities especially for multi-category HC. Tested with
// 1.8 billion docs for multi-category HC, it took more than 20 seconds and caused timeout.
// By removing top entity check, it took about 200ms to return. So just remove it to make
// sure REST API can return quickly.
// We may assign more task slots. For example, cluster has 4 data nodes, each node can run 2
// batch tasks, so the available task slot number is 8. If max running entities per HC is 4,
// then we will assign 4 tasks slots to this HC detector (4 is less than 8). The data index
// only has 2 entities. So we assign 2 more task slots than actual need. But it's ok as we
// will auto tune task slot when historical analysis task starts.
int approvedTaskSlots = detector.isMultientityDetector() ? Math.min(maxRunningEntitiesPerDetector, availableAdTaskSlots) : 1;
forwardToCoordinatingNode(adTask, detector, detectionDateRange, user, afterCheckAction, transportService, wrappedActionListener, approvedTaskSlots);
}, exception -> {
logger.error("Failed to get node's task stats for detector " + detectorId, exception);
wrappedActionListener.onFailure(exception);
}));
}, wrappedActionListener);
}
use of org.opensearch.ad.transport.ADStatsRequest in project anomaly-detection by opensearch-project.
the class ADBatchTaskRunner method dispatchTask.
private void dispatchTask(ADTask adTask, ActionListener<DiscoveryNode> listener) {
hashRing.getNodesWithSameLocalAdVersion(dataNodes -> {
ADStatsRequest adStatsRequest = new ADStatsRequest(dataNodes);
adStatsRequest.addAll(ImmutableSet.of(AD_EXECUTING_BATCH_TASK_COUNT.getName(), JVM_HEAP_USAGE.getName()));
client.execute(ADStatsNodesAction.INSTANCE, adStatsRequest, ActionListener.wrap(adStatsResponse -> {
List<ADStatsNodeResponse> candidateNodeResponse = adStatsResponse.getNodes().stream().filter(stat -> (long) stat.getStatsMap().get(JVM_HEAP_USAGE.getName()) < DEFAULT_JVM_HEAP_USAGE_THRESHOLD).collect(Collectors.toList());
if (candidateNodeResponse.size() == 0) {
StringBuilder errorMessageBuilder = new StringBuilder("All nodes' memory usage exceeds limitation ").append(DEFAULT_JVM_HEAP_USAGE_THRESHOLD).append("%. ").append(NO_ELIGIBLE_NODE_TO_RUN_DETECTOR).append(adTask.getDetectorId());
String errorMessage = errorMessageBuilder.toString();
logger.warn(errorMessage + ", task id " + adTask.getTaskId() + ", " + adTask.getTaskType());
listener.onFailure(new LimitExceededException(adTask.getDetectorId(), errorMessage));
return;
}
candidateNodeResponse = candidateNodeResponse.stream().filter(stat -> (Long) stat.getStatsMap().get(AD_EXECUTING_BATCH_TASK_COUNT.getName()) < maxAdBatchTaskPerNode).collect(Collectors.toList());
if (candidateNodeResponse.size() == 0) {
StringBuilder errorMessageBuilder = new StringBuilder("All nodes' executing batch tasks exceeds limitation ").append(NO_ELIGIBLE_NODE_TO_RUN_DETECTOR).append(adTask.getDetectorId());
String errorMessage = errorMessageBuilder.toString();
logger.warn(errorMessage + ", task id " + adTask.getTaskId() + ", " + adTask.getTaskType());
listener.onFailure(new LimitExceededException(adTask.getDetectorId(), errorMessage));
return;
}
Optional<ADStatsNodeResponse> targetNode = candidateNodeResponse.stream().sorted((ADStatsNodeResponse r1, ADStatsNodeResponse r2) -> {
int result = ((Long) r1.getStatsMap().get(AD_EXECUTING_BATCH_TASK_COUNT.getName())).compareTo((Long) r2.getStatsMap().get(AD_EXECUTING_BATCH_TASK_COUNT.getName()));
if (result == 0) {
// JVM heap usage.
return ((Long) r1.getStatsMap().get(JVM_HEAP_USAGE.getName())).compareTo((Long) r2.getStatsMap().get(JVM_HEAP_USAGE.getName()));
}
return result;
}).findFirst();
listener.onResponse(targetNode.get().getNode());
}, exception -> {
logger.error("Failed to get node's task stats", exception);
listener.onFailure(exception);
}));
}, listener);
}
use of org.opensearch.ad.transport.ADStatsRequest in project anomaly-detection by opensearch-project.
the class RestStatsAnomalyDetectorAction method getRequest.
/**
* Creates a ADStatsRequest from a RestRequest
*
* @param request RestRequest
* @return ADStatsRequest Request containing stats to be retrieved
*/
private ADStatsRequest getRequest(RestRequest request) {
// parse the nodes the user wants to query the stats for
String nodesIdsStr = request.param("nodeId");
Set<String> validStats = adStats.getStats().keySet();
ADStatsRequest adStatsRequest = null;
if (!Strings.isEmpty(nodesIdsStr)) {
String[] nodeIdsArr = nodesIdsStr.split(",");
adStatsRequest = new ADStatsRequest(nodeIdsArr);
} else {
DiscoveryNode[] dataNodes = nodeFilter.getEligibleDataNodes();
adStatsRequest = new ADStatsRequest(dataNodes);
}
adStatsRequest.timeout(request.param("timeout"));
// parse the stats the user wants to see
HashSet<String> statsSet = null;
String statsStr = request.param("stat");
if (!Strings.isEmpty(statsStr)) {
statsSet = new HashSet<>(Arrays.asList(statsStr.split(",")));
}
if (statsSet == null) {
// retrieve all stats if none are specified
adStatsRequest.addAll(validStats);
} else if (statsSet.size() == 1 && statsSet.contains(ADStatsRequest.ALL_STATS_KEY)) {
adStatsRequest.addAll(validStats);
} else if (statsSet.contains(ADStatsRequest.ALL_STATS_KEY)) {
throw new IllegalArgumentException("Request " + request.path() + " contains " + ADStatsRequest.ALL_STATS_KEY + " and individual stats");
} else {
Set<String> invalidStats = new TreeSet<>();
for (String stat : statsSet) {
if (validStats.contains(stat)) {
adStatsRequest.addStat(stat);
} else {
invalidStats.add(stat);
}
}
if (!invalidStats.isEmpty()) {
throw new IllegalArgumentException(unrecognized(request, invalidStats, adStatsRequest.getStatsToBeRetrieved(), "stat"));
}
}
return adStatsRequest;
}
Aggregations