use of org.opensearch.ad.transport.ADStatsNodeResponse in project anomaly-detection by opensearch-project.
the class ADTaskManagerTests method setupTaskSlots.
private void setupTaskSlots(int node1UsedTaskSlots, int node1AssignedTaskSLots, int node2UsedTaskSlots, int node2AssignedTaskSLots) {
doAnswer(invocation -> {
ActionListener<ADStatsNodesResponse> listener = invocation.getArgument(2);
listener.onResponse(new ADStatsNodesResponse(new ClusterName(randomAlphaOfLength(5)), ImmutableList.of(new ADStatsNodeResponse(node1, ImmutableMap.of(InternalStatNames.AD_USED_BATCH_TASK_SLOT_COUNT.getName(), node1UsedTaskSlots, InternalStatNames.AD_DETECTOR_ASSIGNED_BATCH_TASK_SLOT_COUNT.getName(), node1AssignedTaskSLots)), new ADStatsNodeResponse(node2, ImmutableMap.of(InternalStatNames.AD_USED_BATCH_TASK_SLOT_COUNT.getName(), node2UsedTaskSlots, InternalStatNames.AD_DETECTOR_ASSIGNED_BATCH_TASK_SLOT_COUNT.getName(), node2AssignedTaskSLots))), ImmutableList.of()));
return null;
}).when(client).execute(any(), any(), any());
}
use of org.opensearch.ad.transport.ADStatsNodeResponse in project anomaly-detection by opensearch-project.
the class ADStatsResponseTests method testToXContent.
@Test
public void testToXContent() throws IOException {
ADStatsResponse adStatsResponse = new ADStatsResponse();
Map<String, Object> testClusterStats = new HashMap<>();
testClusterStats.put("test_stat", 1);
adStatsResponse.setClusterStats(testClusterStats);
List<ADStatsNodeResponse> responses = Collections.emptyList();
List<FailedNodeException> failures = Collections.emptyList();
ADStatsNodesResponse adStatsNodesResponse = new ADStatsNodesResponse(ClusterName.DEFAULT, responses, failures);
adStatsResponse.setADStatsNodesResponse(adStatsNodesResponse);
XContentBuilder builder = XContentFactory.jsonBuilder();
adStatsResponse.toXContent(builder);
XContentParser parser = createParser(builder);
assertEquals(1, parser.map().get("test_stat"));
}
use of org.opensearch.ad.transport.ADStatsNodeResponse in project anomaly-detection by opensearch-project.
the class ADStatsResponseTests method testGetAndSetADStatsNodesResponse.
@Test
public void testGetAndSetADStatsNodesResponse() {
ADStatsResponse adStatsResponse = new ADStatsResponse();
List<ADStatsNodeResponse> responses = Collections.emptyList();
List<FailedNodeException> failures = Collections.emptyList();
ADStatsNodesResponse adStatsNodesResponse = new ADStatsNodesResponse(ClusterName.DEFAULT, responses, failures);
adStatsResponse.setADStatsNodesResponse(adStatsNodesResponse);
assertEquals(adStatsNodesResponse, adStatsResponse.getADStatsNodesResponse());
}
use of org.opensearch.ad.transport.ADStatsNodeResponse in project anomaly-detection by opensearch-project.
the class ADTaskManager method checkTaskSlots.
/**
* Check available task slots before start historical analysis and scale task lane.
* This check will be done on lead node which will gather detector task slots of all
* data nodes and calculate how many task slots available.
*
* @param adTask AD task
* @param detector detector
* @param detectionDateRange detection date range
* @param user user
* @param afterCheckAction target task action to run after task slot checking
* @param transportService transport service
* @param listener action listener
*/
public void checkTaskSlots(ADTask adTask, AnomalyDetector detector, DetectionDateRange detectionDateRange, User user, ADTaskAction afterCheckAction, TransportService transportService, ActionListener<AnomalyDetectorJobResponse> listener) {
String detectorId = detector.getDetectorId();
logger.debug("Start checking task slots for detector: {}, task action: {}", detectorId, afterCheckAction);
if (!checkingTaskSlot.tryAcquire()) {
logger.info("Can't acquire checking task slot semaphore for detector {}", detectorId);
listener.onFailure(new OpenSearchStatusException("Too many historical analysis requests in short time. Please retry later.", RestStatus.FORBIDDEN));
return;
}
ActionListener<AnomalyDetectorJobResponse> wrappedActionListener = ActionListener.runAfter(listener, () -> {
checkingTaskSlot.release(1);
logger.debug("Release checking task slot semaphore on lead node for detector {}", detectorId);
});
hashRing.getNodesWithSameLocalAdVersion(nodes -> {
int maxAdTaskSlots = nodes.length * maxAdBatchTaskPerNode;
ADStatsRequest adStatsRequest = new ADStatsRequest(nodes);
adStatsRequest.addAll(ImmutableSet.of(AD_USED_BATCH_TASK_SLOT_COUNT.getName(), AD_DETECTOR_ASSIGNED_BATCH_TASK_SLOT_COUNT.getName()));
client.execute(ADStatsNodesAction.INSTANCE, adStatsRequest, ActionListener.wrap(adStatsResponse -> {
// Total entity tasks running on worker nodes
int totalUsedTaskSlots = 0;
// Total assigned task slots on coordinating nodes
int totalAssignedTaskSlots = 0;
for (ADStatsNodeResponse response : adStatsResponse.getNodes()) {
totalUsedTaskSlots += (int) response.getStatsMap().get(AD_USED_BATCH_TASK_SLOT_COUNT.getName());
totalAssignedTaskSlots += (int) response.getStatsMap().get(AD_DETECTOR_ASSIGNED_BATCH_TASK_SLOT_COUNT.getName());
}
logger.info("Current total used task slots is {}, total detector assigned task slots is {} when start historical " + "analysis for detector {}", totalUsedTaskSlots, totalAssignedTaskSlots, detectorId);
// In happy case, totalAssignedTaskSlots >= totalUsedTaskSlots. If some coordinating node left, then we can't
// get detector task slots cached on it, so it's possible that totalAssignedTaskSlots < totalUsedTaskSlots.
int currentUsedTaskSlots = Math.max(totalUsedTaskSlots, totalAssignedTaskSlots);
if (currentUsedTaskSlots >= maxAdTaskSlots) {
wrappedActionListener.onFailure(new OpenSearchStatusException("No available task slot", RestStatus.BAD_REQUEST));
return;
}
int availableAdTaskSlots = maxAdTaskSlots - currentUsedTaskSlots;
logger.info("Current available task slots is {} for historical analysis of detector {}", availableAdTaskSlots, detectorId);
if (ADTaskAction.SCALE_ENTITY_TASK_SLOTS == afterCheckAction) {
forwardToCoordinatingNode(adTask, detector, detectionDateRange, user, afterCheckAction, transportService, wrappedActionListener, availableAdTaskSlots);
return;
}
// It takes long time to check top entities especially for multi-category HC. Tested with
// 1.8 billion docs for multi-category HC, it took more than 20 seconds and caused timeout.
// By removing top entity check, it took about 200ms to return. So just remove it to make
// sure REST API can return quickly.
// We may assign more task slots. For example, cluster has 4 data nodes, each node can run 2
// batch tasks, so the available task slot number is 8. If max running entities per HC is 4,
// then we will assign 4 tasks slots to this HC detector (4 is less than 8). The data index
// only has 2 entities. So we assign 2 more task slots than actual need. But it's ok as we
// will auto tune task slot when historical analysis task starts.
int approvedTaskSlots = detector.isMultientityDetector() ? Math.min(maxRunningEntitiesPerDetector, availableAdTaskSlots) : 1;
forwardToCoordinatingNode(adTask, detector, detectionDateRange, user, afterCheckAction, transportService, wrappedActionListener, approvedTaskSlots);
}, exception -> {
logger.error("Failed to get node's task stats for detector " + detectorId, exception);
wrappedActionListener.onFailure(exception);
}));
}, wrappedActionListener);
}
use of org.opensearch.ad.transport.ADStatsNodeResponse in project anomaly-detection by opensearch-project.
the class ADBatchTaskRunner method dispatchTask.
private void dispatchTask(ADTask adTask, ActionListener<DiscoveryNode> listener) {
hashRing.getNodesWithSameLocalAdVersion(dataNodes -> {
ADStatsRequest adStatsRequest = new ADStatsRequest(dataNodes);
adStatsRequest.addAll(ImmutableSet.of(AD_EXECUTING_BATCH_TASK_COUNT.getName(), JVM_HEAP_USAGE.getName()));
client.execute(ADStatsNodesAction.INSTANCE, adStatsRequest, ActionListener.wrap(adStatsResponse -> {
List<ADStatsNodeResponse> candidateNodeResponse = adStatsResponse.getNodes().stream().filter(stat -> (long) stat.getStatsMap().get(JVM_HEAP_USAGE.getName()) < DEFAULT_JVM_HEAP_USAGE_THRESHOLD).collect(Collectors.toList());
if (candidateNodeResponse.size() == 0) {
StringBuilder errorMessageBuilder = new StringBuilder("All nodes' memory usage exceeds limitation ").append(DEFAULT_JVM_HEAP_USAGE_THRESHOLD).append("%. ").append(NO_ELIGIBLE_NODE_TO_RUN_DETECTOR).append(adTask.getDetectorId());
String errorMessage = errorMessageBuilder.toString();
logger.warn(errorMessage + ", task id " + adTask.getTaskId() + ", " + adTask.getTaskType());
listener.onFailure(new LimitExceededException(adTask.getDetectorId(), errorMessage));
return;
}
candidateNodeResponse = candidateNodeResponse.stream().filter(stat -> (Long) stat.getStatsMap().get(AD_EXECUTING_BATCH_TASK_COUNT.getName()) < maxAdBatchTaskPerNode).collect(Collectors.toList());
if (candidateNodeResponse.size() == 0) {
StringBuilder errorMessageBuilder = new StringBuilder("All nodes' executing batch tasks exceeds limitation ").append(NO_ELIGIBLE_NODE_TO_RUN_DETECTOR).append(adTask.getDetectorId());
String errorMessage = errorMessageBuilder.toString();
logger.warn(errorMessage + ", task id " + adTask.getTaskId() + ", " + adTask.getTaskType());
listener.onFailure(new LimitExceededException(adTask.getDetectorId(), errorMessage));
return;
}
Optional<ADStatsNodeResponse> targetNode = candidateNodeResponse.stream().sorted((ADStatsNodeResponse r1, ADStatsNodeResponse r2) -> {
int result = ((Long) r1.getStatsMap().get(AD_EXECUTING_BATCH_TASK_COUNT.getName())).compareTo((Long) r2.getStatsMap().get(AD_EXECUTING_BATCH_TASK_COUNT.getName()));
if (result == 0) {
// JVM heap usage.
return ((Long) r1.getStatsMap().get(JVM_HEAP_USAGE.getName())).compareTo((Long) r2.getStatsMap().get(JVM_HEAP_USAGE.getName()));
}
return result;
}).findFirst();
listener.onResponse(targetNode.get().getNode());
}, exception -> {
logger.error("Failed to get node's task stats", exception);
listener.onFailure(exception);
}));
}, listener);
}
Aggregations