use of org.opensearch.ad.common.exception.LimitExceededException in project anomaly-detection by opensearch-project.
the class AnomalyResultTransportAction method doExecute.
/**
* All the exceptions thrown by AD is a subclass of AnomalyDetectionException.
* ClientException is a subclass of AnomalyDetectionException. All exception visible to
* Client is under ClientVisible. Two classes directly extends ClientException:
* - InternalFailure for "root cause unknown failure. Maybe transient." We can continue the
* detector running.
* - EndRunException for "failures that might impact the customer." The method endNow() is
* added to indicate whether the client should immediately terminate running a detector.
* + endNow() returns true for "unrecoverable issue". We want to terminate the detector run
* immediately.
* + endNow() returns false for "maybe unrecoverable issue but worth retrying a few more
* times." We want to wait for a few more times on different requests before terminating
* the detector run.
*
* AD may not be able to get an anomaly grade but can find a feature vector. Consider the
* case when the shingle is not ready. In that case, AD just put NaN as anomaly grade and
* return the feature vector. If AD cannot even find a feature vector, AD throws
* EndRunException if there is an issue or returns empty response (all the numeric fields
* are Double.NaN and feature array is empty. Do so so that customer can write painless
* script.) otherwise.
*
* Known causes of EndRunException with endNow returning false:
* + training data for cold start not available
* + cold start cannot succeed
* + unknown prediction error
* + memory circuit breaker tripped
* + invalid search query
*
* Known causes of EndRunException with endNow returning true:
* + a model partition's memory size reached limit
* + models' total memory size reached limit
* + Having trouble querying feature data due to
* * index does not exist
* * all features have been disabled
*
* + anomaly detector is not available
* + AD plugin is disabled
* + training data is invalid due to serious internal bug(s)
*
* Known causes of InternalFailure:
* + threshold model node is not available
* + cluster read/write is blocked
* + cold start hasn't been finished
* + fail to get all of rcf model nodes' responses
* + fail to get threshold model node's response
* + RCF/Threshold model node failing to get checkpoint to restore model before timeout
* + Detection is throttle because previous detection query is running
*/
@Override
protected void doExecute(Task task, ActionRequest actionRequest, ActionListener<AnomalyResultResponse> listener) {
try (ThreadContext.StoredContext context = client.threadPool().getThreadContext().stashContext()) {
AnomalyResultRequest request = AnomalyResultRequest.fromActionRequest(actionRequest);
String adID = request.getAdID();
ActionListener<AnomalyResultResponse> original = listener;
listener = ActionListener.wrap(r -> {
hcDetectors.remove(adID);
original.onResponse(r);
}, e -> {
// we will not count it in failure stats.
if (!(e instanceof AnomalyDetectionException) || ((AnomalyDetectionException) e).isCountedInStats()) {
adStats.getStat(StatNames.AD_EXECUTE_FAIL_COUNT.getName()).increment();
if (hcDetectors.contains(adID)) {
adStats.getStat(StatNames.AD_HC_EXECUTE_FAIL_COUNT.getName()).increment();
}
}
hcDetectors.remove(adID);
original.onFailure(e);
});
if (!EnabledSetting.isADPluginEnabled()) {
throw new EndRunException(adID, CommonErrorMessages.DISABLED_ERR_MSG, true).countedInStats(false);
}
adStats.getStat(StatNames.AD_EXECUTE_REQUEST_COUNT.getName()).increment();
if (adCircuitBreakerService.isOpen()) {
listener.onFailure(new LimitExceededException(adID, CommonErrorMessages.MEMORY_CIRCUIT_BROKEN_ERR_MSG, false));
return;
}
try {
stateManager.getAnomalyDetector(adID, onGetDetector(listener, adID, request));
} catch (Exception ex) {
handleExecuteException(ex, listener, adID);
}
} catch (Exception e) {
LOG.error(e);
listener.onFailure(e);
}
}
use of org.opensearch.ad.common.exception.LimitExceededException in project anomaly-detection by opensearch-project.
the class AnomalyResultTransportAction method coldStartIfNoModel.
/**
* Verify failure of rcf or threshold models. If there is no model, trigger cold
* start. If there is an exception for the previous cold start of this detector,
* throw exception to the caller.
*
* @param failure object that may contain exceptions thrown
* @param detector detector object
* @return exception if AD job execution gets resource not found exception
* @throws Exception when the input failure is not a ResourceNotFoundException.
* List of exceptions we can throw
* 1. Exception from cold start:
* 1). InternalFailure due to
* a. OpenSearchTimeoutException thrown by putModelCheckpoint during cold start
* 2). EndRunException with endNow equal to false
* a. training data not available
* b. cold start cannot succeed
* c. invalid training data
* 3) EndRunException with endNow equal to true
* a. invalid search query
* 2. LimitExceededException from one of RCF model node when the total size of the models
* is more than X% of heap memory.
* 3. InternalFailure wrapping OpenSearchTimeoutException inside caused by
* RCF/Threshold model node failing to get checkpoint to restore model before timeout.
*/
private Exception coldStartIfNoModel(AtomicReference<Exception> failure, AnomalyDetector detector) throws Exception {
Exception exp = failure.get();
if (exp == null) {
return null;
}
// return exceptions like LimitExceededException to caller
if (!(exp instanceof ResourceNotFoundException)) {
return exp;
}
// fetch previous cold start exception
String adID = detector.getDetectorId();
final Optional<Exception> previousException = stateManager.fetchExceptionAndClear(adID);
if (previousException.isPresent()) {
Exception exception = previousException.get();
LOG.error("Previous exception of {}: {}", () -> adID, () -> exception);
if (exception instanceof EndRunException && ((EndRunException) exception).isEndNow()) {
return exception;
}
}
LOG.info("Trigger cold start for {}", detector.getDetectorId());
coldStart(detector);
return previousException.orElse(new InternalFailure(adID, NO_MODEL_ERR_MSG));
}
use of org.opensearch.ad.common.exception.LimitExceededException in project anomaly-detection by opensearch-project.
the class EntityResultTransportAction method doExecute.
@Override
protected void doExecute(Task task, EntityResultRequest request, ActionListener<AcknowledgedResponse> listener) {
if (adCircuitBreakerService.isOpen()) {
threadPool.executor(AnomalyDetectorPlugin.AD_THREAD_POOL_NAME).execute(() -> cache.get().releaseMemoryForOpenCircuitBreaker());
listener.onFailure(new LimitExceededException(request.getDetectorId(), CommonErrorMessages.MEMORY_CIRCUIT_BROKEN_ERR_MSG, false));
return;
}
try {
String detectorId = request.getDetectorId();
Optional<Exception> previousException = stateManager.fetchExceptionAndClear(detectorId);
if (previousException.isPresent()) {
Exception exception = previousException.get();
LOG.error("Previous exception of {}: {}", detectorId, exception);
if (exception instanceof EndRunException) {
EndRunException endRunException = (EndRunException) exception;
if (endRunException.isEndNow()) {
listener.onFailure(exception);
return;
}
}
listener = ExceptionUtil.wrapListener(listener, exception, detectorId);
}
stateManager.getAnomalyDetector(detectorId, onGetDetector(listener, detectorId, request, previousException));
} catch (Exception exception) {
LOG.error("fail to get entity's anomaly grade", exception);
listener.onFailure(exception);
}
}
use of org.opensearch.ad.common.exception.LimitExceededException in project anomaly-detection by opensearch-project.
the class RCFResultTransportAction method doExecute.
@Override
protected void doExecute(Task task, RCFResultRequest request, ActionListener<RCFResultResponse> listener) {
if (adCircuitBreakerService.isOpen()) {
listener.onFailure(new LimitExceededException(request.getAdID(), CommonErrorMessages.MEMORY_CIRCUIT_BROKEN_ERR_MSG));
return;
}
Optional<DiscoveryNode> remoteNode = hashRing.getNodeByAddress(request.remoteAddress());
if (!remoteNode.isPresent()) {
listener.onFailure(new ConnectException("Can't find remote node by address"));
return;
}
String remoteNodeId = remoteNode.get().getId();
Version remoteAdVersion = hashRing.getAdVersion(remoteNodeId);
try {
LOG.info("Serve rcf request for {}", request.getModelID());
manager.getTRcfResult(request.getAdID(), request.getModelID(), request.getFeatures(), ActionListener.wrap(result -> listener.onResponse(new RCFResultResponse(result.getRcfScore(), result.getConfidence(), result.getForestSize(), result.getRelevantAttribution(), result.getTotalUpdates(), result.getGrade(), remoteAdVersion, result.getRelativeIndex(), result.getPastValues(), result.getExpectedValuesList(), result.getLikelihoodOfValues(), result.getThreshold())), exception -> {
LOG.warn(exception);
listener.onFailure(exception);
}));
} catch (Exception e) {
LOG.error(e);
listener.onFailure(e);
}
}
use of org.opensearch.ad.common.exception.LimitExceededException in project anomaly-detection by opensearch-project.
the class ADTaskManager method setHCDetectorTaskDone.
/**
* Set state for HC detector level task when all entities done.
*
* The state could be FINISHED,FAILED or STOPPED.
* 1. If input task state is FINISHED, will check FINISHED entity task count. If
* there is no FINISHED entity task, will set HC detector level task as FAILED; otherwise
* set as FINISHED.
* 2. If input task state is not FINISHED, will set HC detector level task's state as the same.
*
* @param adTask AD task
* @param state AD task state
* @param listener action listener
*/
public void setHCDetectorTaskDone(ADTask adTask, ADTaskState state, ActionListener<AnomalyDetectorJobResponse> listener) {
String detectorId = adTask.getDetectorId();
String taskId = adTask.isEntityTask() ? adTask.getParentTaskId() : adTask.getTaskId();
String detectorTaskId = adTask.getDetectorLevelTaskId();
ActionListener<UpdateResponse> wrappedListener = ActionListener.wrap(response -> {
logger.info("Historical HC detector done with state: {}. Remove from cache, detector id:{}", state.name(), detectorId);
adTaskCacheManager.removeHistoricalTaskCache(detectorId);
}, e -> {
// Will reset task state when get detector with task or maintain tasks in hourly cron.
if (e instanceof LimitExceededException && e.getMessage().contains(HC_DETECTOR_TASK_IS_UPDATING)) {
logger.warn("HC task is updating, skip this update for task: " + taskId);
} else {
logger.error("Failed to update task: " + taskId, e);
}
adTaskCacheManager.removeHistoricalTaskCache(detectorId);
});
// wait for 2 seconds to acquire updating HC detector task semaphore
long timeoutInMillis = 2000;
if (state == ADTaskState.FINISHED) {
this.countEntityTasksByState(detectorTaskId, ImmutableList.of(ADTaskState.FINISHED), ActionListener.wrap(r -> {
logger.info("number of finished entity tasks: {}, for detector {}", r, adTask.getDetectorId());
// Set task as FAILED if no finished entity task; otherwise set as FINISHED
ADTaskState hcDetectorTaskState = r == 0 ? ADTaskState.FAILED : ADTaskState.FINISHED;
// execute in AD batch task thread pool in case waiting for semaphore waste any shared OpenSearch thread pool
threadPool.executor(AD_BATCH_TASK_THREAD_POOL_NAME).execute(() -> {
updateADHCDetectorTask(detectorId, taskId, ImmutableMap.of(STATE_FIELD, hcDetectorTaskState.name(), TASK_PROGRESS_FIELD, 1.0, EXECUTION_END_TIME_FIELD, Instant.now().toEpochMilli()), timeoutInMillis, wrappedListener);
});
}, e -> {
logger.error("Failed to get finished entity tasks", e);
String errorMessage = getErrorMessage(e);
threadPool.executor(AD_BATCH_TASK_THREAD_POOL_NAME).execute(() -> {
updateADHCDetectorTask(detectorId, taskId, ImmutableMap.of(STATE_FIELD, // set as FAILED if fail to get finished entity tasks.
ADTaskState.FAILED.name(), TASK_PROGRESS_FIELD, 1.0, ERROR_FIELD, errorMessage, EXECUTION_END_TIME_FIELD, Instant.now().toEpochMilli()), timeoutInMillis, wrappedListener);
});
}));
} else {
threadPool.executor(AD_BATCH_TASK_THREAD_POOL_NAME).execute(() -> {
updateADHCDetectorTask(detectorId, taskId, ImmutableMap.of(STATE_FIELD, state.name(), ERROR_FIELD, adTask.getError(), EXECUTION_END_TIME_FIELD, Instant.now().toEpochMilli()), timeoutInMillis, wrappedListener);
});
}
listener.onResponse(new AnomalyDetectorJobResponse(taskId, 0, 0, 0, RestStatus.OK));
}
Aggregations