use of org.opensearch.ad.common.exception.AnomalyDetectionException in project anomaly-detection by opensearch-project.
the class EntityColdStarter method coldStart.
/**
* Training model for an entity
* @param modelId model Id corresponding to the entity
* @param entity the entity's information
* @param detectorId the detector Id corresponding to the entity
* @param modelState model state associated with the entity
* @param listener call back to call after cold start
*/
private void coldStart(String modelId, Entity entity, String detectorId, ModelState<EntityModel> modelState, AnomalyDetector detector, ActionListener<Void> listener) {
logger.debug("Trigger cold start for {}", modelId);
if (lastThrottledColdStartTime.plus(Duration.ofMinutes(coolDownMinutes)).isAfter(clock.instant())) {
listener.onResponse(null);
return;
}
boolean earlyExit = true;
try {
DoorKeeper doorKeeper = doorKeepers.computeIfAbsent(detectorId, id -> {
// reset every 60 intervals
return new DoorKeeper(AnomalyDetectorSettings.DOOR_KEEPER_FOR_COLD_STARTER_MAX_INSERTION, AnomalyDetectorSettings.DOOR_KEEPER_FAULSE_POSITIVE_RATE, detector.getDetectionIntervalDuration().multipliedBy(AnomalyDetectorSettings.DOOR_KEEPER_MAINTENANCE_FREQ), clock);
});
// Won't retry cold start within 60 intervals for an entity
if (doorKeeper.mightContain(modelId)) {
return;
}
doorKeeper.put(modelId);
ActionListener<Optional<List<double[][]>>> coldStartCallBack = ActionListener.wrap(trainingData -> {
try {
if (trainingData.isPresent()) {
List<double[][]> dataPoints = trainingData.get();
combineTrainSamples(dataPoints, modelId, modelState);
Queue<double[]> samples = modelState.getModel().getSamples();
// only train models if we have enough samples
if (samples.size() >= numMinSamples) {
// The function trainModelFromDataSegments will save a trained a model. trainModelFromDataSegments is called by
// multiple places so I want to make the saving model implicit just in case I forgot.
trainModelFromDataSegments(samples, entity, modelState, detector.getShingleSize());
logger.info("Succeeded in training entity: {}", modelId);
} else {
// save to checkpoint
checkpointWriteQueue.write(modelState, true, RequestPriority.MEDIUM);
logger.info("Not enough data to train entity: {}, currently we have {}", modelId, samples.size());
}
} else {
logger.info("Cannot get training data for {}", modelId);
}
listener.onResponse(null);
} catch (Exception e) {
listener.onFailure(e);
}
}, exception -> {
try {
logger.error(new ParameterizedMessage("Error while cold start {}", modelId), exception);
Throwable cause = Throwables.getRootCause(exception);
if (ExceptionUtil.isOverloaded(cause)) {
logger.error("too many requests");
lastThrottledColdStartTime = Instant.now();
} else if (cause instanceof AnomalyDetectionException || exception instanceof AnomalyDetectionException) {
// e.g., cannot find anomaly detector
nodeStateManager.setException(detectorId, exception);
} else {
nodeStateManager.setException(detectorId, new AnomalyDetectionException(detectorId, cause));
}
listener.onFailure(exception);
} catch (Exception e) {
listener.onFailure(e);
}
});
threadPool.executor(AnomalyDetectorPlugin.AD_THREAD_POOL_NAME).execute(() -> getEntityColdStartData(detectorId, entity, new ThreadedActionListener<>(logger, threadPool, AnomalyDetectorPlugin.AD_THREAD_POOL_NAME, coldStartCallBack, false)));
earlyExit = false;
} finally {
if (earlyExit) {
listener.onResponse(null);
}
}
}
use of org.opensearch.ad.common.exception.AnomalyDetectionException in project anomaly-detection by opensearch-project.
the class PriorityCache method maintenance.
/**
* Maintain active entity's cache and door keepers.
*
* inActiveEntities is a Guava's LRU cache. The data structure itself is
* gonna evict items if they are inactive for 3 days or its maximum size
* reached (1 million entries)
*/
@Override
public void maintenance() {
try {
// clean up memory if we allocate more memory than we should
tryClearUpMemory();
activeEnities.entrySet().stream().forEach(cacheBufferEntry -> {
String detectorId = cacheBufferEntry.getKey();
CacheBuffer cacheBuffer = cacheBufferEntry.getValue();
// remove expired cache buffer
if (cacheBuffer.expired(modelTtl)) {
activeEnities.remove(detectorId);
cacheBuffer.clear();
} else {
List<ModelState<EntityModel>> removedStates = cacheBuffer.maintenance();
for (ModelState<EntityModel> state : removedStates) {
addIntoInactiveCache(state);
}
}
});
maintainInactiveCache();
doorKeepers.entrySet().stream().forEach(doorKeeperEntry -> {
String detectorId = doorKeeperEntry.getKey();
DoorKeeper doorKeeper = doorKeeperEntry.getValue();
// doorKeeper has its own state ttl
if (doorKeeper.expired(null)) {
doorKeepers.remove(detectorId);
} else {
doorKeeper.maintenance();
}
});
} catch (Exception e) {
// will be thrown to ES's transport broadcast handler
throw new AnomalyDetectionException("Fail to maintain cache", e);
}
}
use of org.opensearch.ad.common.exception.AnomalyDetectionException in project anomaly-detection by opensearch-project.
the class MultiEntityResultHandlerTests method testIndexWriteBlock.
@Test
public void testIndexWriteBlock() throws InterruptedException {
setWriteBlockAdResultIndex(true);
CountDownLatch verified = new CountDownLatch(1);
handler.flush(request, ActionListener.wrap(response -> {
assertTrue("Should not reach here ", false);
verified.countDown();
}, exception -> {
assertTrue(exception instanceof AnomalyDetectionException);
assertTrue("actual: " + exception.getMessage(), exception.getMessage().contains(MultiEntityResultHandler.CANNOT_SAVE_RESULT_ERR_MSG));
verified.countDown();
}));
assertTrue(verified.await(100, TimeUnit.SECONDS));
}
use of org.opensearch.ad.common.exception.AnomalyDetectionException in project anomaly-detection by opensearch-project.
the class AnomalyResultTransportAction method doExecute.
/**
* All the exceptions thrown by AD is a subclass of AnomalyDetectionException.
* ClientException is a subclass of AnomalyDetectionException. All exception visible to
* Client is under ClientVisible. Two classes directly extends ClientException:
* - InternalFailure for "root cause unknown failure. Maybe transient." We can continue the
* detector running.
* - EndRunException for "failures that might impact the customer." The method endNow() is
* added to indicate whether the client should immediately terminate running a detector.
* + endNow() returns true for "unrecoverable issue". We want to terminate the detector run
* immediately.
* + endNow() returns false for "maybe unrecoverable issue but worth retrying a few more
* times." We want to wait for a few more times on different requests before terminating
* the detector run.
*
* AD may not be able to get an anomaly grade but can find a feature vector. Consider the
* case when the shingle is not ready. In that case, AD just put NaN as anomaly grade and
* return the feature vector. If AD cannot even find a feature vector, AD throws
* EndRunException if there is an issue or returns empty response (all the numeric fields
* are Double.NaN and feature array is empty. Do so so that customer can write painless
* script.) otherwise.
*
* Known causes of EndRunException with endNow returning false:
* + training data for cold start not available
* + cold start cannot succeed
* + unknown prediction error
* + memory circuit breaker tripped
* + invalid search query
*
* Known causes of EndRunException with endNow returning true:
* + a model partition's memory size reached limit
* + models' total memory size reached limit
* + Having trouble querying feature data due to
* * index does not exist
* * all features have been disabled
*
* + anomaly detector is not available
* + AD plugin is disabled
* + training data is invalid due to serious internal bug(s)
*
* Known causes of InternalFailure:
* + threshold model node is not available
* + cluster read/write is blocked
* + cold start hasn't been finished
* + fail to get all of rcf model nodes' responses
* + fail to get threshold model node's response
* + RCF/Threshold model node failing to get checkpoint to restore model before timeout
* + Detection is throttle because previous detection query is running
*/
@Override
protected void doExecute(Task task, ActionRequest actionRequest, ActionListener<AnomalyResultResponse> listener) {
try (ThreadContext.StoredContext context = client.threadPool().getThreadContext().stashContext()) {
AnomalyResultRequest request = AnomalyResultRequest.fromActionRequest(actionRequest);
String adID = request.getAdID();
ActionListener<AnomalyResultResponse> original = listener;
listener = ActionListener.wrap(r -> {
hcDetectors.remove(adID);
original.onResponse(r);
}, e -> {
// we will not count it in failure stats.
if (!(e instanceof AnomalyDetectionException) || ((AnomalyDetectionException) e).isCountedInStats()) {
adStats.getStat(StatNames.AD_EXECUTE_FAIL_COUNT.getName()).increment();
if (hcDetectors.contains(adID)) {
adStats.getStat(StatNames.AD_HC_EXECUTE_FAIL_COUNT.getName()).increment();
}
}
hcDetectors.remove(adID);
original.onFailure(e);
});
if (!EnabledSetting.isADPluginEnabled()) {
throw new EndRunException(adID, CommonErrorMessages.DISABLED_ERR_MSG, true).countedInStats(false);
}
adStats.getStat(StatNames.AD_EXECUTE_REQUEST_COUNT.getName()).increment();
if (adCircuitBreakerService.isOpen()) {
listener.onFailure(new LimitExceededException(adID, CommonErrorMessages.MEMORY_CIRCUIT_BROKEN_ERR_MSG, false));
return;
}
try {
stateManager.getAnomalyDetector(adID, onGetDetector(listener, adID, request));
} catch (Exception ex) {
handleExecuteException(ex, listener, adID);
}
} catch (Exception e) {
LOG.error(e);
listener.onFailure(e);
}
}
use of org.opensearch.ad.common.exception.AnomalyDetectionException in project anomaly-detection by opensearch-project.
the class AnomalyResultTransportAction method coldStartIfNoCheckPoint.
/**
* Check if checkpoint for an detector exists or not. If not and previous
* run is not EndRunException whose endNow is true, trigger cold start.
* @param detector detector object
* @return previous cold start exception
*/
private Optional<Exception> coldStartIfNoCheckPoint(AnomalyDetector detector) {
String detectorId = detector.getDetectorId();
Optional<Exception> previousException = stateManager.fetchExceptionAndClear(detectorId);
if (previousException.isPresent()) {
Exception exception = previousException.get();
LOG.error(new ParameterizedMessage("Previous exception of {}:", detectorId), exception);
if (exception instanceof EndRunException && ((EndRunException) exception).isEndNow()) {
return previousException;
}
}
stateManager.getDetectorCheckpoint(detectorId, ActionListener.wrap(checkpointExists -> {
if (!checkpointExists) {
LOG.info("Trigger cold start for {}", detectorId);
coldStart(detector);
}
}, exception -> {
Throwable cause = ExceptionsHelper.unwrapCause(exception);
if (cause instanceof IndexNotFoundException) {
LOG.info("Trigger cold start for {}", detectorId);
coldStart(detector);
} else {
String errorMsg = String.format(Locale.ROOT, "Fail to get checkpoint state for %s", detectorId);
LOG.error(errorMsg, exception);
stateManager.setException(detectorId, new AnomalyDetectionException(errorMsg, exception));
}
}));
return previousException;
}
Aggregations