use of org.opensearch.ad.model.Entity in project anomaly-detection by opensearch-project.
the class SearchFeatureDao method getColdStartSamplesForPeriods.
public void getColdStartSamplesForPeriods(AnomalyDetector detector, List<Entry<Long, Long>> ranges, Entity entity, boolean includesEmptyBucket, ActionListener<List<Optional<double[]>>> listener) throws IOException {
SearchRequest request = createColdStartFeatureSearchRequest(detector, ranges, entity);
client.search(request, ActionListener.wrap(response -> {
Aggregations aggs = response.getAggregations();
if (aggs == null) {
listener.onResponse(Collections.emptyList());
return;
}
long docCountThreshold = includesEmptyBucket ? -1 : 0;
// Extract buckets and order by from_as_string. Currently by default it is ascending. Better not to assume it.
// Example responses from date range bucket aggregation:
// "aggregations":{"date_range":{"buckets":[{"key":"1598865166000-1598865226000","from":1.598865166E12,"
// from_as_string":"1598865166000","to":1.598865226E12,"to_as_string":"1598865226000","doc_count":3,
// "deny_max":{"value":154.0}},{"key":"1598869006000-1598869066000","from":1.598869006E12,
// "from_as_string":"1598869006000","to":1.598869066E12,"to_as_string":"1598869066000","doc_count":3,
// "deny_max":{"value":141.0}},
// We don't want to use default 0 for sum/count aggregation as it might cause false positives during scoring.
// Terms aggregation only returns non-zero count values. If we use a lot of 0s during cold start,
// we will see alarming very easily.
listener.onResponse(aggs.asList().stream().filter(InternalDateRange.class::isInstance).flatMap(agg -> ((InternalDateRange) agg).getBuckets().stream()).filter(bucket -> bucket.getFrom() != null && bucket.getFrom() instanceof ZonedDateTime).filter(bucket -> bucket.getDocCount() > docCountThreshold).sorted(Comparator.comparing((Bucket bucket) -> (ZonedDateTime) bucket.getFrom())).map(bucket -> parseBucket(bucket, detector.getEnabledFeatureIds())).collect(Collectors.toList()));
}, listener::onFailure));
}
use of org.opensearch.ad.model.Entity in project anomaly-detection by opensearch-project.
the class SearchFeatureDao method getHighestCountEntities.
/**
* Get list of entities with high count in descending order within specified time range
* @param detector detector config
* @param startTime start time of time range
* @param endTime end time of time range
* @param maxEntitiesSize max top entities
* @param minimumDocCount minimum doc count for top entities
* @param pageSize page size when query multi-category HC detector's top entities
* @param listener listener to return back the entities
*/
public void getHighestCountEntities(AnomalyDetector detector, long startTime, long endTime, int maxEntitiesSize, int minimumDocCount, int pageSize, ActionListener<List<Entity>> listener) {
if (!detector.isMultientityDetector()) {
listener.onResponse(null);
return;
}
RangeQueryBuilder rangeQuery = new RangeQueryBuilder(detector.getTimeField()).from(startTime).to(endTime).format("epoch_millis").includeLower(true).includeUpper(false);
BoolQueryBuilder boolQueryBuilder = QueryBuilders.boolQuery().filter(rangeQuery).filter(detector.getFilterQuery());
AggregationBuilder bucketAggs = null;
if (detector.getCategoryField().size() == 1) {
bucketAggs = AggregationBuilders.terms(AGG_NAME_TOP).size(maxEntitiesSize).field(detector.getCategoryField().get(0));
} else {
/*
* We don't have an efficient solution for terms aggregation on multiple fields.
* Terms aggregation does not support collecting terms from multiple fields in the same document.
* We have to work around the limitation by using a script to retrieve terms from multiple fields.
* The workaround disables the global ordinals optimization and thus causes a markedly longer
* slowdown. This is because scripting is tugging on memory and has to iterate through
* all of the documents at least once to create run-time fields.
*
* We evaluated composite and terms aggregation using a generated data set with one
* million entities. Each entity has two documents. Composite aggregation finishes
* around 40 seconds. Terms aggregation performs differently on different clusters.
* On a 3 data node cluster, terms aggregation does not finish running within 2 hours
* on a 5 primary shard index. On a 15 data node cluster, terms aggregation needs 217 seconds
* on a 15 primary shard index. On a 30 data node cluster, terms aggregation needs 47 seconds
* on a 30 primary shard index.
*
* Here we work around the problem using composite aggregation. Composite aggregation cannot
* give top entities without collecting all aggregated results. Paginated results are returned
* in the natural order of composite keys. This is fine for Preview API. Preview API needs the
* top entities to make sure there is enough data for training and showing the results. We
* can paginate entities and filter out entities that do not have enough docs (e.g., 256 docs).
* As long as we have collected the desired number of entities (e.g., 5 entities), we can stop
* pagination.
*
* Example composite query:
* {
* "size": 0,
* "query": {
* "bool": {
* "filter": [{
* "range": {
* "@timestamp": {
* "from": 1626118340000,
* "to": 1626294912000,
* "include_lower": true,
* "include_upper": false,
* "format": "epoch_millis",
* "boost": 1.0
* }
* }
* }, {
* "match_all": {
* "boost": 1.0
* }
* }],
* "adjust_pure_negative": true,
* "boost": 1.0
* }
* },
* "track_total_hits": -1,
* "aggregations": {
* "top_agg": {
* "composite": {
* "size": 1,
* "sources": [{
* "service": {
* "terms": {
* "field": "service",
* "missing_bucket": false,
* "order": "asc"
* }
* }
* }, {
* "host": {
* "terms": {
* "field": "host",
* "missing_bucket": false,
* "order": "asc"
* }
* }
* }]
* },
* "aggregations": {
* "bucketSort": {
* "bucket_sort": {
* "sort": [{
* "_count": {
* "order": "desc"
* }
* }],
* "from": 0,
* "size": 5,
* "gap_policy": "SKIP"
* }
* }
* }
* }
* }
* }
*
*/
bucketAggs = AggregationBuilders.composite(AGG_NAME_TOP, detector.getCategoryField().stream().map(f -> new TermsValuesSourceBuilder(f).field(f)).collect(Collectors.toList())).size(pageSize).subAggregation(PipelineAggregatorBuilders.bucketSort("bucketSort", Arrays.asList(new FieldSortBuilder("_count").order(SortOrder.DESC))).size(maxEntitiesSize));
}
SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder().query(boolQueryBuilder).aggregation(bucketAggs).trackTotalHits(false).size(0);
SearchRequest searchRequest = new SearchRequest().indices(detector.getIndices().toArray(new String[0])).source(searchSourceBuilder);
client.search(searchRequest, new TopEntitiesListener(listener, detector, searchSourceBuilder, // TODO: tune timeout for historical analysis based on performance test result
clock.millis() + previewTimeoutInMilliseconds, maxEntitiesSize, minimumDocCount));
}
use of org.opensearch.ad.model.Entity in project anomaly-detection by opensearch-project.
the class EntityProfileRunner method validateEntity.
/**
* Verify if the input entity exists or not in case of typos.
*
* If a user deletes the entity after job start, then we will not be able to
* get this entity in the index. For this case, we will not return a profile
* for this entity even if it's running on some data node. the entity's model
* will be deleted by another entity or by maintenance due to long inactivity.
*
* @param entity Entity accessor
* @param categoryFields category fields defined for a detector
* @param detectorId Detector Id
* @param profilesToCollect Profile to collect from the input
* @param detector Detector config accessor
* @param listener Callback to send responses.
*/
private void validateEntity(Entity entity, List<String> categoryFields, String detectorId, Set<EntityProfileName> profilesToCollect, AnomalyDetector detector, ActionListener<EntityProfile> listener) {
Map<String, String> attributes = entity.getAttributes();
if (attributes == null || attributes.size() != categoryFields.size()) {
listener.onFailure(new IllegalArgumentException(EMPTY_ENTITY_ATTRIBUTES));
return;
}
for (String field : categoryFields) {
if (false == attributes.containsKey(field)) {
listener.onFailure(new IllegalArgumentException("Cannot find " + field));
return;
}
}
BoolQueryBuilder internalFilterQuery = QueryBuilders.boolQuery().filter(detector.getFilterQuery());
for (TermQueryBuilder term : entity.getTermQueryBuilders()) {
internalFilterQuery.filter(term);
}
SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder().query(internalFilterQuery).size(1);
SearchRequest searchRequest = new SearchRequest(detector.getIndices().toArray(new String[0]), searchSourceBuilder).preference(Preference.LOCAL.toString());
client.search(searchRequest, ActionListener.wrap(searchResponse -> {
try {
if (searchResponse.getHits().getHits().length == 0) {
listener.onFailure(new IllegalArgumentException(NO_ENTITY));
return;
}
prepareEntityProfile(listener, detectorId, entity, profilesToCollect, detector, categoryFields.get(0));
} catch (Exception e) {
listener.onFailure(new IllegalArgumentException(NO_ENTITY));
return;
}
}, e -> listener.onFailure(new IllegalArgumentException(NO_ENTITY))));
}
use of org.opensearch.ad.model.Entity in project anomaly-detection by opensearch-project.
the class EntityProfileRunner method prepareEntityProfile.
private void prepareEntityProfile(ActionListener<EntityProfile> listener, String detectorId, Entity entityValue, Set<EntityProfileName> profilesToCollect, AnomalyDetector detector, String categoryField) {
EntityProfileRequest request = new EntityProfileRequest(detectorId, entityValue, profilesToCollect);
client.execute(EntityProfileAction.INSTANCE, request, ActionListener.wrap(r -> getJob(detectorId, entityValue, profilesToCollect, detector, r, listener), listener::onFailure));
}
use of org.opensearch.ad.model.Entity in project anomaly-detection by opensearch-project.
the class ADTaskManagerTests method testGetLocalADTaskProfilesByDetectorId.
public void testGetLocalADTaskProfilesByDetectorId() {
doReturn(node1).when(clusterService).localNode();
when(adTaskCacheManager.isHCTaskRunning(anyString())).thenReturn(true);
when(adTaskCacheManager.isHCTaskCoordinatingNode(anyString())).thenReturn(true);
List<String> tasksOfDetector = ImmutableList.of(randomAlphaOfLength(5));
when(adTaskCacheManager.getTasksOfDetector(anyString())).thenReturn(tasksOfDetector);
Deque<Map.Entry<Long, Optional<double[]>>> shingle = new LinkedBlockingDeque<>();
when(adTaskCacheManager.getShingle(anyString())).thenReturn(shingle);
ThresholdedRandomCutForest trcf = mock(ThresholdedRandomCutForest.class);
when(adTaskCacheManager.getTRcfModel(anyString())).thenReturn(trcf);
RandomCutForest rcf = mock(RandomCutForest.class);
when(trcf.getForest()).thenReturn(rcf);
when(rcf.getTotalUpdates()).thenReturn(randomLongBetween(100, 1000));
when(adTaskCacheManager.isThresholdModelTrained(anyString())).thenReturn(true);
when(adTaskCacheManager.getThresholdModelTrainingDataSize(anyString())).thenReturn(randomIntBetween(100, 1000));
when(adTaskCacheManager.getModelSize(anyString())).thenReturn(randomLongBetween(100, 1000));
Entity entity = createSingleAttributeEntity(randomAlphaOfLength(5), randomAlphaOfLength(5));
when(adTaskCacheManager.getEntity(anyString())).thenReturn(entity);
String detectorId = randomAlphaOfLength(5);
ExecutorService executeService = mock(ExecutorService.class);
when(threadPool.executor(anyString())).thenReturn(executeService);
doAnswer(invocation -> {
Runnable runnable = invocation.getArgument(0);
runnable.run();
return null;
}).when(executeService).execute(any());
ADTaskProfile taskProfile = adTaskManager.getLocalADTaskProfilesByDetectorId(detectorId);
assertEquals(1, taskProfile.getEntityTaskProfiles().size());
verify(adTaskCacheManager, times(1)).cleanExpiredHCBatchTaskRunStates();
}
Aggregations