Search in sources :

Example 1 with SignificanceHeuristic

use of org.elasticsearch.search.aggregations.bucket.significant.heuristics.SignificanceHeuristic in project elasticsearch by elastic.

the class SignificantTermsTests method createTestAggregatorBuilder.

@Override
protected SignificantTermsAggregationBuilder createTestAggregatorBuilder() {
    String name = randomAsciiOfLengthBetween(3, 20);
    SignificantTermsAggregationBuilder factory = new SignificantTermsAggregationBuilder(name, null);
    String field = randomAsciiOfLengthBetween(3, 20);
    int randomFieldBranch = randomInt(2);
    switch(randomFieldBranch) {
        case 0:
            factory.field(field);
            break;
        case 1:
            factory.field(field);
            factory.script(new Script("_value + 1"));
            break;
        case 2:
            factory.script(new Script("doc[" + field + "] + 1"));
            break;
        default:
            fail();
    }
    if (randomBoolean()) {
        factory.missing("MISSING");
    }
    if (randomBoolean()) {
        factory.bucketCountThresholds().setRequiredSize(randomIntBetween(1, Integer.MAX_VALUE));
    }
    if (randomBoolean()) {
        factory.bucketCountThresholds().setShardSize(randomIntBetween(1, Integer.MAX_VALUE));
    }
    if (randomBoolean()) {
        int minDocCount = randomInt(4);
        switch(minDocCount) {
            case 0:
                break;
            case 1:
            case 2:
            case 3:
            case 4:
                minDocCount = randomIntBetween(0, Integer.MAX_VALUE);
                break;
        }
        factory.bucketCountThresholds().setMinDocCount(minDocCount);
    }
    if (randomBoolean()) {
        int shardMinDocCount = randomInt(4);
        switch(shardMinDocCount) {
            case 0:
                break;
            case 1:
            case 2:
            case 3:
            case 4:
                shardMinDocCount = randomIntBetween(0, Integer.MAX_VALUE);
                break;
            default:
                fail();
        }
        factory.bucketCountThresholds().setShardMinDocCount(shardMinDocCount);
    }
    if (randomBoolean()) {
        factory.executionHint(randomFrom(executionHints));
    }
    if (randomBoolean()) {
        factory.format("###.##");
    }
    if (randomBoolean()) {
        IncludeExclude incExc = null;
        switch(randomInt(5)) {
            case 0:
                incExc = new IncludeExclude(new RegExp("foobar"), null);
                break;
            case 1:
                incExc = new IncludeExclude(null, new RegExp("foobaz"));
                break;
            case 2:
                incExc = new IncludeExclude(new RegExp("foobar"), new RegExp("foobaz"));
                break;
            case 3:
                SortedSet<BytesRef> includeValues = new TreeSet<>();
                int numIncs = randomIntBetween(1, 20);
                for (int i = 0; i < numIncs; i++) {
                    includeValues.add(new BytesRef(randomAsciiOfLengthBetween(1, 30)));
                }
                SortedSet<BytesRef> excludeValues = null;
                incExc = new IncludeExclude(includeValues, excludeValues);
                break;
            case 4:
                SortedSet<BytesRef> includeValues2 = null;
                SortedSet<BytesRef> excludeValues2 = new TreeSet<>();
                int numExcs2 = randomIntBetween(1, 20);
                for (int i = 0; i < numExcs2; i++) {
                    excludeValues2.add(new BytesRef(randomAsciiOfLengthBetween(1, 30)));
                }
                incExc = new IncludeExclude(includeValues2, excludeValues2);
                break;
            case 5:
                SortedSet<BytesRef> includeValues3 = new TreeSet<>();
                int numIncs3 = randomIntBetween(1, 20);
                for (int i = 0; i < numIncs3; i++) {
                    includeValues3.add(new BytesRef(randomAsciiOfLengthBetween(1, 30)));
                }
                SortedSet<BytesRef> excludeValues3 = new TreeSet<>();
                int numExcs3 = randomIntBetween(1, 20);
                for (int i = 0; i < numExcs3; i++) {
                    excludeValues3.add(new BytesRef(randomAsciiOfLengthBetween(1, 30)));
                }
                incExc = new IncludeExclude(includeValues3, excludeValues3);
                break;
            default:
                fail();
        }
        factory.includeExclude(incExc);
    }
    if (randomBoolean()) {
        SignificanceHeuristic significanceHeuristic = null;
        switch(randomInt(5)) {
            case 0:
                significanceHeuristic = new PercentageScore();
                break;
            case 1:
                significanceHeuristic = new ChiSquare(randomBoolean(), randomBoolean());
                break;
            case 2:
                significanceHeuristic = new GND(randomBoolean());
                break;
            case 3:
                significanceHeuristic = new MutualInformation(randomBoolean(), randomBoolean());
                break;
            case 4:
                significanceHeuristic = new ScriptHeuristic(new Script("foo"));
                break;
            case 5:
                significanceHeuristic = new JLHScore();
                break;
            default:
                fail();
        }
        factory.significanceHeuristic(significanceHeuristic);
    }
    if (randomBoolean()) {
        factory.backgroundFilter(QueryBuilders.termsQuery("foo", "bar"));
    }
    return factory;
}
Also used : Script(org.elasticsearch.script.Script) JLHScore(org.elasticsearch.search.aggregations.bucket.significant.heuristics.JLHScore) ChiSquare(org.elasticsearch.search.aggregations.bucket.significant.heuristics.ChiSquare) RegExp(org.apache.lucene.util.automaton.RegExp) IncludeExclude(org.elasticsearch.search.aggregations.bucket.terms.support.IncludeExclude) PercentageScore(org.elasticsearch.search.aggregations.bucket.significant.heuristics.PercentageScore) TreeSet(java.util.TreeSet) ScriptHeuristic(org.elasticsearch.search.aggregations.bucket.significant.heuristics.ScriptHeuristic) SignificanceHeuristic(org.elasticsearch.search.aggregations.bucket.significant.heuristics.SignificanceHeuristic) MutualInformation(org.elasticsearch.search.aggregations.bucket.significant.heuristics.MutualInformation) SignificantTermsAggregationBuilder(org.elasticsearch.search.aggregations.bucket.significant.SignificantTermsAggregationBuilder) GND(org.elasticsearch.search.aggregations.bucket.significant.heuristics.GND) BytesRef(org.apache.lucene.util.BytesRef)

Example 2 with SignificanceHeuristic

use of org.elasticsearch.search.aggregations.bucket.significant.heuristics.SignificanceHeuristic in project elasticsearch by elastic.

the class SignificanceHeuristicTests method getRandomSignificanceheuristic.

SignificanceHeuristic getRandomSignificanceheuristic() {
    List<SignificanceHeuristic> heuristics = new ArrayList<>();
    heuristics.add(new JLHScore());
    heuristics.add(new MutualInformation(randomBoolean(), randomBoolean()));
    heuristics.add(new GND(randomBoolean()));
    heuristics.add(new ChiSquare(randomBoolean(), randomBoolean()));
    return heuristics.get(randomInt(3));
}
Also used : JLHScore(org.elasticsearch.search.aggregations.bucket.significant.heuristics.JLHScore) ChiSquare(org.elasticsearch.search.aggregations.bucket.significant.heuristics.ChiSquare) ArrayList(java.util.ArrayList) SignificanceHeuristic(org.elasticsearch.search.aggregations.bucket.significant.heuristics.SignificanceHeuristic) MutualInformation(org.elasticsearch.search.aggregations.bucket.significant.heuristics.MutualInformation) GND(org.elasticsearch.search.aggregations.bucket.significant.heuristics.GND)

Example 3 with SignificanceHeuristic

use of org.elasticsearch.search.aggregations.bucket.significant.heuristics.SignificanceHeuristic in project elasticsearch by elastic.

the class SignificanceHeuristicTests method createInternalAggregations.

// Create aggregations as they might come from three different shards and return as list.
private List<InternalAggregation> createInternalAggregations() {
    SignificanceHeuristic significanceHeuristic = getRandomSignificanceheuristic();
    TestAggFactory<?, ?> factory = randomBoolean() ? new StringTestAggFactory() : new LongTestAggFactory();
    List<InternalAggregation> aggs = new ArrayList<>();
    aggs.add(factory.createAggregation(significanceHeuristic, 4, 10, 1, (f, i) -> f.createBucket(4, 4, 5, 10, 0)));
    aggs.add(factory.createAggregation(significanceHeuristic, 4, 10, 1, (f, i) -> f.createBucket(4, 4, 5, 10, 1)));
    aggs.add(factory.createAggregation(significanceHeuristic, 8, 10, 2, (f, i) -> f.createBucket(4, 4, 5, 10, i)));
    return aggs;
}
Also used : InternalAggregation(org.elasticsearch.search.aggregations.InternalAggregation) BiFunction(java.util.function.BiFunction) XContentBuilder(org.elasticsearch.common.xcontent.XContentBuilder) DocValueFormat(org.elasticsearch.search.DocValueFormat) JsonXContent(org.elasticsearch.common.xcontent.json.JsonXContent) Collections.singletonList(java.util.Collections.singletonList) InputStreamStreamInput(org.elasticsearch.common.io.stream.InputStreamStreamInput) Settings(org.elasticsearch.common.settings.Settings) ByteArrayInputStream(java.io.ByteArrayInputStream) TestSearchContext(org.elasticsearch.test.TestSearchContext) SignificanceHeuristicParser(org.elasticsearch.search.aggregations.bucket.significant.heuristics.SignificanceHeuristicParser) MutualInformation(org.elasticsearch.search.aggregations.bucket.significant.heuristics.MutualInformation) BytesRef(org.apache.lucene.util.BytesRef) Collections.emptyList(java.util.Collections.emptyList) Matchers.lessThanOrEqualTo(org.hamcrest.Matchers.lessThanOrEqualTo) ParseFieldRegistry(org.elasticsearch.common.xcontent.ParseFieldRegistry) Collectors(java.util.stream.Collectors) QueryParseContext(org.elasticsearch.index.query.QueryParseContext) StandardCharsets(java.nio.charset.StandardCharsets) InternalAggregation(org.elasticsearch.search.aggregations.InternalAggregation) Matchers.instanceOf(org.hamcrest.Matchers.instanceOf) List(java.util.List) Version(org.elasticsearch.Version) ChiSquare(org.elasticsearch.search.aggregations.bucket.significant.heuristics.ChiSquare) Matchers.equalTo(org.hamcrest.Matchers.equalTo) Matchers.greaterThan(org.hamcrest.Matchers.greaterThan) Matchers.containsString(org.hamcrest.Matchers.containsString) AggregationBuilders.significantTerms(org.elasticsearch.search.aggregations.AggregationBuilders.significantTerms) IntStream(java.util.stream.IntStream) XContentFactory(org.elasticsearch.common.xcontent.XContentFactory) SearchShardTarget(org.elasticsearch.search.SearchShardTarget) ByteArrayOutputStream(java.io.ByteArrayOutputStream) ParsingException(org.elasticsearch.common.ParsingException) PercentageScore(org.elasticsearch.search.aggregations.bucket.significant.heuristics.PercentageScore) Index(org.elasticsearch.index.Index) OutputStreamStreamOutput(org.elasticsearch.common.io.stream.OutputStreamStreamOutput) ArrayList(java.util.ArrayList) NamedWriteableRegistry(org.elasticsearch.common.io.stream.NamedWriteableRegistry) InternalAggregations(org.elasticsearch.search.aggregations.InternalAggregations) Matchers.lessThan(org.hamcrest.Matchers.lessThan) ESTestCase(org.elasticsearch.test.ESTestCase) VersionUtils.randomVersion(org.elasticsearch.test.VersionUtils.randomVersion) SignificanceHeuristic(org.elasticsearch.search.aggregations.bucket.significant.heuristics.SignificanceHeuristic) SearchModule(org.elasticsearch.search.SearchModule) Collections.emptyMap(java.util.Collections.emptyMap) Matchers.greaterThanOrEqualTo(org.hamcrest.Matchers.greaterThanOrEqualTo) PipelineAggregator(org.elasticsearch.search.aggregations.pipeline.PipelineAggregator) IOException(java.io.IOException) XContentParser(org.elasticsearch.common.xcontent.XContentParser) NamedWriteableAwareStreamInput(org.elasticsearch.common.io.stream.NamedWriteableAwareStreamInput) GND(org.elasticsearch.search.aggregations.bucket.significant.heuristics.GND) StreamInput(org.elasticsearch.common.io.stream.StreamInput) JLHScore(org.elasticsearch.search.aggregations.bucket.significant.heuristics.JLHScore) ArrayList(java.util.ArrayList) SignificanceHeuristic(org.elasticsearch.search.aggregations.bucket.significant.heuristics.SignificanceHeuristic)

Example 4 with SignificanceHeuristic

use of org.elasticsearch.search.aggregations.bucket.significant.heuristics.SignificanceHeuristic in project elasticsearch by elastic.

the class SearchModule method registerSignificanceHeuristic.

private void registerSignificanceHeuristic(SearchExtensionSpec<SignificanceHeuristic, SignificanceHeuristicParser> heuristic) {
    significanceHeuristicParserRegistry.register(heuristic.getParser(), heuristic.getName());
    namedWriteables.add(new NamedWriteableRegistry.Entry(SignificanceHeuristic.class, heuristic.getName().getPreferredName(), heuristic.getReader()));
}
Also used : NamedWriteableRegistry(org.elasticsearch.common.io.stream.NamedWriteableRegistry) SignificanceHeuristic(org.elasticsearch.search.aggregations.bucket.significant.heuristics.SignificanceHeuristic) Entry(org.elasticsearch.common.io.stream.NamedWriteableRegistry.Entry)

Example 5 with SignificanceHeuristic

use of org.elasticsearch.search.aggregations.bucket.significant.heuristics.SignificanceHeuristic in project elasticsearch by elastic.

the class InternalSignificantTerms method doReduce.

@Override
public InternalAggregation doReduce(List<InternalAggregation> aggregations, ReduceContext reduceContext) {
    long globalSubsetSize = 0;
    long globalSupersetSize = 0;
    // top-level Aggregations from each shard
    for (InternalAggregation aggregation : aggregations) {
        @SuppressWarnings("unchecked") InternalSignificantTerms<A, B> terms = (InternalSignificantTerms<A, B>) aggregation;
        globalSubsetSize += terms.getSubsetSize();
        globalSupersetSize += terms.getSupersetSize();
    }
    Map<String, List<B>> buckets = new HashMap<>();
    for (InternalAggregation aggregation : aggregations) {
        @SuppressWarnings("unchecked") InternalSignificantTerms<A, B> terms = (InternalSignificantTerms<A, B>) aggregation;
        for (B bucket : terms.getBucketsInternal()) {
            List<B> existingBuckets = buckets.get(bucket.getKeyAsString());
            if (existingBuckets == null) {
                existingBuckets = new ArrayList<>(aggregations.size());
                buckets.put(bucket.getKeyAsString(), existingBuckets);
            }
            // Adjust the buckets with the global stats representing the
            // total size of the pots from which the stats are drawn
            existingBuckets.add(bucket.newBucket(bucket.getSubsetDf(), globalSubsetSize, bucket.getSupersetDf(), globalSupersetSize, bucket.aggregations));
        }
    }
    SignificanceHeuristic heuristic = getSignificanceHeuristic().rewrite(reduceContext);
    final int size = reduceContext.isFinalReduce() == false ? buckets.size() : Math.min(requiredSize, buckets.size());
    BucketSignificancePriorityQueue<B> ordered = new BucketSignificancePriorityQueue<>(size);
    for (Map.Entry<String, List<B>> entry : buckets.entrySet()) {
        List<B> sameTermBuckets = entry.getValue();
        final B b = sameTermBuckets.get(0).reduce(sameTermBuckets, reduceContext);
        b.updateScore(heuristic);
        if (((b.score > 0) && (b.subsetDf >= minDocCount)) || reduceContext.isFinalReduce() == false) {
            ordered.insertWithOverflow(b);
        }
    }
    B[] list = createBucketsArray(ordered.size());
    for (int i = ordered.size() - 1; i >= 0; i--) {
        list[i] = ordered.pop();
    }
    return create(globalSubsetSize, globalSupersetSize, Arrays.asList(list));
}
Also used : HashMap(java.util.HashMap) InternalAggregation(org.elasticsearch.search.aggregations.InternalAggregation) Collections.unmodifiableList(java.util.Collections.unmodifiableList) ArrayList(java.util.ArrayList) List(java.util.List) SignificanceHeuristic(org.elasticsearch.search.aggregations.bucket.significant.heuristics.SignificanceHeuristic) HashMap(java.util.HashMap) Map(java.util.Map)

Aggregations

SignificanceHeuristic (org.elasticsearch.search.aggregations.bucket.significant.heuristics.SignificanceHeuristic)6 MutualInformation (org.elasticsearch.search.aggregations.bucket.significant.heuristics.MutualInformation)4 ArrayList (java.util.ArrayList)3 ChiSquare (org.elasticsearch.search.aggregations.bucket.significant.heuristics.ChiSquare)3 GND (org.elasticsearch.search.aggregations.bucket.significant.heuristics.GND)3 JLHScore (org.elasticsearch.search.aggregations.bucket.significant.heuristics.JLHScore)3 List (java.util.List)2 BytesRef (org.apache.lucene.util.BytesRef)2 NamedWriteableRegistry (org.elasticsearch.common.io.stream.NamedWriteableRegistry)2 PercentageScore (org.elasticsearch.search.aggregations.bucket.significant.heuristics.PercentageScore)2 ByteArrayInputStream (java.io.ByteArrayInputStream)1 ByteArrayOutputStream (java.io.ByteArrayOutputStream)1 IOException (java.io.IOException)1 StandardCharsets (java.nio.charset.StandardCharsets)1 Collections.emptyList (java.util.Collections.emptyList)1 Collections.emptyMap (java.util.Collections.emptyMap)1 Collections.singletonList (java.util.Collections.singletonList)1 Collections.unmodifiableList (java.util.Collections.unmodifiableList)1 HashMap (java.util.HashMap)1 Map (java.util.Map)1