Search in sources :

Example 1 with TermStates

use of org.apache.lucene.index.TermStates in project neo4j by neo4j.

the class StatsCollector method computeTermStatistics.

private Optional<TermStatistics> computeTermStatistics(Term term) {
    TermStatistics result;
    List<TermStatistics> statistics = new ArrayList<>(searches.size());
    for (PreparedSearch search : searches) {
        Neo4jIndexSearcher searcher = search.searcher();
        try {
            TermStates context = TermStates.build(searcher.getTopReaderContext(), term, true);
            TermStatistics statistic = searcher.termStatistics(term, context);
            if (statistic != null) {
                statistics.add(statistic);
            }
        } catch (IOException e) {
            throw new UncheckedIOException(e);
        }
    }
    long docFreq = 0;
    long totalTermFreq = 0;
    for (TermStatistics statistic : statistics) {
        docFreq += statistic.docFreq();
        totalTermFreq += statistic.totalTermFreq();
    }
    if (docFreq == 0) {
        return Optional.empty();
    }
    BytesRef bytesTerm = statistics.get(0).term();
    result = new TermStatistics(bytesTerm, docFreq, totalTermFreq);
    return Optional.of(result);
}
Also used : TermStates(org.apache.lucene.index.TermStates) ArrayList(java.util.ArrayList) UncheckedIOException(java.io.UncheckedIOException) IOException(java.io.IOException) UncheckedIOException(java.io.UncheckedIOException) Neo4jIndexSearcher(org.neo4j.kernel.api.impl.index.partition.Neo4jIndexSearcher) TermStatistics(org.apache.lucene.search.TermStatistics) BytesRef(org.apache.lucene.util.BytesRef)

Example 2 with TermStates

use of org.apache.lucene.index.TermStates in project crate by crate.

the class BlendedTermQuery method adjustDF.

private static TermStates adjustDF(IndexReaderContext readerContext, TermStates ctx, int newDocFreq) throws IOException {
    assert ctx.wasBuiltFor(readerContext);
    // Use a value of ttf that is consistent with the doc freq (ie. gte)
    long newTTF;
    if (ctx.totalTermFreq() < 0) {
        newTTF = -1;
    } else {
        newTTF = Math.max(ctx.totalTermFreq(), newDocFreq);
    }
    List<LeafReaderContext> leaves = readerContext.leaves();
    final int len;
    if (leaves == null) {
        len = 1;
    } else {
        len = leaves.size();
    }
    TermStates newCtx = new TermStates(readerContext);
    if (leaves != null) {
        for (int i = 0; i < len; ++i) {
            TermState termState = ctx.get(leaves.get(i));
            if (termState == null) {
                continue;
            }
            newCtx.register(termState, i, newDocFreq, newTTF);
            newDocFreq = 0;
            newTTF = 0;
        }
    }
    return newCtx;
}
Also used : TermStates(org.apache.lucene.index.TermStates) LeafReaderContext(org.apache.lucene.index.LeafReaderContext) TermState(org.apache.lucene.index.TermState)

Example 3 with TermStates

use of org.apache.lucene.index.TermStates in project crate by crate.

the class BlendedTermQuery method blend.

protected void blend(final TermStates[] contexts, int maxDoc, IndexReader reader) throws IOException {
    if (contexts.length <= 1) {
        return;
    }
    int max = 0;
    long minSumTTF = Long.MAX_VALUE;
    for (int i = 0; i < contexts.length; i++) {
        TermStates ctx = contexts[i];
        int df = ctx.docFreq();
        // we use the max here since it's the only "true" estimation we can make here
        // at least max(df) documents have that term. Sum or Averages don't seem
        // to have a significant meaning here.
        // TODO: Maybe it could also make sense to assume independent distributions of documents and eg. have:
        // df = df1 + df2 - (df1 * df2 / maxDoc)?
        max = Math.max(df, max);
        if (minSumTTF != -1 && ctx.totalTermFreq() != -1) {
            // we need to find out the minimum sumTTF to adjust the statistics
            // otherwise the statistics don't match
            minSumTTF = Math.min(minSumTTF, reader.getSumTotalTermFreq(terms[i].field()));
        } else {
            minSumTTF = -1;
        }
    }
    if (minSumTTF != -1 && maxDoc > minSumTTF) {
        maxDoc = (int) minSumTTF;
    }
    if (max == 0) {
        // we are done that term doesn't exist at all
        return;
    }
    long sumTTF = minSumTTF == -1 ? -1 : 0;
    final int[] tieBreak = new int[contexts.length];
    for (int i = 0; i < tieBreak.length; ++i) {
        tieBreak[i] = i;
    }
    new InPlaceMergeSorter() {

        @Override
        protected void swap(int i, int j) {
            final int tmp = tieBreak[i];
            tieBreak[i] = tieBreak[j];
            tieBreak[j] = tmp;
        }

        @Override
        protected int compare(int i, int j) {
            return Integer.compare(contexts[tieBreak[j]].docFreq(), contexts[tieBreak[i]].docFreq());
        }
    }.sort(0, tieBreak.length);
    int prev = contexts[tieBreak[0]].docFreq();
    int actualDf = Math.min(maxDoc, max);
    assert actualDf >= 0 : "DF must be >= 0";
    // that acts as a tie breaker
    for (int i : tieBreak) {
        TermStates ctx = contexts[i];
        if (ctx.docFreq() == 0) {
            break;
        }
        final int current = ctx.docFreq();
        if (prev > current) {
            actualDf++;
        }
        contexts[i] = ctx = adjustDF(reader.getContext(), ctx, Math.min(maxDoc, actualDf));
        prev = current;
        if (sumTTF >= 0 && ctx.totalTermFreq() >= 0) {
            sumTTF += ctx.totalTermFreq();
        } else {
            // omit once TF is omitted anywhere!
            sumTTF = -1;
        }
    }
    sumTTF = Math.min(sumTTF, minSumTTF);
    for (int i = 0; i < contexts.length; i++) {
        int df = contexts[i].docFreq();
        if (df == 0) {
            continue;
        }
        // the blended sumTTF can't be greater than the sumTTTF on the field
        final long fixedTTF = sumTTF == -1 ? -1 : sumTTF;
        contexts[i] = adjustTTF(reader.getContext(), contexts[i], fixedTTF);
    }
}
Also used : TermStates(org.apache.lucene.index.TermStates) InPlaceMergeSorter(org.apache.lucene.util.InPlaceMergeSorter)

Example 4 with TermStates

use of org.apache.lucene.index.TermStates in project crate by crate.

the class BlendedTermQuery method adjustTTF.

private TermStates adjustTTF(IndexReaderContext readerContext, TermStates termContext, long sumTTF) throws IOException {
    assert termContext.wasBuiltFor(readerContext);
    if (sumTTF == -1 && termContext.totalTermFreq() == -1) {
        return termContext;
    }
    TermStates newTermContext = new TermStates(readerContext);
    List<LeafReaderContext> leaves = readerContext.leaves();
    final int len;
    if (leaves == null) {
        len = 1;
    } else {
        len = leaves.size();
    }
    int df = termContext.docFreq();
    long ttf = sumTTF;
    if (leaves != null) {
        for (int i = 0; i < len; i++) {
            TermState termState = termContext.get(leaves.get(i));
            if (termState == null) {
                continue;
            }
            newTermContext.register(termState, i, df, ttf);
            df = 0;
            ttf = 0;
        }
    }
    return newTermContext;
}
Also used : TermStates(org.apache.lucene.index.TermStates) LeafReaderContext(org.apache.lucene.index.LeafReaderContext) TermState(org.apache.lucene.index.TermState)

Example 5 with TermStates

use of org.apache.lucene.index.TermStates in project crate by crate.

the class BlendedTermQuery method rewrite.

@Override
public Query rewrite(IndexReader reader) throws IOException {
    Query rewritten = super.rewrite(reader);
    if (rewritten != this) {
        return rewritten;
    }
    IndexReaderContext context = reader.getContext();
    TermStates[] ctx = new TermStates[terms.length];
    int[] docFreqs = new int[ctx.length];
    for (int i = 0; i < terms.length; i++) {
        ctx[i] = TermStates.build(context, terms[i], true);
        docFreqs[i] = ctx[i].docFreq();
    }
    final int maxDoc = reader.maxDoc();
    blend(ctx, maxDoc, reader);
    return topLevelQuery(terms, ctx, docFreqs, maxDoc);
}
Also used : Query(org.apache.lucene.search.Query) DisjunctionMaxQuery(org.apache.lucene.search.DisjunctionMaxQuery) TermQuery(org.apache.lucene.search.TermQuery) BooleanQuery(org.apache.lucene.search.BooleanQuery) BoostQuery(org.apache.lucene.search.BoostQuery) TermStates(org.apache.lucene.index.TermStates) IndexReaderContext(org.apache.lucene.index.IndexReaderContext)

Aggregations

TermStates (org.apache.lucene.index.TermStates)5 LeafReaderContext (org.apache.lucene.index.LeafReaderContext)2 TermState (org.apache.lucene.index.TermState)2 IOException (java.io.IOException)1 UncheckedIOException (java.io.UncheckedIOException)1 ArrayList (java.util.ArrayList)1 IndexReaderContext (org.apache.lucene.index.IndexReaderContext)1 BooleanQuery (org.apache.lucene.search.BooleanQuery)1 BoostQuery (org.apache.lucene.search.BoostQuery)1 DisjunctionMaxQuery (org.apache.lucene.search.DisjunctionMaxQuery)1 Query (org.apache.lucene.search.Query)1 TermQuery (org.apache.lucene.search.TermQuery)1 TermStatistics (org.apache.lucene.search.TermStatistics)1 BytesRef (org.apache.lucene.util.BytesRef)1 InPlaceMergeSorter (org.apache.lucene.util.InPlaceMergeSorter)1 Neo4jIndexSearcher (org.neo4j.kernel.api.impl.index.partition.Neo4jIndexSearcher)1