Search in sources :

Example 11 with TermContext

use of org.apache.lucene.index.TermContext in project elasticsearch by elastic.

the class DfsPhase method execute.

@Override
public void execute(SearchContext context) {
    final ObjectHashSet<Term> termsSet = new ObjectHashSet<>();
    try {
        context.searcher().createNormalizedWeight(context.query(), true).extractTerms(new DelegateSet(termsSet));
        for (RescoreSearchContext rescoreContext : context.rescore()) {
            rescoreContext.rescorer().extractTerms(context, rescoreContext, new DelegateSet(termsSet));
        }
        Term[] terms = termsSet.toArray(Term.class);
        TermStatistics[] termStatistics = new TermStatistics[terms.length];
        IndexReaderContext indexReaderContext = context.searcher().getTopReaderContext();
        for (int i = 0; i < terms.length; i++) {
            if (context.isCancelled()) {
                throw new TaskCancelledException("cancelled");
            }
            // LUCENE 4 UPGRADE: cache TermContext?
            TermContext termContext = TermContext.build(indexReaderContext, terms[i]);
            termStatistics[i] = context.searcher().termStatistics(terms[i], termContext);
        }
        ObjectObjectHashMap<String, CollectionStatistics> fieldStatistics = HppcMaps.newNoNullKeysMap();
        for (Term term : terms) {
            assert term.field() != null : "field is null";
            if (!fieldStatistics.containsKey(term.field())) {
                final CollectionStatistics collectionStatistics = context.searcher().collectionStatistics(term.field());
                fieldStatistics.put(term.field(), collectionStatistics);
                if (context.isCancelled()) {
                    throw new TaskCancelledException("cancelled");
                }
            }
        }
        context.dfsResult().termsStatistics(terms, termStatistics).fieldStatistics(fieldStatistics).maxDoc(context.searcher().getIndexReader().maxDoc());
    } catch (Exception e) {
        throw new DfsPhaseExecutionException(context, "Exception during dfs phase", e);
    } finally {
        // don't hold on to terms
        termsSet.clear();
    }
}
Also used : RescoreSearchContext(org.elasticsearch.search.rescore.RescoreSearchContext) Term(org.apache.lucene.index.Term) TermStatistics(org.apache.lucene.search.TermStatistics) IndexReaderContext(org.apache.lucene.index.IndexReaderContext) TermContext(org.apache.lucene.index.TermContext) SearchContextException(org.elasticsearch.search.SearchContextException) TaskCancelledException(org.elasticsearch.tasks.TaskCancelledException) CollectionStatistics(org.apache.lucene.search.CollectionStatistics) ObjectHashSet(com.carrotsearch.hppc.ObjectHashSet) TaskCancelledException(org.elasticsearch.tasks.TaskCancelledException)

Example 12 with TermContext

use of org.apache.lucene.index.TermContext in project elasticsearch by elastic.

the class BlendedTermQuery method adjustTTF.

private TermContext adjustTTF(IndexReaderContext readerContext, TermContext termContext, long sumTTF) {
    assert termContext.wasBuiltFor(readerContext);
    if (sumTTF == -1 && termContext.totalTermFreq() == -1) {
        return termContext;
    }
    TermContext newTermContext = new TermContext(readerContext);
    List<LeafReaderContext> leaves = readerContext.leaves();
    final int len;
    if (leaves == null) {
        len = 1;
    } else {
        len = leaves.size();
    }
    int df = termContext.docFreq();
    long ttf = sumTTF;
    for (int i = 0; i < len; i++) {
        TermState termState = termContext.get(i);
        if (termState == null) {
            continue;
        }
        newTermContext.register(termState, i, df, ttf);
        df = 0;
        ttf = 0;
    }
    return newTermContext;
}
Also used : LeafReaderContext(org.apache.lucene.index.LeafReaderContext) TermState(org.apache.lucene.index.TermState) TermContext(org.apache.lucene.index.TermContext)

Example 13 with TermContext

use of org.apache.lucene.index.TermContext in project elasticsearch by elastic.

the class BlendedTermQuery method blend.

protected void blend(final TermContext[] contexts, int maxDoc, IndexReader reader) throws IOException {
    if (contexts.length <= 1) {
        return;
    }
    int max = 0;
    long minSumTTF = Long.MAX_VALUE;
    for (int i = 0; i < contexts.length; i++) {
        TermContext ctx = contexts[i];
        int df = ctx.docFreq();
        // we use the max here since it's the only "true" estimation we can make here
        // at least max(df) documents have that term. Sum or Averages don't seem
        // to have a significant meaning here.
        // TODO: Maybe it could also make sense to assume independent distributions of documents and eg. have:
        //   df = df1 + df2 - (df1 * df2 / maxDoc)?
        max = Math.max(df, max);
        if (minSumTTF != -1 && ctx.totalTermFreq() != -1) {
            // we need to find out the minimum sumTTF to adjust the statistics
            // otherwise the statistics don't match
            minSumTTF = Math.min(minSumTTF, reader.getSumTotalTermFreq(terms[i].field()));
        } else {
            minSumTTF = -1;
        }
    }
    if (minSumTTF != -1 && maxDoc > minSumTTF) {
        maxDoc = (int) minSumTTF;
    }
    if (max == 0) {
        // we are done that term doesn't exist at all
        return;
    }
    long sumTTF = minSumTTF == -1 ? -1 : 0;
    final int[] tieBreak = new int[contexts.length];
    for (int i = 0; i < tieBreak.length; ++i) {
        tieBreak[i] = i;
    }
    new InPlaceMergeSorter() {

        @Override
        protected void swap(int i, int j) {
            final int tmp = tieBreak[i];
            tieBreak[i] = tieBreak[j];
            tieBreak[j] = tmp;
        }

        @Override
        protected int compare(int i, int j) {
            return Integer.compare(contexts[tieBreak[j]].docFreq(), contexts[tieBreak[i]].docFreq());
        }
    }.sort(0, tieBreak.length);
    int prev = contexts[tieBreak[0]].docFreq();
    int actualDf = Math.min(maxDoc, max);
    assert actualDf >= 0 : "DF must be >= 0";
    // that acts as a tie breaker
    for (int i : tieBreak) {
        TermContext ctx = contexts[i];
        if (ctx.docFreq() == 0) {
            break;
        }
        final int current = ctx.docFreq();
        if (prev > current) {
            actualDf++;
        }
        contexts[i] = ctx = adjustDF(reader.getContext(), ctx, Math.min(maxDoc, actualDf));
        prev = current;
        if (sumTTF >= 0 && ctx.totalTermFreq() >= 0) {
            sumTTF += ctx.totalTermFreq();
        } else {
            // omit once TF is omitted anywhere!
            sumTTF = -1;
        }
    }
    sumTTF = Math.min(sumTTF, minSumTTF);
    for (int i = 0; i < contexts.length; i++) {
        int df = contexts[i].docFreq();
        if (df == 0) {
            continue;
        }
        // the blended sumTTF can't be greater than the sumTTTF on the field
        final long fixedTTF = sumTTF == -1 ? -1 : sumTTF;
        contexts[i] = adjustTTF(reader.getContext(), contexts[i], fixedTTF);
    }
}
Also used : InPlaceMergeSorter(org.apache.lucene.util.InPlaceMergeSorter) TermContext(org.apache.lucene.index.TermContext)

Example 14 with TermContext

use of org.apache.lucene.index.TermContext in project elasticsearch by elastic.

the class AllTermQuery method createWeight.

@Override
public Weight createWeight(IndexSearcher searcher, boolean needsScores) throws IOException {
    if (needsScores == false) {
        return new TermQuery(term).createWeight(searcher, needsScores);
    }
    final TermContext termStates = TermContext.build(searcher.getTopReaderContext(), term);
    final CollectionStatistics collectionStats = searcher.collectionStatistics(term.field());
    final TermStatistics termStats = searcher.termStatistics(term, termStates);
    final Similarity similarity = searcher.getSimilarity(needsScores);
    final SimWeight stats = similarity.computeWeight(collectionStats, termStats);
    return new Weight(this) {

        @Override
        public float getValueForNormalization() throws IOException {
            return stats.getValueForNormalization();
        }

        @Override
        public void normalize(float norm, float topLevelBoost) {
            stats.normalize(norm, topLevelBoost);
        }

        @Override
        public void extractTerms(Set<Term> terms) {
            terms.add(term);
        }

        @Override
        public Explanation explain(LeafReaderContext context, int doc) throws IOException {
            AllTermScorer scorer = scorer(context);
            if (scorer != null) {
                int newDoc = scorer.iterator().advance(doc);
                if (newDoc == doc) {
                    float score = scorer.score();
                    float freq = scorer.freq();
                    SimScorer docScorer = similarity.simScorer(stats, context);
                    Explanation freqExplanation = Explanation.match(freq, "termFreq=" + freq);
                    Explanation termScoreExplanation = docScorer.explain(doc, freqExplanation);
                    Explanation payloadBoostExplanation = Explanation.match(scorer.payloadBoost(), "payloadBoost=" + scorer.payloadBoost());
                    return Explanation.match(score, "weight(" + getQuery() + " in " + doc + ") [" + similarity.getClass().getSimpleName() + "], product of:", termScoreExplanation, payloadBoostExplanation);
                }
            }
            return Explanation.noMatch("no matching term");
        }

        @Override
        public AllTermScorer scorer(LeafReaderContext context) throws IOException {
            final Terms terms = context.reader().terms(term.field());
            if (terms == null) {
                return null;
            }
            final TermsEnum termsEnum = terms.iterator();
            if (termsEnum == null) {
                return null;
            }
            final TermState state = termStates.get(context.ord);
            if (state == null) {
                // Term does not exist in this segment
                return null;
            }
            termsEnum.seekExact(term.bytes(), state);
            PostingsEnum docs = termsEnum.postings(null, PostingsEnum.PAYLOADS);
            assert docs != null;
            return new AllTermScorer(this, docs, similarity.simScorer(stats, context));
        }
    };
}
Also used : TermQuery(org.apache.lucene.search.TermQuery) Set(java.util.Set) Similarity(org.apache.lucene.search.similarities.Similarity) Explanation(org.apache.lucene.search.Explanation) SimWeight(org.apache.lucene.search.similarities.Similarity.SimWeight) Terms(org.apache.lucene.index.Terms) SimScorer(org.apache.lucene.search.similarities.Similarity.SimScorer) TermStatistics(org.apache.lucene.search.TermStatistics) TermContext(org.apache.lucene.index.TermContext) Weight(org.apache.lucene.search.Weight) SimWeight(org.apache.lucene.search.similarities.Similarity.SimWeight) CollectionStatistics(org.apache.lucene.search.CollectionStatistics) TermsEnum(org.apache.lucene.index.TermsEnum) LeafReaderContext(org.apache.lucene.index.LeafReaderContext) TermState(org.apache.lucene.index.TermState) PostingsEnum(org.apache.lucene.index.PostingsEnum)

Example 15 with TermContext

use of org.apache.lucene.index.TermContext in project lucene-solr by apache.

the class TermAutomatonQuery method createWeight.

@Override
public Weight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException {
    IndexReaderContext context = searcher.getTopReaderContext();
    Map<Integer, TermContext> termStates = new HashMap<>();
    for (Map.Entry<BytesRef, Integer> ent : termToID.entrySet()) {
        if (ent.getKey() != null) {
            termStates.put(ent.getValue(), TermContext.build(context, new Term(field, ent.getKey())));
        }
    }
    return new TermAutomatonWeight(det, searcher, termStates, boost);
}
Also used : HashMap(java.util.HashMap) Term(org.apache.lucene.index.Term) HashMap(java.util.HashMap) Map(java.util.Map) IndexReaderContext(org.apache.lucene.index.IndexReaderContext) TermContext(org.apache.lucene.index.TermContext) BytesRef(org.apache.lucene.util.BytesRef)

Aggregations

TermContext (org.apache.lucene.index.TermContext)21 Term (org.apache.lucene.index.Term)10 LeafReaderContext (org.apache.lucene.index.LeafReaderContext)8 IndexReaderContext (org.apache.lucene.index.IndexReaderContext)6 TermState (org.apache.lucene.index.TermState)6 TermsEnum (org.apache.lucene.index.TermsEnum)6 BytesRef (org.apache.lucene.util.BytesRef)6 Terms (org.apache.lucene.index.Terms)5 TermQuery (org.apache.lucene.search.TermQuery)5 ArrayList (java.util.ArrayList)4 HashMap (java.util.HashMap)4 Query (org.apache.lucene.search.Query)4 IOException (java.io.IOException)3 PostingsEnum (org.apache.lucene.index.PostingsEnum)3 CollectionStatistics (org.apache.lucene.search.CollectionStatistics)3 TermStatistics (org.apache.lucene.search.TermStatistics)3 List (java.util.List)2 Set (java.util.Set)2 Fields (org.apache.lucene.index.Fields)2 BooleanQuery (org.apache.lucene.search.BooleanQuery)2