Search in sources :

Example 6 with TermContext

use of org.apache.lucene.index.TermContext in project lucene-solr by apache.

the class TopTermsRewrite method rewrite.

@Override
public final Query rewrite(final IndexReader reader, final MultiTermQuery query) throws IOException {
    final int maxSize = Math.min(size, getMaxSize());
    final PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<>();
    collectTerms(reader, query, new TermCollector() {

        private final MaxNonCompetitiveBoostAttribute maxBoostAtt = attributes.addAttribute(MaxNonCompetitiveBoostAttribute.class);

        private final Map<BytesRef, ScoreTerm> visitedTerms = new HashMap<>();

        private TermsEnum termsEnum;

        private BoostAttribute boostAtt;

        private ScoreTerm st;

        @Override
        public void setNextEnum(TermsEnum termsEnum) {
            this.termsEnum = termsEnum;
            assert compareToLastTerm(null);
            // lazy init the initial ScoreTerm because comparator is not known on ctor:
            if (st == null)
                st = new ScoreTerm(new TermContext(topReaderContext));
            boostAtt = termsEnum.attributes().addAttribute(BoostAttribute.class);
        }

        // for assert:
        private BytesRefBuilder lastTerm;

        private boolean compareToLastTerm(BytesRef t) {
            if (lastTerm == null && t != null) {
                lastTerm = new BytesRefBuilder();
                lastTerm.append(t);
            } else if (t == null) {
                lastTerm = null;
            } else {
                assert lastTerm.get().compareTo(t) < 0 : "lastTerm=" + lastTerm + " t=" + t;
                lastTerm.copyBytes(t);
            }
            return true;
        }

        @Override
        public boolean collect(BytesRef bytes) throws IOException {
            final float boost = boostAtt.getBoost();
            // terms in order
            assert compareToLastTerm(bytes);
            // ignore uncompetitive hits
            if (stQueue.size() == maxSize) {
                final ScoreTerm t = stQueue.peek();
                if (boost < t.boost)
                    return true;
                if (boost == t.boost && bytes.compareTo(t.bytes.get()) > 0)
                    return true;
            }
            ScoreTerm t = visitedTerms.get(bytes);
            final TermState state = termsEnum.termState();
            assert state != null;
            if (t != null) {
                // if the term is already in the PQ, only update docFreq of term in PQ
                assert t.boost == boost : "boost should be equal in all segment TermsEnums";
                t.termState.register(state, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq());
            } else {
                // add new entry in PQ, we must clone the term, else it may get overwritten!
                st.bytes.copyBytes(bytes);
                st.boost = boost;
                visitedTerms.put(st.bytes.get(), st);
                assert st.termState.docFreq() == 0;
                st.termState.register(state, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq());
                stQueue.offer(st);
                // possibly drop entries from queue
                if (stQueue.size() > maxSize) {
                    st = stQueue.poll();
                    visitedTerms.remove(st.bytes.get());
                    // reset the termstate! 
                    st.termState.clear();
                } else {
                    st = new ScoreTerm(new TermContext(topReaderContext));
                }
                assert stQueue.size() <= maxSize : "the PQ size must be limited to maxSize";
                // set maxBoostAtt with values to help FuzzyTermsEnum to optimize
                if (stQueue.size() == maxSize) {
                    t = stQueue.peek();
                    maxBoostAtt.setMaxNonCompetitiveBoost(t.boost);
                    maxBoostAtt.setCompetitiveTerm(t.bytes.get());
                }
            }
            return true;
        }
    });
    final B b = getTopLevelBuilder();
    final ScoreTerm[] scoreTerms = stQueue.toArray(new ScoreTerm[stQueue.size()]);
    ArrayUtil.timSort(scoreTerms, scoreTermSortByTermComp);
    for (final ScoreTerm st : scoreTerms) {
        final Term term = new Term(query.field, st.bytes.toBytesRef());
        // We allow negative term scores (fuzzy query does this, for example) while collecting the terms,
        // but truncate such boosts to 0.0f when building the query:
        // add to query
        addClause(b, term, st.termState.docFreq(), Math.max(0.0f, st.boost), st.termState);
    }
    return build(b);
}
Also used : BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) HashMap(java.util.HashMap) IOException(java.io.IOException) Term(org.apache.lucene.index.Term) PriorityQueue(java.util.PriorityQueue) TermContext(org.apache.lucene.index.TermContext) TermsEnum(org.apache.lucene.index.TermsEnum) TermState(org.apache.lucene.index.TermState) BytesRef(org.apache.lucene.util.BytesRef)

Example 7 with TermContext

use of org.apache.lucene.index.TermContext in project lucene-solr by apache.

the class TestSimilarityBase method testLengthEncodingBackwardCompatibility.

public void testLengthEncodingBackwardCompatibility() throws IOException {
    Similarity similarity = RandomPicks.randomFrom(random(), sims);
    for (int indexCreatedVersionMajor : new int[] { Version.LUCENE_6_0_0.major, Version.LATEST.major }) {
        for (int length : new int[] { 1, 2, 4 }) {
            // these length values are encoded accurately on both cases
            Directory dir = newDirectory();
            // set the version on the directory
            new SegmentInfos(indexCreatedVersionMajor).commit(dir);
            IndexWriter w = new IndexWriter(dir, newIndexWriterConfig().setSimilarity(similarity));
            Document doc = new Document();
            String value = IntStream.range(0, length).mapToObj(i -> "b").collect(Collectors.joining(" "));
            doc.add(new TextField("foo", value, Store.NO));
            w.addDocument(doc);
            IndexReader reader = DirectoryReader.open(w);
            IndexSearcher searcher = newSearcher(reader);
            searcher.setSimilarity(similarity);
            Term term = new Term("foo", "b");
            TermContext context = TermContext.build(reader.getContext(), term);
            SimWeight simWeight = similarity.computeWeight(1f, searcher.collectionStatistics("foo"), searcher.termStatistics(term, context));
            SimilarityBase.BasicSimScorer simScorer = (SimilarityBase.BasicSimScorer) similarity.simScorer(simWeight, reader.leaves().get(0));
            float docLength = simScorer.getLengthValue(0);
            assertEquals(length, (int) docLength);
            w.close();
            reader.close();
            dir.close();
        }
    }
}
Also used : IntStream(java.util.stream.IntStream) Query(org.apache.lucene.search.Query) RandomPicks(com.carrotsearch.randomizedtesting.generators.RandomPicks) FieldType(org.apache.lucene.document.FieldType) Term(org.apache.lucene.index.Term) SimWeight(org.apache.lucene.search.similarities.Similarity.SimWeight) ArrayList(java.util.ArrayList) Document(org.apache.lucene.document.Document) Directory(org.apache.lucene.store.Directory) Store(org.apache.lucene.document.Field.Store) TermStatistics(org.apache.lucene.search.TermStatistics) TopDocs(org.apache.lucene.search.TopDocs) Explanation(org.apache.lucene.search.Explanation) BytesRef(org.apache.lucene.util.BytesRef) DirectoryReader(org.apache.lucene.index.DirectoryReader) IOException(java.io.IOException) TermContext(org.apache.lucene.index.TermContext) Collectors(java.util.stream.Collectors) Version(org.apache.lucene.util.Version) SegmentInfos(org.apache.lucene.index.SegmentInfos) List(java.util.List) FieldInvertState(org.apache.lucene.index.FieldInvertState) IndexWriter(org.apache.lucene.index.IndexWriter) CollectionStatistics(org.apache.lucene.search.CollectionStatistics) TermQuery(org.apache.lucene.search.TermQuery) Field(org.apache.lucene.document.Field) LuceneTestCase(org.apache.lucene.util.LuceneTestCase) TextField(org.apache.lucene.document.TextField) IndexOptions(org.apache.lucene.index.IndexOptions) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) IndexReader(org.apache.lucene.index.IndexReader) IndexSearcher(org.apache.lucene.search.IndexSearcher) IndexSearcher(org.apache.lucene.search.IndexSearcher) SegmentInfos(org.apache.lucene.index.SegmentInfos) SimWeight(org.apache.lucene.search.similarities.Similarity.SimWeight) Term(org.apache.lucene.index.Term) Document(org.apache.lucene.document.Document) TermContext(org.apache.lucene.index.TermContext) IndexWriter(org.apache.lucene.index.IndexWriter) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) IndexReader(org.apache.lucene.index.IndexReader) TextField(org.apache.lucene.document.TextField) Directory(org.apache.lucene.store.Directory)

Example 8 with TermContext

use of org.apache.lucene.index.TermContext in project lucene-solr by apache.

the class CommonTermsQuery method collectTermContext.

public void collectTermContext(IndexReader reader, List<LeafReaderContext> leaves, TermContext[] contextArray, Term[] queryTerms) throws IOException {
    TermsEnum termsEnum = null;
    for (LeafReaderContext context : leaves) {
        final Fields fields = context.reader().fields();
        for (int i = 0; i < queryTerms.length; i++) {
            Term term = queryTerms[i];
            TermContext termContext = contextArray[i];
            final Terms terms = fields.terms(term.field());
            if (terms == null) {
                // field does not exist
                continue;
            }
            termsEnum = terms.iterator();
            assert termsEnum != null;
            if (termsEnum == TermsEnum.EMPTY)
                continue;
            if (termsEnum.seekExact(term.bytes())) {
                if (termContext == null) {
                    contextArray[i] = new TermContext(reader.getContext(), termsEnum.termState(), context.ord, termsEnum.docFreq(), termsEnum.totalTermFreq());
                } else {
                    termContext.register(termsEnum.termState(), context.ord, termsEnum.docFreq(), termsEnum.totalTermFreq());
                }
            }
        }
    }
}
Also used : Fields(org.apache.lucene.index.Fields) Terms(org.apache.lucene.index.Terms) LeafReaderContext(org.apache.lucene.index.LeafReaderContext) Term(org.apache.lucene.index.Term) TermContext(org.apache.lucene.index.TermContext) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 9 with TermContext

use of org.apache.lucene.index.TermContext in project lucene-solr by apache.

the class CommonTermsQuery method buildQuery.

protected Query buildQuery(final int maxDoc, final TermContext[] contextArray, final Term[] queryTerms) {
    List<Query> lowFreqQueries = new ArrayList<>();
    List<Query> highFreqQueries = new ArrayList<>();
    for (int i = 0; i < queryTerms.length; i++) {
        TermContext termContext = contextArray[i];
        if (termContext == null) {
            lowFreqQueries.add(newTermQuery(queryTerms[i], null));
        } else {
            if ((maxTermFrequency >= 1f && termContext.docFreq() > maxTermFrequency) || (termContext.docFreq() > (int) Math.ceil(maxTermFrequency * (float) maxDoc))) {
                highFreqQueries.add(newTermQuery(queryTerms[i], termContext));
            } else {
                lowFreqQueries.add(newTermQuery(queryTerms[i], termContext));
            }
        }
    }
    final int numLowFreqClauses = lowFreqQueries.size();
    final int numHighFreqClauses = highFreqQueries.size();
    Occur lowFreqOccur = this.lowFreqOccur;
    Occur highFreqOccur = this.highFreqOccur;
    int lowFreqMinShouldMatch = 0;
    int highFreqMinShouldMatch = 0;
    if (lowFreqOccur == Occur.SHOULD && numLowFreqClauses > 0) {
        lowFreqMinShouldMatch = calcLowFreqMinimumNumberShouldMatch(numLowFreqClauses);
    }
    if (highFreqOccur == Occur.SHOULD && numHighFreqClauses > 0) {
        highFreqMinShouldMatch = calcHighFreqMinimumNumberShouldMatch(numHighFreqClauses);
    }
    if (lowFreqQueries.isEmpty()) {
        /*
       * if lowFreq is empty we rewrite the high freq terms in a conjunction to
       * prevent slow queries.
       */
        if (highFreqMinShouldMatch == 0 && highFreqOccur != Occur.MUST) {
            highFreqOccur = Occur.MUST;
        }
    }
    BooleanQuery.Builder builder = new BooleanQuery.Builder();
    if (lowFreqQueries.isEmpty() == false) {
        BooleanQuery.Builder lowFreq = new BooleanQuery.Builder();
        for (Query query : lowFreqQueries) {
            lowFreq.add(query, lowFreqOccur);
        }
        lowFreq.setMinimumNumberShouldMatch(lowFreqMinShouldMatch);
        Query lowFreqQuery = lowFreq.build();
        builder.add(new BoostQuery(lowFreqQuery, lowFreqBoost), Occur.MUST);
    }
    if (highFreqQueries.isEmpty() == false) {
        BooleanQuery.Builder highFreq = new BooleanQuery.Builder();
        for (Query query : highFreqQueries) {
            highFreq.add(query, highFreqOccur);
        }
        highFreq.setMinimumNumberShouldMatch(highFreqMinShouldMatch);
        Query highFreqQuery = highFreq.build();
        builder.add(new BoostQuery(highFreqQuery, highFreqBoost), Occur.SHOULD);
    }
    return builder.build();
}
Also used : BooleanQuery(org.apache.lucene.search.BooleanQuery) Query(org.apache.lucene.search.Query) MatchNoDocsQuery(org.apache.lucene.search.MatchNoDocsQuery) TermQuery(org.apache.lucene.search.TermQuery) BooleanQuery(org.apache.lucene.search.BooleanQuery) BoostQuery(org.apache.lucene.search.BoostQuery) ArrayList(java.util.ArrayList) Occur(org.apache.lucene.search.BooleanClause.Occur) BoostQuery(org.apache.lucene.search.BoostQuery) TermContext(org.apache.lucene.index.TermContext)

Example 10 with TermContext

use of org.apache.lucene.index.TermContext in project lucene-solr by apache.

the class FuzzyLikeThisQuery method newTermQuery.

private Query newTermQuery(IndexReader reader, Term term) throws IOException {
    if (ignoreTF) {
        return new ConstantScoreQuery(new TermQuery(term));
    } else {
        // we build an artificial TermContext that will give an overall df and ttf
        // equal to 1
        TermContext context = new TermContext(reader.getContext());
        for (LeafReaderContext leafContext : reader.leaves()) {
            Terms terms = leafContext.reader().terms(term.field());
            if (terms != null) {
                TermsEnum termsEnum = terms.iterator();
                if (termsEnum.seekExact(term.bytes())) {
                    // we want the total df and ttf to be 1
                    int freq = 1 - context.docFreq();
                    context.register(termsEnum.termState(), leafContext.ord, freq, freq);
                }
            }
        }
        return new TermQuery(term, context);
    }
}
Also used : TermQuery(org.apache.lucene.search.TermQuery) Terms(org.apache.lucene.index.Terms) ConstantScoreQuery(org.apache.lucene.search.ConstantScoreQuery) LeafReaderContext(org.apache.lucene.index.LeafReaderContext) TermContext(org.apache.lucene.index.TermContext) TermsEnum(org.apache.lucene.index.TermsEnum) FuzzyTermsEnum(org.apache.lucene.search.FuzzyTermsEnum)

Aggregations

TermContext (org.apache.lucene.index.TermContext)21 Term (org.apache.lucene.index.Term)10 LeafReaderContext (org.apache.lucene.index.LeafReaderContext)8 IndexReaderContext (org.apache.lucene.index.IndexReaderContext)6 TermState (org.apache.lucene.index.TermState)6 TermsEnum (org.apache.lucene.index.TermsEnum)6 BytesRef (org.apache.lucene.util.BytesRef)6 Terms (org.apache.lucene.index.Terms)5 TermQuery (org.apache.lucene.search.TermQuery)5 ArrayList (java.util.ArrayList)4 HashMap (java.util.HashMap)4 Query (org.apache.lucene.search.Query)4 IOException (java.io.IOException)3 PostingsEnum (org.apache.lucene.index.PostingsEnum)3 CollectionStatistics (org.apache.lucene.search.CollectionStatistics)3 TermStatistics (org.apache.lucene.search.TermStatistics)3 List (java.util.List)2 Set (java.util.Set)2 Fields (org.apache.lucene.index.Fields)2 BooleanQuery (org.apache.lucene.search.BooleanQuery)2