Search in sources :

Example 76 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.

the class TopTermsRewrite method rewrite.

@Override
public final Query rewrite(final IndexReader reader, final MultiTermQuery query) throws IOException {
    final int maxSize = Math.min(size, getMaxSize());
    final PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<>();
    collectTerms(reader, query, new TermCollector() {

        private final MaxNonCompetitiveBoostAttribute maxBoostAtt = attributes.addAttribute(MaxNonCompetitiveBoostAttribute.class);

        private final Map<BytesRef, ScoreTerm> visitedTerms = new HashMap<>();

        private TermsEnum termsEnum;

        private BoostAttribute boostAtt;

        private ScoreTerm st;

        @Override
        public void setNextEnum(TermsEnum termsEnum) {
            this.termsEnum = termsEnum;
            assert compareToLastTerm(null);
            // lazy init the initial ScoreTerm because comparator is not known on ctor:
            if (st == null)
                st = new ScoreTerm(new TermContext(topReaderContext));
            boostAtt = termsEnum.attributes().addAttribute(BoostAttribute.class);
        }

        // for assert:
        private BytesRefBuilder lastTerm;

        private boolean compareToLastTerm(BytesRef t) {
            if (lastTerm == null && t != null) {
                lastTerm = new BytesRefBuilder();
                lastTerm.append(t);
            } else if (t == null) {
                lastTerm = null;
            } else {
                assert lastTerm.get().compareTo(t) < 0 : "lastTerm=" + lastTerm + " t=" + t;
                lastTerm.copyBytes(t);
            }
            return true;
        }

        @Override
        public boolean collect(BytesRef bytes) throws IOException {
            final float boost = boostAtt.getBoost();
            // terms in order
            assert compareToLastTerm(bytes);
            // ignore uncompetitive hits
            if (stQueue.size() == maxSize) {
                final ScoreTerm t = stQueue.peek();
                if (boost < t.boost)
                    return true;
                if (boost == t.boost && bytes.compareTo(t.bytes.get()) > 0)
                    return true;
            }
            ScoreTerm t = visitedTerms.get(bytes);
            final TermState state = termsEnum.termState();
            assert state != null;
            if (t != null) {
                // if the term is already in the PQ, only update docFreq of term in PQ
                assert t.boost == boost : "boost should be equal in all segment TermsEnums";
                t.termState.register(state, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq());
            } else {
                // add new entry in PQ, we must clone the term, else it may get overwritten!
                st.bytes.copyBytes(bytes);
                st.boost = boost;
                visitedTerms.put(st.bytes.get(), st);
                assert st.termState.docFreq() == 0;
                st.termState.register(state, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq());
                stQueue.offer(st);
                // possibly drop entries from queue
                if (stQueue.size() > maxSize) {
                    st = stQueue.poll();
                    visitedTerms.remove(st.bytes.get());
                    // reset the termstate! 
                    st.termState.clear();
                } else {
                    st = new ScoreTerm(new TermContext(topReaderContext));
                }
                assert stQueue.size() <= maxSize : "the PQ size must be limited to maxSize";
                // set maxBoostAtt with values to help FuzzyTermsEnum to optimize
                if (stQueue.size() == maxSize) {
                    t = stQueue.peek();
                    maxBoostAtt.setMaxNonCompetitiveBoost(t.boost);
                    maxBoostAtt.setCompetitiveTerm(t.bytes.get());
                }
            }
            return true;
        }
    });
    final B b = getTopLevelBuilder();
    final ScoreTerm[] scoreTerms = stQueue.toArray(new ScoreTerm[stQueue.size()]);
    ArrayUtil.timSort(scoreTerms, scoreTermSortByTermComp);
    for (final ScoreTerm st : scoreTerms) {
        final Term term = new Term(query.field, st.bytes.toBytesRef());
        // We allow negative term scores (fuzzy query does this, for example) while collecting the terms,
        // but truncate such boosts to 0.0f when building the query:
        // add to query
        addClause(b, term, st.termState.docFreq(), Math.max(0.0f, st.boost), st.termState);
    }
    return build(b);
}
Also used : BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) HashMap(java.util.HashMap) IOException(java.io.IOException) Term(org.apache.lucene.index.Term) PriorityQueue(java.util.PriorityQueue) TermContext(org.apache.lucene.index.TermContext) TermsEnum(org.apache.lucene.index.TermsEnum) TermState(org.apache.lucene.index.TermState) BytesRef(org.apache.lucene.util.BytesRef)

Example 77 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project Anserini by castorini.

the class TFIDFFeatureExtractor method extract.

@Override
public float extract(Document doc, Terms terms, RerankerContext context) {
    float score = 0.0f;
    Map<String, Long> countMap = new HashMap<>();
    Map<String, Integer> docFreqs = new HashMap<>();
    IndexReader reader = context.getIndexSearcher().getIndexReader();
    long numDocs = reader.numDocs();
    for (String queryToken : context.getQueryTokens()) {
        try {
            docFreqs.put(queryToken, reader.docFreq(new Term(context.getField(), queryToken)));
        } catch (IOException e) {
            LOG.error("Error trying to read document frequency");
            docFreqs.put(queryToken, 0);
        }
    }
    try {
        TermsEnum termsEnum = terms.iterator();
        while (termsEnum.next() != null) {
            String termString = termsEnum.term().utf8ToString();
            if (context.getQueryTokens().contains(termString)) {
                countMap.put(termString, termsEnum.totalTermFreq());
            }
        }
    } catch (IOException e) {
        LOG.error("Error while accessing term vector");
    }
    TFIDFSimilarity similarity = new ClassicSimilarity();
    // number of query tokens found
    // how many of our query tokens were found
    float coord = similarity.coord(countMap.size(), context.getQueryTokens().size());
    for (String token : context.getQueryTokens()) {
        long termFreq = countMap.containsKey(token) ? countMap.get(token) : 0;
        long docFreq = docFreqs.containsKey(token) ? docFreqs.get(token) : 0;
        float tf = similarity.tf(termFreq);
        float idf = similarity.idf(docFreq, numDocs);
        score += tf * idf * idf;
    }
    score *= coord;
    return score;
}
Also used : ClassicSimilarity(org.apache.lucene.search.similarities.ClassicSimilarity) HashMap(java.util.HashMap) Term(org.apache.lucene.index.Term) IOException(java.io.IOException) TermsEnum(org.apache.lucene.index.TermsEnum) IndexReader(org.apache.lucene.index.IndexReader) TFIDFSimilarity(org.apache.lucene.search.similarities.TFIDFSimilarity)

Example 78 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project Anserini by castorini.

the class TermFrequencyFeatureExtractor method extract.

@Override
public float extract(Document doc, Terms terms, RerankerContext context) {
    TermsEnum termsEnum = null;
    try {
        termsEnum = terms.iterator();
    } catch (IOException e) {
        LOG.warn("No terms enum found");
        return 0.0f;
    }
    Map<String, Long> termFreqMap = new HashMap<>();
    Set<String> queryTokens = new HashSet<>(context.getQueryTokens());
    try {
        while (termsEnum.next() != null) {
            String termString = termsEnum.term().utf8ToString();
            if (queryTokens.contains(termString)) {
                termFreqMap.put(termString, termsEnum.totalTermFreq());
            }
        }
    } catch (IOException e) {
        LOG.warn("Error retrieving total term freq");
    }
    float score = 0.0f;
    for (String queryToken : queryTokens) {
        if (termFreqMap.containsKey(queryToken)) {
            score += termFreqMap.get(queryToken);
        } else {
            score += 0.0f;
        }
    }
    return score;
}
Also used : HashMap(java.util.HashMap) IOException(java.io.IOException) TermsEnum(org.apache.lucene.index.TermsEnum) HashSet(java.util.HashSet)

Example 79 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project Anserini by castorini.

the class DocSizeFeatureExtractor method extract.

@Override
public float extract(Document doc, Terms terms, RerankerContext context) {
    float score;
    try {
        score = (float) terms.getSumTotalTermFreq();
        if (score == -1) {
            // try to iterate over the terms
            TermsEnum termsEnum = terms.iterator();
            score = 0.0f;
            while (termsEnum.next() != null) {
                score += termsEnum.totalTermFreq();
            }
        }
    } catch (IOException e) {
        score = 0.0f;
    }
    return score;
}
Also used : IOException(java.io.IOException) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 80 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project Anserini by castorini.

the class FeatureVector method fromLuceneTermVector.

public static FeatureVector fromLuceneTermVector(Terms terms, Rm3Stopper stopper) {
    FeatureVector f = new FeatureVector();
    try {
        TermsEnum termsEnum = terms.iterator();
        BytesRef text = null;
        while ((text = termsEnum.next()) != null) {
            String term = text.utf8ToString();
            if (term.length() < 2)
                continue;
            if (stopper.isStopWord(term))
                continue;
            if (!term.matches("[a-z0-9]+"))
                continue;
            int freq = (int) termsEnum.totalTermFreq();
            f.addFeatureWeight(term, (float) freq);
        }
    } catch (Exception e) {
        e.printStackTrace();
        // Return empty feature vector
        return f;
    }
    return f;
}
Also used : BytesRef(org.apache.lucene.util.BytesRef) TermsEnum(org.apache.lucene.index.TermsEnum)

Aggregations

TermsEnum (org.apache.lucene.index.TermsEnum)155 BytesRef (org.apache.lucene.util.BytesRef)116 Terms (org.apache.lucene.index.Terms)103 PostingsEnum (org.apache.lucene.index.PostingsEnum)52 ArrayList (java.util.ArrayList)31 Term (org.apache.lucene.index.Term)31 IndexReader (org.apache.lucene.index.IndexReader)29 LeafReader (org.apache.lucene.index.LeafReader)28 IOException (java.io.IOException)26 Fields (org.apache.lucene.index.Fields)26 LeafReaderContext (org.apache.lucene.index.LeafReaderContext)25 Document (org.apache.lucene.document.Document)24 Directory (org.apache.lucene.store.Directory)24 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)19 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)18 HashMap (java.util.HashMap)12 HashSet (java.util.HashSet)11 DirectoryReader (org.apache.lucene.index.DirectoryReader)11 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)10 Bits (org.apache.lucene.util.Bits)10