use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.
the class TopTermsRewrite method rewrite.
@Override
public final Query rewrite(final IndexReader reader, final MultiTermQuery query) throws IOException {
final int maxSize = Math.min(size, getMaxSize());
final PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<>();
collectTerms(reader, query, new TermCollector() {
private final MaxNonCompetitiveBoostAttribute maxBoostAtt = attributes.addAttribute(MaxNonCompetitiveBoostAttribute.class);
private final Map<BytesRef, ScoreTerm> visitedTerms = new HashMap<>();
private TermsEnum termsEnum;
private BoostAttribute boostAtt;
private ScoreTerm st;
@Override
public void setNextEnum(TermsEnum termsEnum) {
this.termsEnum = termsEnum;
assert compareToLastTerm(null);
// lazy init the initial ScoreTerm because comparator is not known on ctor:
if (st == null)
st = new ScoreTerm(new TermContext(topReaderContext));
boostAtt = termsEnum.attributes().addAttribute(BoostAttribute.class);
}
// for assert:
private BytesRefBuilder lastTerm;
private boolean compareToLastTerm(BytesRef t) {
if (lastTerm == null && t != null) {
lastTerm = new BytesRefBuilder();
lastTerm.append(t);
} else if (t == null) {
lastTerm = null;
} else {
assert lastTerm.get().compareTo(t) < 0 : "lastTerm=" + lastTerm + " t=" + t;
lastTerm.copyBytes(t);
}
return true;
}
@Override
public boolean collect(BytesRef bytes) throws IOException {
final float boost = boostAtt.getBoost();
// terms in order
assert compareToLastTerm(bytes);
// ignore uncompetitive hits
if (stQueue.size() == maxSize) {
final ScoreTerm t = stQueue.peek();
if (boost < t.boost)
return true;
if (boost == t.boost && bytes.compareTo(t.bytes.get()) > 0)
return true;
}
ScoreTerm t = visitedTerms.get(bytes);
final TermState state = termsEnum.termState();
assert state != null;
if (t != null) {
// if the term is already in the PQ, only update docFreq of term in PQ
assert t.boost == boost : "boost should be equal in all segment TermsEnums";
t.termState.register(state, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq());
} else {
// add new entry in PQ, we must clone the term, else it may get overwritten!
st.bytes.copyBytes(bytes);
st.boost = boost;
visitedTerms.put(st.bytes.get(), st);
assert st.termState.docFreq() == 0;
st.termState.register(state, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq());
stQueue.offer(st);
// possibly drop entries from queue
if (stQueue.size() > maxSize) {
st = stQueue.poll();
visitedTerms.remove(st.bytes.get());
// reset the termstate!
st.termState.clear();
} else {
st = new ScoreTerm(new TermContext(topReaderContext));
}
assert stQueue.size() <= maxSize : "the PQ size must be limited to maxSize";
// set maxBoostAtt with values to help FuzzyTermsEnum to optimize
if (stQueue.size() == maxSize) {
t = stQueue.peek();
maxBoostAtt.setMaxNonCompetitiveBoost(t.boost);
maxBoostAtt.setCompetitiveTerm(t.bytes.get());
}
}
return true;
}
});
final B b = getTopLevelBuilder();
final ScoreTerm[] scoreTerms = stQueue.toArray(new ScoreTerm[stQueue.size()]);
ArrayUtil.timSort(scoreTerms, scoreTermSortByTermComp);
for (final ScoreTerm st : scoreTerms) {
final Term term = new Term(query.field, st.bytes.toBytesRef());
// We allow negative term scores (fuzzy query does this, for example) while collecting the terms,
// but truncate such boosts to 0.0f when building the query:
// add to query
addClause(b, term, st.termState.docFreq(), Math.max(0.0f, st.boost), st.termState);
}
return build(b);
}
use of org.apache.lucene.index.TermsEnum in project Anserini by castorini.
the class TFIDFFeatureExtractor method extract.
@Override
public float extract(Document doc, Terms terms, RerankerContext context) {
float score = 0.0f;
Map<String, Long> countMap = new HashMap<>();
Map<String, Integer> docFreqs = new HashMap<>();
IndexReader reader = context.getIndexSearcher().getIndexReader();
long numDocs = reader.numDocs();
for (String queryToken : context.getQueryTokens()) {
try {
docFreqs.put(queryToken, reader.docFreq(new Term(context.getField(), queryToken)));
} catch (IOException e) {
LOG.error("Error trying to read document frequency");
docFreqs.put(queryToken, 0);
}
}
try {
TermsEnum termsEnum = terms.iterator();
while (termsEnum.next() != null) {
String termString = termsEnum.term().utf8ToString();
if (context.getQueryTokens().contains(termString)) {
countMap.put(termString, termsEnum.totalTermFreq());
}
}
} catch (IOException e) {
LOG.error("Error while accessing term vector");
}
TFIDFSimilarity similarity = new ClassicSimilarity();
// number of query tokens found
// how many of our query tokens were found
float coord = similarity.coord(countMap.size(), context.getQueryTokens().size());
for (String token : context.getQueryTokens()) {
long termFreq = countMap.containsKey(token) ? countMap.get(token) : 0;
long docFreq = docFreqs.containsKey(token) ? docFreqs.get(token) : 0;
float tf = similarity.tf(termFreq);
float idf = similarity.idf(docFreq, numDocs);
score += tf * idf * idf;
}
score *= coord;
return score;
}
use of org.apache.lucene.index.TermsEnum in project Anserini by castorini.
the class TermFrequencyFeatureExtractor method extract.
@Override
public float extract(Document doc, Terms terms, RerankerContext context) {
TermsEnum termsEnum = null;
try {
termsEnum = terms.iterator();
} catch (IOException e) {
LOG.warn("No terms enum found");
return 0.0f;
}
Map<String, Long> termFreqMap = new HashMap<>();
Set<String> queryTokens = new HashSet<>(context.getQueryTokens());
try {
while (termsEnum.next() != null) {
String termString = termsEnum.term().utf8ToString();
if (queryTokens.contains(termString)) {
termFreqMap.put(termString, termsEnum.totalTermFreq());
}
}
} catch (IOException e) {
LOG.warn("Error retrieving total term freq");
}
float score = 0.0f;
for (String queryToken : queryTokens) {
if (termFreqMap.containsKey(queryToken)) {
score += termFreqMap.get(queryToken);
} else {
score += 0.0f;
}
}
return score;
}
use of org.apache.lucene.index.TermsEnum in project Anserini by castorini.
the class DocSizeFeatureExtractor method extract.
@Override
public float extract(Document doc, Terms terms, RerankerContext context) {
float score;
try {
score = (float) terms.getSumTotalTermFreq();
if (score == -1) {
// try to iterate over the terms
TermsEnum termsEnum = terms.iterator();
score = 0.0f;
while (termsEnum.next() != null) {
score += termsEnum.totalTermFreq();
}
}
} catch (IOException e) {
score = 0.0f;
}
return score;
}
use of org.apache.lucene.index.TermsEnum in project Anserini by castorini.
the class FeatureVector method fromLuceneTermVector.
public static FeatureVector fromLuceneTermVector(Terms terms, Rm3Stopper stopper) {
FeatureVector f = new FeatureVector();
try {
TermsEnum termsEnum = terms.iterator();
BytesRef text = null;
while ((text = termsEnum.next()) != null) {
String term = text.utf8ToString();
if (term.length() < 2)
continue;
if (stopper.isStopWord(term))
continue;
if (!term.matches("[a-z0-9]+"))
continue;
int freq = (int) termsEnum.totalTermFreq();
f.addFeatureWeight(term, (float) freq);
}
} catch (Exception e) {
e.printStackTrace();
// Return empty feature vector
return f;
}
return f;
}
Aggregations