use of org.apache.lucene.index.TermsEnum in project textdb by TextDB.
the class DataReader method buildPayloadFromTermVector.
private ArrayList<Span> buildPayloadFromTermVector(List<IField> fields, int docID) throws IOException {
ArrayList<Span> payloadSpanList = new ArrayList<>();
for (Attribute attr : inputSchema.getAttributes()) {
String attributeName = attr.getName();
AttributeType attributeType = attr.getType();
// payload.
if (attributeType != AttributeType.TEXT) {
continue;
}
String fieldValue = fields.get(inputSchema.getIndex(attributeName)).getValue().toString();
Terms termVector = luceneIndexReader.getTermVector(docID, attributeName);
if (termVector == null) {
continue;
}
TermsEnum termsEnum = termVector.iterator();
PostingsEnum termPostings = null;
// go through document terms
while ((termsEnum.next()) != null) {
termPostings = termsEnum.postings(termPostings, PostingsEnum.ALL);
if (termPostings.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) {
continue;
}
// for each term, go through its postings
for (int i = 0; i < termPostings.freq(); i++) {
// nextPosition needs to be called first
int tokenPosition = termPostings.nextPosition();
int charStart = termPostings.startOffset();
int charEnd = termPostings.endOffset();
String analyzedTermStr = termsEnum.term().utf8ToString();
String originalTermStr = fieldValue.substring(charStart, charEnd);
Span span = new Span(attributeName, charStart, charEnd, analyzedTermStr, originalTermStr, tokenPosition);
payloadSpanList.add(span);
}
}
}
return payloadSpanList;
}
use of org.apache.lucene.index.TermsEnum in project Anserini by castorini.
the class SumMatchingTf method extract.
@Override
public float extract(Document doc, Terms terms, RerankerContext context) {
try {
List<String> queryTokens = context.getQueryTokens();
TermsEnum termsEnum = terms.iterator();
int sum = 0;
BytesRef text = null;
while ((text = termsEnum.next()) != null) {
String term = text.utf8ToString();
if (queryTokens.contains(term)) {
sum += (int) termsEnum.totalTermFreq();
}
}
return sum;
} catch (IOException e) {
return 0;
}
}
use of org.apache.lucene.index.TermsEnum in project Anserini by castorini.
the class MatchingTermCount method extract.
@Override
public float extract(Document doc, Terms terms, RerankerContext context) {
try {
List<String> queryTokens = context.getQueryTokens();
TermsEnum termsEnum = terms.iterator();
int matching = 0;
BytesRef text = null;
while ((text = termsEnum.next()) != null) {
String term = text.utf8ToString();
if (queryTokens.contains(term)) {
matching++;
}
}
return matching;
} catch (IOException e) {
return 0;
}
}
use of org.apache.lucene.index.TermsEnum in project Anserini by castorini.
the class AvgICTFFeatureExtractor method getSumICTF.
// Calculate term frequencies, if error returns an empty map, couting all tf = 0
private float getSumICTF(Terms terms, List<String> queryTokens) {
float sumICTF = 0.0f;
float docSize = 0.0f;
List<Long> termFreqs = new ArrayList<>();
try {
TermsEnum termsEnum = terms.iterator();
while (termsEnum.next() != null) {
String termString = termsEnum.term().utf8ToString();
docSize += termsEnum.totalTermFreq();
if (queryTokens.contains(termString) && termsEnum.totalTermFreq() > 0) {
termFreqs.add(termsEnum.totalTermFreq());
}
}
} catch (IOException e) {
LOG.warn("Error retrieving term frequencies");
return 0.0f;
}
for (Long termFreq : termFreqs) {
sumICTF += Math.log(docSize / termFreq);
}
return sumICTF;
}
use of org.apache.lucene.index.TermsEnum in project Anserini by castorini.
the class BM25FeatureExtractor method extract.
/**
* We will implement this according to the Lucene specification
* the formula used:
* sum ( IDF(qi) * (df(qi,D) * (k+1)) / (df(qi,D) + k * (1-b + b*|D| / avgFL))
* IDF and avgFL computation are described above.
* @param doc
* @param terms
* @param context
* @return
*/
@Override
public float extract(Document doc, Terms terms, RerankerContext context) {
Set<String> queryTokens = new HashSet<>(context.getQueryTokens());
TermsEnum termsEnum = null;
try {
termsEnum = terms.iterator();
} catch (IOException e) {
LOG.warn("Error computing BM25, unable to retrieve terms enum");
return 0.0f;
}
IndexReader reader = context.getIndexSearcher().getIndexReader();
long maxDocs = reader.numDocs();
long sumTotalTermFreq = getSumTermFrequency(reader, context.getField());
// Compute by iterating
long docSize = 0L;
// NOTE df cannot be retrieved just from the term vector,
// the term vector here is only a partial term vector that treats this as if we only have 1 document in the index
Map<String, Integer> docFreqMap = null;
try {
docFreqMap = getDocFreqs(reader, context.getQueryTokens(), context.getField());
} catch (IOException e) {
LOG.warn("Unable to retrieve document frequencies.");
docFreqMap = new HashMap<>();
}
Map<String, Long> termFreqMap = new HashMap<>();
try {
while (termsEnum.next() != null) {
String termString = termsEnum.term().utf8ToString();
docSize += termsEnum.totalTermFreq();
if (queryTokens.contains(termString)) {
termFreqMap.put(termString, termsEnum.totalTermFreq());
}
}
} catch (IOException e) {
LOG.warn("Unable to retrieve termsEnum, treating as 0");
}
float score = 0.0f;
// Iterate over the query tokens
double avgFL = computeAvgFL(sumTotalTermFreq, maxDocs);
for (String token : queryTokens) {
long docFreq = docFreqMap.containsKey(token) ? docFreqMap.get(token) : 0;
double termFreq = termFreqMap.containsKey(token) ? termFreqMap.get(token) : 0;
double numerator = (this.k1 + 1) * termFreq;
double docLengthFactor = this.b * (docSize / avgFL);
double denominator = termFreq + (this.k1) * (1 - this.b + docLengthFactor);
score += computeIDF(docFreq, maxDocs) * numerator / denominator;
}
return score;
}
Aggregations