Search in sources :

Example 86 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project textdb by TextDB.

the class DataReader method buildPayloadFromTermVector.

private ArrayList<Span> buildPayloadFromTermVector(List<IField> fields, int docID) throws IOException {
    ArrayList<Span> payloadSpanList = new ArrayList<>();
    for (Attribute attr : inputSchema.getAttributes()) {
        String attributeName = attr.getName();
        AttributeType attributeType = attr.getType();
        // payload.
        if (attributeType != AttributeType.TEXT) {
            continue;
        }
        String fieldValue = fields.get(inputSchema.getIndex(attributeName)).getValue().toString();
        Terms termVector = luceneIndexReader.getTermVector(docID, attributeName);
        if (termVector == null) {
            continue;
        }
        TermsEnum termsEnum = termVector.iterator();
        PostingsEnum termPostings = null;
        // go through document terms
        while ((termsEnum.next()) != null) {
            termPostings = termsEnum.postings(termPostings, PostingsEnum.ALL);
            if (termPostings.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) {
                continue;
            }
            // for each term, go through its postings
            for (int i = 0; i < termPostings.freq(); i++) {
                // nextPosition needs to be called first
                int tokenPosition = termPostings.nextPosition();
                int charStart = termPostings.startOffset();
                int charEnd = termPostings.endOffset();
                String analyzedTermStr = termsEnum.term().utf8ToString();
                String originalTermStr = fieldValue.substring(charStart, charEnd);
                Span span = new Span(attributeName, charStart, charEnd, analyzedTermStr, originalTermStr, tokenPosition);
                payloadSpanList.add(span);
            }
        }
    }
    return payloadSpanList;
}
Also used : Attribute(edu.uci.ics.texera.api.schema.Attribute) AttributeType(edu.uci.ics.texera.api.schema.AttributeType) ArrayList(java.util.ArrayList) Terms(org.apache.lucene.index.Terms) PostingsEnum(org.apache.lucene.index.PostingsEnum) Span(edu.uci.ics.texera.api.span.Span) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 87 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project Anserini by castorini.

the class SumMatchingTf method extract.

@Override
public float extract(Document doc, Terms terms, RerankerContext context) {
    try {
        List<String> queryTokens = context.getQueryTokens();
        TermsEnum termsEnum = terms.iterator();
        int sum = 0;
        BytesRef text = null;
        while ((text = termsEnum.next()) != null) {
            String term = text.utf8ToString();
            if (queryTokens.contains(term)) {
                sum += (int) termsEnum.totalTermFreq();
            }
        }
        return sum;
    } catch (IOException e) {
        return 0;
    }
}
Also used : IOException(java.io.IOException) BytesRef(org.apache.lucene.util.BytesRef) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 88 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project Anserini by castorini.

the class MatchingTermCount method extract.

@Override
public float extract(Document doc, Terms terms, RerankerContext context) {
    try {
        List<String> queryTokens = context.getQueryTokens();
        TermsEnum termsEnum = terms.iterator();
        int matching = 0;
        BytesRef text = null;
        while ((text = termsEnum.next()) != null) {
            String term = text.utf8ToString();
            if (queryTokens.contains(term)) {
                matching++;
            }
        }
        return matching;
    } catch (IOException e) {
        return 0;
    }
}
Also used : IOException(java.io.IOException) BytesRef(org.apache.lucene.util.BytesRef) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 89 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project Anserini by castorini.

the class AvgICTFFeatureExtractor method getSumICTF.

// Calculate term frequencies, if error returns an empty map, couting all tf = 0
private float getSumICTF(Terms terms, List<String> queryTokens) {
    float sumICTF = 0.0f;
    float docSize = 0.0f;
    List<Long> termFreqs = new ArrayList<>();
    try {
        TermsEnum termsEnum = terms.iterator();
        while (termsEnum.next() != null) {
            String termString = termsEnum.term().utf8ToString();
            docSize += termsEnum.totalTermFreq();
            if (queryTokens.contains(termString) && termsEnum.totalTermFreq() > 0) {
                termFreqs.add(termsEnum.totalTermFreq());
            }
        }
    } catch (IOException e) {
        LOG.warn("Error retrieving term frequencies");
        return 0.0f;
    }
    for (Long termFreq : termFreqs) {
        sumICTF += Math.log(docSize / termFreq);
    }
    return sumICTF;
}
Also used : ArrayList(java.util.ArrayList) IOException(java.io.IOException) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 90 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project Anserini by castorini.

the class BM25FeatureExtractor method extract.

/**
 * We will implement this according to the Lucene specification
 * the formula used:
 * sum ( IDF(qi) * (df(qi,D) * (k+1)) / (df(qi,D) + k * (1-b + b*|D| / avgFL))
 * IDF and avgFL computation are described above.
 * @param doc
 * @param terms
 * @param context
 * @return
 */
@Override
public float extract(Document doc, Terms terms, RerankerContext context) {
    Set<String> queryTokens = new HashSet<>(context.getQueryTokens());
    TermsEnum termsEnum = null;
    try {
        termsEnum = terms.iterator();
    } catch (IOException e) {
        LOG.warn("Error computing BM25, unable to retrieve terms enum");
        return 0.0f;
    }
    IndexReader reader = context.getIndexSearcher().getIndexReader();
    long maxDocs = reader.numDocs();
    long sumTotalTermFreq = getSumTermFrequency(reader, context.getField());
    // Compute by iterating
    long docSize = 0L;
    // NOTE df cannot be retrieved just from the term vector,
    // the term vector here is only a partial term vector that treats this as if we only have 1 document in the index
    Map<String, Integer> docFreqMap = null;
    try {
        docFreqMap = getDocFreqs(reader, context.getQueryTokens(), context.getField());
    } catch (IOException e) {
        LOG.warn("Unable to retrieve document frequencies.");
        docFreqMap = new HashMap<>();
    }
    Map<String, Long> termFreqMap = new HashMap<>();
    try {
        while (termsEnum.next() != null) {
            String termString = termsEnum.term().utf8ToString();
            docSize += termsEnum.totalTermFreq();
            if (queryTokens.contains(termString)) {
                termFreqMap.put(termString, termsEnum.totalTermFreq());
            }
        }
    } catch (IOException e) {
        LOG.warn("Unable to retrieve termsEnum, treating as 0");
    }
    float score = 0.0f;
    // Iterate over the query tokens
    double avgFL = computeAvgFL(sumTotalTermFreq, maxDocs);
    for (String token : queryTokens) {
        long docFreq = docFreqMap.containsKey(token) ? docFreqMap.get(token) : 0;
        double termFreq = termFreqMap.containsKey(token) ? termFreqMap.get(token) : 0;
        double numerator = (this.k1 + 1) * termFreq;
        double docLengthFactor = this.b * (docSize / avgFL);
        double denominator = termFreq + (this.k1) * (1 - this.b + docLengthFactor);
        score += computeIDF(docFreq, maxDocs) * numerator / denominator;
    }
    return score;
}
Also used : HashMap(java.util.HashMap) IOException(java.io.IOException) TermsEnum(org.apache.lucene.index.TermsEnum) IndexReader(org.apache.lucene.index.IndexReader) HashSet(java.util.HashSet)

Aggregations

TermsEnum (org.apache.lucene.index.TermsEnum)155 BytesRef (org.apache.lucene.util.BytesRef)116 Terms (org.apache.lucene.index.Terms)103 PostingsEnum (org.apache.lucene.index.PostingsEnum)52 ArrayList (java.util.ArrayList)31 Term (org.apache.lucene.index.Term)31 IndexReader (org.apache.lucene.index.IndexReader)29 LeafReader (org.apache.lucene.index.LeafReader)28 IOException (java.io.IOException)26 Fields (org.apache.lucene.index.Fields)26 LeafReaderContext (org.apache.lucene.index.LeafReaderContext)25 Document (org.apache.lucene.document.Document)24 Directory (org.apache.lucene.store.Directory)24 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)19 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)18 HashMap (java.util.HashMap)12 HashSet (java.util.HashSet)11 DirectoryReader (org.apache.lucene.index.DirectoryReader)11 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)10 Bits (org.apache.lucene.util.Bits)10