Search in sources :

Example 81 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project pyramid by cheng-li.

the class ESIndex method getTermStats.

/**
     * df is from one shard!!!
     * @param id
     * @return term statistics from one doc
     * @throws IOException
     */
public Set<TermStat> getTermStats(String field, String id) throws IOException {
    StopWatch stopWatch = null;
    if (logger.isDebugEnabled()) {
        stopWatch = new StopWatch();
        stopWatch.start();
    }
    TermVectorResponse response = client.prepareTermVector(indexName, documentType, id).setOffsets(false).setPositions(false).setFieldStatistics(false).setTermStatistics(true).setSelectedFields(field).execute().actionGet();
    Terms terms = response.getFields().terms(field);
    Set<TermStat> set = new HashSet<>();
    // if the field is empty, terms==null
    if (terms == null) {
        return set;
    }
    TermsEnum iterator = terms.iterator(null);
    for (int i = 0; i < terms.size(); i++) {
        String term = iterator.next().utf8ToString();
        int tf = iterator.docsAndPositions(null, null).freq();
        int df = iterator.docFreq();
        DefaultSimilarity defaultSimilarity = new DefaultSimilarity();
        /**
             * from lucene
             */
        /**
             * tf is just tf, not square root of tf as in lucene
             */
        /** Implemented as <code>log(numDocs/(docFreq+1)) + 1</code>. */
        float tfidf = tf * defaultSimilarity.idf(df, this.numDocs);
        TermStat termStat = new TermStat(term);
        termStat.setTf(tf).setDf(df).setTfidf(tfidf);
        set.add(termStat);
    }
    if (logger.isDebugEnabled()) {
        logger.debug("time spent on getNgramInfos for " + id + " = " + stopWatch);
    }
    return set;
}
Also used : Terms(org.apache.lucene.index.Terms) DefaultSimilarity(org.apache.lucene.search.similarities.DefaultSimilarity) TermVectorResponse(org.elasticsearch.action.termvector.TermVectorResponse) StopWatch(org.apache.commons.lang3.time.StopWatch) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 82 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project pyramid by cheng-li.

the class ESIndex method getTermVectorWithException.

private Map<Integer, String> getTermVectorWithException(String field, String id) throws IOException {
    TermVectorResponse response = client.prepareTermVector(indexName, documentType, id).setOffsets(false).setPositions(true).setFieldStatistics(false).setTermStatistics(false).setSelectedFields(field).execute().actionGet();
    Map<Integer, String> map = new HashMap<>();
    Terms terms = response.getFields().terms(field);
    if (terms == null) {
        return map;
    }
    TermsEnum iterator = terms.iterator(null);
    for (int i = 0; i < terms.size(); i++) {
        String term = iterator.next().utf8ToString();
        int tf = iterator.docsAndPositions(null, null).freq();
        //must declare docsAndPositionsEnum as a local variable and reuse it for positions
        DocsAndPositionsEnum docsAndPositionsEnum = iterator.docsAndPositions(null, null);
        for (int j = 0; j < tf; j++) {
            int pos = docsAndPositionsEnum.nextPosition();
            map.put(pos, term);
        }
    }
    return map;
}
Also used : DocsAndPositionsEnum(org.apache.lucene.index.DocsAndPositionsEnum) Terms(org.apache.lucene.index.Terms) TermVectorResponse(org.elasticsearch.action.termvector.TermVectorResponse) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 83 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project cogcomp-nlp by CogComp.

the class TermIterator method run.

public void run() {
    TermsEnum te;
    try {
        te = terms.iterator();
        int i = 0;
        while (te.next() != null) {
            int freq = te.docFreq();
            String termString = te.term().utf8ToString();
            hasTerm(i++, termString, freq);
        }
    } catch (IOException e) {
        e.printStackTrace();
    }
}
Also used : IOException(java.io.IOException) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 84 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project tika by apache.

the class LuceneTokenCounter method count.

void count(String field) throws IOException {
    long tokenCount = leafReader.getSumTotalTermFreq(field);
    if (tokenCount > Integer.MAX_VALUE) {
        throw new IllegalArgumentException("can't handle longs");
    }
    int tokenCountInt = (int) tokenCount;
    int uniqueTokenCount = 0;
    SummaryStatistics summStats = new SummaryStatistics();
    double ent = 0.0d;
    double p = 0.0d;
    double base = 2.0;
    Terms terms = leafReader.terms(field);
    if (terms == null) {
        //if there were no terms
        fieldStats.put(field, new TokenStatistics(uniqueTokenCount, tokenCountInt, new TokenIntPair[0], ent, summStats));
        return;
    }
    TermsEnum termsEnum = terms.iterator();
    BytesRef bytesRef = termsEnum.next();
    TokenCountPriorityQueue queue = new TokenCountPriorityQueue(topN);
    while (bytesRef != null) {
        long termFreq = termsEnum.totalTermFreq();
        if (termFreq > Integer.MAX_VALUE) {
            throw new IllegalArgumentException("Sorry can't handle longs yet");
        }
        int tf = (int) termFreq;
        //TODO: figure out how to avoid Stringifying this
        //to get codepoint count
        String t = bytesRef.utf8ToString();
        int len = t.codePointCount(0, t.length());
        for (int i = 0; i < tf; i++) {
            summStats.addValue(len);
        }
        p = (double) tf / (double) tokenCount;
        ent += p * FastMath.log(base, p);
        if (queue.top() == null || queue.size() < topN || tf >= queue.top().getValue()) {
            queue.insertWithOverflow(new TokenIntPair(t, tf));
        }
        uniqueTokenCount++;
        bytesRef = termsEnum.next();
    }
    if (tokenCountInt > 0) {
        ent = (-1.0d / (double) tokenCountInt) * ent;
    }
    fieldStats.put(field, new TokenStatistics(uniqueTokenCount, tokenCountInt, queue.getArray(), ent, summStats));
}
Also used : Terms(org.apache.lucene.index.Terms) SummaryStatistics(org.apache.commons.math3.stat.descriptive.SummaryStatistics) BytesRef(org.apache.lucene.util.BytesRef) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 85 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project textdb by TextDB.

the class WordCountIndexSource method computeWordCount.

private void computeWordCount() throws TexeraException {
    try {
        HashMap<String, Integer> wordCountMap = new HashMap<>();
        DataReader dataReader = RelationManager.getInstance().getTableDataReader(predicate.getTableName(), new MatchAllDocsQuery());
        dataReader.open();
        IndexReader luceneIndexReader = dataReader.getLuceneIndexReader();
        for (int i = 0; i < luceneIndexReader.numDocs(); i++) {
            Terms termVector = luceneIndexReader.getTermVector(i, predicate.getAttribute());
            TermsEnum termsEnum = termVector.iterator();
            while (termsEnum.next() != null) {
                String key = termsEnum.term().utf8ToString();
                wordCountMap.put(key, wordCountMap.get(key) == null ? ((int) termsEnum.totalTermFreq()) : wordCountMap.get(key) + ((int) termsEnum.totalTermFreq()));
            }
        }
        luceneIndexReader.close();
        dataReader.close();
        sortedWordCountMap = wordCountMap.entrySet().stream().sorted((e1, e2) -> e2.getValue().compareTo(e1.getValue())).collect(Collectors.toList());
        wordCountIterator = sortedWordCountMap.iterator();
    } catch (IOException e) {
        throw new DataflowException(e);
    }
}
Also used : DataReader(edu.uci.ics.texera.storage.DataReader) HashMap(java.util.HashMap) IndexReader(org.apache.lucene.index.IndexReader) Terms(org.apache.lucene.index.Terms) DataflowException(edu.uci.ics.texera.api.exception.DataflowException) IOException(java.io.IOException) MatchAllDocsQuery(org.apache.lucene.search.MatchAllDocsQuery) TermsEnum(org.apache.lucene.index.TermsEnum)

Aggregations

TermsEnum (org.apache.lucene.index.TermsEnum)155 BytesRef (org.apache.lucene.util.BytesRef)116 Terms (org.apache.lucene.index.Terms)103 PostingsEnum (org.apache.lucene.index.PostingsEnum)52 ArrayList (java.util.ArrayList)31 Term (org.apache.lucene.index.Term)31 IndexReader (org.apache.lucene.index.IndexReader)29 LeafReader (org.apache.lucene.index.LeafReader)28 IOException (java.io.IOException)26 Fields (org.apache.lucene.index.Fields)26 LeafReaderContext (org.apache.lucene.index.LeafReaderContext)25 Document (org.apache.lucene.document.Document)24 Directory (org.apache.lucene.store.Directory)24 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)19 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)18 HashMap (java.util.HashMap)12 HashSet (java.util.HashSet)11 DirectoryReader (org.apache.lucene.index.DirectoryReader)11 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)10 Bits (org.apache.lucene.util.Bits)10