use of org.apache.lucene.index.TermsEnum in project pyramid by cheng-li.
the class ESIndex method getTermStats.
/**
* df is from one shard!!!
* @param id
* @return term statistics from one doc
* @throws IOException
*/
public Set<TermStat> getTermStats(String field, String id) throws IOException {
StopWatch stopWatch = null;
if (logger.isDebugEnabled()) {
stopWatch = new StopWatch();
stopWatch.start();
}
TermVectorResponse response = client.prepareTermVector(indexName, documentType, id).setOffsets(false).setPositions(false).setFieldStatistics(false).setTermStatistics(true).setSelectedFields(field).execute().actionGet();
Terms terms = response.getFields().terms(field);
Set<TermStat> set = new HashSet<>();
// if the field is empty, terms==null
if (terms == null) {
return set;
}
TermsEnum iterator = terms.iterator(null);
for (int i = 0; i < terms.size(); i++) {
String term = iterator.next().utf8ToString();
int tf = iterator.docsAndPositions(null, null).freq();
int df = iterator.docFreq();
DefaultSimilarity defaultSimilarity = new DefaultSimilarity();
/**
* from lucene
*/
/**
* tf is just tf, not square root of tf as in lucene
*/
/** Implemented as <code>log(numDocs/(docFreq+1)) + 1</code>. */
float tfidf = tf * defaultSimilarity.idf(df, this.numDocs);
TermStat termStat = new TermStat(term);
termStat.setTf(tf).setDf(df).setTfidf(tfidf);
set.add(termStat);
}
if (logger.isDebugEnabled()) {
logger.debug("time spent on getNgramInfos for " + id + " = " + stopWatch);
}
return set;
}
use of org.apache.lucene.index.TermsEnum in project pyramid by cheng-li.
the class ESIndex method getTermVectorWithException.
private Map<Integer, String> getTermVectorWithException(String field, String id) throws IOException {
TermVectorResponse response = client.prepareTermVector(indexName, documentType, id).setOffsets(false).setPositions(true).setFieldStatistics(false).setTermStatistics(false).setSelectedFields(field).execute().actionGet();
Map<Integer, String> map = new HashMap<>();
Terms terms = response.getFields().terms(field);
if (terms == null) {
return map;
}
TermsEnum iterator = terms.iterator(null);
for (int i = 0; i < terms.size(); i++) {
String term = iterator.next().utf8ToString();
int tf = iterator.docsAndPositions(null, null).freq();
//must declare docsAndPositionsEnum as a local variable and reuse it for positions
DocsAndPositionsEnum docsAndPositionsEnum = iterator.docsAndPositions(null, null);
for (int j = 0; j < tf; j++) {
int pos = docsAndPositionsEnum.nextPosition();
map.put(pos, term);
}
}
return map;
}
use of org.apache.lucene.index.TermsEnum in project cogcomp-nlp by CogComp.
the class TermIterator method run.
public void run() {
TermsEnum te;
try {
te = terms.iterator();
int i = 0;
while (te.next() != null) {
int freq = te.docFreq();
String termString = te.term().utf8ToString();
hasTerm(i++, termString, freq);
}
} catch (IOException e) {
e.printStackTrace();
}
}
use of org.apache.lucene.index.TermsEnum in project tika by apache.
the class LuceneTokenCounter method count.
void count(String field) throws IOException {
long tokenCount = leafReader.getSumTotalTermFreq(field);
if (tokenCount > Integer.MAX_VALUE) {
throw new IllegalArgumentException("can't handle longs");
}
int tokenCountInt = (int) tokenCount;
int uniqueTokenCount = 0;
SummaryStatistics summStats = new SummaryStatistics();
double ent = 0.0d;
double p = 0.0d;
double base = 2.0;
Terms terms = leafReader.terms(field);
if (terms == null) {
//if there were no terms
fieldStats.put(field, new TokenStatistics(uniqueTokenCount, tokenCountInt, new TokenIntPair[0], ent, summStats));
return;
}
TermsEnum termsEnum = terms.iterator();
BytesRef bytesRef = termsEnum.next();
TokenCountPriorityQueue queue = new TokenCountPriorityQueue(topN);
while (bytesRef != null) {
long termFreq = termsEnum.totalTermFreq();
if (termFreq > Integer.MAX_VALUE) {
throw new IllegalArgumentException("Sorry can't handle longs yet");
}
int tf = (int) termFreq;
//TODO: figure out how to avoid Stringifying this
//to get codepoint count
String t = bytesRef.utf8ToString();
int len = t.codePointCount(0, t.length());
for (int i = 0; i < tf; i++) {
summStats.addValue(len);
}
p = (double) tf / (double) tokenCount;
ent += p * FastMath.log(base, p);
if (queue.top() == null || queue.size() < topN || tf >= queue.top().getValue()) {
queue.insertWithOverflow(new TokenIntPair(t, tf));
}
uniqueTokenCount++;
bytesRef = termsEnum.next();
}
if (tokenCountInt > 0) {
ent = (-1.0d / (double) tokenCountInt) * ent;
}
fieldStats.put(field, new TokenStatistics(uniqueTokenCount, tokenCountInt, queue.getArray(), ent, summStats));
}
use of org.apache.lucene.index.TermsEnum in project textdb by TextDB.
the class WordCountIndexSource method computeWordCount.
private void computeWordCount() throws TexeraException {
try {
HashMap<String, Integer> wordCountMap = new HashMap<>();
DataReader dataReader = RelationManager.getInstance().getTableDataReader(predicate.getTableName(), new MatchAllDocsQuery());
dataReader.open();
IndexReader luceneIndexReader = dataReader.getLuceneIndexReader();
for (int i = 0; i < luceneIndexReader.numDocs(); i++) {
Terms termVector = luceneIndexReader.getTermVector(i, predicate.getAttribute());
TermsEnum termsEnum = termVector.iterator();
while (termsEnum.next() != null) {
String key = termsEnum.term().utf8ToString();
wordCountMap.put(key, wordCountMap.get(key) == null ? ((int) termsEnum.totalTermFreq()) : wordCountMap.get(key) + ((int) termsEnum.totalTermFreq()));
}
}
luceneIndexReader.close();
dataReader.close();
sortedWordCountMap = wordCountMap.entrySet().stream().sorted((e1, e2) -> e2.getValue().compareTo(e1.getValue())).collect(Collectors.toList());
wordCountIterator = sortedWordCountMap.iterator();
} catch (IOException e) {
throw new DataflowException(e);
}
}
Aggregations