use of org.elasticsearch.action.termvector.TermVectorResponse in project pyramid by cheng-li.
the class ESIndex method getTermStats.
/**
* df is from one shard!!!
* @param id
* @return term statistics from one doc
* @throws IOException
*/
public Set<TermStat> getTermStats(String field, String id) throws IOException {
StopWatch stopWatch = null;
if (logger.isDebugEnabled()) {
stopWatch = new StopWatch();
stopWatch.start();
}
TermVectorResponse response = client.prepareTermVector(indexName, documentType, id).setOffsets(false).setPositions(false).setFieldStatistics(false).setTermStatistics(true).setSelectedFields(field).execute().actionGet();
Terms terms = response.getFields().terms(field);
Set<TermStat> set = new HashSet<>();
// if the field is empty, terms==null
if (terms == null) {
return set;
}
TermsEnum iterator = terms.iterator(null);
for (int i = 0; i < terms.size(); i++) {
String term = iterator.next().utf8ToString();
int tf = iterator.docsAndPositions(null, null).freq();
int df = iterator.docFreq();
DefaultSimilarity defaultSimilarity = new DefaultSimilarity();
/**
* from lucene
*/
/**
* tf is just tf, not square root of tf as in lucene
*/
/** Implemented as <code>log(numDocs/(docFreq+1)) + 1</code>. */
float tfidf = tf * defaultSimilarity.idf(df, this.numDocs);
TermStat termStat = new TermStat(term);
termStat.setTf(tf).setDf(df).setTfidf(tfidf);
set.add(termStat);
}
if (logger.isDebugEnabled()) {
logger.debug("time spent on getNgramInfos for " + id + " = " + stopWatch);
}
return set;
}
use of org.elasticsearch.action.termvector.TermVectorResponse in project pyramid by cheng-li.
the class ESIndex method getTermVectorWithException.
private Map<Integer, String> getTermVectorWithException(String field, String id) throws IOException {
TermVectorResponse response = client.prepareTermVector(indexName, documentType, id).setOffsets(false).setPositions(true).setFieldStatistics(false).setTermStatistics(false).setSelectedFields(field).execute().actionGet();
Map<Integer, String> map = new HashMap<>();
Terms terms = response.getFields().terms(field);
if (terms == null) {
return map;
}
TermsEnum iterator = terms.iterator(null);
for (int i = 0; i < terms.size(); i++) {
String term = iterator.next().utf8ToString();
int tf = iterator.docsAndPositions(null, null).freq();
//must declare docsAndPositionsEnum as a local variable and reuse it for positions
DocsAndPositionsEnum docsAndPositionsEnum = iterator.docsAndPositions(null, null);
for (int j = 0; j < tf; j++) {
int pos = docsAndPositionsEnum.nextPosition();
map.put(pos, term);
}
}
return map;
}
use of org.elasticsearch.action.termvector.TermVectorResponse in project pyramid by cheng-li.
the class ESIndex method getTerms.
/**
*
* @return terms stemmed
*/
public Set<String> getTerms(String id) throws IOException {
StopWatch stopWatch = null;
if (logger.isDebugEnabled()) {
stopWatch = new StopWatch();
stopWatch.start();
}
TermVectorResponse response = client.prepareTermVector(indexName, documentType, id).setOffsets(false).setPositions(false).setFieldStatistics(false).setSelectedFields(this.bodyField).execute().actionGet();
Terms terms = response.getFields().terms(this.bodyField);
TermsEnum iterator = terms.iterator(null);
Set<String> termsSet = new HashSet<>();
for (int i = 0; i < terms.size(); i++) {
String term = iterator.next().utf8ToString();
termsSet.add(term);
}
if (logger.isDebugEnabled()) {
logger.debug("time spent on getNgrams from doc " + id + " = " + stopWatch + " It has " + termsSet.size() + " ngrams");
}
return termsSet;
}
Aggregations