use of org.apache.lucene.index.PostingsEnum in project pyramid by cheng-li.
the class ESIndex method getTermVectorWithException.
private Map<Integer, String> getTermVectorWithException(String field, String id) throws IOException {
TermVectorsResponse response = client.prepareTermVector(indexName, documentType, id).setOffsets(false).setPositions(true).setFieldStatistics(false).setTermStatistics(false).setSelectedFields(field).execute().actionGet();
Map<Integer, String> map = new HashMap<>();
Terms terms = response.getFields().terms(field);
if (terms == null) {
return map;
}
TermsEnum iterator = terms.iterator();
PostingsEnum postings = null;
for (BytesRef termBytes = null; (termBytes = iterator.next()) != null; ) {
String term = termBytes.utf8ToString();
postings = iterator.postings(postings, PostingsEnum.ALL);
// there can only be one doc since we are getting with id. get the doc and the position
postings.nextDoc();
int tf = postings.freq();
for (int i = 0; i < tf; i++) {
int pos = postings.nextPosition();
map.put(pos, term);
}
}
return map;
}
use of org.apache.lucene.index.PostingsEnum in project pyramid by cheng-li.
the class ESIndex method getTermStats.
/**
* df is from one shard!!!
* @param id
* @return term statistics from one doc
* @throws IOException
*/
public Set<TermStat> getTermStats(String field, String id) throws IOException {
StopWatch stopWatch = null;
if (logger.isDebugEnabled()) {
stopWatch = new StopWatch();
stopWatch.start();
}
TermVectorsResponse response = client.prepareTermVector(indexName, documentType, id).setOffsets(false).setPositions(false).setFieldStatistics(false).setTermStatistics(true).setSelectedFields(field).execute().actionGet();
Terms terms = response.getFields().terms(field);
Set<TermStat> set = new HashSet<>();
// if the field is empty, terms==null
if (terms == null) {
return set;
}
TermsEnum iterator = terms.iterator();
PostingsEnum postings = null;
for (int i = 0; i < terms.size(); i++) {
String term = iterator.next().utf8ToString();
postings = iterator.postings(postings);
int tf = postings.freq();
int df = iterator.docFreq();
ClassicSimilarity defaultSimilarity = new ClassicSimilarity();
/**
* from lucene
*/
/**
* tf is just tf, not square root of tf as in lucene
*/
/**
* Implemented as <code>log(numDocs/(docFreq+1)) + 1</code>.
*/
float tfidf = tf * defaultSimilarity.idf(df, this.numDocs);
TermStat termStat = new TermStat(term);
termStat.setTf(tf).setDf(df).setTfidf(tfidf);
set.add(termStat);
}
if (logger.isDebugEnabled()) {
logger.debug("time spent on getNgramInfos for " + id + " = " + stopWatch);
}
return set;
}
use of org.apache.lucene.index.PostingsEnum in project crate by crate.
the class PrunePostingsMergePolicy method wrapReader.
private static CodecReader wrapReader(CodecReader reader, String idField) {
Bits liveDocs = reader.getLiveDocs();
if (liveDocs == null) {
// no deleted docs - we are good!
return reader;
}
final boolean fullyDeletedSegment = reader.numDocs() == 0;
return new FilterCodecReader(reader) {
@Override
public FieldsProducer getPostingsReader() {
FieldsProducer postingsReader = super.getPostingsReader();
if (postingsReader == null) {
return null;
}
return new FieldsProducer() {
@Override
public void close() throws IOException {
postingsReader.close();
}
@Override
public void checkIntegrity() throws IOException {
postingsReader.checkIntegrity();
}
@Override
public Iterator<String> iterator() {
return postingsReader.iterator();
}
@Override
public Terms terms(String field) throws IOException {
Terms in = postingsReader.terms(field);
if (idField.equals(field) && in != null) {
return new FilterLeafReader.FilterTerms(in) {
@Override
public TermsEnum iterator() throws IOException {
TermsEnum iterator = super.iterator();
return new FilteredTermsEnum(iterator, false) {
private PostingsEnum internal;
@Override
protected AcceptStatus accept(BytesRef term) throws IOException {
if (fullyDeletedSegment) {
// short-cut this if we don't match anything
return AcceptStatus.END;
}
internal = postings(internal, PostingsEnum.NONE);
if (internal.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
return AcceptStatus.YES;
}
return AcceptStatus.NO;
}
@Override
public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException {
if (reuse instanceof OnlyLiveDocsPostingsEnum) {
OnlyLiveDocsPostingsEnum reuseInstance = (OnlyLiveDocsPostingsEnum) reuse;
reuseInstance.reset(super.postings(reuseInstance.in, flags));
return reuseInstance;
}
return new OnlyLiveDocsPostingsEnum(super.postings(null, flags), liveDocs);
}
@Override
public ImpactsEnum impacts(int flags) throws IOException {
throw new UnsupportedOperationException();
}
};
}
};
} else {
return in;
}
}
@Override
public int size() {
return postingsReader.size();
}
@Override
public long ramBytesUsed() {
return postingsReader.ramBytesUsed();
}
};
}
@Override
public CacheHelper getCoreCacheHelper() {
return null;
}
@Override
public CacheHelper getReaderCacheHelper() {
return null;
}
};
}
use of org.apache.lucene.index.PostingsEnum in project elasticsearch by elastic.
the class TermVectorsFilter method selectBestTerms.
public void selectBestTerms() throws IOException {
PostingsEnum docsEnum = null;
for (String fieldName : fields) {
if ((selectedFields != null) && (!selectedFields.contains(fieldName))) {
continue;
}
Terms terms = fields.terms(fieldName);
Terms topLevelTerms = topLevelFields.terms(fieldName);
// if no terms found, take the retrieved term vector fields for stats
if (topLevelTerms == null) {
topLevelTerms = terms;
}
long numDocs = getDocCount(fieldName, topLevelTerms);
// one queue per field name
ScoreTermsQueue queue = new ScoreTermsQueue(Math.min(maxNumTerms, (int) terms.size()));
// select terms with highest tf-idf
TermsEnum termsEnum = terms.iterator();
TermsEnum topLevelTermsEnum = topLevelTerms.iterator();
while (termsEnum.next() != null) {
BytesRef termBytesRef = termsEnum.term();
boolean foundTerm = topLevelTermsEnum.seekExact(termBytesRef);
assert foundTerm : "Term: " + termBytesRef.utf8ToString() + " not found!";
Term term = new Term(fieldName, termBytesRef);
// remove noise words
int freq = getTermFreq(termsEnum, docsEnum);
if (isNoise(term.bytes().utf8ToString(), freq)) {
continue;
}
// now call on docFreq
long docFreq = getTermStatistics(topLevelTermsEnum, term).docFreq();
if (!isAccepted(docFreq)) {
continue;
}
// filter based on score
float score = computeScore(docFreq, freq, numDocs);
queue.addOrUpdate(new ScoreTerm(term.field(), term.bytes().utf8ToString(), score));
}
// retain the best terms for quick lookups
ScoreTerm scoreTerm;
int count = 0;
while ((scoreTerm = queue.pop()) != null) {
scoreTerms.put(new Term(scoreTerm.field, scoreTerm.word), scoreTerm);
count++;
}
sizes.put(fieldName, count);
}
}
use of org.apache.lucene.index.PostingsEnum in project elasticsearch by elastic.
the class TermVectorsWriter method setFields.
void setFields(Fields termVectorsByField, Set<String> selectedFields, EnumSet<Flag> flags, Fields topLevelFields, @Nullable AggregatedDfs dfs, @Nullable TermVectorsFilter termVectorsFilter) throws IOException {
int numFieldsWritten = 0;
PostingsEnum docsAndPosEnum = null;
PostingsEnum docsEnum = null;
boolean hasScores = termVectorsFilter != null;
for (String field : termVectorsByField) {
if ((selectedFields != null) && (!selectedFields.contains(field))) {
continue;
}
Terms fieldTermVector = termVectorsByField.terms(field);
Terms topLevelTerms = topLevelFields.terms(field);
// if no terms found, take the retrieved term vector fields for stats
if (topLevelTerms == null) {
topLevelTerms = EMPTY_TERMS;
}
TermsEnum topLevelIterator = topLevelTerms.iterator();
boolean positions = flags.contains(Flag.Positions) && fieldTermVector.hasPositions();
boolean offsets = flags.contains(Flag.Offsets) && fieldTermVector.hasOffsets();
boolean payloads = flags.contains(Flag.Payloads) && fieldTermVector.hasPayloads();
long termsSize = fieldTermVector.size();
if (hasScores) {
termsSize = Math.min(termsSize, termVectorsFilter.size(field));
}
startField(field, termsSize, positions, offsets, payloads);
if (flags.contains(Flag.FieldStatistics)) {
if (dfs != null) {
writeFieldStatistics(dfs.fieldStatistics().get(field));
} else {
writeFieldStatistics(topLevelTerms);
}
}
TermsEnum iterator = fieldTermVector.iterator();
final boolean useDocsAndPos = positions || offsets || payloads;
while (iterator.next() != null) {
// iterate all terms of the current field
BytesRef termBytesRef = iterator.term();
Term term = new Term(field, termBytesRef);
// with filtering we only keep the best terms
if (hasScores && !termVectorsFilter.hasScoreTerm(term)) {
continue;
}
startTerm(termBytesRef);
if (flags.contains(Flag.TermStatistics)) {
// get the doc frequency
if (dfs != null) {
final TermStatistics statistics = dfs.termStatistics().get(term);
writeTermStatistics(statistics == null ? new TermStatistics(termBytesRef, 0, 0) : statistics);
} else {
boolean foundTerm = topLevelIterator.seekExact(termBytesRef);
if (foundTerm) {
writeTermStatistics(topLevelIterator);
} else {
writeTermStatistics(new TermStatistics(termBytesRef, 0, 0));
}
}
}
if (useDocsAndPos) {
// given we have pos or offsets
docsAndPosEnum = writeTermWithDocsAndPos(iterator, docsAndPosEnum, positions, offsets, payloads);
} else {
// if we do not have the positions stored, we need to
// get the frequency from a PostingsEnum.
docsEnum = writeTermWithDocsOnly(iterator, docsEnum);
}
if (hasScores) {
writeScoreTerm(termVectorsFilter.getScoreTerm(term));
}
}
numFieldsWritten++;
}
response.setTermVectorsField(output);
response.setHeader(writeHeader(numFieldsWritten, flags.contains(Flag.TermStatistics), flags.contains(Flag.FieldStatistics), hasScores));
}
Aggregations