use of org.apache.lucene.index.PostingsEnum in project Anserini by castorini.
the class IndexReaderUtils method getTermPositions.
/**
* Returns the term position mapping for a particular document. Note that this method explicitly returns
* {@code null} if the document does not exist (as opposed to an empty map), so that the caller is explicitly forced
* to handle this case.
*
* @param reader index reader
* @param docid collection docid
* @return term position mapping for a particular document or {@code null} if document does not exist.
* @throws IOException if error encountered during query
* @throws NotStoredException if the term vector is not stored
*/
public static Map<String, List<Integer>> getTermPositions(IndexReader reader, String docid) throws IOException, NotStoredException {
int ldocid = convertDocidToLuceneDocid(reader, docid);
if (ldocid == -1) {
return null;
}
Terms terms = reader.getTermVector(ldocid, IndexArgs.CONTENTS);
if (terms == null) {
throw new NotStoredException("Document vector not stored!");
}
TermsEnum termIter = terms.iterator();
if (termIter == null) {
throw new NotStoredException("Document vector not stored!");
}
Map<String, List<Integer>> termPosition = new HashMap<>();
PostingsEnum positionIter = null;
while ((termIter.next()) != null) {
List<Integer> positions = new ArrayList<>();
long termFreq = termIter.totalTermFreq();
positionIter = termIter.postings(positionIter, PostingsEnum.POSITIONS);
positionIter.nextDoc();
for (int i = 0; i < termFreq; i++) {
positions.add(positionIter.nextPosition());
}
termPosition.put(termIter.term().utf8ToString(), positions);
}
return termPosition;
}
use of org.apache.lucene.index.PostingsEnum in project ltr4l by LTR4L.
the class FieldFeatureTFExtractorFactory method create.
@Override
public FieldFeatureExtractor[] create(LeafReaderContext context, Set<Integer> allDocs) throws IOException {
FieldFeatureExtractor[] extractors = new FieldFeatureExtractor[terms.length];
int i = 0;
for (Term term : terms) {
final TermsEnum termsEnum = getTermsEnum(context, term);
if (termsEnum == null) {
extractors[i] = new FieldFeatureNullExtractor();
} else {
extractors[i] = new FieldFeatureTFExtractor(termsEnum.postings(null, PostingsEnum.FREQS));
// get it twice without reuse to clone it...
PostingsEnum docs = termsEnum.postings(null, PostingsEnum.FREQS);
for (int docId = docs.nextDoc(); docId != PostingsEnum.NO_MORE_DOCS; docId = docs.nextDoc()) {
allDocs.add(docId);
}
}
i++;
}
return extractors;
}
use of org.apache.lucene.index.PostingsEnum in project OpenGrok by OpenGrok.
the class SuggesterSearcher method suggest.
private List<LookupResultItem> suggest(final Query query, final LeafReaderContext leafReaderContext, final String project, final SuggesterQuery suggesterQuery, final PopularityCounter searchCounts) throws IOException {
if (Thread.currentThread().isInterrupted()) {
interrupted = true;
return Collections.emptyList();
}
boolean shouldLeaveOutSameTerms = shouldLeaveOutSameTerms(query, suggesterQuery);
Set<BytesRef> tokensAlreadyIncluded = null;
if (shouldLeaveOutSameTerms) {
tokensAlreadyIncluded = SuggesterUtils.intoTermsExceptPhraseQuery(query).stream().filter(t -> t.field().equals(suggesterQuery.getField())).map(Term::bytes).collect(Collectors.toSet());
}
boolean needsDocumentIds = query != null && !(query instanceof MatchAllDocsQuery);
ComplexQueryData complexQueryData = null;
if (needsDocumentIds) {
complexQueryData = getComplexQueryData(query, leafReaderContext);
if (interrupted) {
return Collections.emptyList();
}
}
Terms terms = leafReaderContext.reader().terms(suggesterQuery.getField());
TermsEnum termsEnum = suggesterQuery.getTermsEnumForSuggestions(terms);
LookupPriorityQueue queue = new LookupPriorityQueue(resultSize);
boolean needPositionsAndFrequencies = needPositionsAndFrequencies(query);
PostingsEnum postingsEnum = null;
BytesRef term = termsEnum.next();
while (term != null) {
if (Thread.currentThread().isInterrupted()) {
interrupted = true;
break;
}
if (needPositionsAndFrequencies) {
postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.POSITIONS | PostingsEnum.FREQS);
} else {
postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
}
int score = 0;
if (!needsDocumentIds) {
score = normalizeDocumentFrequency(termsEnum.docFreq(), numDocs);
} else if (needPositionsAndFrequencies) {
score = getPhraseScore(complexQueryData, leafReaderContext.docBase, postingsEnum);
} else if (complexQueryData != null) {
score = getDocumentFrequency(complexQueryData.documentIds, leafReaderContext.docBase, postingsEnum);
}
if (score > 0) {
if (!shouldLeaveOutSameTerms || !tokensAlreadyIncluded.contains(term)) {
score += searchCounts.get(term) * TERM_ALREADY_SEARCHED_MULTIPLIER;
if (queue.canInsert(score)) {
queue.insertWithOverflow(new LookupResultItem(term.utf8ToString(), project, score));
}
}
}
term = termsEnum.next();
}
return queue.getResult();
}
use of org.apache.lucene.index.PostingsEnum in project crate by crate.
the class ShardSplittingQuery method findSplitDocs.
private static void findSplitDocs(String idField, Predicate<BytesRef> includeInShard, LeafReader leafReader, IntConsumer consumer) throws IOException {
Terms terms = leafReader.terms(idField);
TermsEnum iterator = terms.iterator();
BytesRef idTerm;
PostingsEnum postingsEnum = null;
while ((idTerm = iterator.next()) != null) {
if (includeInShard.test(idTerm) == false) {
postingsEnum = iterator.postings(postingsEnum);
int doc;
while ((doc = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
consumer.accept(doc);
}
}
}
}
Aggregations