Search in sources :

Example 86 with PostingsEnum

use of org.apache.lucene.index.PostingsEnum in project Anserini by castorini.

the class IndexReaderUtils method getTermPositions.

/**
 * Returns the term position mapping for a particular document. Note that this method explicitly returns
 * {@code null} if the document does not exist (as opposed to an empty map), so that the caller is explicitly forced
 * to handle this case.
 *
 * @param reader index reader
 * @param docid collection docid
 * @return term position mapping for a particular document or {@code null} if document does not exist.
 * @throws IOException if error encountered during query
 * @throws NotStoredException if the term vector is not stored
 */
public static Map<String, List<Integer>> getTermPositions(IndexReader reader, String docid) throws IOException, NotStoredException {
    int ldocid = convertDocidToLuceneDocid(reader, docid);
    if (ldocid == -1) {
        return null;
    }
    Terms terms = reader.getTermVector(ldocid, IndexArgs.CONTENTS);
    if (terms == null) {
        throw new NotStoredException("Document vector not stored!");
    }
    TermsEnum termIter = terms.iterator();
    if (termIter == null) {
        throw new NotStoredException("Document vector not stored!");
    }
    Map<String, List<Integer>> termPosition = new HashMap<>();
    PostingsEnum positionIter = null;
    while ((termIter.next()) != null) {
        List<Integer> positions = new ArrayList<>();
        long termFreq = termIter.totalTermFreq();
        positionIter = termIter.postings(positionIter, PostingsEnum.POSITIONS);
        positionIter.nextDoc();
        for (int i = 0; i < termFreq; i++) {
            positions.add(positionIter.nextPosition());
        }
        termPosition.put(termIter.term().utf8ToString(), positions);
    }
    return termPosition;
}
Also used : HashMap(java.util.HashMap) Terms(org.apache.lucene.index.Terms) MultiTerms(org.apache.lucene.index.MultiTerms) ArrayList(java.util.ArrayList) TermsEnum(org.apache.lucene.index.TermsEnum) ArrayList(java.util.ArrayList) List(java.util.List) PostingsEnum(org.apache.lucene.index.PostingsEnum)

Example 87 with PostingsEnum

use of org.apache.lucene.index.PostingsEnum in project ltr4l by LTR4L.

the class FieldFeatureTFExtractorFactory method create.

@Override
public FieldFeatureExtractor[] create(LeafReaderContext context, Set<Integer> allDocs) throws IOException {
    FieldFeatureExtractor[] extractors = new FieldFeatureExtractor[terms.length];
    int i = 0;
    for (Term term : terms) {
        final TermsEnum termsEnum = getTermsEnum(context, term);
        if (termsEnum == null) {
            extractors[i] = new FieldFeatureNullExtractor();
        } else {
            extractors[i] = new FieldFeatureTFExtractor(termsEnum.postings(null, PostingsEnum.FREQS));
            // get it twice without reuse to clone it...
            PostingsEnum docs = termsEnum.postings(null, PostingsEnum.FREQS);
            for (int docId = docs.nextDoc(); docId != PostingsEnum.NO_MORE_DOCS; docId = docs.nextDoc()) {
                allDocs.add(docId);
            }
        }
        i++;
    }
    return extractors;
}
Also used : Term(org.apache.lucene.index.Term) PostingsEnum(org.apache.lucene.index.PostingsEnum) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 88 with PostingsEnum

use of org.apache.lucene.index.PostingsEnum in project OpenGrok by OpenGrok.

the class SuggesterSearcher method suggest.

private List<LookupResultItem> suggest(final Query query, final LeafReaderContext leafReaderContext, final String project, final SuggesterQuery suggesterQuery, final PopularityCounter searchCounts) throws IOException {
    if (Thread.currentThread().isInterrupted()) {
        interrupted = true;
        return Collections.emptyList();
    }
    boolean shouldLeaveOutSameTerms = shouldLeaveOutSameTerms(query, suggesterQuery);
    Set<BytesRef> tokensAlreadyIncluded = null;
    if (shouldLeaveOutSameTerms) {
        tokensAlreadyIncluded = SuggesterUtils.intoTermsExceptPhraseQuery(query).stream().filter(t -> t.field().equals(suggesterQuery.getField())).map(Term::bytes).collect(Collectors.toSet());
    }
    boolean needsDocumentIds = query != null && !(query instanceof MatchAllDocsQuery);
    ComplexQueryData complexQueryData = null;
    if (needsDocumentIds) {
        complexQueryData = getComplexQueryData(query, leafReaderContext);
        if (interrupted) {
            return Collections.emptyList();
        }
    }
    Terms terms = leafReaderContext.reader().terms(suggesterQuery.getField());
    TermsEnum termsEnum = suggesterQuery.getTermsEnumForSuggestions(terms);
    LookupPriorityQueue queue = new LookupPriorityQueue(resultSize);
    boolean needPositionsAndFrequencies = needPositionsAndFrequencies(query);
    PostingsEnum postingsEnum = null;
    BytesRef term = termsEnum.next();
    while (term != null) {
        if (Thread.currentThread().isInterrupted()) {
            interrupted = true;
            break;
        }
        if (needPositionsAndFrequencies) {
            postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.POSITIONS | PostingsEnum.FREQS);
        } else {
            postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
        }
        int score = 0;
        if (!needsDocumentIds) {
            score = normalizeDocumentFrequency(termsEnum.docFreq(), numDocs);
        } else if (needPositionsAndFrequencies) {
            score = getPhraseScore(complexQueryData, leafReaderContext.docBase, postingsEnum);
        } else if (complexQueryData != null) {
            score = getDocumentFrequency(complexQueryData.documentIds, leafReaderContext.docBase, postingsEnum);
        }
        if (score > 0) {
            if (!shouldLeaveOutSameTerms || !tokensAlreadyIncluded.contains(term)) {
                score += searchCounts.get(term) * TERM_ALREADY_SEARCHED_MULTIPLIER;
                if (queue.canInsert(score)) {
                    queue.insertWithOverflow(new LookupResultItem(term.utf8ToString(), project, score));
                }
            }
        }
        term = termsEnum.next();
    }
    return queue.getResult();
}
Also used : Query(org.apache.lucene.search.Query) LeafCollector(org.apache.lucene.search.LeafCollector) Term(org.apache.lucene.index.Term) IntsHolder(org.opengrok.suggest.query.data.IntsHolder) Scorable(org.apache.lucene.search.Scorable) CustomPhraseQuery(org.opengrok.suggest.query.customized.CustomPhraseQuery) ArrayList(java.util.ArrayList) Level(java.util.logging.Level) PopularityCounter(org.opengrok.suggest.popular.PopularityCounter) TermsEnum(org.apache.lucene.index.TermsEnum) SuggesterRangeQuery(org.opengrok.suggest.query.SuggesterRangeQuery) LeafReaderContext(org.apache.lucene.index.LeafReaderContext) DocIdSetIterator(org.apache.lucene.search.DocIdSetIterator) BitIntsHolder(org.opengrok.suggest.query.data.BitIntsHolder) PostingsEnum(org.apache.lucene.index.PostingsEnum) Terms(org.apache.lucene.index.Terms) Scorer(org.apache.lucene.search.Scorer) BytesRef(org.apache.lucene.util.BytesRef) Set(java.util.Set) IOException(java.io.IOException) Collector(org.apache.lucene.search.Collector) MatchAllDocsQuery(org.apache.lucene.search.MatchAllDocsQuery) Logger(java.util.logging.Logger) SuggesterQuery(org.opengrok.suggest.query.SuggesterQuery) Collectors(java.util.stream.Collectors) BooleanClause(org.apache.lucene.search.BooleanClause) ScoreMode(org.apache.lucene.search.ScoreMode) List(java.util.List) BooleanQuery(org.apache.lucene.search.BooleanQuery) Collections(java.util.Collections) IndexReader(org.apache.lucene.index.IndexReader) PhraseScorer(org.opengrok.suggest.query.PhraseScorer) IndexSearcher(org.apache.lucene.search.IndexSearcher) Terms(org.apache.lucene.index.Terms) Term(org.apache.lucene.index.Term) MatchAllDocsQuery(org.apache.lucene.search.MatchAllDocsQuery) TermsEnum(org.apache.lucene.index.TermsEnum) PostingsEnum(org.apache.lucene.index.PostingsEnum) BytesRef(org.apache.lucene.util.BytesRef)

Example 89 with PostingsEnum

use of org.apache.lucene.index.PostingsEnum in project crate by crate.

the class ShardSplittingQuery method findSplitDocs.

private static void findSplitDocs(String idField, Predicate<BytesRef> includeInShard, LeafReader leafReader, IntConsumer consumer) throws IOException {
    Terms terms = leafReader.terms(idField);
    TermsEnum iterator = terms.iterator();
    BytesRef idTerm;
    PostingsEnum postingsEnum = null;
    while ((idTerm = iterator.next()) != null) {
        if (includeInShard.test(idTerm) == false) {
            postingsEnum = iterator.postings(postingsEnum);
            int doc;
            while ((doc = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                consumer.accept(doc);
            }
        }
    }
}
Also used : Terms(org.apache.lucene.index.Terms) PostingsEnum(org.apache.lucene.index.PostingsEnum) BytesRef(org.apache.lucene.util.BytesRef) TermsEnum(org.apache.lucene.index.TermsEnum)

Aggregations

PostingsEnum (org.apache.lucene.index.PostingsEnum)89 TermsEnum (org.apache.lucene.index.TermsEnum)62 BytesRef (org.apache.lucene.util.BytesRef)62 Terms (org.apache.lucene.index.Terms)51 Term (org.apache.lucene.index.Term)23 Fields (org.apache.lucene.index.Fields)18 ArrayList (java.util.ArrayList)17 LeafReader (org.apache.lucene.index.LeafReader)17 LeafReaderContext (org.apache.lucene.index.LeafReaderContext)16 Document (org.apache.lucene.document.Document)13 IOException (java.io.IOException)12 IndexReader (org.apache.lucene.index.IndexReader)12 Bits (org.apache.lucene.util.Bits)11 Directory (org.apache.lucene.store.Directory)10 TextField (org.apache.lucene.document.TextField)9 DirectoryReader (org.apache.lucene.index.DirectoryReader)7 HashMap (java.util.HashMap)6 List (java.util.List)6 IndexWriter (org.apache.lucene.index.IndexWriter)6 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)6