Search in sources :

Example 66 with PostingsEnum

use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.

the class MultiTermQueryConstantScoreWrapper method createWeight.

@Override
public Weight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException {
    return new ConstantScoreWeight(this, boost) {

        /** Try to collect terms from the given terms enum and return true iff all
       *  terms could be collected. If {@code false} is returned, the enum is
       *  left positioned on the next term. */
        private boolean collectTerms(LeafReaderContext context, TermsEnum termsEnum, List<TermAndState> terms) throws IOException {
            final int threshold = Math.min(BOOLEAN_REWRITE_TERM_COUNT_THRESHOLD, BooleanQuery.getMaxClauseCount());
            for (int i = 0; i < threshold; ++i) {
                final BytesRef term = termsEnum.next();
                if (term == null) {
                    return true;
                }
                TermState state = termsEnum.termState();
                terms.add(new TermAndState(BytesRef.deepCopyOf(term), state, termsEnum.docFreq(), termsEnum.totalTermFreq()));
            }
            return termsEnum.next() == null;
        }

        /**
       * On the given leaf context, try to either rewrite to a disjunction if
       * there are few terms, or build a bitset containing matching docs.
       */
        private WeightOrDocIdSet rewrite(LeafReaderContext context) throws IOException {
            final Terms terms = context.reader().terms(query.field);
            if (terms == null) {
                // field does not exist
                return new WeightOrDocIdSet((DocIdSet) null);
            }
            final TermsEnum termsEnum = query.getTermsEnum(terms);
            assert termsEnum != null;
            PostingsEnum docs = null;
            final List<TermAndState> collectedTerms = new ArrayList<>();
            if (collectTerms(context, termsEnum, collectedTerms)) {
                // build a boolean query
                BooleanQuery.Builder bq = new BooleanQuery.Builder();
                for (TermAndState t : collectedTerms) {
                    final TermContext termContext = new TermContext(searcher.getTopReaderContext());
                    termContext.register(t.state, context.ord, t.docFreq, t.totalTermFreq);
                    bq.add(new TermQuery(new Term(query.field, t.term), termContext), Occur.SHOULD);
                }
                Query q = new ConstantScoreQuery(bq.build());
                final Weight weight = searcher.rewrite(q).createWeight(searcher, needsScores, score());
                return new WeightOrDocIdSet(weight);
            }
            // Too many terms: go back to the terms we already collected and start building the bit set
            DocIdSetBuilder builder = new DocIdSetBuilder(context.reader().maxDoc(), terms);
            if (collectedTerms.isEmpty() == false) {
                TermsEnum termsEnum2 = terms.iterator();
                for (TermAndState t : collectedTerms) {
                    termsEnum2.seekExact(t.term, t.state);
                    docs = termsEnum2.postings(docs, PostingsEnum.NONE);
                    builder.add(docs);
                }
            }
            // Then keep filling the bit set with remaining terms
            do {
                docs = termsEnum.postings(docs, PostingsEnum.NONE);
                builder.add(docs);
            } while (termsEnum.next() != null);
            return new WeightOrDocIdSet(builder.build());
        }

        private Scorer scorer(DocIdSet set) throws IOException {
            if (set == null) {
                return null;
            }
            final DocIdSetIterator disi = set.iterator();
            if (disi == null) {
                return null;
            }
            return new ConstantScoreScorer(this, score(), disi);
        }

        @Override
        public BulkScorer bulkScorer(LeafReaderContext context) throws IOException {
            final WeightOrDocIdSet weightOrBitSet = rewrite(context);
            if (weightOrBitSet.weight != null) {
                return weightOrBitSet.weight.bulkScorer(context);
            } else {
                final Scorer scorer = scorer(weightOrBitSet.set);
                if (scorer == null) {
                    return null;
                }
                return new DefaultBulkScorer(scorer);
            }
        }

        @Override
        public Scorer scorer(LeafReaderContext context) throws IOException {
            final WeightOrDocIdSet weightOrBitSet = rewrite(context);
            if (weightOrBitSet.weight != null) {
                return weightOrBitSet.weight.scorer(context);
            } else {
                return scorer(weightOrBitSet.set);
            }
        }
    };
}
Also used : DocIdSetBuilder(org.apache.lucene.util.DocIdSetBuilder) ArrayList(java.util.ArrayList) TermContext(org.apache.lucene.index.TermContext) TermsEnum(org.apache.lucene.index.TermsEnum) LeafReaderContext(org.apache.lucene.index.LeafReaderContext) ArrayList(java.util.ArrayList) List(java.util.List) PostingsEnum(org.apache.lucene.index.PostingsEnum) BytesRef(org.apache.lucene.util.BytesRef) Terms(org.apache.lucene.index.Terms) Term(org.apache.lucene.index.Term) TermState(org.apache.lucene.index.TermState) DocIdSetBuilder(org.apache.lucene.util.DocIdSetBuilder)

Example 67 with PostingsEnum

use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.

the class TermInSetQuery method createWeight.

@Override
public Weight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException {
    return new ConstantScoreWeight(this, boost) {

        @Override
        public void extractTerms(Set<Term> terms) {
        // no-op
        // This query is for abuse cases when the number of terms is too high to
        // run efficiently as a BooleanQuery. So likewise we hide its terms in
        // order to protect highlighters
        }

        /**
       * On the given leaf context, try to either rewrite to a disjunction if
       * there are few matching terms, or build a bitset containing matching docs.
       */
        private WeightOrDocIdSet rewrite(LeafReaderContext context) throws IOException {
            final LeafReader reader = context.reader();
            final Fields fields = reader.fields();
            Terms terms = fields.terms(field);
            if (terms == null) {
                return null;
            }
            TermsEnum termsEnum = terms.iterator();
            PostingsEnum docs = null;
            TermIterator iterator = termData.iterator();
            // We will first try to collect up to 'threshold' terms into 'matchingTerms'
            // if there are two many terms, we will fall back to building the 'builder'
            final int threshold = Math.min(BOOLEAN_REWRITE_TERM_COUNT_THRESHOLD, BooleanQuery.getMaxClauseCount());
            assert termData.size() > threshold : "Query should have been rewritten";
            List<TermAndState> matchingTerms = new ArrayList<>(threshold);
            DocIdSetBuilder builder = null;
            for (BytesRef term = iterator.next(); term != null; term = iterator.next()) {
                assert field.equals(iterator.field());
                if (termsEnum.seekExact(term)) {
                    if (matchingTerms == null) {
                        docs = termsEnum.postings(docs, PostingsEnum.NONE);
                        builder.add(docs);
                    } else if (matchingTerms.size() < threshold) {
                        matchingTerms.add(new TermAndState(field, termsEnum));
                    } else {
                        assert matchingTerms.size() == threshold;
                        builder = new DocIdSetBuilder(reader.maxDoc(), terms);
                        docs = termsEnum.postings(docs, PostingsEnum.NONE);
                        builder.add(docs);
                        for (TermAndState t : matchingTerms) {
                            t.termsEnum.seekExact(t.term, t.state);
                            docs = t.termsEnum.postings(docs, PostingsEnum.NONE);
                            builder.add(docs);
                        }
                        matchingTerms = null;
                    }
                }
            }
            if (matchingTerms != null) {
                assert builder == null;
                BooleanQuery.Builder bq = new BooleanQuery.Builder();
                for (TermAndState t : matchingTerms) {
                    final TermContext termContext = new TermContext(searcher.getTopReaderContext());
                    termContext.register(t.state, context.ord, t.docFreq, t.totalTermFreq);
                    bq.add(new TermQuery(new Term(t.field, t.term), termContext), Occur.SHOULD);
                }
                Query q = new ConstantScoreQuery(bq.build());
                final Weight weight = searcher.rewrite(q).createWeight(searcher, needsScores, score());
                return new WeightOrDocIdSet(weight);
            } else {
                assert builder != null;
                return new WeightOrDocIdSet(builder.build());
            }
        }

        private Scorer scorer(DocIdSet set) throws IOException {
            if (set == null) {
                return null;
            }
            final DocIdSetIterator disi = set.iterator();
            if (disi == null) {
                return null;
            }
            return new ConstantScoreScorer(this, score(), disi);
        }

        @Override
        public BulkScorer bulkScorer(LeafReaderContext context) throws IOException {
            final WeightOrDocIdSet weightOrBitSet = rewrite(context);
            if (weightOrBitSet == null) {
                return null;
            } else if (weightOrBitSet.weight != null) {
                return weightOrBitSet.weight.bulkScorer(context);
            } else {
                final Scorer scorer = scorer(weightOrBitSet.set);
                if (scorer == null) {
                    return null;
                }
                return new DefaultBulkScorer(scorer);
            }
        }

        @Override
        public Scorer scorer(LeafReaderContext context) throws IOException {
            final WeightOrDocIdSet weightOrBitSet = rewrite(context);
            if (weightOrBitSet == null) {
                return null;
            } else if (weightOrBitSet.weight != null) {
                return weightOrBitSet.weight.scorer(context);
            } else {
                return scorer(weightOrBitSet.set);
            }
        }
    };
}
Also used : SortedSet(java.util.SortedSet) Set(java.util.Set) DocIdSetBuilder(org.apache.lucene.util.DocIdSetBuilder) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) ArrayList(java.util.ArrayList) TermContext(org.apache.lucene.index.TermContext) TermsEnum(org.apache.lucene.index.TermsEnum) LeafReaderContext(org.apache.lucene.index.LeafReaderContext) PostingsEnum(org.apache.lucene.index.PostingsEnum) BytesRef(org.apache.lucene.util.BytesRef) LeafReader(org.apache.lucene.index.LeafReader) PrefixCodedTerms(org.apache.lucene.index.PrefixCodedTerms) Terms(org.apache.lucene.index.Terms) TermIterator(org.apache.lucene.index.PrefixCodedTerms.TermIterator) Term(org.apache.lucene.index.Term) Fields(org.apache.lucene.index.Fields) DocIdSetBuilder(org.apache.lucene.util.DocIdSetBuilder)

Example 68 with PostingsEnum

use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.

the class QueryElevationComponent method getBoostDocs.

public static IntIntHashMap getBoostDocs(SolrIndexSearcher indexSearcher, Map<BytesRef, Integer> boosted, Map context) throws IOException {
    IntIntHashMap boostDocs = null;
    if (boosted != null) {
        //by another caller.
        if (context != null) {
            boostDocs = (IntIntHashMap) context.get(BOOSTED_DOCIDS);
        }
        if (boostDocs != null) {
            return boostDocs;
        }
        //Not in the context yet so load it.
        SchemaField idField = indexSearcher.getSchema().getUniqueKeyField();
        String fieldName = idField.getName();
        HashSet<BytesRef> localBoosts = new HashSet<>(boosted.size() * 2);
        Iterator<BytesRef> boostedIt = boosted.keySet().iterator();
        while (boostedIt.hasNext()) {
            localBoosts.add(boostedIt.next());
        }
        boostDocs = new IntIntHashMap(boosted.size());
        List<LeafReaderContext> leaves = indexSearcher.getTopReaderContext().leaves();
        PostingsEnum postingsEnum = null;
        for (LeafReaderContext leaf : leaves) {
            LeafReader reader = leaf.reader();
            int docBase = leaf.docBase;
            Bits liveDocs = reader.getLiveDocs();
            Terms terms = reader.terms(fieldName);
            TermsEnum termsEnum = terms.iterator();
            Iterator<BytesRef> it = localBoosts.iterator();
            while (it.hasNext()) {
                BytesRef ref = it.next();
                if (termsEnum.seekExact(ref)) {
                    postingsEnum = termsEnum.postings(postingsEnum);
                    int doc = postingsEnum.nextDoc();
                    while (doc != PostingsEnum.NO_MORE_DOCS && liveDocs != null && liveDocs.get(doc) == false) {
                        doc = postingsEnum.nextDoc();
                    }
                    if (doc != PostingsEnum.NO_MORE_DOCS) {
                        //Found the document.
                        int p = boosted.get(ref);
                        boostDocs.put(doc + docBase, p);
                        it.remove();
                    }
                }
            }
        }
    }
    if (context != null) {
        context.put(BOOSTED_DOCIDS, boostDocs);
    }
    return boostDocs;
}
Also used : LeafReader(org.apache.lucene.index.LeafReader) Terms(org.apache.lucene.index.Terms) TermsEnum(org.apache.lucene.index.TermsEnum) SchemaField(org.apache.solr.schema.SchemaField) IntIntHashMap(com.carrotsearch.hppc.IntIntHashMap) LeafReaderContext(org.apache.lucene.index.LeafReaderContext) Bits(org.apache.lucene.util.Bits) PostingsEnum(org.apache.lucene.index.PostingsEnum) BytesRef(org.apache.lucene.util.BytesRef) HashSet(java.util.HashSet)

Example 69 with PostingsEnum

use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.

the class DirectoryTaxonomyWriter method perhapsFillCache.

// we need to guarantee that if several threads call this concurrently, only
// one executes it, and after it returns, the cache is updated and is either
// complete or not.
private synchronized void perhapsFillCache() throws IOException {
    if (cacheMisses.get() < cacheMissesUntilFill) {
        return;
    }
    if (!shouldFillCache) {
        // we already filled the cache once, there's no need to re-fill it
        return;
    }
    shouldFillCache = false;
    initReaderManager();
    boolean aborted = false;
    DirectoryReader reader = readerManager.acquire();
    try {
        PostingsEnum postingsEnum = null;
        for (LeafReaderContext ctx : reader.leaves()) {
            Terms terms = ctx.reader().terms(Consts.FULL);
            if (terms != null) {
                // cannot really happen, but be on the safe side
                // TODO: share per-segment TermsEnum here!
                TermsEnum termsEnum = terms.iterator();
                while (termsEnum.next() != null) {
                    if (!cache.isFull()) {
                        BytesRef t = termsEnum.term();
                        // Since we guarantee uniqueness of categories, each term has exactly
                        // one document. Also, since we do not allow removing categories (and
                        // hence documents), there are no deletions in the index. Therefore, it
                        // is sufficient to call next(), and then doc(), exactly once with no
                        // 'validation' checks.
                        FacetLabel cp = new FacetLabel(FacetsConfig.stringToPath(t.utf8ToString()));
                        postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
                        boolean res = cache.put(cp, postingsEnum.nextDoc() + ctx.docBase);
                        assert !res : "entries should not have been evicted from the cache";
                    } else {
                        // the cache is full and the next put() will evict entries from it, therefore abort the iteration.
                        aborted = true;
                        break;
                    }
                }
            }
            if (aborted) {
                break;
            }
        }
    } finally {
        readerManager.release(reader);
    }
    cacheIsComplete = !aborted;
    if (cacheIsComplete) {
        synchronized (this) {
            // everything is in the cache, so no need to keep readerManager open.
            // this block is executed in a sync block so that it works well with
            // initReaderManager called in parallel.
            readerManager.close();
            readerManager = null;
            initializedReaderManager = false;
        }
    }
}
Also used : DirectoryReader(org.apache.lucene.index.DirectoryReader) FacetLabel(org.apache.lucene.facet.taxonomy.FacetLabel) Terms(org.apache.lucene.index.Terms) LeafReaderContext(org.apache.lucene.index.LeafReaderContext) PostingsEnum(org.apache.lucene.index.PostingsEnum) BytesRef(org.apache.lucene.util.BytesRef) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 70 with PostingsEnum

use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.

the class DirectoryTaxonomyWriter method findCategory.

/**
   * Look up the given category in the cache and/or the on-disk storage,
   * returning the category's ordinal, or a negative number in case the
   * category does not yet exist in the taxonomy.
   */
protected synchronized int findCategory(FacetLabel categoryPath) throws IOException {
    // If we can find the category in the cache, or we know the cache is
    // complete, we can return the response directly from it
    int res = cache.get(categoryPath);
    if (res >= 0 || cacheIsComplete) {
        return res;
    }
    cacheMisses.incrementAndGet();
    // After a few cache misses, it makes sense to read all the categories
    // from disk and into the cache. The reason not to do this on the first
    // cache miss (or even when opening the writer) is that it will
    // significantly slow down the case when a taxonomy is opened just to
    // add one category. The idea only spending a long time on reading
    // after enough time was spent on cache misses is known as an "online
    // algorithm".
    perhapsFillCache();
    res = cache.get(categoryPath);
    if (res >= 0 || cacheIsComplete) {
        // or the cache is complete, return whatever cache.get returned.
        return res;
    }
    // if we get here, it means the category is not in the cache, and it is not
    // complete, and therefore we must look for the category on disk.
    // We need to get an answer from the on-disk index.
    initReaderManager();
    int doc = -1;
    DirectoryReader reader = readerManager.acquire();
    try {
        final BytesRef catTerm = new BytesRef(FacetsConfig.pathToString(categoryPath.components, categoryPath.length));
        // reuse
        PostingsEnum docs = null;
        for (LeafReaderContext ctx : reader.leaves()) {
            Terms terms = ctx.reader().terms(Consts.FULL);
            if (terms != null) {
                // TODO: share per-segment TermsEnum here!
                TermsEnum termsEnum = terms.iterator();
                if (termsEnum.seekExact(catTerm)) {
                    // liveDocs=null because the taxonomy has no deletes
                    docs = termsEnum.postings(docs, 0);
                    // if the term was found, we know it has exactly one document.
                    doc = docs.nextDoc() + ctx.docBase;
                    break;
                }
            }
        }
    } finally {
        readerManager.release(reader);
    }
    if (doc > 0) {
        addToCache(categoryPath, doc);
    }
    return doc;
}
Also used : DirectoryReader(org.apache.lucene.index.DirectoryReader) Terms(org.apache.lucene.index.Terms) LeafReaderContext(org.apache.lucene.index.LeafReaderContext) PostingsEnum(org.apache.lucene.index.PostingsEnum) BytesRef(org.apache.lucene.util.BytesRef) TermsEnum(org.apache.lucene.index.TermsEnum)

Aggregations

PostingsEnum (org.apache.lucene.index.PostingsEnum)80 BytesRef (org.apache.lucene.util.BytesRef)59 TermsEnum (org.apache.lucene.index.TermsEnum)56 Terms (org.apache.lucene.index.Terms)47 Fields (org.apache.lucene.index.Fields)18 LeafReader (org.apache.lucene.index.LeafReader)17 Term (org.apache.lucene.index.Term)17 LeafReaderContext (org.apache.lucene.index.LeafReaderContext)15 Document (org.apache.lucene.document.Document)13 ArrayList (java.util.ArrayList)12 Bits (org.apache.lucene.util.Bits)11 IndexReader (org.apache.lucene.index.IndexReader)10 TextField (org.apache.lucene.document.TextField)9 Directory (org.apache.lucene.store.Directory)9 IOException (java.io.IOException)8 DirectoryReader (org.apache.lucene.index.DirectoryReader)7 IndexWriter (org.apache.lucene.index.IndexWriter)6 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)6 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)5 XContentBuilder (org.elasticsearch.common.xcontent.XContentBuilder)5