use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.
the class MultiTermQueryConstantScoreWrapper method createWeight.
@Override
public Weight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException {
return new ConstantScoreWeight(this, boost) {
/** Try to collect terms from the given terms enum and return true iff all
* terms could be collected. If {@code false} is returned, the enum is
* left positioned on the next term. */
private boolean collectTerms(LeafReaderContext context, TermsEnum termsEnum, List<TermAndState> terms) throws IOException {
final int threshold = Math.min(BOOLEAN_REWRITE_TERM_COUNT_THRESHOLD, BooleanQuery.getMaxClauseCount());
for (int i = 0; i < threshold; ++i) {
final BytesRef term = termsEnum.next();
if (term == null) {
return true;
}
TermState state = termsEnum.termState();
terms.add(new TermAndState(BytesRef.deepCopyOf(term), state, termsEnum.docFreq(), termsEnum.totalTermFreq()));
}
return termsEnum.next() == null;
}
/**
* On the given leaf context, try to either rewrite to a disjunction if
* there are few terms, or build a bitset containing matching docs.
*/
private WeightOrDocIdSet rewrite(LeafReaderContext context) throws IOException {
final Terms terms = context.reader().terms(query.field);
if (terms == null) {
// field does not exist
return new WeightOrDocIdSet((DocIdSet) null);
}
final TermsEnum termsEnum = query.getTermsEnum(terms);
assert termsEnum != null;
PostingsEnum docs = null;
final List<TermAndState> collectedTerms = new ArrayList<>();
if (collectTerms(context, termsEnum, collectedTerms)) {
// build a boolean query
BooleanQuery.Builder bq = new BooleanQuery.Builder();
for (TermAndState t : collectedTerms) {
final TermContext termContext = new TermContext(searcher.getTopReaderContext());
termContext.register(t.state, context.ord, t.docFreq, t.totalTermFreq);
bq.add(new TermQuery(new Term(query.field, t.term), termContext), Occur.SHOULD);
}
Query q = new ConstantScoreQuery(bq.build());
final Weight weight = searcher.rewrite(q).createWeight(searcher, needsScores, score());
return new WeightOrDocIdSet(weight);
}
// Too many terms: go back to the terms we already collected and start building the bit set
DocIdSetBuilder builder = new DocIdSetBuilder(context.reader().maxDoc(), terms);
if (collectedTerms.isEmpty() == false) {
TermsEnum termsEnum2 = terms.iterator();
for (TermAndState t : collectedTerms) {
termsEnum2.seekExact(t.term, t.state);
docs = termsEnum2.postings(docs, PostingsEnum.NONE);
builder.add(docs);
}
}
// Then keep filling the bit set with remaining terms
do {
docs = termsEnum.postings(docs, PostingsEnum.NONE);
builder.add(docs);
} while (termsEnum.next() != null);
return new WeightOrDocIdSet(builder.build());
}
private Scorer scorer(DocIdSet set) throws IOException {
if (set == null) {
return null;
}
final DocIdSetIterator disi = set.iterator();
if (disi == null) {
return null;
}
return new ConstantScoreScorer(this, score(), disi);
}
@Override
public BulkScorer bulkScorer(LeafReaderContext context) throws IOException {
final WeightOrDocIdSet weightOrBitSet = rewrite(context);
if (weightOrBitSet.weight != null) {
return weightOrBitSet.weight.bulkScorer(context);
} else {
final Scorer scorer = scorer(weightOrBitSet.set);
if (scorer == null) {
return null;
}
return new DefaultBulkScorer(scorer);
}
}
@Override
public Scorer scorer(LeafReaderContext context) throws IOException {
final WeightOrDocIdSet weightOrBitSet = rewrite(context);
if (weightOrBitSet.weight != null) {
return weightOrBitSet.weight.scorer(context);
} else {
return scorer(weightOrBitSet.set);
}
}
};
}
use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.
the class TermInSetQuery method createWeight.
@Override
public Weight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException {
return new ConstantScoreWeight(this, boost) {
@Override
public void extractTerms(Set<Term> terms) {
// no-op
// This query is for abuse cases when the number of terms is too high to
// run efficiently as a BooleanQuery. So likewise we hide its terms in
// order to protect highlighters
}
/**
* On the given leaf context, try to either rewrite to a disjunction if
* there are few matching terms, or build a bitset containing matching docs.
*/
private WeightOrDocIdSet rewrite(LeafReaderContext context) throws IOException {
final LeafReader reader = context.reader();
final Fields fields = reader.fields();
Terms terms = fields.terms(field);
if (terms == null) {
return null;
}
TermsEnum termsEnum = terms.iterator();
PostingsEnum docs = null;
TermIterator iterator = termData.iterator();
// We will first try to collect up to 'threshold' terms into 'matchingTerms'
// if there are two many terms, we will fall back to building the 'builder'
final int threshold = Math.min(BOOLEAN_REWRITE_TERM_COUNT_THRESHOLD, BooleanQuery.getMaxClauseCount());
assert termData.size() > threshold : "Query should have been rewritten";
List<TermAndState> matchingTerms = new ArrayList<>(threshold);
DocIdSetBuilder builder = null;
for (BytesRef term = iterator.next(); term != null; term = iterator.next()) {
assert field.equals(iterator.field());
if (termsEnum.seekExact(term)) {
if (matchingTerms == null) {
docs = termsEnum.postings(docs, PostingsEnum.NONE);
builder.add(docs);
} else if (matchingTerms.size() < threshold) {
matchingTerms.add(new TermAndState(field, termsEnum));
} else {
assert matchingTerms.size() == threshold;
builder = new DocIdSetBuilder(reader.maxDoc(), terms);
docs = termsEnum.postings(docs, PostingsEnum.NONE);
builder.add(docs);
for (TermAndState t : matchingTerms) {
t.termsEnum.seekExact(t.term, t.state);
docs = t.termsEnum.postings(docs, PostingsEnum.NONE);
builder.add(docs);
}
matchingTerms = null;
}
}
}
if (matchingTerms != null) {
assert builder == null;
BooleanQuery.Builder bq = new BooleanQuery.Builder();
for (TermAndState t : matchingTerms) {
final TermContext termContext = new TermContext(searcher.getTopReaderContext());
termContext.register(t.state, context.ord, t.docFreq, t.totalTermFreq);
bq.add(new TermQuery(new Term(t.field, t.term), termContext), Occur.SHOULD);
}
Query q = new ConstantScoreQuery(bq.build());
final Weight weight = searcher.rewrite(q).createWeight(searcher, needsScores, score());
return new WeightOrDocIdSet(weight);
} else {
assert builder != null;
return new WeightOrDocIdSet(builder.build());
}
}
private Scorer scorer(DocIdSet set) throws IOException {
if (set == null) {
return null;
}
final DocIdSetIterator disi = set.iterator();
if (disi == null) {
return null;
}
return new ConstantScoreScorer(this, score(), disi);
}
@Override
public BulkScorer bulkScorer(LeafReaderContext context) throws IOException {
final WeightOrDocIdSet weightOrBitSet = rewrite(context);
if (weightOrBitSet == null) {
return null;
} else if (weightOrBitSet.weight != null) {
return weightOrBitSet.weight.bulkScorer(context);
} else {
final Scorer scorer = scorer(weightOrBitSet.set);
if (scorer == null) {
return null;
}
return new DefaultBulkScorer(scorer);
}
}
@Override
public Scorer scorer(LeafReaderContext context) throws IOException {
final WeightOrDocIdSet weightOrBitSet = rewrite(context);
if (weightOrBitSet == null) {
return null;
} else if (weightOrBitSet.weight != null) {
return weightOrBitSet.weight.scorer(context);
} else {
return scorer(weightOrBitSet.set);
}
}
};
}
use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.
the class QueryElevationComponent method getBoostDocs.
public static IntIntHashMap getBoostDocs(SolrIndexSearcher indexSearcher, Map<BytesRef, Integer> boosted, Map context) throws IOException {
IntIntHashMap boostDocs = null;
if (boosted != null) {
//by another caller.
if (context != null) {
boostDocs = (IntIntHashMap) context.get(BOOSTED_DOCIDS);
}
if (boostDocs != null) {
return boostDocs;
}
//Not in the context yet so load it.
SchemaField idField = indexSearcher.getSchema().getUniqueKeyField();
String fieldName = idField.getName();
HashSet<BytesRef> localBoosts = new HashSet<>(boosted.size() * 2);
Iterator<BytesRef> boostedIt = boosted.keySet().iterator();
while (boostedIt.hasNext()) {
localBoosts.add(boostedIt.next());
}
boostDocs = new IntIntHashMap(boosted.size());
List<LeafReaderContext> leaves = indexSearcher.getTopReaderContext().leaves();
PostingsEnum postingsEnum = null;
for (LeafReaderContext leaf : leaves) {
LeafReader reader = leaf.reader();
int docBase = leaf.docBase;
Bits liveDocs = reader.getLiveDocs();
Terms terms = reader.terms(fieldName);
TermsEnum termsEnum = terms.iterator();
Iterator<BytesRef> it = localBoosts.iterator();
while (it.hasNext()) {
BytesRef ref = it.next();
if (termsEnum.seekExact(ref)) {
postingsEnum = termsEnum.postings(postingsEnum);
int doc = postingsEnum.nextDoc();
while (doc != PostingsEnum.NO_MORE_DOCS && liveDocs != null && liveDocs.get(doc) == false) {
doc = postingsEnum.nextDoc();
}
if (doc != PostingsEnum.NO_MORE_DOCS) {
//Found the document.
int p = boosted.get(ref);
boostDocs.put(doc + docBase, p);
it.remove();
}
}
}
}
}
if (context != null) {
context.put(BOOSTED_DOCIDS, boostDocs);
}
return boostDocs;
}
use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.
the class DirectoryTaxonomyWriter method perhapsFillCache.
// we need to guarantee that if several threads call this concurrently, only
// one executes it, and after it returns, the cache is updated and is either
// complete or not.
private synchronized void perhapsFillCache() throws IOException {
if (cacheMisses.get() < cacheMissesUntilFill) {
return;
}
if (!shouldFillCache) {
// we already filled the cache once, there's no need to re-fill it
return;
}
shouldFillCache = false;
initReaderManager();
boolean aborted = false;
DirectoryReader reader = readerManager.acquire();
try {
PostingsEnum postingsEnum = null;
for (LeafReaderContext ctx : reader.leaves()) {
Terms terms = ctx.reader().terms(Consts.FULL);
if (terms != null) {
// cannot really happen, but be on the safe side
// TODO: share per-segment TermsEnum here!
TermsEnum termsEnum = terms.iterator();
while (termsEnum.next() != null) {
if (!cache.isFull()) {
BytesRef t = termsEnum.term();
// Since we guarantee uniqueness of categories, each term has exactly
// one document. Also, since we do not allow removing categories (and
// hence documents), there are no deletions in the index. Therefore, it
// is sufficient to call next(), and then doc(), exactly once with no
// 'validation' checks.
FacetLabel cp = new FacetLabel(FacetsConfig.stringToPath(t.utf8ToString()));
postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
boolean res = cache.put(cp, postingsEnum.nextDoc() + ctx.docBase);
assert !res : "entries should not have been evicted from the cache";
} else {
// the cache is full and the next put() will evict entries from it, therefore abort the iteration.
aborted = true;
break;
}
}
}
if (aborted) {
break;
}
}
} finally {
readerManager.release(reader);
}
cacheIsComplete = !aborted;
if (cacheIsComplete) {
synchronized (this) {
// everything is in the cache, so no need to keep readerManager open.
// this block is executed in a sync block so that it works well with
// initReaderManager called in parallel.
readerManager.close();
readerManager = null;
initializedReaderManager = false;
}
}
}
use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.
the class DirectoryTaxonomyWriter method findCategory.
/**
* Look up the given category in the cache and/or the on-disk storage,
* returning the category's ordinal, or a negative number in case the
* category does not yet exist in the taxonomy.
*/
protected synchronized int findCategory(FacetLabel categoryPath) throws IOException {
// If we can find the category in the cache, or we know the cache is
// complete, we can return the response directly from it
int res = cache.get(categoryPath);
if (res >= 0 || cacheIsComplete) {
return res;
}
cacheMisses.incrementAndGet();
// After a few cache misses, it makes sense to read all the categories
// from disk and into the cache. The reason not to do this on the first
// cache miss (or even when opening the writer) is that it will
// significantly slow down the case when a taxonomy is opened just to
// add one category. The idea only spending a long time on reading
// after enough time was spent on cache misses is known as an "online
// algorithm".
perhapsFillCache();
res = cache.get(categoryPath);
if (res >= 0 || cacheIsComplete) {
// or the cache is complete, return whatever cache.get returned.
return res;
}
// if we get here, it means the category is not in the cache, and it is not
// complete, and therefore we must look for the category on disk.
// We need to get an answer from the on-disk index.
initReaderManager();
int doc = -1;
DirectoryReader reader = readerManager.acquire();
try {
final BytesRef catTerm = new BytesRef(FacetsConfig.pathToString(categoryPath.components, categoryPath.length));
// reuse
PostingsEnum docs = null;
for (LeafReaderContext ctx : reader.leaves()) {
Terms terms = ctx.reader().terms(Consts.FULL);
if (terms != null) {
// TODO: share per-segment TermsEnum here!
TermsEnum termsEnum = terms.iterator();
if (termsEnum.seekExact(catTerm)) {
// liveDocs=null because the taxonomy has no deletes
docs = termsEnum.postings(docs, 0);
// if the term was found, we know it has exactly one document.
doc = docs.nextDoc() + ctx.docBase;
break;
}
}
}
} finally {
readerManager.release(reader);
}
if (doc > 0) {
addToCache(categoryPath, doc);
}
return doc;
}
Aggregations