use of org.apache.lucene.index.LeafReaderContext in project lucene-solr by apache.
the class DocValuesFacets method getCounts.
public static NamedList<Integer> getCounts(SolrIndexSearcher searcher, DocSet docs, String fieldName, int offset, int limit, int mincount, boolean missing, String sort, String prefix, Predicate<BytesRef> termFilter, FacetDebugInfo fdebug) throws IOException {
SchemaField schemaField = searcher.getSchema().getField(fieldName);
FieldType ft = schemaField.getType();
NamedList<Integer> res = new NamedList<>();
// TODO: remove multiValuedFieldCache(), check dv type / uninversion type?
final boolean multiValued = schemaField.multiValued() || ft.multiValuedFieldCache();
// for term lookups only
final SortedSetDocValues si;
// for mapping per-segment ords to global ones
OrdinalMap ordinalMap = null;
if (multiValued) {
si = searcher.getSlowAtomicReader().getSortedSetDocValues(fieldName);
if (si instanceof MultiDocValues.MultiSortedSetDocValues) {
ordinalMap = ((MultiSortedSetDocValues) si).mapping;
}
} else {
SortedDocValues single = searcher.getSlowAtomicReader().getSortedDocValues(fieldName);
si = single == null ? null : DocValues.singleton(single);
if (single instanceof MultiDocValues.MultiSortedDocValues) {
ordinalMap = ((MultiDocValues.MultiSortedDocValues) single).mapping;
}
}
if (si == null) {
return finalize(res, searcher, schemaField, docs, -1, missing);
}
if (si.getValueCount() >= Integer.MAX_VALUE) {
throw new UnsupportedOperationException("Currently this faceting method is limited to " + Integer.MAX_VALUE + " unique terms");
}
final BytesRefBuilder prefixRef;
if (prefix == null) {
prefixRef = null;
} else if (prefix.length() == 0) {
prefix = null;
prefixRef = null;
} else {
prefixRef = new BytesRefBuilder();
prefixRef.copyChars(prefix);
}
int startTermIndex, endTermIndex;
if (prefix != null) {
startTermIndex = (int) si.lookupTerm(prefixRef.get());
if (startTermIndex < 0)
startTermIndex = -startTermIndex - 1;
prefixRef.append(UnicodeUtil.BIG_TERM);
endTermIndex = (int) si.lookupTerm(prefixRef.get());
assert endTermIndex < 0;
endTermIndex = -endTermIndex - 1;
} else {
startTermIndex = -1;
endTermIndex = (int) si.getValueCount();
}
final int nTerms = endTermIndex - startTermIndex;
int missingCount = -1;
final CharsRefBuilder charsRef = new CharsRefBuilder();
if (nTerms > 0 && docs.size() >= mincount) {
// count collection array only needs to be as big as the number of terms we are
// going to collect counts for.
final int[] counts = new int[nTerms];
if (fdebug != null) {
fdebug.putInfoItem("numBuckets", nTerms);
}
Filter filter = docs.getTopFilter();
List<LeafReaderContext> leaves = searcher.getTopReaderContext().leaves();
for (int subIndex = 0; subIndex < leaves.size(); subIndex++) {
LeafReaderContext leaf = leaves.get(subIndex);
// solr docsets already exclude any deleted docs
DocIdSet dis = filter.getDocIdSet(leaf, null);
DocIdSetIterator disi = null;
if (dis != null) {
disi = dis.iterator();
}
if (disi != null) {
if (multiValued) {
SortedSetDocValues sub = leaf.reader().getSortedSetDocValues(fieldName);
if (sub == null) {
sub = DocValues.emptySortedSet();
}
final SortedDocValues singleton = DocValues.unwrapSingleton(sub);
if (singleton != null) {
// some codecs may optimize SORTED_SET storage for single-valued fields
accumSingle(counts, startTermIndex, singleton, disi, subIndex, ordinalMap);
} else {
accumMulti(counts, startTermIndex, sub, disi, subIndex, ordinalMap);
}
} else {
SortedDocValues sub = leaf.reader().getSortedDocValues(fieldName);
if (sub == null) {
sub = DocValues.emptySorted();
}
accumSingle(counts, startTermIndex, sub, disi, subIndex, ordinalMap);
}
}
}
if (startTermIndex == -1) {
missingCount = counts[0];
}
// IDEA: we could also maintain a count of "other"... everything that fell outside
// of the top 'N'
int off = offset;
int lim = limit >= 0 ? limit : Integer.MAX_VALUE;
if (sort.equals(FacetParams.FACET_SORT_COUNT) || sort.equals(FacetParams.FACET_SORT_COUNT_LEGACY)) {
int maxsize = limit > 0 ? offset + limit : Integer.MAX_VALUE - 1;
maxsize = Math.min(maxsize, nTerms);
LongPriorityQueue queue = new LongPriorityQueue(Math.min(maxsize, 1000), maxsize, Long.MIN_VALUE);
// the smallest value in the top 'N' values
int min = mincount - 1;
for (int i = (startTermIndex == -1) ? 1 : 0; i < nTerms; i++) {
int c = counts[i];
if (c > min) {
if (termFilter != null) {
final BytesRef term = si.lookupOrd(startTermIndex + i);
if (!termFilter.test(term)) {
continue;
}
}
// smaller term numbers sort higher, so subtract the term number instead
long pair = (((long) c) << 32) + (Integer.MAX_VALUE - i);
boolean displaced = queue.insert(pair);
if (displaced)
min = (int) (queue.top() >>> 32);
}
}
// if we are deep paging, we don't have to order the highest "offset" counts.
int collectCount = Math.max(0, queue.size() - off);
assert collectCount <= lim;
// the start and end indexes of our list "sorted" (starting with the highest value)
int sortedIdxStart = queue.size() - (collectCount - 1);
int sortedIdxEnd = queue.size() + 1;
final long[] sorted = queue.sort(collectCount);
for (int i = sortedIdxStart; i < sortedIdxEnd; i++) {
long pair = sorted[i];
int c = (int) (pair >>> 32);
int tnum = Integer.MAX_VALUE - (int) pair;
final BytesRef term = si.lookupOrd(startTermIndex + tnum);
ft.indexedToReadable(term, charsRef);
res.add(charsRef.toString(), c);
}
} else {
// add results in index order
int i = (startTermIndex == -1) ? 1 : 0;
if (mincount <= 0 && termFilter == null) {
// if mincount<=0 and we're not examining the values for the term filter, then
// we won't discard any terms and we know exactly where to start.
i += off;
off = 0;
}
for (; i < nTerms; i++) {
int c = counts[i];
if (c < mincount)
continue;
BytesRef term = null;
if (termFilter != null) {
term = si.lookupOrd(startTermIndex + i);
if (!termFilter.test(term)) {
continue;
}
}
if (--off >= 0)
continue;
if (--lim < 0)
break;
if (term == null) {
term = si.lookupOrd(startTermIndex + i);
}
ft.indexedToReadable(term, charsRef);
res.add(charsRef.toString(), c);
}
}
}
return finalize(res, searcher, schemaField, docs, missingCount, missing);
}
use of org.apache.lucene.index.LeafReaderContext in project lucene-solr by apache.
the class DocValuesStats method getCounts.
public static StatsValues getCounts(SolrIndexSearcher searcher, StatsField statsField, DocSet docs, String[] facet) throws IOException {
final SchemaField schemaField = statsField.getSchemaField();
assert null != statsField.getSchemaField() : "DocValuesStats requires a StatsField using a SchemaField";
final String fieldName = schemaField.getName();
final FieldType ft = schemaField.getType();
final StatsValues res = StatsValuesFactory.createStatsValues(statsField);
//Initialize facetstats, if facets have been passed in
final FieldFacetStats[] facetStats = new FieldFacetStats[facet.length];
int upto = 0;
for (String facetField : facet) {
SchemaField fsf = searcher.getSchema().getField(facetField);
if (fsf.multiValued()) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Stats can only facet on single-valued fields, not: " + facetField);
}
SchemaField facetSchemaField = searcher.getSchema().getField(facetField);
facetStats[upto++] = new FieldFacetStats(searcher, facetSchemaField, statsField);
}
// TODO: remove multiValuedFieldCache(), check dv type / uninversion type?
final boolean multiValued = schemaField.multiValued() || ft.multiValuedFieldCache();
// for term lookups only
SortedSetDocValues si;
// for mapping per-segment ords to global ones
OrdinalMap ordinalMap = null;
if (multiValued) {
si = searcher.getSlowAtomicReader().getSortedSetDocValues(fieldName);
if (si instanceof MultiSortedSetDocValues) {
ordinalMap = ((MultiDocValues.MultiSortedSetDocValues) si).mapping;
}
} else {
SortedDocValues single = searcher.getSlowAtomicReader().getSortedDocValues(fieldName);
si = single == null ? null : DocValues.singleton(single);
if (single instanceof MultiDocValues.MultiSortedDocValues) {
ordinalMap = ((MultiDocValues.MultiSortedDocValues) single).mapping;
}
}
if (si == null) {
si = DocValues.emptySortedSet();
}
if (si.getValueCount() >= Integer.MAX_VALUE) {
throw new UnsupportedOperationException("Currently this stats method is limited to " + Integer.MAX_VALUE + " unique terms");
}
int missingDocCountTotal = 0;
final int nTerms = (int) si.getValueCount();
// count collection array only needs to be as big as the number of terms we are
// going to collect counts for.
final int[] counts = new int[nTerms];
Filter filter = docs.getTopFilter();
List<LeafReaderContext> leaves = searcher.getTopReaderContext().leaves();
for (int subIndex = 0; subIndex < leaves.size(); subIndex++) {
LeafReaderContext leaf = leaves.get(subIndex);
// solr docsets already exclude any deleted docs
DocIdSet dis = filter.getDocIdSet(leaf, null);
DocIdSetIterator disi = null;
if (dis != null) {
disi = dis.iterator();
}
if (disi != null) {
int docBase = leaf.docBase;
if (multiValued) {
SortedSetDocValues sub = leaf.reader().getSortedSetDocValues(fieldName);
if (sub == null) {
sub = DocValues.emptySortedSet();
}
SortedDocValues singleton = DocValues.unwrapSingleton(sub);
if (singleton != null) {
// some codecs may optimize SORTED_SET storage for single-valued fields
missingDocCountTotal += accumSingle(counts, docBase, facetStats, singleton, disi, subIndex, ordinalMap);
} else {
missingDocCountTotal += accumMulti(counts, docBase, facetStats, sub, disi, subIndex, ordinalMap);
}
} else {
SortedDocValues sub = leaf.reader().getSortedDocValues(fieldName);
if (sub == null) {
sub = DocValues.emptySorted();
}
missingDocCountTotal += accumSingle(counts, docBase, facetStats, sub, disi, subIndex, ordinalMap);
}
}
}
// add results in index order
for (int ord = 0; ord < counts.length; ord++) {
int count = counts[ord];
if (count > 0) {
final BytesRef value = si.lookupOrd(ord);
res.accumulate(value, count);
for (FieldFacetStats f : facetStats) {
f.accumulateTermNum(ord, value);
}
}
}
res.addMissing(missingDocCountTotal);
if (facetStats.length > 0) {
for (FieldFacetStats f : facetStats) {
Map<String, StatsValues> facetStatsValues = f.facetStatsValues;
f.accumulateMissing();
res.addFacet(f.name, facetStatsValues);
}
}
return res;
}
use of org.apache.lucene.index.LeafReaderContext in project lucene-solr by apache.
the class IntervalFacets method getCountString.
private void getCountString() throws IOException {
Filter filter = docs.getTopFilter();
List<LeafReaderContext> leaves = searcher.getTopReaderContext().leaves();
for (int subIndex = 0; subIndex < leaves.size(); subIndex++) {
LeafReaderContext leaf = leaves.get(subIndex);
// solr docsets already exclude any deleted docs
DocIdSet dis = filter.getDocIdSet(leaf, null);
if (dis == null) {
continue;
}
DocIdSetIterator disi = dis.iterator();
if (disi != null) {
if (schemaField.multiValued()) {
SortedSetDocValues sub = leaf.reader().getSortedSetDocValues(schemaField.getName());
if (sub == null) {
continue;
}
final SortedDocValues singleton = DocValues.unwrapSingleton(sub);
if (singleton != null) {
// some codecs may optimize SORTED_SET storage for single-valued fields
accumIntervalsSingle(singleton, disi, dis.bits());
} else {
accumIntervalsMulti(sub, disi, dis.bits());
}
} else {
SortedDocValues sub = leaf.reader().getSortedDocValues(schemaField.getName());
if (sub == null) {
continue;
}
accumIntervalsSingle(sub, disi, dis.bits());
}
}
}
}
use of org.apache.lucene.index.LeafReaderContext in project lucene-solr by apache.
the class IntervalFacets method getCountNumeric.
private void getCountNumeric() throws IOException {
final FieldType ft = schemaField.getType();
final String fieldName = schemaField.getName();
final NumberType numericType = ft.getNumberType();
if (numericType == null) {
throw new IllegalStateException();
}
final List<LeafReaderContext> leaves = searcher.getIndexReader().leaves();
final Iterator<LeafReaderContext> ctxIt = leaves.iterator();
LeafReaderContext ctx = null;
NumericDocValues longs = null;
for (DocIterator docsIt = docs.iterator(); docsIt.hasNext(); ) {
final int doc = docsIt.nextDoc();
if (ctx == null || doc >= ctx.docBase + ctx.reader().maxDoc()) {
do {
ctx = ctxIt.next();
} while (ctx == null || doc >= ctx.docBase + ctx.reader().maxDoc());
assert doc >= ctx.docBase;
switch(numericType) {
case LONG:
case DATE:
case INTEGER:
longs = DocValues.getNumeric(ctx.reader(), fieldName);
break;
case FLOAT:
// TODO: this bit flipping should probably be moved to tie-break in the PQ comparator
longs = new FilterNumericDocValues(DocValues.getNumeric(ctx.reader(), fieldName)) {
@Override
public long longValue() throws IOException {
long bits = super.longValue();
if (bits < 0)
bits ^= 0x7fffffffffffffffL;
return bits;
}
};
break;
case DOUBLE:
// TODO: this bit flipping should probably be moved to tie-break in the PQ comparator
longs = new FilterNumericDocValues(DocValues.getNumeric(ctx.reader(), fieldName)) {
@Override
public long longValue() throws IOException {
long bits = super.longValue();
if (bits < 0)
bits ^= 0x7fffffffffffffffL;
return bits;
}
};
break;
default:
throw new AssertionError();
}
}
int valuesDocID = longs.docID();
if (valuesDocID < doc - ctx.docBase) {
valuesDocID = longs.advance(doc - ctx.docBase);
}
if (valuesDocID == doc - ctx.docBase) {
accumIntervalWithValue(longs.longValue());
}
}
}
use of org.apache.lucene.index.LeafReaderContext in project lucene-solr by apache.
the class NumericFacets method getCountsSingleValue.
private static NamedList<Integer> getCountsSingleValue(SolrIndexSearcher searcher, DocSet docs, String fieldName, int offset, int limit, int mincount, boolean missing, String sort) throws IOException {
boolean zeros = mincount <= 0;
mincount = Math.max(mincount, 1);
final SchemaField sf = searcher.getSchema().getField(fieldName);
final FieldType ft = sf.getType();
final NumberType numericType = ft.getNumberType();
if (numericType == null) {
throw new IllegalStateException();
}
// We don't return zeros when using PointFields or when index=false
zeros = zeros && !ft.isPointField() && sf.indexed();
final List<LeafReaderContext> leaves = searcher.getIndexReader().leaves();
// 1. accumulate
final HashTable hashTable = new HashTable(true);
final Iterator<LeafReaderContext> ctxIt = leaves.iterator();
LeafReaderContext ctx = null;
NumericDocValues longs = null;
int missingCount = 0;
for (DocIterator docsIt = docs.iterator(); docsIt.hasNext(); ) {
final int doc = docsIt.nextDoc();
if (ctx == null || doc >= ctx.docBase + ctx.reader().maxDoc()) {
do {
ctx = ctxIt.next();
} while (ctx == null || doc >= ctx.docBase + ctx.reader().maxDoc());
assert doc >= ctx.docBase;
switch(numericType) {
case LONG:
case DATE:
case INTEGER:
// Long, Date and Integer
longs = DocValues.getNumeric(ctx.reader(), fieldName);
break;
case FLOAT:
// TODO: this bit flipping should probably be moved to tie-break in the PQ comparator
longs = new FilterNumericDocValues(DocValues.getNumeric(ctx.reader(), fieldName)) {
@Override
public long longValue() throws IOException {
long bits = super.longValue();
if (bits < 0)
bits ^= 0x7fffffffffffffffL;
return bits;
}
};
break;
case DOUBLE:
// TODO: this bit flipping should probably be moved to tie-break in the PQ comparator
longs = new FilterNumericDocValues(DocValues.getNumeric(ctx.reader(), fieldName)) {
@Override
public long longValue() throws IOException {
long bits = super.longValue();
if (bits < 0)
bits ^= 0x7fffffffffffffffL;
return bits;
}
};
break;
default:
throw new AssertionError("Unexpected type: " + numericType);
}
}
int valuesDocID = longs.docID();
if (valuesDocID < doc - ctx.docBase) {
valuesDocID = longs.advance(doc - ctx.docBase);
}
if (valuesDocID == doc - ctx.docBase) {
hashTable.add(doc, longs.longValue(), 1);
} else {
++missingCount;
}
}
// 2. select top-k facet values
final int pqSize = limit < 0 ? hashTable.size : Math.min(offset + limit, hashTable.size);
final PriorityQueue<Entry> pq;
if (FacetParams.FACET_SORT_COUNT.equals(sort) || FacetParams.FACET_SORT_COUNT_LEGACY.equals(sort)) {
pq = new PriorityQueue<Entry>(pqSize) {
@Override
protected boolean lessThan(Entry a, Entry b) {
if (a.count < b.count || (a.count == b.count && a.bits > b.bits)) {
return true;
} else {
return false;
}
}
};
} else {
pq = new PriorityQueue<Entry>(pqSize) {
@Override
protected boolean lessThan(Entry a, Entry b) {
return a.bits > b.bits;
}
};
}
Entry e = null;
for (int i = 0; i < hashTable.bits.length; ++i) {
if (hashTable.counts[i] >= mincount) {
if (e == null) {
e = new Entry();
}
e.bits = hashTable.bits[i];
e.count = hashTable.counts[i];
e.docID = hashTable.docIDs[i];
e = pq.insertWithOverflow(e);
}
}
// 4. build the NamedList
final ValueSource vs = ft.getValueSource(sf, null);
final NamedList<Integer> result = new NamedList<>();
// to be merged with terms from the terms dict
if (!zeros || FacetParams.FACET_SORT_COUNT.equals(sort) || FacetParams.FACET_SORT_COUNT_LEGACY.equals(sort)) {
// Only keep items we're interested in
final Deque<Entry> counts = new ArrayDeque<>();
while (pq.size() > offset) {
counts.addFirst(pq.pop());
}
// Entries from the PQ first, then using the terms dictionary
for (Entry entry : counts) {
final int readerIdx = ReaderUtil.subIndex(entry.docID, leaves);
final FunctionValues values = vs.getValues(Collections.emptyMap(), leaves.get(readerIdx));
result.add(values.strVal(entry.docID - leaves.get(readerIdx).docBase), entry.count);
}
if (zeros && (limit < 0 || result.size() < limit)) {
// need to merge with the term dict
if (!sf.indexed() && !sf.hasDocValues()) {
throw new IllegalStateException("Cannot use " + FacetParams.FACET_MINCOUNT + "=0 on field " + sf.getName() + " which is neither indexed nor docValues");
}
// Add zeros until there are limit results
final Set<String> alreadySeen = new HashSet<>();
while (pq.size() > 0) {
Entry entry = pq.pop();
final int readerIdx = ReaderUtil.subIndex(entry.docID, leaves);
final FunctionValues values = vs.getValues(Collections.emptyMap(), leaves.get(readerIdx));
alreadySeen.add(values.strVal(entry.docID - leaves.get(readerIdx).docBase));
}
for (int i = 0; i < result.size(); ++i) {
alreadySeen.add(result.getName(i));
}
final Terms terms = searcher.getSlowAtomicReader().terms(fieldName);
if (terms != null) {
final String prefixStr = TrieField.getMainValuePrefix(ft);
final BytesRef prefix;
if (prefixStr != null) {
prefix = new BytesRef(prefixStr);
} else {
prefix = new BytesRef();
}
final TermsEnum termsEnum = terms.iterator();
BytesRef term;
switch(termsEnum.seekCeil(prefix)) {
case FOUND:
case NOT_FOUND:
term = termsEnum.term();
break;
case END:
term = null;
break;
default:
throw new AssertionError();
}
final CharsRefBuilder spare = new CharsRefBuilder();
for (int skipped = hashTable.size; skipped < offset && term != null && StringHelper.startsWith(term, prefix); ) {
ft.indexedToReadable(term, spare);
final String termStr = spare.toString();
if (!alreadySeen.contains(termStr)) {
++skipped;
}
term = termsEnum.next();
}
for (; term != null && StringHelper.startsWith(term, prefix) && (limit < 0 || result.size() < limit); term = termsEnum.next()) {
ft.indexedToReadable(term, spare);
final String termStr = spare.toString();
if (!alreadySeen.contains(termStr)) {
result.add(termStr, 0);
}
}
}
}
} else {
// => Merge the PQ and the terms dictionary on the fly
if (!sf.indexed()) {
throw new IllegalStateException("Cannot use " + FacetParams.FACET_SORT + "=" + FacetParams.FACET_SORT_INDEX + " on a field which is not indexed");
}
final Map<String, Integer> counts = new HashMap<>();
while (pq.size() > 0) {
final Entry entry = pq.pop();
final int readerIdx = ReaderUtil.subIndex(entry.docID, leaves);
final FunctionValues values = vs.getValues(Collections.emptyMap(), leaves.get(readerIdx));
counts.put(values.strVal(entry.docID - leaves.get(readerIdx).docBase), entry.count);
}
final Terms terms = searcher.getSlowAtomicReader().terms(fieldName);
if (terms != null) {
final String prefixStr = TrieField.getMainValuePrefix(ft);
final BytesRef prefix;
if (prefixStr != null) {
prefix = new BytesRef(prefixStr);
} else {
prefix = new BytesRef();
}
final TermsEnum termsEnum = terms.iterator();
BytesRef term;
switch(termsEnum.seekCeil(prefix)) {
case FOUND:
case NOT_FOUND:
term = termsEnum.term();
break;
case END:
term = null;
break;
default:
throw new AssertionError();
}
final CharsRefBuilder spare = new CharsRefBuilder();
for (int i = 0; i < offset && term != null && StringHelper.startsWith(term, prefix); ++i) {
term = termsEnum.next();
}
for (; term != null && StringHelper.startsWith(term, prefix) && (limit < 0 || result.size() < limit); term = termsEnum.next()) {
ft.indexedToReadable(term, spare);
final String termStr = spare.toString();
Integer count = counts.get(termStr);
if (count == null) {
count = 0;
}
result.add(termStr, count);
}
}
}
if (missing) {
result.add(null, missingCount);
}
return result;
}
Aggregations