use of org.apache.solr.search.DocSet in project lucene-solr by apache.
the class UnInvertedField method collectDocsGeneric.
// called from FieldFacetProcessor
// TODO: do a callback version that can be specialized!
public void collectDocsGeneric(FacetFieldProcessorByArrayUIF processor) throws IOException {
use.incrementAndGet();
int startTermIndex = processor.startTermIndex;
int endTermIndex = processor.endTermIndex;
int nTerms = processor.nTerms;
DocSet docs = processor.fcontext.base;
int uniqueTerms = 0;
final CountSlotAcc countAcc = processor.countAcc;
for (TopTerm tt : bigTerms.values()) {
if (tt.termNum >= startTermIndex && tt.termNum < endTermIndex) {
// handle the biggest terms
try (DocSet intersection = searcher.getDocSet(tt.termQuery, docs)) {
int collected = processor.collectFirstPhase(intersection, tt.termNum - startTermIndex);
countAcc.incrementCount(tt.termNum - startTermIndex, collected);
if (collected > 0) {
uniqueTerms++;
}
}
}
}
if (termInstances > 0) {
final List<LeafReaderContext> leaves = searcher.getIndexReader().leaves();
final Iterator<LeafReaderContext> ctxIt = leaves.iterator();
LeafReaderContext ctx = null;
int segBase = 0;
int segMax;
int adjustedMax = 0;
// TODO: handle facet.prefix here!!!
DocIterator iter = docs.iterator();
while (iter.hasNext()) {
int doc = iter.nextDoc();
if (doc >= adjustedMax) {
do {
ctx = ctxIt.next();
if (ctx == null) {
// should be impossible
throw new RuntimeException("INTERNAL FACET ERROR");
}
segBase = ctx.docBase;
segMax = ctx.reader().maxDoc();
adjustedMax = segBase + segMax;
} while (doc >= adjustedMax);
assert doc >= ctx.docBase;
processor.setNextReaderFirstPhase(ctx);
}
int segDoc = doc - segBase;
int code = index[doc];
if ((code & 0xff) == 1) {
int pos = code >>> 8;
int whichArray = (doc >>> 16) & 0xff;
byte[] arr = tnums[whichArray];
int tnum = 0;
for (; ; ) {
int delta = 0;
for (; ; ) {
byte b = arr[pos++];
delta = (delta << 7) | (b & 0x7f);
if ((b & 0x80) == 0)
break;
}
if (delta == 0)
break;
tnum += delta - TNUM_OFFSET;
int arrIdx = tnum - startTermIndex;
if (arrIdx < 0)
continue;
if (arrIdx >= nTerms)
break;
countAcc.incrementCount(arrIdx, 1);
processor.collectFirstPhase(segDoc, arrIdx);
}
} else {
int tnum = 0;
int delta = 0;
for (; ; ) {
delta = (delta << 7) | (code & 0x7f);
if ((code & 0x80) == 0) {
if (delta == 0)
break;
tnum += delta - TNUM_OFFSET;
int arrIdx = tnum - startTermIndex;
if (arrIdx >= 0) {
if (arrIdx >= nTerms)
break;
countAcc.incrementCount(arrIdx, 1);
processor.collectFirstPhase(segDoc, arrIdx);
}
delta = 0;
}
code >>>= 8;
}
}
}
}
}
use of org.apache.solr.search.DocSet in project lucene-solr by apache.
the class UnInvertedField method getCounts.
private void getCounts(FacetFieldProcessorByArrayUIF processor, CountSlotAcc counts) throws IOException {
DocSet docs = processor.fcontext.base;
int baseSize = docs.size();
int maxDoc = searcher.maxDoc();
// what about allBuckets?
if (baseSize < processor.effectiveMincount) {
return;
}
final int[] index = this.index;
boolean doNegative = baseSize > maxDoc >> 1 && termInstances > 0 && docs instanceof BitDocSet;
if (doNegative) {
FixedBitSet bs = ((BitDocSet) docs).getBits().clone();
bs.flip(0, maxDoc);
// TODO: when iterator across negative elements is available, use that
// instead of creating a new bitset and inverting.
docs = new BitDocSet(bs, maxDoc - baseSize);
// simply negating will mean that we have deleted docs in the set.
// that should be OK, as their entries in our table should be empty.
}
// For the biggest terms, do straight set intersections
for (TopTerm tt : bigTerms.values()) {
// TODO: counts could be deferred if sorting by index order
counts.incrementCount(tt.termNum, searcher.numDocs(tt.termQuery, docs));
}
if (termInstances > 0) {
DocIterator iter = docs.iterator();
while (iter.hasNext()) {
int doc = iter.nextDoc();
int code = index[doc];
if ((code & 0xff) == 1) {
int pos = code >>> 8;
int whichArray = (doc >>> 16) & 0xff;
byte[] arr = tnums[whichArray];
int tnum = 0;
for (; ; ) {
int delta = 0;
for (; ; ) {
byte b = arr[pos++];
delta = (delta << 7) | (b & 0x7f);
if ((b & 0x80) == 0)
break;
}
if (delta == 0)
break;
tnum += delta - TNUM_OFFSET;
counts.incrementCount(tnum, 1);
}
} else {
int tnum = 0;
int delta = 0;
for (; ; ) {
delta = (delta << 7) | (code & 0x7f);
if ((code & 0x80) == 0) {
if (delta == 0)
break;
tnum += delta - TNUM_OFFSET;
counts.incrementCount(tnum, 1);
delta = 0;
}
code >>>= 8;
}
}
}
}
if (doNegative) {
for (int i = 0; i < numTermsInField; i++) {
// counts[i] = maxTermCounts[i] - counts[i];
counts.incrementCount(i, maxTermCounts[i] - counts.getCount(i) * 2);
}
}
/*** TODO - future optimization to handle allBuckets
if (processor.allBucketsSlot >= 0) {
int all = 0; // overflow potential
for (int i=0; i<numTermsInField; i++) {
all += counts.getCount(i);
}
counts.incrementCount(processor.allBucketsSlot, all);
}
***/
}
use of org.apache.solr.search.DocSet in project lucene-solr by apache.
the class SimpleFacets method getFacetTermEnumCounts.
/**
* Returns a list of terms in the specified field along with the
* corresponding count of documents in the set that match that constraint.
* This method uses the FilterCache to get the intersection count between <code>docs</code>
* and the DocSet for each term in the filter.
*
* @see FacetParams#FACET_LIMIT
* @see FacetParams#FACET_ZEROS
* @see FacetParams#FACET_MISSING
*/
public NamedList<Integer> getFacetTermEnumCounts(SolrIndexSearcher searcher, DocSet docs, String field, int offset, int limit, int mincount, boolean missing, String sort, String prefix, Predicate<BytesRef> termFilter, boolean intersectsCheck) throws IOException {
/* :TODO: potential optimization...
* cache the Terms with the highest docFreq and try them first
* don't enum if we get our max from them
*/
// Minimum term docFreq in order to use the filterCache for that term.
int minDfFilterCache = global.getFieldInt(field, FacetParams.FACET_ENUM_CACHE_MINDF, 0);
// make sure we have a set that is fast for random access, if we will use it for that
DocSet fastForRandomSet = docs;
if (minDfFilterCache > 0 && docs instanceof SortedIntDocSet) {
SortedIntDocSet sset = (SortedIntDocSet) docs;
fastForRandomSet = new HashDocSet(sset.getDocs(), 0, sset.size());
}
IndexSchema schema = searcher.getSchema();
FieldType ft = schema.getFieldType(field);
assert !ft.isPointField() : "Point Fields don't support enum method";
LeafReader r = searcher.getSlowAtomicReader();
boolean sortByCount = sort.equals("count") || sort.equals("true");
final int maxsize = limit >= 0 ? offset + limit : Integer.MAX_VALUE - 1;
final BoundedTreeSet<CountPair<BytesRef, Integer>> queue = sortByCount ? new BoundedTreeSet<CountPair<BytesRef, Integer>>(maxsize) : null;
final NamedList<Integer> res = new NamedList<>();
// the smallest value in the top 'N' values
int min = mincount - 1;
int off = offset;
int lim = limit >= 0 ? limit : Integer.MAX_VALUE;
BytesRef prefixTermBytes = null;
if (prefix != null) {
String indexedPrefix = ft.toInternal(prefix);
prefixTermBytes = new BytesRef(indexedPrefix);
}
Fields fields = r.fields();
Terms terms = fields == null ? null : fields.terms(field);
TermsEnum termsEnum = null;
SolrIndexSearcher.DocsEnumState deState = null;
BytesRef term = null;
if (terms != null) {
termsEnum = terms.iterator();
if (prefixTermBytes != null) {
if (termsEnum.seekCeil(prefixTermBytes) == TermsEnum.SeekStatus.END) {
termsEnum = null;
} else {
term = termsEnum.term();
}
} else {
// position termsEnum on first term
term = termsEnum.next();
}
}
PostingsEnum postingsEnum = null;
CharsRefBuilder charsRef = new CharsRefBuilder();
if (docs.size() >= mincount) {
while (term != null) {
if (prefixTermBytes != null && !StringHelper.startsWith(term, prefixTermBytes))
break;
if (termFilter == null || termFilter.test(term)) {
int df = termsEnum.docFreq();
// make a large difference (for example, many terms with df=1).
if (df > 0 && df > min) {
int c;
if (df >= minDfFilterCache) {
if (deState == null) {
deState = new SolrIndexSearcher.DocsEnumState();
deState.fieldName = field;
deState.liveDocs = r.getLiveDocs();
deState.termsEnum = termsEnum;
deState.postingsEnum = postingsEnum;
}
if (intersectsCheck) {
c = searcher.intersects(docs, deState) ? 1 : 0;
} else {
c = searcher.numDocs(docs, deState);
}
postingsEnum = deState.postingsEnum;
} else {
// iterate over TermDocs to calculate the intersection
// TODO: specialize when base docset is a bitset or hash set (skipDocs)? or does it matter for this?
// TODO: do this per-segment for better efficiency (MultiDocsEnum just uses base class impl)
// TODO: would passing deleted docs lead to better efficiency over checking the fastForRandomSet?
postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
c = 0;
if (postingsEnum instanceof MultiPostingsEnum) {
MultiPostingsEnum.EnumWithSlice[] subs = ((MultiPostingsEnum) postingsEnum).getSubs();
int numSubs = ((MultiPostingsEnum) postingsEnum).getNumSubs();
SEGMENTS_LOOP: for (int subindex = 0; subindex < numSubs; subindex++) {
MultiPostingsEnum.EnumWithSlice sub = subs[subindex];
if (sub.postingsEnum == null)
continue;
int base = sub.slice.start;
int docid;
while ((docid = sub.postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
if (fastForRandomSet.exists(docid + base)) {
c++;
if (intersectsCheck) {
assert c == 1;
break SEGMENTS_LOOP;
}
}
}
}
} else {
int docid;
while ((docid = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
if (fastForRandomSet.exists(docid)) {
c++;
if (intersectsCheck) {
assert c == 1;
break;
}
}
}
}
}
if (sortByCount) {
if (c > min) {
BytesRef termCopy = BytesRef.deepCopyOf(term);
queue.add(new CountPair<>(termCopy, c));
if (queue.size() >= maxsize)
min = queue.last().val;
}
} else {
if (c >= mincount && --off < 0) {
if (--lim < 0)
break;
ft.indexedToReadable(term, charsRef);
res.add(charsRef.toString(), c);
}
}
}
}
term = termsEnum.next();
}
}
if (sortByCount) {
for (CountPair<BytesRef, Integer> p : queue) {
if (--off >= 0)
continue;
if (--lim < 0)
break;
ft.indexedToReadable(p.key, charsRef);
res.add(charsRef.toString(), p.val);
}
}
if (missing) {
res.add(null, getFieldMissingCount(searcher, docs, field));
}
return res;
}
use of org.apache.solr.search.DocSet in project lucene-solr by apache.
the class SimpleFacets method parseParams.
protected ParsedParams parseParams(String type, String param) throws SyntaxError, IOException {
SolrParams localParams = QueryParsing.getLocalParams(param, req.getParams());
DocSet docs = docsOrig;
String facetValue = param;
String key = param;
List<String> tags = Collections.emptyList();
int threads = -1;
if (localParams == null) {
SolrParams params = global;
SolrParams required = new RequiredSolrParams(params);
return new ParsedParams(localParams, params, required, facetValue, docs, key, tags, threads);
}
SolrParams params = SolrParams.wrapDefaults(localParams, global);
SolrParams required = new RequiredSolrParams(params);
// remove local params unless it's a query
if (type != FacetParams.FACET_QUERY) {
// TODO Cut over to an Enum here
facetValue = localParams.get(CommonParams.VALUE);
}
// reset set the default key now that localParams have been removed
key = facetValue;
// allow explicit set of the key
key = localParams.get(CommonParams.OUTPUT_KEY, key);
String tagStr = localParams.get(CommonParams.TAG);
tags = tagStr == null ? Collections.<String>emptyList() : StrUtils.splitSmart(tagStr, ',');
String threadStr = localParams.get(CommonParams.THREADS);
if (threadStr != null) {
threads = Integer.parseInt(threadStr);
}
// figure out if we need a new base DocSet
String excludeStr = localParams.get(CommonParams.EXCLUDE);
if (excludeStr == null)
return new ParsedParams(localParams, params, required, facetValue, docs, key, tags, threads);
List<String> excludeTagList = StrUtils.splitSmart(excludeStr, ',');
docs = computeDocSet(docs, excludeTagList);
return new ParsedParams(localParams, params, required, facetValue, docs, key, tags, threads);
}
use of org.apache.solr.search.DocSet in project lucene-solr by apache.
the class HighlighterTest method payloadFilteringSpanQuery.
@Test
public void payloadFilteringSpanQuery() throws IOException {
clearIndex();
String FIELD_NAME = "payloadDelimited";
assertU(adoc("id", "0", FIELD_NAME, "word|7 word|2"));
assertU(commit());
//We search at a lower level than typical Solr tests because there's no QParser for payloads
//Create query matching this payload
Query query = new SpanPayloadCheckQuery(new SpanTermQuery(new Term(FIELD_NAME, "word")), //bytes for integer 7
Collections.singletonList(new BytesRef(new byte[] { 0, 0, 0, 7 })));
//invoke highlight component... the hard way
final SearchComponent hlComp = h.getCore().getSearchComponent("highlight");
SolrQueryRequest req = req("hl", "true", "hl.fl", FIELD_NAME, HighlightParams.USE_PHRASE_HIGHLIGHTER, "true");
try {
SolrQueryResponse resp = new SolrQueryResponse();
ResponseBuilder rb = new ResponseBuilder(req, resp, Collections.singletonList(hlComp));
rb.setHighlightQuery(query);
rb.setResults(req.getSearcher().getDocListAndSet(query, (DocSet) null, null, 0, 1));
//highlight:
hlComp.prepare(rb);
hlComp.process(rb);
//inspect response
final String[] snippets = (String[]) resp.getValues().findRecursive("highlighting", "0", FIELD_NAME);
assertEquals("<em>word|7</em> word|2", snippets[0]);
} finally {
req.close();
}
}
Aggregations