Search in sources :

Example 6 with DocSet

use of org.apache.solr.search.DocSet in project lucene-solr by apache.

the class UnInvertedField method collectDocsGeneric.

// called from FieldFacetProcessor
// TODO: do a callback version that can be specialized!
public void collectDocsGeneric(FacetFieldProcessorByArrayUIF processor) throws IOException {
    use.incrementAndGet();
    int startTermIndex = processor.startTermIndex;
    int endTermIndex = processor.endTermIndex;
    int nTerms = processor.nTerms;
    DocSet docs = processor.fcontext.base;
    int uniqueTerms = 0;
    final CountSlotAcc countAcc = processor.countAcc;
    for (TopTerm tt : bigTerms.values()) {
        if (tt.termNum >= startTermIndex && tt.termNum < endTermIndex) {
            // handle the biggest terms
            try (DocSet intersection = searcher.getDocSet(tt.termQuery, docs)) {
                int collected = processor.collectFirstPhase(intersection, tt.termNum - startTermIndex);
                countAcc.incrementCount(tt.termNum - startTermIndex, collected);
                if (collected > 0) {
                    uniqueTerms++;
                }
            }
        }
    }
    if (termInstances > 0) {
        final List<LeafReaderContext> leaves = searcher.getIndexReader().leaves();
        final Iterator<LeafReaderContext> ctxIt = leaves.iterator();
        LeafReaderContext ctx = null;
        int segBase = 0;
        int segMax;
        int adjustedMax = 0;
        // TODO: handle facet.prefix here!!!
        DocIterator iter = docs.iterator();
        while (iter.hasNext()) {
            int doc = iter.nextDoc();
            if (doc >= adjustedMax) {
                do {
                    ctx = ctxIt.next();
                    if (ctx == null) {
                        // should be impossible
                        throw new RuntimeException("INTERNAL FACET ERROR");
                    }
                    segBase = ctx.docBase;
                    segMax = ctx.reader().maxDoc();
                    adjustedMax = segBase + segMax;
                } while (doc >= adjustedMax);
                assert doc >= ctx.docBase;
                processor.setNextReaderFirstPhase(ctx);
            }
            int segDoc = doc - segBase;
            int code = index[doc];
            if ((code & 0xff) == 1) {
                int pos = code >>> 8;
                int whichArray = (doc >>> 16) & 0xff;
                byte[] arr = tnums[whichArray];
                int tnum = 0;
                for (; ; ) {
                    int delta = 0;
                    for (; ; ) {
                        byte b = arr[pos++];
                        delta = (delta << 7) | (b & 0x7f);
                        if ((b & 0x80) == 0)
                            break;
                    }
                    if (delta == 0)
                        break;
                    tnum += delta - TNUM_OFFSET;
                    int arrIdx = tnum - startTermIndex;
                    if (arrIdx < 0)
                        continue;
                    if (arrIdx >= nTerms)
                        break;
                    countAcc.incrementCount(arrIdx, 1);
                    processor.collectFirstPhase(segDoc, arrIdx);
                }
            } else {
                int tnum = 0;
                int delta = 0;
                for (; ; ) {
                    delta = (delta << 7) | (code & 0x7f);
                    if ((code & 0x80) == 0) {
                        if (delta == 0)
                            break;
                        tnum += delta - TNUM_OFFSET;
                        int arrIdx = tnum - startTermIndex;
                        if (arrIdx >= 0) {
                            if (arrIdx >= nTerms)
                                break;
                            countAcc.incrementCount(arrIdx, 1);
                            processor.collectFirstPhase(segDoc, arrIdx);
                        }
                        delta = 0;
                    }
                    code >>>= 8;
                }
            }
        }
    }
}
Also used : DocIterator(org.apache.solr.search.DocIterator) LeafReaderContext(org.apache.lucene.index.LeafReaderContext) BitDocSet(org.apache.solr.search.BitDocSet) DocSet(org.apache.solr.search.DocSet)

Example 7 with DocSet

use of org.apache.solr.search.DocSet in project lucene-solr by apache.

the class UnInvertedField method getCounts.

private void getCounts(FacetFieldProcessorByArrayUIF processor, CountSlotAcc counts) throws IOException {
    DocSet docs = processor.fcontext.base;
    int baseSize = docs.size();
    int maxDoc = searcher.maxDoc();
    // what about allBuckets?
    if (baseSize < processor.effectiveMincount) {
        return;
    }
    final int[] index = this.index;
    boolean doNegative = baseSize > maxDoc >> 1 && termInstances > 0 && docs instanceof BitDocSet;
    if (doNegative) {
        FixedBitSet bs = ((BitDocSet) docs).getBits().clone();
        bs.flip(0, maxDoc);
        // TODO: when iterator across negative elements is available, use that
        // instead of creating a new bitset and inverting.
        docs = new BitDocSet(bs, maxDoc - baseSize);
    // simply negating will mean that we have deleted docs in the set.
    // that should be OK, as their entries in our table should be empty.
    }
    // For the biggest terms, do straight set intersections
    for (TopTerm tt : bigTerms.values()) {
        // TODO: counts could be deferred if sorting by index order
        counts.incrementCount(tt.termNum, searcher.numDocs(tt.termQuery, docs));
    }
    if (termInstances > 0) {
        DocIterator iter = docs.iterator();
        while (iter.hasNext()) {
            int doc = iter.nextDoc();
            int code = index[doc];
            if ((code & 0xff) == 1) {
                int pos = code >>> 8;
                int whichArray = (doc >>> 16) & 0xff;
                byte[] arr = tnums[whichArray];
                int tnum = 0;
                for (; ; ) {
                    int delta = 0;
                    for (; ; ) {
                        byte b = arr[pos++];
                        delta = (delta << 7) | (b & 0x7f);
                        if ((b & 0x80) == 0)
                            break;
                    }
                    if (delta == 0)
                        break;
                    tnum += delta - TNUM_OFFSET;
                    counts.incrementCount(tnum, 1);
                }
            } else {
                int tnum = 0;
                int delta = 0;
                for (; ; ) {
                    delta = (delta << 7) | (code & 0x7f);
                    if ((code & 0x80) == 0) {
                        if (delta == 0)
                            break;
                        tnum += delta - TNUM_OFFSET;
                        counts.incrementCount(tnum, 1);
                        delta = 0;
                    }
                    code >>>= 8;
                }
            }
        }
    }
    if (doNegative) {
        for (int i = 0; i < numTermsInField; i++) {
            //       counts[i] = maxTermCounts[i] - counts[i];
            counts.incrementCount(i, maxTermCounts[i] - counts.getCount(i) * 2);
        }
    }
/*** TODO - future optimization to handle allBuckets
    if (processor.allBucketsSlot >= 0) {
      int all = 0;  // overflow potential
      for (int i=0; i<numTermsInField; i++) {
        all += counts.getCount(i);
      }
      counts.incrementCount(processor.allBucketsSlot, all);
    }
     ***/
}
Also used : BitDocSet(org.apache.solr.search.BitDocSet) DocIterator(org.apache.solr.search.DocIterator) FixedBitSet(org.apache.lucene.util.FixedBitSet) BitDocSet(org.apache.solr.search.BitDocSet) DocSet(org.apache.solr.search.DocSet)

Example 8 with DocSet

use of org.apache.solr.search.DocSet in project lucene-solr by apache.

the class SimpleFacets method getFacetTermEnumCounts.

/**
   * Returns a list of terms in the specified field along with the 
   * corresponding count of documents in the set that match that constraint.
   * This method uses the FilterCache to get the intersection count between <code>docs</code>
   * and the DocSet for each term in the filter.
   *
   * @see FacetParams#FACET_LIMIT
   * @see FacetParams#FACET_ZEROS
   * @see FacetParams#FACET_MISSING
   */
public NamedList<Integer> getFacetTermEnumCounts(SolrIndexSearcher searcher, DocSet docs, String field, int offset, int limit, int mincount, boolean missing, String sort, String prefix, Predicate<BytesRef> termFilter, boolean intersectsCheck) throws IOException {
    /* :TODO: potential optimization...
    * cache the Terms with the highest docFreq and try them first
    * don't enum if we get our max from them
    */
    // Minimum term docFreq in order to use the filterCache for that term.
    int minDfFilterCache = global.getFieldInt(field, FacetParams.FACET_ENUM_CACHE_MINDF, 0);
    // make sure we have a set that is fast for random access, if we will use it for that
    DocSet fastForRandomSet = docs;
    if (minDfFilterCache > 0 && docs instanceof SortedIntDocSet) {
        SortedIntDocSet sset = (SortedIntDocSet) docs;
        fastForRandomSet = new HashDocSet(sset.getDocs(), 0, sset.size());
    }
    IndexSchema schema = searcher.getSchema();
    FieldType ft = schema.getFieldType(field);
    assert !ft.isPointField() : "Point Fields don't support enum method";
    LeafReader r = searcher.getSlowAtomicReader();
    boolean sortByCount = sort.equals("count") || sort.equals("true");
    final int maxsize = limit >= 0 ? offset + limit : Integer.MAX_VALUE - 1;
    final BoundedTreeSet<CountPair<BytesRef, Integer>> queue = sortByCount ? new BoundedTreeSet<CountPair<BytesRef, Integer>>(maxsize) : null;
    final NamedList<Integer> res = new NamedList<>();
    // the smallest value in the top 'N' values    
    int min = mincount - 1;
    int off = offset;
    int lim = limit >= 0 ? limit : Integer.MAX_VALUE;
    BytesRef prefixTermBytes = null;
    if (prefix != null) {
        String indexedPrefix = ft.toInternal(prefix);
        prefixTermBytes = new BytesRef(indexedPrefix);
    }
    Fields fields = r.fields();
    Terms terms = fields == null ? null : fields.terms(field);
    TermsEnum termsEnum = null;
    SolrIndexSearcher.DocsEnumState deState = null;
    BytesRef term = null;
    if (terms != null) {
        termsEnum = terms.iterator();
        if (prefixTermBytes != null) {
            if (termsEnum.seekCeil(prefixTermBytes) == TermsEnum.SeekStatus.END) {
                termsEnum = null;
            } else {
                term = termsEnum.term();
            }
        } else {
            // position termsEnum on first term
            term = termsEnum.next();
        }
    }
    PostingsEnum postingsEnum = null;
    CharsRefBuilder charsRef = new CharsRefBuilder();
    if (docs.size() >= mincount) {
        while (term != null) {
            if (prefixTermBytes != null && !StringHelper.startsWith(term, prefixTermBytes))
                break;
            if (termFilter == null || termFilter.test(term)) {
                int df = termsEnum.docFreq();
                // make a large difference (for example, many terms with df=1).
                if (df > 0 && df > min) {
                    int c;
                    if (df >= minDfFilterCache) {
                        if (deState == null) {
                            deState = new SolrIndexSearcher.DocsEnumState();
                            deState.fieldName = field;
                            deState.liveDocs = r.getLiveDocs();
                            deState.termsEnum = termsEnum;
                            deState.postingsEnum = postingsEnum;
                        }
                        if (intersectsCheck) {
                            c = searcher.intersects(docs, deState) ? 1 : 0;
                        } else {
                            c = searcher.numDocs(docs, deState);
                        }
                        postingsEnum = deState.postingsEnum;
                    } else {
                        // iterate over TermDocs to calculate the intersection
                        // TODO: specialize when base docset is a bitset or hash set (skipDocs)?  or does it matter for this?
                        // TODO: do this per-segment for better efficiency (MultiDocsEnum just uses base class impl)
                        // TODO: would passing deleted docs lead to better efficiency over checking the fastForRandomSet?
                        postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
                        c = 0;
                        if (postingsEnum instanceof MultiPostingsEnum) {
                            MultiPostingsEnum.EnumWithSlice[] subs = ((MultiPostingsEnum) postingsEnum).getSubs();
                            int numSubs = ((MultiPostingsEnum) postingsEnum).getNumSubs();
                            SEGMENTS_LOOP: for (int subindex = 0; subindex < numSubs; subindex++) {
                                MultiPostingsEnum.EnumWithSlice sub = subs[subindex];
                                if (sub.postingsEnum == null)
                                    continue;
                                int base = sub.slice.start;
                                int docid;
                                while ((docid = sub.postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                                    if (fastForRandomSet.exists(docid + base)) {
                                        c++;
                                        if (intersectsCheck) {
                                            assert c == 1;
                                            break SEGMENTS_LOOP;
                                        }
                                    }
                                }
                            }
                        } else {
                            int docid;
                            while ((docid = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                                if (fastForRandomSet.exists(docid)) {
                                    c++;
                                    if (intersectsCheck) {
                                        assert c == 1;
                                        break;
                                    }
                                }
                            }
                        }
                    }
                    if (sortByCount) {
                        if (c > min) {
                            BytesRef termCopy = BytesRef.deepCopyOf(term);
                            queue.add(new CountPair<>(termCopy, c));
                            if (queue.size() >= maxsize)
                                min = queue.last().val;
                        }
                    } else {
                        if (c >= mincount && --off < 0) {
                            if (--lim < 0)
                                break;
                            ft.indexedToReadable(term, charsRef);
                            res.add(charsRef.toString(), c);
                        }
                    }
                }
            }
            term = termsEnum.next();
        }
    }
    if (sortByCount) {
        for (CountPair<BytesRef, Integer> p : queue) {
            if (--off >= 0)
                continue;
            if (--lim < 0)
                break;
            ft.indexedToReadable(p.key, charsRef);
            res.add(charsRef.toString(), p.val);
        }
    }
    if (missing) {
        res.add(null, getFieldMissingCount(searcher, docs, field));
    }
    return res;
}
Also used : SortedIntDocSet(org.apache.solr.search.SortedIntDocSet) HashDocSet(org.apache.solr.search.HashDocSet) TermsEnum(org.apache.lucene.index.TermsEnum) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) MultiPostingsEnum(org.apache.lucene.index.MultiPostingsEnum) PostingsEnum(org.apache.lucene.index.PostingsEnum) BytesRef(org.apache.lucene.util.BytesRef) LeafReader(org.apache.lucene.index.LeafReader) NamedList(org.apache.solr.common.util.NamedList) Terms(org.apache.lucene.index.Terms) SolrIndexSearcher(org.apache.solr.search.SolrIndexSearcher) FieldType(org.apache.solr.schema.FieldType) Fields(org.apache.lucene.index.Fields) MultiPostingsEnum(org.apache.lucene.index.MultiPostingsEnum) IndexSchema(org.apache.solr.schema.IndexSchema) HashDocSet(org.apache.solr.search.HashDocSet) DocSet(org.apache.solr.search.DocSet) SortedIntDocSet(org.apache.solr.search.SortedIntDocSet) BitDocSet(org.apache.solr.search.BitDocSet)

Example 9 with DocSet

use of org.apache.solr.search.DocSet in project lucene-solr by apache.

the class SimpleFacets method parseParams.

protected ParsedParams parseParams(String type, String param) throws SyntaxError, IOException {
    SolrParams localParams = QueryParsing.getLocalParams(param, req.getParams());
    DocSet docs = docsOrig;
    String facetValue = param;
    String key = param;
    List<String> tags = Collections.emptyList();
    int threads = -1;
    if (localParams == null) {
        SolrParams params = global;
        SolrParams required = new RequiredSolrParams(params);
        return new ParsedParams(localParams, params, required, facetValue, docs, key, tags, threads);
    }
    SolrParams params = SolrParams.wrapDefaults(localParams, global);
    SolrParams required = new RequiredSolrParams(params);
    // remove local params unless it's a query
    if (type != FacetParams.FACET_QUERY) {
        // TODO Cut over to an Enum here
        facetValue = localParams.get(CommonParams.VALUE);
    }
    // reset set the default key now that localParams have been removed
    key = facetValue;
    // allow explicit set of the key
    key = localParams.get(CommonParams.OUTPUT_KEY, key);
    String tagStr = localParams.get(CommonParams.TAG);
    tags = tagStr == null ? Collections.<String>emptyList() : StrUtils.splitSmart(tagStr, ',');
    String threadStr = localParams.get(CommonParams.THREADS);
    if (threadStr != null) {
        threads = Integer.parseInt(threadStr);
    }
    // figure out if we need a new base DocSet
    String excludeStr = localParams.get(CommonParams.EXCLUDE);
    if (excludeStr == null)
        return new ParsedParams(localParams, params, required, facetValue, docs, key, tags, threads);
    List<String> excludeTagList = StrUtils.splitSmart(excludeStr, ',');
    docs = computeDocSet(docs, excludeTagList);
    return new ParsedParams(localParams, params, required, facetValue, docs, key, tags, threads);
}
Also used : RequiredSolrParams(org.apache.solr.common.params.RequiredSolrParams) RequiredSolrParams(org.apache.solr.common.params.RequiredSolrParams) SolrParams(org.apache.solr.common.params.SolrParams) HashDocSet(org.apache.solr.search.HashDocSet) DocSet(org.apache.solr.search.DocSet) SortedIntDocSet(org.apache.solr.search.SortedIntDocSet) BitDocSet(org.apache.solr.search.BitDocSet)

Example 10 with DocSet

use of org.apache.solr.search.DocSet in project lucene-solr by apache.

the class HighlighterTest method payloadFilteringSpanQuery.

@Test
public void payloadFilteringSpanQuery() throws IOException {
    clearIndex();
    String FIELD_NAME = "payloadDelimited";
    assertU(adoc("id", "0", FIELD_NAME, "word|7 word|2"));
    assertU(commit());
    //We search at a lower level than typical Solr tests because there's no QParser for payloads
    //Create query matching this payload
    Query query = new SpanPayloadCheckQuery(new SpanTermQuery(new Term(FIELD_NAME, "word")), //bytes for integer 7
    Collections.singletonList(new BytesRef(new byte[] { 0, 0, 0, 7 })));
    //invoke highlight component... the hard way
    final SearchComponent hlComp = h.getCore().getSearchComponent("highlight");
    SolrQueryRequest req = req("hl", "true", "hl.fl", FIELD_NAME, HighlightParams.USE_PHRASE_HIGHLIGHTER, "true");
    try {
        SolrQueryResponse resp = new SolrQueryResponse();
        ResponseBuilder rb = new ResponseBuilder(req, resp, Collections.singletonList(hlComp));
        rb.setHighlightQuery(query);
        rb.setResults(req.getSearcher().getDocListAndSet(query, (DocSet) null, null, 0, 1));
        //highlight:
        hlComp.prepare(rb);
        hlComp.process(rb);
        //inspect response
        final String[] snippets = (String[]) resp.getValues().findRecursive("highlighting", "0", FIELD_NAME);
        assertEquals("<em>word|7</em> word|2", snippets[0]);
    } finally {
        req.close();
    }
}
Also used : SolrQueryRequest(org.apache.solr.request.SolrQueryRequest) LocalSolrQueryRequest(org.apache.solr.request.LocalSolrQueryRequest) SolrQueryResponse(org.apache.solr.response.SolrQueryResponse) Query(org.apache.lucene.search.Query) SpanTermQuery(org.apache.lucene.search.spans.SpanTermQuery) SpanPayloadCheckQuery(org.apache.lucene.queries.payloads.SpanPayloadCheckQuery) SpanPayloadCheckQuery(org.apache.lucene.queries.payloads.SpanPayloadCheckQuery) SpanTermQuery(org.apache.lucene.search.spans.SpanTermQuery) SearchComponent(org.apache.solr.handler.component.SearchComponent) Term(org.apache.lucene.index.Term) ResponseBuilder(org.apache.solr.handler.component.ResponseBuilder) BytesRef(org.apache.lucene.util.BytesRef) DocSet(org.apache.solr.search.DocSet) Test(org.junit.Test)

Aggregations

DocSet (org.apache.solr.search.DocSet)24 BitDocSet (org.apache.solr.search.BitDocSet)12 Query (org.apache.lucene.search.Query)9 HashDocSet (org.apache.solr.search.HashDocSet)6 SolrIndexSearcher (org.apache.solr.search.SolrIndexSearcher)6 SortedIntDocSet (org.apache.solr.search.SortedIntDocSet)6 SimpleOrderedMap (org.apache.solr.common.util.SimpleOrderedMap)5 FieldType (org.apache.solr.schema.FieldType)5 ArrayList (java.util.ArrayList)4 BytesRef (org.apache.lucene.util.BytesRef)4 NamedList (org.apache.solr.common.util.NamedList)4 SchemaField (org.apache.solr.schema.SchemaField)4 IdentityHashMap (java.util.IdentityHashMap)3 Map (java.util.Map)3 LeafReaderContext (org.apache.lucene.index.LeafReaderContext)3 Term (org.apache.lucene.index.Term)3 SolrException (org.apache.solr.common.SolrException)3 SolrParams (org.apache.solr.common.params.SolrParams)3 QParser (org.apache.solr.search.QParser)3 SyntaxError (org.apache.solr.search.SyntaxError)3