Search in sources :

Example 21 with DocIdSetIterator

use of org.apache.lucene.search.DocIdSetIterator in project lucene-solr by apache.

the class BaseDocValuesFormatTestCase method doTestRandomAdvance.

private void doTestRandomAdvance(FieldCreator fieldCreator) throws IOException {
    Analyzer analyzer = new MockAnalyzer(random());
    Directory directory = newDirectory();
    IndexWriterConfig conf = newIndexWriterConfig(analyzer);
    conf.setMergePolicy(newLogMergePolicy());
    RandomIndexWriter w = new RandomIndexWriter(random(), directory, conf);
    int numChunks = atLeast(10);
    int id = 0;
    Set<Integer> missingSet = new HashSet<>();
    for (int i = 0; i < numChunks; i++) {
        // change sparseness for each chunk
        double sparseChance = random().nextDouble();
        int docCount = atLeast(1000);
        for (int j = 0; j < docCount; j++) {
            Document doc = new Document();
            doc.add(new StoredField("id", id));
            if (random().nextDouble() > sparseChance) {
                doc.add(fieldCreator.next());
            } else {
                missingSet.add(id);
            }
            id++;
            w.addDocument(doc);
        }
    }
    if (random().nextBoolean()) {
        w.forceMerge(1);
    }
    // Now search the index:
    IndexReader r = w.getReader();
    BitSet missing = new FixedBitSet(r.maxDoc());
    for (int docID = 0; docID < r.maxDoc(); docID++) {
        Document doc = r.document(docID);
        if (missingSet.contains(doc.getField("id").numericValue())) {
            missing.set(docID);
        }
    }
    for (int iter = 0; iter < 100; iter++) {
        DocIdSetIterator values = fieldCreator.iterator(r);
        assertEquals(-1, values.docID());
        while (true) {
            int docID;
            if (random().nextBoolean()) {
                docID = values.nextDoc();
            } else {
                int range;
                if (random().nextInt(10) == 7) {
                    // big jump
                    range = r.maxDoc() - values.docID();
                } else {
                    // small jump
                    range = 25;
                }
                int inc = TestUtil.nextInt(random(), 1, range);
                docID = values.advance(values.docID() + inc);
            }
            if (docID == NO_MORE_DOCS) {
                break;
            }
            assertFalse(missing.get(docID));
        }
    }
    IOUtils.close(r, w, directory);
}
Also used : BitSet(org.apache.lucene.util.BitSet) FixedBitSet(org.apache.lucene.util.FixedBitSet) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) Analyzer(org.apache.lucene.analysis.Analyzer) Document(org.apache.lucene.document.Document) StoredField(org.apache.lucene.document.StoredField) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) FixedBitSet(org.apache.lucene.util.FixedBitSet) DocIdSetIterator(org.apache.lucene.search.DocIdSetIterator) Directory(org.apache.lucene.store.Directory) HashSet(java.util.HashSet)

Example 22 with DocIdSetIterator

use of org.apache.lucene.search.DocIdSetIterator in project lucene-solr by apache.

the class DocValuesFacets method getCounts.

public static NamedList<Integer> getCounts(SolrIndexSearcher searcher, DocSet docs, String fieldName, int offset, int limit, int mincount, boolean missing, String sort, String prefix, Predicate<BytesRef> termFilter, FacetDebugInfo fdebug) throws IOException {
    SchemaField schemaField = searcher.getSchema().getField(fieldName);
    FieldType ft = schemaField.getType();
    NamedList<Integer> res = new NamedList<>();
    // TODO: remove multiValuedFieldCache(), check dv type / uninversion type?
    final boolean multiValued = schemaField.multiValued() || ft.multiValuedFieldCache();
    // for term lookups only
    final SortedSetDocValues si;
    // for mapping per-segment ords to global ones
    OrdinalMap ordinalMap = null;
    if (multiValued) {
        si = searcher.getSlowAtomicReader().getSortedSetDocValues(fieldName);
        if (si instanceof MultiDocValues.MultiSortedSetDocValues) {
            ordinalMap = ((MultiSortedSetDocValues) si).mapping;
        }
    } else {
        SortedDocValues single = searcher.getSlowAtomicReader().getSortedDocValues(fieldName);
        si = single == null ? null : DocValues.singleton(single);
        if (single instanceof MultiDocValues.MultiSortedDocValues) {
            ordinalMap = ((MultiDocValues.MultiSortedDocValues) single).mapping;
        }
    }
    if (si == null) {
        return finalize(res, searcher, schemaField, docs, -1, missing);
    }
    if (si.getValueCount() >= Integer.MAX_VALUE) {
        throw new UnsupportedOperationException("Currently this faceting method is limited to " + Integer.MAX_VALUE + " unique terms");
    }
    final BytesRefBuilder prefixRef;
    if (prefix == null) {
        prefixRef = null;
    } else if (prefix.length() == 0) {
        prefix = null;
        prefixRef = null;
    } else {
        prefixRef = new BytesRefBuilder();
        prefixRef.copyChars(prefix);
    }
    int startTermIndex, endTermIndex;
    if (prefix != null) {
        startTermIndex = (int) si.lookupTerm(prefixRef.get());
        if (startTermIndex < 0)
            startTermIndex = -startTermIndex - 1;
        prefixRef.append(UnicodeUtil.BIG_TERM);
        endTermIndex = (int) si.lookupTerm(prefixRef.get());
        assert endTermIndex < 0;
        endTermIndex = -endTermIndex - 1;
    } else {
        startTermIndex = -1;
        endTermIndex = (int) si.getValueCount();
    }
    final int nTerms = endTermIndex - startTermIndex;
    int missingCount = -1;
    final CharsRefBuilder charsRef = new CharsRefBuilder();
    if (nTerms > 0 && docs.size() >= mincount) {
        // count collection array only needs to be as big as the number of terms we are
        // going to collect counts for.
        final int[] counts = new int[nTerms];
        if (fdebug != null) {
            fdebug.putInfoItem("numBuckets", nTerms);
        }
        Filter filter = docs.getTopFilter();
        List<LeafReaderContext> leaves = searcher.getTopReaderContext().leaves();
        for (int subIndex = 0; subIndex < leaves.size(); subIndex++) {
            LeafReaderContext leaf = leaves.get(subIndex);
            // solr docsets already exclude any deleted docs
            DocIdSet dis = filter.getDocIdSet(leaf, null);
            DocIdSetIterator disi = null;
            if (dis != null) {
                disi = dis.iterator();
            }
            if (disi != null) {
                if (multiValued) {
                    SortedSetDocValues sub = leaf.reader().getSortedSetDocValues(fieldName);
                    if (sub == null) {
                        sub = DocValues.emptySortedSet();
                    }
                    final SortedDocValues singleton = DocValues.unwrapSingleton(sub);
                    if (singleton != null) {
                        // some codecs may optimize SORTED_SET storage for single-valued fields
                        accumSingle(counts, startTermIndex, singleton, disi, subIndex, ordinalMap);
                    } else {
                        accumMulti(counts, startTermIndex, sub, disi, subIndex, ordinalMap);
                    }
                } else {
                    SortedDocValues sub = leaf.reader().getSortedDocValues(fieldName);
                    if (sub == null) {
                        sub = DocValues.emptySorted();
                    }
                    accumSingle(counts, startTermIndex, sub, disi, subIndex, ordinalMap);
                }
            }
        }
        if (startTermIndex == -1) {
            missingCount = counts[0];
        }
        // IDEA: we could also maintain a count of "other"... everything that fell outside
        // of the top 'N'
        int off = offset;
        int lim = limit >= 0 ? limit : Integer.MAX_VALUE;
        if (sort.equals(FacetParams.FACET_SORT_COUNT) || sort.equals(FacetParams.FACET_SORT_COUNT_LEGACY)) {
            int maxsize = limit > 0 ? offset + limit : Integer.MAX_VALUE - 1;
            maxsize = Math.min(maxsize, nTerms);
            LongPriorityQueue queue = new LongPriorityQueue(Math.min(maxsize, 1000), maxsize, Long.MIN_VALUE);
            // the smallest value in the top 'N' values
            int min = mincount - 1;
            for (int i = (startTermIndex == -1) ? 1 : 0; i < nTerms; i++) {
                int c = counts[i];
                if (c > min) {
                    if (termFilter != null) {
                        final BytesRef term = si.lookupOrd(startTermIndex + i);
                        if (!termFilter.test(term)) {
                            continue;
                        }
                    }
                    // smaller term numbers sort higher, so subtract the term number instead
                    long pair = (((long) c) << 32) + (Integer.MAX_VALUE - i);
                    boolean displaced = queue.insert(pair);
                    if (displaced)
                        min = (int) (queue.top() >>> 32);
                }
            }
            // if we are deep paging, we don't have to order the highest "offset" counts.
            int collectCount = Math.max(0, queue.size() - off);
            assert collectCount <= lim;
            // the start and end indexes of our list "sorted" (starting with the highest value)
            int sortedIdxStart = queue.size() - (collectCount - 1);
            int sortedIdxEnd = queue.size() + 1;
            final long[] sorted = queue.sort(collectCount);
            for (int i = sortedIdxStart; i < sortedIdxEnd; i++) {
                long pair = sorted[i];
                int c = (int) (pair >>> 32);
                int tnum = Integer.MAX_VALUE - (int) pair;
                final BytesRef term = si.lookupOrd(startTermIndex + tnum);
                ft.indexedToReadable(term, charsRef);
                res.add(charsRef.toString(), c);
            }
        } else {
            // add results in index order
            int i = (startTermIndex == -1) ? 1 : 0;
            if (mincount <= 0 && termFilter == null) {
                // if mincount<=0 and we're not examining the values for the term filter, then
                // we won't discard any terms and we know exactly where to start.
                i += off;
                off = 0;
            }
            for (; i < nTerms; i++) {
                int c = counts[i];
                if (c < mincount)
                    continue;
                BytesRef term = null;
                if (termFilter != null) {
                    term = si.lookupOrd(startTermIndex + i);
                    if (!termFilter.test(term)) {
                        continue;
                    }
                }
                if (--off >= 0)
                    continue;
                if (--lim < 0)
                    break;
                if (term == null) {
                    term = si.lookupOrd(startTermIndex + i);
                }
                ft.indexedToReadable(term, charsRef);
                res.add(charsRef.toString(), c);
            }
        }
    }
    return finalize(res, searcher, schemaField, docs, missingCount, missing);
}
Also used : MultiSortedSetDocValues(org.apache.lucene.index.MultiDocValues.MultiSortedSetDocValues) DocIdSet(org.apache.lucene.search.DocIdSet) MultiDocValues(org.apache.lucene.index.MultiDocValues) OrdinalMap(org.apache.lucene.index.MultiDocValues.OrdinalMap) LeafReaderContext(org.apache.lucene.index.LeafReaderContext) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) BytesRef(org.apache.lucene.util.BytesRef) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) LongPriorityQueue(org.apache.solr.util.LongPriorityQueue) NamedList(org.apache.solr.common.util.NamedList) SortedDocValues(org.apache.lucene.index.SortedDocValues) FieldType(org.apache.solr.schema.FieldType) SchemaField(org.apache.solr.schema.SchemaField) MultiSortedSetDocValues(org.apache.lucene.index.MultiDocValues.MultiSortedSetDocValues) SortedSetDocValues(org.apache.lucene.index.SortedSetDocValues) Filter(org.apache.solr.search.Filter) DocIdSetIterator(org.apache.lucene.search.DocIdSetIterator)

Example 23 with DocIdSetIterator

use of org.apache.lucene.search.DocIdSetIterator in project lucene-solr by apache.

the class DocValuesStats method getCounts.

public static StatsValues getCounts(SolrIndexSearcher searcher, StatsField statsField, DocSet docs, String[] facet) throws IOException {
    final SchemaField schemaField = statsField.getSchemaField();
    assert null != statsField.getSchemaField() : "DocValuesStats requires a StatsField using a SchemaField";
    final String fieldName = schemaField.getName();
    final FieldType ft = schemaField.getType();
    final StatsValues res = StatsValuesFactory.createStatsValues(statsField);
    //Initialize facetstats, if facets have been passed in
    final FieldFacetStats[] facetStats = new FieldFacetStats[facet.length];
    int upto = 0;
    for (String facetField : facet) {
        SchemaField fsf = searcher.getSchema().getField(facetField);
        if (fsf.multiValued()) {
            throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Stats can only facet on single-valued fields, not: " + facetField);
        }
        SchemaField facetSchemaField = searcher.getSchema().getField(facetField);
        facetStats[upto++] = new FieldFacetStats(searcher, facetSchemaField, statsField);
    }
    // TODO: remove multiValuedFieldCache(), check dv type / uninversion type?
    final boolean multiValued = schemaField.multiValued() || ft.multiValuedFieldCache();
    // for term lookups only
    SortedSetDocValues si;
    // for mapping per-segment ords to global ones
    OrdinalMap ordinalMap = null;
    if (multiValued) {
        si = searcher.getSlowAtomicReader().getSortedSetDocValues(fieldName);
        if (si instanceof MultiSortedSetDocValues) {
            ordinalMap = ((MultiDocValues.MultiSortedSetDocValues) si).mapping;
        }
    } else {
        SortedDocValues single = searcher.getSlowAtomicReader().getSortedDocValues(fieldName);
        si = single == null ? null : DocValues.singleton(single);
        if (single instanceof MultiDocValues.MultiSortedDocValues) {
            ordinalMap = ((MultiDocValues.MultiSortedDocValues) single).mapping;
        }
    }
    if (si == null) {
        si = DocValues.emptySortedSet();
    }
    if (si.getValueCount() >= Integer.MAX_VALUE) {
        throw new UnsupportedOperationException("Currently this stats method is limited to " + Integer.MAX_VALUE + " unique terms");
    }
    int missingDocCountTotal = 0;
    final int nTerms = (int) si.getValueCount();
    // count collection array only needs to be as big as the number of terms we are
    // going to collect counts for.
    final int[] counts = new int[nTerms];
    Filter filter = docs.getTopFilter();
    List<LeafReaderContext> leaves = searcher.getTopReaderContext().leaves();
    for (int subIndex = 0; subIndex < leaves.size(); subIndex++) {
        LeafReaderContext leaf = leaves.get(subIndex);
        // solr docsets already exclude any deleted docs
        DocIdSet dis = filter.getDocIdSet(leaf, null);
        DocIdSetIterator disi = null;
        if (dis != null) {
            disi = dis.iterator();
        }
        if (disi != null) {
            int docBase = leaf.docBase;
            if (multiValued) {
                SortedSetDocValues sub = leaf.reader().getSortedSetDocValues(fieldName);
                if (sub == null) {
                    sub = DocValues.emptySortedSet();
                }
                SortedDocValues singleton = DocValues.unwrapSingleton(sub);
                if (singleton != null) {
                    // some codecs may optimize SORTED_SET storage for single-valued fields
                    missingDocCountTotal += accumSingle(counts, docBase, facetStats, singleton, disi, subIndex, ordinalMap);
                } else {
                    missingDocCountTotal += accumMulti(counts, docBase, facetStats, sub, disi, subIndex, ordinalMap);
                }
            } else {
                SortedDocValues sub = leaf.reader().getSortedDocValues(fieldName);
                if (sub == null) {
                    sub = DocValues.emptySorted();
                }
                missingDocCountTotal += accumSingle(counts, docBase, facetStats, sub, disi, subIndex, ordinalMap);
            }
        }
    }
    // add results in index order
    for (int ord = 0; ord < counts.length; ord++) {
        int count = counts[ord];
        if (count > 0) {
            final BytesRef value = si.lookupOrd(ord);
            res.accumulate(value, count);
            for (FieldFacetStats f : facetStats) {
                f.accumulateTermNum(ord, value);
            }
        }
    }
    res.addMissing(missingDocCountTotal);
    if (facetStats.length > 0) {
        for (FieldFacetStats f : facetStats) {
            Map<String, StatsValues> facetStatsValues = f.facetStatsValues;
            f.accumulateMissing();
            res.addFacet(f.name, facetStatsValues);
        }
    }
    return res;
}
Also used : MultiSortedSetDocValues(org.apache.lucene.index.MultiDocValues.MultiSortedSetDocValues) DocIdSet(org.apache.lucene.search.DocIdSet) MultiDocValues(org.apache.lucene.index.MultiDocValues) OrdinalMap(org.apache.lucene.index.MultiDocValues.OrdinalMap) SortedDocValues(org.apache.lucene.index.SortedDocValues) FieldType(org.apache.solr.schema.FieldType) SchemaField(org.apache.solr.schema.SchemaField) FieldFacetStats(org.apache.solr.handler.component.FieldFacetStats) MultiSortedSetDocValues(org.apache.lucene.index.MultiDocValues.MultiSortedSetDocValues) SortedSetDocValues(org.apache.lucene.index.SortedSetDocValues) Filter(org.apache.solr.search.Filter) LeafReaderContext(org.apache.lucene.index.LeafReaderContext) StatsValues(org.apache.solr.handler.component.StatsValues) DocIdSetIterator(org.apache.lucene.search.DocIdSetIterator) SolrException(org.apache.solr.common.SolrException) BytesRef(org.apache.lucene.util.BytesRef)

Example 24 with DocIdSetIterator

use of org.apache.lucene.search.DocIdSetIterator in project lucene-solr by apache.

the class IntervalFacets method getCountString.

private void getCountString() throws IOException {
    Filter filter = docs.getTopFilter();
    List<LeafReaderContext> leaves = searcher.getTopReaderContext().leaves();
    for (int subIndex = 0; subIndex < leaves.size(); subIndex++) {
        LeafReaderContext leaf = leaves.get(subIndex);
        // solr docsets already exclude any deleted docs
        DocIdSet dis = filter.getDocIdSet(leaf, null);
        if (dis == null) {
            continue;
        }
        DocIdSetIterator disi = dis.iterator();
        if (disi != null) {
            if (schemaField.multiValued()) {
                SortedSetDocValues sub = leaf.reader().getSortedSetDocValues(schemaField.getName());
                if (sub == null) {
                    continue;
                }
                final SortedDocValues singleton = DocValues.unwrapSingleton(sub);
                if (singleton != null) {
                    // some codecs may optimize SORTED_SET storage for single-valued fields
                    accumIntervalsSingle(singleton, disi, dis.bits());
                } else {
                    accumIntervalsMulti(sub, disi, dis.bits());
                }
            } else {
                SortedDocValues sub = leaf.reader().getSortedDocValues(schemaField.getName());
                if (sub == null) {
                    continue;
                }
                accumIntervalsSingle(sub, disi, dis.bits());
            }
        }
    }
}
Also used : SortedSetDocValues(org.apache.lucene.index.SortedSetDocValues) Filter(org.apache.solr.search.Filter) LeafReaderContext(org.apache.lucene.index.LeafReaderContext) DocIdSet(org.apache.lucene.search.DocIdSet) DocIdSetIterator(org.apache.lucene.search.DocIdSetIterator) SortedDocValues(org.apache.lucene.index.SortedDocValues)

Example 25 with DocIdSetIterator

use of org.apache.lucene.search.DocIdSetIterator in project lucene-solr by apache.

the class TestDocIdSetBuilder method testRandom.

public void testRandom() throws IOException {
    final int maxDoc = TestUtil.nextInt(random(), 1, 10000000);
    for (int i = 1; i < maxDoc / 2; i <<= 1) {
        final int numDocs = TestUtil.nextInt(random(), 1, i);
        final FixedBitSet docs = new FixedBitSet(maxDoc);
        int c = 0;
        while (c < numDocs) {
            final int d = random().nextInt(maxDoc);
            if (docs.get(d) == false) {
                docs.set(d);
                c += 1;
            }
        }
        final int[] array = new int[numDocs + random().nextInt(100)];
        DocIdSetIterator it = new BitSetIterator(docs, 0L);
        int j = 0;
        for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
            array[j++] = doc;
        }
        assertEquals(numDocs, j);
        // add some duplicates
        while (j < array.length) {
            array[j++] = array[random().nextInt(numDocs)];
        }
        // shuffle
        for (j = array.length - 1; j >= 1; --j) {
            final int k = random().nextInt(j);
            int tmp = array[j];
            array[j] = array[k];
            array[k] = tmp;
        }
        // add docs out of order
        DocIdSetBuilder builder = new DocIdSetBuilder(maxDoc);
        for (j = 0; j < array.length; ) {
            final int l = TestUtil.nextInt(random(), 1, array.length - j);
            DocIdSetBuilder.BulkAdder adder = null;
            for (int k = 0, budget = 0; k < l; ++k) {
                if (budget == 0 || rarely()) {
                    budget = TestUtil.nextInt(random(), 1, l - k + 5);
                    adder = builder.grow(budget);
                }
                adder.add(array[j++]);
                budget--;
            }
        }
        final DocIdSet expected = new BitDocIdSet(docs);
        final DocIdSet actual = builder.build();
        assertEquals(expected, actual);
    }
}
Also used : DocIdSet(org.apache.lucene.search.DocIdSet) DocIdSetIterator(org.apache.lucene.search.DocIdSetIterator)

Aggregations

DocIdSetIterator (org.apache.lucene.search.DocIdSetIterator)68 LeafReaderContext (org.apache.lucene.index.LeafReaderContext)28 Scorer (org.apache.lucene.search.Scorer)15 DocIdSet (org.apache.lucene.search.DocIdSet)14 Weight (org.apache.lucene.search.Weight)12 ConstantScoreScorer (org.apache.lucene.search.ConstantScoreScorer)10 BitSet (org.apache.lucene.util.BitSet)10 Bits (org.apache.lucene.util.Bits)10 BytesRef (org.apache.lucene.util.BytesRef)10 IOException (java.io.IOException)9 MatchingDocs (org.apache.lucene.facet.FacetsCollector.MatchingDocs)8 IndexSearcher (org.apache.lucene.search.IndexSearcher)8 SortedDocValues (org.apache.lucene.index.SortedDocValues)7 ConstantScoreWeight (org.apache.lucene.search.ConstantScoreWeight)7 TwoPhaseIterator (org.apache.lucene.search.TwoPhaseIterator)7 SortedSetDocValues (org.apache.lucene.index.SortedSetDocValues)6 Query (org.apache.lucene.search.Query)6 Document (org.apache.lucene.document.Document)5 IndexReader (org.apache.lucene.index.IndexReader)5 LeafReader (org.apache.lucene.index.LeafReader)5