Search in sources :

Example 61 with DocIdSetIterator

use of org.apache.lucene.search.DocIdSetIterator in project lucene-solr by apache.

the class PointInSetIncludingScoreQuery method createWeight.

@Override
public final Weight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException {
    return new Weight(this) {

        @Override
        public void extractTerms(Set<Term> terms) {
        }

        @Override
        public Explanation explain(LeafReaderContext context, int doc) throws IOException {
            Scorer scorer = scorer(context);
            if (scorer != null) {
                int target = scorer.iterator().advance(doc);
                if (doc == target) {
                    return Explanation.match(scorer.score(), "A match");
                }
            }
            return Explanation.noMatch("Not a match");
        }

        @Override
        public Scorer scorer(LeafReaderContext context) throws IOException {
            LeafReader reader = context.reader();
            FieldInfo fieldInfo = reader.getFieldInfos().fieldInfo(field);
            if (fieldInfo == null) {
                return null;
            }
            if (fieldInfo.getPointDimensionCount() != 1) {
                throw new IllegalArgumentException("field=\"" + field + "\" was indexed with numDims=" + fieldInfo.getPointDimensionCount() + " but this query has numDims=1");
            }
            if (fieldInfo.getPointNumBytes() != bytesPerDim) {
                throw new IllegalArgumentException("field=\"" + field + "\" was indexed with bytesPerDim=" + fieldInfo.getPointNumBytes() + " but this query has bytesPerDim=" + bytesPerDim);
            }
            PointValues values = reader.getPointValues(field);
            if (values == null) {
                return null;
            }
            FixedBitSet result = new FixedBitSet(reader.maxDoc());
            float[] scores = new float[reader.maxDoc()];
            values.intersect(new MergePointVisitor(sortedPackedPoints, result, scores));
            return new Scorer(this) {

                DocIdSetIterator disi = new BitSetIterator(result, 10L);

                @Override
                public float score() throws IOException {
                    return scores[docID()];
                }

                @Override
                public int freq() throws IOException {
                    return 1;
                }

                @Override
                public int docID() {
                    return disi.docID();
                }

                @Override
                public DocIdSetIterator iterator() {
                    return disi;
                }
            };
        }
    };
}
Also used : BitSetIterator(org.apache.lucene.util.BitSetIterator) FixedBitSet(org.apache.lucene.util.FixedBitSet) Set(java.util.Set) LeafReader(org.apache.lucene.index.LeafReader) Scorer(org.apache.lucene.search.Scorer) Weight(org.apache.lucene.search.Weight) LongPoint(org.apache.lucene.document.LongPoint) DoublePoint(org.apache.lucene.document.DoublePoint) IntPoint(org.apache.lucene.document.IntPoint) FloatPoint(org.apache.lucene.document.FloatPoint) PointValues(org.apache.lucene.index.PointValues) FixedBitSet(org.apache.lucene.util.FixedBitSet) LeafReaderContext(org.apache.lucene.index.LeafReaderContext) DocIdSetIterator(org.apache.lucene.search.DocIdSetIterator) FieldInfo(org.apache.lucene.index.FieldInfo)

Example 62 with DocIdSetIterator

use of org.apache.lucene.search.DocIdSetIterator in project lucene-solr by apache.

the class AbstractPrefixTreeQuery method createWeight.

@Override
public Weight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException {
    return new ConstantScoreWeight(this, boost) {

        @Override
        public Scorer scorer(LeafReaderContext context) throws IOException {
            DocIdSet docSet = getDocIdSet(context);
            if (docSet == null) {
                return null;
            }
            DocIdSetIterator disi = docSet.iterator();
            if (disi == null) {
                return null;
            }
            return new ConstantScoreScorer(this, score(), disi);
        }
    };
}
Also used : ConstantScoreScorer(org.apache.lucene.search.ConstantScoreScorer) LeafReaderContext(org.apache.lucene.index.LeafReaderContext) DocIdSet(org.apache.lucene.search.DocIdSet) DocIdSetIterator(org.apache.lucene.search.DocIdSetIterator) ConstantScoreWeight(org.apache.lucene.search.ConstantScoreWeight)

Example 63 with DocIdSetIterator

use of org.apache.lucene.search.DocIdSetIterator in project lucene-solr by apache.

the class TestRandomSamplingFacetsCollector method testRandomSampling.

public void testRandomSampling() throws Exception {
    Directory dir = newDirectory();
    Directory taxoDir = newDirectory();
    Random random = random();
    DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir);
    RandomIndexWriter writer = new RandomIndexWriter(random, dir);
    FacetsConfig config = new FacetsConfig();
    final int numCategories = 10;
    int numDocs = atLeast(10000);
    for (int i = 0; i < numDocs; i++) {
        Document doc = new Document();
        doc.add(new StringField("EvenOdd", (i % 2 == 0) ? "even" : "odd", Store.NO));
        doc.add(new FacetField("iMod10", Integer.toString(i % numCategories)));
        writer.addDocument(config.build(taxoWriter, doc));
    }
    writer.forceMerge(CHI_SQUARE_VALUES.length - 1);
    // NRT open
    IndexSearcher searcher = newSearcher(writer.getReader());
    TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoWriter);
    IOUtils.close(writer, taxoWriter);
    // Test empty results
    RandomSamplingFacetsCollector collectRandomZeroResults = new RandomSamplingFacetsCollector(numDocs / 10, random.nextLong());
    // There should be no divisions by zero
    searcher.search(new TermQuery(new Term("EvenOdd", "NeverMatches")), collectRandomZeroResults);
    // There should be no divisions by zero and no null result
    assertNotNull(collectRandomZeroResults.getMatchingDocs());
    // There should be no results at all
    for (MatchingDocs doc : collectRandomZeroResults.getMatchingDocs()) {
        assertEquals(0, doc.totalHits);
    }
    // Now start searching and retrieve results.
    // Use a query to select half of the documents.
    TermQuery query = new TermQuery(new Term("EvenOdd", "even"));
    // 10% of total docs, 20% of the hits
    RandomSamplingFacetsCollector random10Percent = new RandomSamplingFacetsCollector(numDocs / 10, random.nextLong());
    FacetsCollector fc = new FacetsCollector();
    searcher.search(query, MultiCollector.wrap(fc, random10Percent));
    final List<MatchingDocs> matchingDocs = random10Percent.getMatchingDocs();
    // count the total hits and sampled docs, also store the number of sampled
    // docs per segment
    int totalSampledDocs = 0, totalHits = 0;
    int[] numSampledDocs = new int[matchingDocs.size()];
    //    System.out.println("numSegments=" + numSampledDocs.length);
    for (int i = 0; i < numSampledDocs.length; i++) {
        MatchingDocs md = matchingDocs.get(i);
        final DocIdSetIterator iter = md.bits.iterator();
        while (iter.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) ++numSampledDocs[i];
        totalSampledDocs += numSampledDocs[i];
        totalHits += md.totalHits;
    }
    // compute the chi-square value for the sampled documents' distribution
    float chi_square = 0;
    for (int i = 0; i < numSampledDocs.length; i++) {
        MatchingDocs md = matchingDocs.get(i);
        float ei = (float) md.totalHits / totalHits;
        if (ei > 0.0f) {
            float oi = (float) numSampledDocs[i] / totalSampledDocs;
            chi_square += (Math.pow(ei - oi, 2) / ei);
        }
    }
    // Verify that the chi-square value isn't too big. According to
    // http://en.wikipedia.org/wiki/Chi-squared_distribution#Table_of_.CF.872_value_vs_p-value,
    // we basically verify that there is a really small chance of hitting a very
    // bad sample (p-value < 0.05), for n-degrees of freedom. The number 'n' depends
    // on the number of segments.
    assertTrue("chisquare not statistically significant enough: " + chi_square, chi_square < CHI_SQUARE_VALUES[numSampledDocs.length]);
    // Test amortized counts - should be 5X the sampled count, but maximum numDocs/10
    final FastTaxonomyFacetCounts random10FacetCounts = new FastTaxonomyFacetCounts(taxoReader, config, random10Percent);
    final FacetResult random10Result = random10FacetCounts.getTopChildren(10, "iMod10");
    final FacetResult amortized10Result = random10Percent.amortizeFacetCounts(random10Result, config, searcher);
    for (int i = 0; i < amortized10Result.labelValues.length; i++) {
        LabelAndValue amortized = amortized10Result.labelValues[i];
        LabelAndValue sampled = random10Result.labelValues[i];
        // since numDocs may not divide by 10 exactly, allow for some slack in the amortized count 
        assertEquals(amortized.value.floatValue(), Math.min(5 * sampled.value.floatValue(), numDocs / 10.f), 1.0);
    }
    IOUtils.close(searcher.getIndexReader(), taxoReader, dir, taxoDir);
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) TermQuery(org.apache.lucene.search.TermQuery) FastTaxonomyFacetCounts(org.apache.lucene.facet.taxonomy.FastTaxonomyFacetCounts) TaxonomyReader(org.apache.lucene.facet.taxonomy.TaxonomyReader) DirectoryTaxonomyReader(org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader) MatchingDocs(org.apache.lucene.facet.FacetsCollector.MatchingDocs) Term(org.apache.lucene.index.Term) Document(org.apache.lucene.document.Document) DirectoryTaxonomyWriter(org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter) Random(java.util.Random) StringField(org.apache.lucene.document.StringField) DocIdSetIterator(org.apache.lucene.search.DocIdSetIterator) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) Directory(org.apache.lucene.store.Directory) DirectoryTaxonomyReader(org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader)

Example 64 with DocIdSetIterator

use of org.apache.lucene.search.DocIdSetIterator in project lucene-solr by apache.

the class UnifiedHighlighter method highlightFieldsAsObjects.

/**
   * Expert: highlights the top-N passages from multiple fields,
   * for the provided int[] docids, to custom Object as
   * returned by the {@link PassageFormatter}.  Use
   * this API to render to something other than String.
   *
   * @param fieldsIn      field names to highlight. Must have a stored string value.
   * @param query         query to highlight.
   * @param docIdsIn      containing the document IDs to highlight.
   * @param maxPassagesIn The maximum number of top-N ranked passages per-field used to
   *                      form the highlighted snippets.
   * @return Map keyed on field name, containing the array of formatted snippets
   * corresponding to the documents in <code>docIdsIn</code>.
   * If no highlights were found for a document, the
   * first {@code maxPassages} from the field will
   * be returned.
   * @throws IOException              if an I/O error occurred during processing
   * @throws IllegalArgumentException if <code>field</code> was indexed without
   *                                  {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
   */
protected Map<String, Object[]> highlightFieldsAsObjects(String[] fieldsIn, Query query, int[] docIdsIn, int[] maxPassagesIn) throws IOException {
    if (fieldsIn.length < 1) {
        throw new IllegalArgumentException("fieldsIn must not be empty");
    }
    if (fieldsIn.length != maxPassagesIn.length) {
        throw new IllegalArgumentException("invalid number of maxPassagesIn");
    }
    if (searcher == null) {
        throw new IllegalStateException("This method requires that an indexSearcher was passed in the " + "constructor.  Perhaps you mean to call highlightWithoutSearcher?");
    }
    // Sort docs & fields for sequential i/o
    // Sort doc IDs w/ index to original order: (copy input arrays since we sort in-place)
    int[] docIds = new int[docIdsIn.length];
    // fill in ascending order; points into docIdsIn[]
    int[] docInIndexes = new int[docIds.length];
    // latter 2 are "out" params
    copyAndSortDocIdsWithIndex(docIdsIn, docIds, docInIndexes);
    // Sort fields w/ maxPassages pair: (copy input arrays since we sort in-place)
    final String[] fields = new String[fieldsIn.length];
    final int[] maxPassages = new int[maxPassagesIn.length];
    // latter 2 are "out" params
    copyAndSortFieldsWithMaxPassages(fieldsIn, maxPassagesIn, fields, maxPassages);
    // Init field highlighters (where most of the highlight logic lives, and on a per field basis)
    Set<Term> queryTerms = extractTerms(query);
    FieldHighlighter[] fieldHighlighters = new FieldHighlighter[fields.length];
    int numTermVectors = 0;
    int numPostings = 0;
    for (int f = 0; f < fields.length; f++) {
        FieldHighlighter fieldHighlighter = getFieldHighlighter(fields[f], query, queryTerms, maxPassages[f]);
        fieldHighlighters[f] = fieldHighlighter;
        switch(fieldHighlighter.getOffsetSource()) {
            case TERM_VECTORS:
                numTermVectors++;
                break;
            case POSTINGS:
                numPostings++;
                break;
            case POSTINGS_WITH_TERM_VECTORS:
                numTermVectors++;
                numPostings++;
                break;
            case ANALYSIS:
            case NONE_NEEDED:
            default:
                //do nothing
                break;
        }
    }
    int cacheCharsThreshold = calculateOptimalCacheCharsThreshold(numTermVectors, numPostings);
    IndexReader indexReaderWithTermVecCache = (numTermVectors >= 2) ? TermVectorReusingLeafReader.wrap(searcher.getIndexReader()) : null;
    // [fieldIdx][docIdInIndex] of highlightDoc result
    Object[][] highlightDocsInByField = new Object[fields.length][docIds.length];
    // Highlight in doc batches determined by loadFieldValues (consumes from docIdIter)
    DocIdSetIterator docIdIter = asDocIdSetIterator(docIds);
    for (int batchDocIdx = 0; batchDocIdx < docIds.length; ) {
        // Load the field values of the first batch of document(s) (note: commonly all docs are in this batch)
        List<CharSequence[]> fieldValsByDoc = loadFieldValues(fields, docIdIter, cacheCharsThreshold);
        // Highlight in per-field order first, then by doc (better I/O pattern)
        for (int fieldIdx = 0; fieldIdx < fields.length; fieldIdx++) {
            //parallel to docIdsIn
            Object[] resultByDocIn = highlightDocsInByField[fieldIdx];
            FieldHighlighter fieldHighlighter = fieldHighlighters[fieldIdx];
            for (int docIdx = batchDocIdx; docIdx - batchDocIdx < fieldValsByDoc.size(); docIdx++) {
                //sorted order
                int docId = docIds[docIdx];
                CharSequence content = fieldValsByDoc.get(docIdx - batchDocIdx)[fieldIdx];
                if (content == null) {
                    continue;
                }
                IndexReader indexReader = (fieldHighlighter.getOffsetSource() == OffsetSource.TERM_VECTORS && indexReaderWithTermVecCache != null) ? indexReaderWithTermVecCache : searcher.getIndexReader();
                //original input order
                int docInIndex = docInIndexes[docIdx];
                assert resultByDocIn[docInIndex] == null;
                resultByDocIn[docInIndex] = fieldHighlighter.highlightFieldForDoc(indexReader, docId, content.toString());
            }
        }
        batchDocIdx += fieldValsByDoc.size();
    }
    assert docIdIter.docID() == DocIdSetIterator.NO_MORE_DOCS || docIdIter.nextDoc() == DocIdSetIterator.NO_MORE_DOCS;
    // TODO reconsider the return type; since this is an "advanced" method, lets not return a Map?  Notice the only
    //    caller simply iterates it to build another structure.
    // field -> object highlights parallel to docIdsIn
    Map<String, Object[]> resultMap = new HashMap<>(fields.length);
    for (int f = 0; f < fields.length; f++) {
        resultMap.put(fields[f], highlightDocsInByField[f]);
    }
    return resultMap;
}
Also used : HashMap(java.util.HashMap) Term(org.apache.lucene.index.Term) IndexReader(org.apache.lucene.index.IndexReader) DocIdSetIterator(org.apache.lucene.search.DocIdSetIterator)

Example 65 with DocIdSetIterator

use of org.apache.lucene.search.DocIdSetIterator in project lucene-solr by apache.

the class FacetFieldProcessorByArrayDV method collectDocs.

@Override
protected void collectDocs() throws IOException {
    int domainSize = fcontext.base.size();
    if (nTerms <= 0 || domainSize < effectiveMincount) {
        // TODO: what about allBuckets? missing bucket?
        return;
    }
    // TODO: refactor some of this logic into a base class
    boolean countOnly = collectAcc == null && allBucketsAcc == null;
    boolean fullRange = startTermIndex == 0 && endTermIndex == si.getValueCount();
    // Are we expecting many hits per bucket?
    // FUTURE: pro-rate for nTerms?
    // FUTURE: better take into account number of values in multi-valued fields.  This info is available for indexed fields.
    // FUTURE: take into account that bigger ord maps are more expensive than smaller ones
    // One test: 5M doc index, faceting on a single-valued field with almost 1M unique values, crossover point where global counting was slower
    // than per-segment counting was a domain of 658k docs.  At that point, top 10 buckets had 6-7 matches each.
    // this was for heap docvalues produced by UninvertingReader
    // Since these values were randomly distributed, lets round our domain multiplier up to account for less random real world data.
    long domainMultiplier = multiValuedField ? 4L : 2L;
    // +3 to increase test coverage with small tests
    boolean manyHitsPerBucket = domainSize * domainMultiplier > (si.getValueCount() + 3);
    // If we're only calculating counts, we're not prefixing, and we expect to collect many documents per unique value,
    // then collect per-segment before mapping to global ords at the end.  This will save redundant seg->global ord mappings.
    // FUTURE: there are probably some other non "countOnly" cases where we can use this as well (i.e. those where
    // the docid is not used)
    boolean canDoPerSeg = countOnly && fullRange;
    boolean accumSeg = manyHitsPerBucket && canDoPerSeg;
    // internal - override perSeg heuristic
    if (freq.perSeg != null)
        accumSeg = canDoPerSeg && freq.perSeg;
    final List<LeafReaderContext> leaves = fcontext.searcher.getIndexReader().leaves();
    Filter filter = fcontext.base.getTopFilter();
    for (int subIdx = 0; subIdx < leaves.size(); subIdx++) {
        LeafReaderContext subCtx = leaves.get(subIdx);
        setNextReaderFirstPhase(subCtx);
        // solr docsets already exclude any deleted docs
        DocIdSet dis = filter.getDocIdSet(subCtx, null);
        DocIdSetIterator disi = dis.iterator();
        SortedDocValues singleDv = null;
        SortedSetDocValues multiDv = null;
        if (multiValuedField) {
            // TODO: get sub from multi?
            multiDv = subCtx.reader().getSortedSetDocValues(sf.getName());
            if (multiDv == null) {
                multiDv = DocValues.emptySortedSet();
            }
            // this will be null if this is not a wrapped single valued docvalues.
            if (unwrap_singleValued_multiDv) {
                singleDv = DocValues.unwrapSingleton(multiDv);
            }
        } else {
            singleDv = subCtx.reader().getSortedDocValues(sf.getName());
            if (singleDv == null) {
                singleDv = DocValues.emptySorted();
            }
        }
        LongValues toGlobal = ordinalMap == null ? null : ordinalMap.getGlobalOrds(subIdx);
        if (singleDv != null) {
            if (accumSeg) {
                collectPerSeg(singleDv, disi, toGlobal);
            } else {
                if (canDoPerSeg && toGlobal != null) {
                    collectCounts(singleDv, disi, toGlobal);
                } else {
                    collectDocs(singleDv, disi, toGlobal);
                }
            }
        } else {
            if (accumSeg) {
                collectPerSeg(multiDv, disi, toGlobal);
            } else {
                if (canDoPerSeg && toGlobal != null) {
                    collectCounts(multiDv, disi, toGlobal);
                } else {
                    collectDocs(multiDv, disi, toGlobal);
                }
            }
        }
    }
    // better GC
    reuse = null;
}
Also used : SortedSetDocValues(org.apache.lucene.index.SortedSetDocValues) Filter(org.apache.solr.search.Filter) LeafReaderContext(org.apache.lucene.index.LeafReaderContext) DocIdSet(org.apache.lucene.search.DocIdSet) LongValues(org.apache.lucene.util.LongValues) DocIdSetIterator(org.apache.lucene.search.DocIdSetIterator) SortedDocValues(org.apache.lucene.index.SortedDocValues)

Aggregations

DocIdSetIterator (org.apache.lucene.search.DocIdSetIterator)68 LeafReaderContext (org.apache.lucene.index.LeafReaderContext)28 Scorer (org.apache.lucene.search.Scorer)15 DocIdSet (org.apache.lucene.search.DocIdSet)14 Weight (org.apache.lucene.search.Weight)12 ConstantScoreScorer (org.apache.lucene.search.ConstantScoreScorer)10 BitSet (org.apache.lucene.util.BitSet)10 Bits (org.apache.lucene.util.Bits)10 BytesRef (org.apache.lucene.util.BytesRef)10 IOException (java.io.IOException)9 MatchingDocs (org.apache.lucene.facet.FacetsCollector.MatchingDocs)8 IndexSearcher (org.apache.lucene.search.IndexSearcher)8 SortedDocValues (org.apache.lucene.index.SortedDocValues)7 ConstantScoreWeight (org.apache.lucene.search.ConstantScoreWeight)7 TwoPhaseIterator (org.apache.lucene.search.TwoPhaseIterator)7 SortedSetDocValues (org.apache.lucene.index.SortedSetDocValues)6 Query (org.apache.lucene.search.Query)6 Document (org.apache.lucene.document.Document)5 IndexReader (org.apache.lucene.index.IndexReader)5 LeafReader (org.apache.lucene.index.LeafReader)5