Search in sources :

Example 1 with EarlyTerminatingSortingCollector

use of org.apache.lucene.search.EarlyTerminatingSortingCollector in project lucene-solr by apache.

the class TestIndexSorting method testRandom3.

// pits index time sorting against query time sorting
public void testRandom3() throws Exception {
    int numDocs;
    if (TEST_NIGHTLY) {
        numDocs = atLeast(100000);
    } else {
        numDocs = atLeast(1000);
    }
    List<RandomDoc> docs = new ArrayList<>();
    Sort sort = randomSort();
    if (VERBOSE) {
        System.out.println("TEST: numDocs=" + numDocs + " use sort=" + sort);
    }
    // no index sorting, all search-time sorting:
    Directory dir1 = newFSDirectory(createTempDir());
    IndexWriterConfig iwc1 = newIndexWriterConfig(new MockAnalyzer(random()));
    IndexWriter w1 = new IndexWriter(dir1, iwc1);
    // use index sorting:
    Directory dir2 = newFSDirectory(createTempDir());
    IndexWriterConfig iwc2 = newIndexWriterConfig(new MockAnalyzer(random()));
    iwc2.setIndexSort(sort);
    IndexWriter w2 = new IndexWriter(dir2, iwc2);
    Set<Integer> toDelete = new HashSet<>();
    double deleteChance = random().nextDouble();
    for (int id = 0; id < numDocs; id++) {
        RandomDoc docValues = new RandomDoc(id);
        docs.add(docValues);
        if (VERBOSE) {
            System.out.println("TEST: doc id=" + id);
            System.out.println("  int=" + docValues.intValue);
            System.out.println("  long=" + docValues.longValue);
            System.out.println("  float=" + docValues.floatValue);
            System.out.println("  double=" + docValues.doubleValue);
            System.out.println("  bytes=" + new BytesRef(docValues.bytesValue));
        }
        Document doc = new Document();
        doc.add(new StringField("id", Integer.toString(id), Field.Store.YES));
        doc.add(new NumericDocValuesField("id", id));
        doc.add(new NumericDocValuesField("int", docValues.intValue));
        doc.add(new NumericDocValuesField("long", docValues.longValue));
        doc.add(new DoubleDocValuesField("double", docValues.doubleValue));
        doc.add(new FloatDocValuesField("float", docValues.floatValue));
        doc.add(new SortedDocValuesField("bytes", new BytesRef(docValues.bytesValue)));
        for (int value : docValues.intValues) {
            doc.add(new SortedNumericDocValuesField("multi_valued_int", value));
        }
        for (long value : docValues.longValues) {
            doc.add(new SortedNumericDocValuesField("multi_valued_long", value));
        }
        for (float value : docValues.floatValues) {
            doc.add(new SortedNumericDocValuesField("multi_valued_float", NumericUtils.floatToSortableInt(value)));
        }
        for (double value : docValues.doubleValues) {
            doc.add(new SortedNumericDocValuesField("multi_valued_double", NumericUtils.doubleToSortableLong(value)));
        }
        for (byte[] value : docValues.bytesValues) {
            doc.add(new SortedSetDocValuesField("multi_valued_bytes", new BytesRef(value)));
        }
        w1.addDocument(doc);
        w2.addDocument(doc);
        if (random().nextDouble() < deleteChance) {
            toDelete.add(id);
        }
    }
    for (int id : toDelete) {
        w1.deleteDocuments(new Term("id", Integer.toString(id)));
        w2.deleteDocuments(new Term("id", Integer.toString(id)));
    }
    DirectoryReader r1 = DirectoryReader.open(w1);
    IndexSearcher s1 = newSearcher(r1);
    if (random().nextBoolean()) {
        int maxSegmentCount = TestUtil.nextInt(random(), 1, 5);
        if (VERBOSE) {
            System.out.println("TEST: now forceMerge(" + maxSegmentCount + ")");
        }
        w2.forceMerge(maxSegmentCount);
    }
    DirectoryReader r2 = DirectoryReader.open(w2);
    IndexSearcher s2 = newSearcher(r2);
    for (int iter = 0; iter < 100; iter++) {
        int numHits = TestUtil.nextInt(random(), 1, numDocs);
        if (VERBOSE) {
            System.out.println("TEST: iter=" + iter + " numHits=" + numHits);
        }
        TopFieldCollector c1 = TopFieldCollector.create(sort, numHits, true, true, true);
        s1.search(new MatchAllDocsQuery(), c1);
        TopDocs hits1 = c1.topDocs();
        TopFieldCollector c2 = TopFieldCollector.create(sort, numHits, true, true, true);
        EarlyTerminatingSortingCollector c3 = new EarlyTerminatingSortingCollector(c2, sort, numHits);
        s2.search(new MatchAllDocsQuery(), c3);
        TopDocs hits2 = c2.topDocs();
        if (VERBOSE) {
            System.out.println("  topDocs query-time sort: totalHits=" + hits1.totalHits);
            for (ScoreDoc scoreDoc : hits1.scoreDocs) {
                System.out.println("    " + scoreDoc.doc);
            }
            System.out.println("  topDocs index-time sort: totalHits=" + hits2.totalHits);
            for (ScoreDoc scoreDoc : hits2.scoreDocs) {
                System.out.println("    " + scoreDoc.doc);
            }
        }
        assertTrue(hits2.totalHits <= hits1.totalHits);
        assertEquals(hits2.scoreDocs.length, hits1.scoreDocs.length);
        for (int i = 0; i < hits2.scoreDocs.length; i++) {
            ScoreDoc hit1 = hits1.scoreDocs[i];
            ScoreDoc hit2 = hits2.scoreDocs[i];
            assertEquals(r1.document(hit1.doc).get("id"), r2.document(hit2.doc).get("id"));
            assertEquals(((FieldDoc) hit1).fields, ((FieldDoc) hit2).fields);
        }
    }
    IOUtils.close(r1, r2, w1, w2, dir1, dir2);
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) ArrayList(java.util.ArrayList) FloatDocValuesField(org.apache.lucene.document.FloatDocValuesField) Document(org.apache.lucene.document.Document) ScoreDoc(org.apache.lucene.search.ScoreDoc) TopDocs(org.apache.lucene.search.TopDocs) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) SortedNumericDocValuesField(org.apache.lucene.document.SortedNumericDocValuesField) NumericDocValuesField(org.apache.lucene.document.NumericDocValuesField) DoubleDocValuesField(org.apache.lucene.document.DoubleDocValuesField) SortedDocValuesField(org.apache.lucene.document.SortedDocValuesField) Sort(org.apache.lucene.search.Sort) TopFieldCollector(org.apache.lucene.search.TopFieldCollector) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory) HashSet(java.util.HashSet) MatchAllDocsQuery(org.apache.lucene.search.MatchAllDocsQuery) BinaryPoint(org.apache.lucene.document.BinaryPoint) IntPoint(org.apache.lucene.document.IntPoint) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) EarlyTerminatingSortingCollector(org.apache.lucene.search.EarlyTerminatingSortingCollector) SortedNumericDocValuesField(org.apache.lucene.document.SortedNumericDocValuesField) StringField(org.apache.lucene.document.StringField) SortedSetDocValuesField(org.apache.lucene.document.SortedSetDocValuesField)

Example 2 with EarlyTerminatingSortingCollector

use of org.apache.lucene.search.EarlyTerminatingSortingCollector in project lucene-solr by apache.

the class AnalyzingInfixSuggester method lookup.

/**
   * This is an advanced method providing the capability to send down to the suggester any 
   * arbitrary lucene query to be used to filter the result of the suggester
   * 
   * @param key the keyword being looked for
   * @param contextQuery an arbitrary Lucene query to be used to filter the result of the suggester. {@link #addContextToQuery} could be used to build this contextQuery.
   * @param num number of items to return
   * @param allTermsRequired all searched terms must match or not
   * @param doHighlight if true, the matching term will be highlighted in the search result
   * @return the result of the suggester
   * @throws IOException f the is IO exception while reading data from the index
   */
public List<LookupResult> lookup(CharSequence key, BooleanQuery contextQuery, int num, boolean allTermsRequired, boolean doHighlight) throws IOException {
    if (searcherMgr == null) {
        throw new IllegalStateException("suggester was not built");
    }
    final BooleanClause.Occur occur;
    if (allTermsRequired) {
        occur = BooleanClause.Occur.MUST;
    } else {
        occur = BooleanClause.Occur.SHOULD;
    }
    BooleanQuery.Builder query;
    Set<String> matchedTokens;
    String prefixToken = null;
    try (TokenStream ts = queryAnalyzer.tokenStream("", new StringReader(key.toString()))) {
        //long t0 = System.currentTimeMillis();
        ts.reset();
        final CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        final OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
        String lastToken = null;
        query = new BooleanQuery.Builder();
        int maxEndOffset = -1;
        matchedTokens = new HashSet<>();
        while (ts.incrementToken()) {
            if (lastToken != null) {
                matchedTokens.add(lastToken);
                query.add(new TermQuery(new Term(TEXT_FIELD_NAME, lastToken)), occur);
            }
            lastToken = termAtt.toString();
            if (lastToken != null) {
                maxEndOffset = Math.max(maxEndOffset, offsetAtt.endOffset());
            }
        }
        ts.end();
        if (lastToken != null) {
            Query lastQuery;
            if (maxEndOffset == offsetAtt.endOffset()) {
                // Use PrefixQuery (or the ngram equivalent) when
                // there was no trailing discarded chars in the
                // string (e.g. whitespace), so that if query does
                // not end with a space we show prefix matches for
                // that token:
                lastQuery = getLastTokenQuery(lastToken);
                prefixToken = lastToken;
            } else {
                // Use TermQuery for an exact match if there were
                // trailing discarded chars (e.g. whitespace), so
                // that if query ends with a space we only show
                // exact matches for that term:
                matchedTokens.add(lastToken);
                lastQuery = new TermQuery(new Term(TEXT_FIELD_NAME, lastToken));
            }
            if (lastQuery != null) {
                query.add(lastQuery, occur);
            }
        }
        if (contextQuery != null) {
            boolean allMustNot = true;
            for (BooleanClause clause : contextQuery.clauses()) {
                if (clause.getOccur() != BooleanClause.Occur.MUST_NOT) {
                    allMustNot = false;
                    break;
                }
            }
            if (allMustNot) {
                // All are MUST_NOT: add the contextQuery to the main query instead (not as sub-query)
                for (BooleanClause clause : contextQuery.clauses()) {
                    query.add(clause);
                }
            } else if (allTermsRequired == false) {
                // We must carefully upgrade the query clauses to MUST:
                BooleanQuery.Builder newQuery = new BooleanQuery.Builder();
                newQuery.add(query.build(), BooleanClause.Occur.MUST);
                newQuery.add(contextQuery, BooleanClause.Occur.MUST);
                query = newQuery;
            } else {
                // Add contextQuery as sub-query
                query.add(contextQuery, BooleanClause.Occur.MUST);
            }
        }
    }
    // TODO: we could allow blended sort here, combining
    // weight w/ score.  Now we ignore score and sort only
    // by weight:
    Query finalQuery = finishQuery(query, allTermsRequired);
    //System.out.println("finalQuery=" + finalQuery);
    // Sort by weight, descending:
    TopFieldCollector c = TopFieldCollector.create(SORT, num, true, false, false);
    // We sorted postings by weight during indexing, so we
    // only retrieve the first num hits now:
    Collector c2 = new EarlyTerminatingSortingCollector(c, SORT, num);
    List<LookupResult> results = null;
    SearcherManager mgr;
    IndexSearcher searcher;
    synchronized (searcherMgrLock) {
        // acquire & release on same SearcherManager, via local reference
        mgr = searcherMgr;
        searcher = mgr.acquire();
    }
    try {
        //System.out.println("got searcher=" + searcher);
        searcher.search(finalQuery, c2);
        TopFieldDocs hits = c.topDocs();
        // Slower way if postings are not pre-sorted by weight:
        // hits = searcher.search(query, null, num, SORT);
        results = createResults(searcher, hits, num, key, doHighlight, matchedTokens, prefixToken);
    } finally {
        mgr.release(searcher);
    }
    return results;
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) BooleanQuery(org.apache.lucene.search.BooleanQuery) TokenStream(org.apache.lucene.analysis.TokenStream) Query(org.apache.lucene.search.Query) PrefixQuery(org.apache.lucene.search.PrefixQuery) TermQuery(org.apache.lucene.search.TermQuery) BooleanQuery(org.apache.lucene.search.BooleanQuery) TopFieldDocs(org.apache.lucene.search.TopFieldDocs) SearcherManager(org.apache.lucene.search.SearcherManager) StringReader(java.io.StringReader) EarlyTerminatingSortingCollector(org.apache.lucene.search.EarlyTerminatingSortingCollector) TopFieldCollector(org.apache.lucene.search.TopFieldCollector) Collector(org.apache.lucene.search.Collector) TopFieldCollector(org.apache.lucene.search.TopFieldCollector) TermQuery(org.apache.lucene.search.TermQuery) Occur(org.apache.lucene.search.BooleanClause.Occur) Term(org.apache.lucene.index.Term) BooleanClause(org.apache.lucene.search.BooleanClause) EarlyTerminatingSortingCollector(org.apache.lucene.search.EarlyTerminatingSortingCollector) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute)

Aggregations

EarlyTerminatingSortingCollector (org.apache.lucene.search.EarlyTerminatingSortingCollector)2 IndexSearcher (org.apache.lucene.search.IndexSearcher)2 TopFieldCollector (org.apache.lucene.search.TopFieldCollector)2 StringReader (java.io.StringReader)1 ArrayList (java.util.ArrayList)1 HashSet (java.util.HashSet)1 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)1 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)1 TokenStream (org.apache.lucene.analysis.TokenStream)1 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)1 OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)1 BinaryPoint (org.apache.lucene.document.BinaryPoint)1 Document (org.apache.lucene.document.Document)1 DoubleDocValuesField (org.apache.lucene.document.DoubleDocValuesField)1 FloatDocValuesField (org.apache.lucene.document.FloatDocValuesField)1 IntPoint (org.apache.lucene.document.IntPoint)1 NumericDocValuesField (org.apache.lucene.document.NumericDocValuesField)1 SortedDocValuesField (org.apache.lucene.document.SortedDocValuesField)1 SortedNumericDocValuesField (org.apache.lucene.document.SortedNumericDocValuesField)1 SortedSetDocValuesField (org.apache.lucene.document.SortedSetDocValuesField)1