use of org.apache.lucene.search.EarlyTerminatingSortingCollector in project lucene-solr by apache.
the class TestIndexSorting method testRandom3.
// pits index time sorting against query time sorting
public void testRandom3() throws Exception {
int numDocs;
if (TEST_NIGHTLY) {
numDocs = atLeast(100000);
} else {
numDocs = atLeast(1000);
}
List<RandomDoc> docs = new ArrayList<>();
Sort sort = randomSort();
if (VERBOSE) {
System.out.println("TEST: numDocs=" + numDocs + " use sort=" + sort);
}
// no index sorting, all search-time sorting:
Directory dir1 = newFSDirectory(createTempDir());
IndexWriterConfig iwc1 = newIndexWriterConfig(new MockAnalyzer(random()));
IndexWriter w1 = new IndexWriter(dir1, iwc1);
// use index sorting:
Directory dir2 = newFSDirectory(createTempDir());
IndexWriterConfig iwc2 = newIndexWriterConfig(new MockAnalyzer(random()));
iwc2.setIndexSort(sort);
IndexWriter w2 = new IndexWriter(dir2, iwc2);
Set<Integer> toDelete = new HashSet<>();
double deleteChance = random().nextDouble();
for (int id = 0; id < numDocs; id++) {
RandomDoc docValues = new RandomDoc(id);
docs.add(docValues);
if (VERBOSE) {
System.out.println("TEST: doc id=" + id);
System.out.println(" int=" + docValues.intValue);
System.out.println(" long=" + docValues.longValue);
System.out.println(" float=" + docValues.floatValue);
System.out.println(" double=" + docValues.doubleValue);
System.out.println(" bytes=" + new BytesRef(docValues.bytesValue));
}
Document doc = new Document();
doc.add(new StringField("id", Integer.toString(id), Field.Store.YES));
doc.add(new NumericDocValuesField("id", id));
doc.add(new NumericDocValuesField("int", docValues.intValue));
doc.add(new NumericDocValuesField("long", docValues.longValue));
doc.add(new DoubleDocValuesField("double", docValues.doubleValue));
doc.add(new FloatDocValuesField("float", docValues.floatValue));
doc.add(new SortedDocValuesField("bytes", new BytesRef(docValues.bytesValue)));
for (int value : docValues.intValues) {
doc.add(new SortedNumericDocValuesField("multi_valued_int", value));
}
for (long value : docValues.longValues) {
doc.add(new SortedNumericDocValuesField("multi_valued_long", value));
}
for (float value : docValues.floatValues) {
doc.add(new SortedNumericDocValuesField("multi_valued_float", NumericUtils.floatToSortableInt(value)));
}
for (double value : docValues.doubleValues) {
doc.add(new SortedNumericDocValuesField("multi_valued_double", NumericUtils.doubleToSortableLong(value)));
}
for (byte[] value : docValues.bytesValues) {
doc.add(new SortedSetDocValuesField("multi_valued_bytes", new BytesRef(value)));
}
w1.addDocument(doc);
w2.addDocument(doc);
if (random().nextDouble() < deleteChance) {
toDelete.add(id);
}
}
for (int id : toDelete) {
w1.deleteDocuments(new Term("id", Integer.toString(id)));
w2.deleteDocuments(new Term("id", Integer.toString(id)));
}
DirectoryReader r1 = DirectoryReader.open(w1);
IndexSearcher s1 = newSearcher(r1);
if (random().nextBoolean()) {
int maxSegmentCount = TestUtil.nextInt(random(), 1, 5);
if (VERBOSE) {
System.out.println("TEST: now forceMerge(" + maxSegmentCount + ")");
}
w2.forceMerge(maxSegmentCount);
}
DirectoryReader r2 = DirectoryReader.open(w2);
IndexSearcher s2 = newSearcher(r2);
for (int iter = 0; iter < 100; iter++) {
int numHits = TestUtil.nextInt(random(), 1, numDocs);
if (VERBOSE) {
System.out.println("TEST: iter=" + iter + " numHits=" + numHits);
}
TopFieldCollector c1 = TopFieldCollector.create(sort, numHits, true, true, true);
s1.search(new MatchAllDocsQuery(), c1);
TopDocs hits1 = c1.topDocs();
TopFieldCollector c2 = TopFieldCollector.create(sort, numHits, true, true, true);
EarlyTerminatingSortingCollector c3 = new EarlyTerminatingSortingCollector(c2, sort, numHits);
s2.search(new MatchAllDocsQuery(), c3);
TopDocs hits2 = c2.topDocs();
if (VERBOSE) {
System.out.println(" topDocs query-time sort: totalHits=" + hits1.totalHits);
for (ScoreDoc scoreDoc : hits1.scoreDocs) {
System.out.println(" " + scoreDoc.doc);
}
System.out.println(" topDocs index-time sort: totalHits=" + hits2.totalHits);
for (ScoreDoc scoreDoc : hits2.scoreDocs) {
System.out.println(" " + scoreDoc.doc);
}
}
assertTrue(hits2.totalHits <= hits1.totalHits);
assertEquals(hits2.scoreDocs.length, hits1.scoreDocs.length);
for (int i = 0; i < hits2.scoreDocs.length; i++) {
ScoreDoc hit1 = hits1.scoreDocs[i];
ScoreDoc hit2 = hits2.scoreDocs[i];
assertEquals(r1.document(hit1.doc).get("id"), r2.document(hit2.doc).get("id"));
assertEquals(((FieldDoc) hit1).fields, ((FieldDoc) hit2).fields);
}
}
IOUtils.close(r1, r2, w1, w2, dir1, dir2);
}
use of org.apache.lucene.search.EarlyTerminatingSortingCollector in project lucene-solr by apache.
the class AnalyzingInfixSuggester method lookup.
/**
* This is an advanced method providing the capability to send down to the suggester any
* arbitrary lucene query to be used to filter the result of the suggester
*
* @param key the keyword being looked for
* @param contextQuery an arbitrary Lucene query to be used to filter the result of the suggester. {@link #addContextToQuery} could be used to build this contextQuery.
* @param num number of items to return
* @param allTermsRequired all searched terms must match or not
* @param doHighlight if true, the matching term will be highlighted in the search result
* @return the result of the suggester
* @throws IOException f the is IO exception while reading data from the index
*/
public List<LookupResult> lookup(CharSequence key, BooleanQuery contextQuery, int num, boolean allTermsRequired, boolean doHighlight) throws IOException {
if (searcherMgr == null) {
throw new IllegalStateException("suggester was not built");
}
final BooleanClause.Occur occur;
if (allTermsRequired) {
occur = BooleanClause.Occur.MUST;
} else {
occur = BooleanClause.Occur.SHOULD;
}
BooleanQuery.Builder query;
Set<String> matchedTokens;
String prefixToken = null;
try (TokenStream ts = queryAnalyzer.tokenStream("", new StringReader(key.toString()))) {
//long t0 = System.currentTimeMillis();
ts.reset();
final CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
final OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
String lastToken = null;
query = new BooleanQuery.Builder();
int maxEndOffset = -1;
matchedTokens = new HashSet<>();
while (ts.incrementToken()) {
if (lastToken != null) {
matchedTokens.add(lastToken);
query.add(new TermQuery(new Term(TEXT_FIELD_NAME, lastToken)), occur);
}
lastToken = termAtt.toString();
if (lastToken != null) {
maxEndOffset = Math.max(maxEndOffset, offsetAtt.endOffset());
}
}
ts.end();
if (lastToken != null) {
Query lastQuery;
if (maxEndOffset == offsetAtt.endOffset()) {
// Use PrefixQuery (or the ngram equivalent) when
// there was no trailing discarded chars in the
// string (e.g. whitespace), so that if query does
// not end with a space we show prefix matches for
// that token:
lastQuery = getLastTokenQuery(lastToken);
prefixToken = lastToken;
} else {
// Use TermQuery for an exact match if there were
// trailing discarded chars (e.g. whitespace), so
// that if query ends with a space we only show
// exact matches for that term:
matchedTokens.add(lastToken);
lastQuery = new TermQuery(new Term(TEXT_FIELD_NAME, lastToken));
}
if (lastQuery != null) {
query.add(lastQuery, occur);
}
}
if (contextQuery != null) {
boolean allMustNot = true;
for (BooleanClause clause : contextQuery.clauses()) {
if (clause.getOccur() != BooleanClause.Occur.MUST_NOT) {
allMustNot = false;
break;
}
}
if (allMustNot) {
// All are MUST_NOT: add the contextQuery to the main query instead (not as sub-query)
for (BooleanClause clause : contextQuery.clauses()) {
query.add(clause);
}
} else if (allTermsRequired == false) {
// We must carefully upgrade the query clauses to MUST:
BooleanQuery.Builder newQuery = new BooleanQuery.Builder();
newQuery.add(query.build(), BooleanClause.Occur.MUST);
newQuery.add(contextQuery, BooleanClause.Occur.MUST);
query = newQuery;
} else {
// Add contextQuery as sub-query
query.add(contextQuery, BooleanClause.Occur.MUST);
}
}
}
// TODO: we could allow blended sort here, combining
// weight w/ score. Now we ignore score and sort only
// by weight:
Query finalQuery = finishQuery(query, allTermsRequired);
//System.out.println("finalQuery=" + finalQuery);
// Sort by weight, descending:
TopFieldCollector c = TopFieldCollector.create(SORT, num, true, false, false);
// We sorted postings by weight during indexing, so we
// only retrieve the first num hits now:
Collector c2 = new EarlyTerminatingSortingCollector(c, SORT, num);
List<LookupResult> results = null;
SearcherManager mgr;
IndexSearcher searcher;
synchronized (searcherMgrLock) {
// acquire & release on same SearcherManager, via local reference
mgr = searcherMgr;
searcher = mgr.acquire();
}
try {
//System.out.println("got searcher=" + searcher);
searcher.search(finalQuery, c2);
TopFieldDocs hits = c.topDocs();
// Slower way if postings are not pre-sorted by weight:
// hits = searcher.search(query, null, num, SORT);
results = createResults(searcher, hits, num, key, doHighlight, matchedTokens, prefixToken);
} finally {
mgr.release(searcher);
}
return results;
}
Aggregations