use of org.apache.lucene.search.ScoreDoc in project lucene-solr by apache.
the class SpellChecker method suggestSimilar.
/**
* Suggest similar words (optionally restricted to a field of an index).
*
* <p>As the Lucene similarity that is used to fetch the most relevant n-grammed terms
* is not the same as the edit distance strategy used to calculate the best
* matching spell-checked word from the hits that Lucene found, one usually has
* to retrieve a couple of numSug's in order to get the true best match.
*
* <p>I.e. if numSug == 1, don't count on that suggestion being the best one.
* Thus, you should set this value to <b>at least</b> 5 for a good suggestion.
*
* @param word the word you want a spell check done on
* @param numSug the number of suggested words
* @param ir the indexReader of the user index (can be null see field param)
* @param field the field of the user index: if field is not null, the suggested
* words are restricted to the words present in this field.
* @param suggestMode
* (NOTE: if indexReader==null and/or field==null, then this is overridden with SuggestMode.SUGGEST_ALWAYS)
* @param accuracy The minimum score a suggestion must have in order to qualify for inclusion in the results
* @throws IOException if the underlying index throws an {@link IOException}
* @throws AlreadyClosedException if the Spellchecker is already closed
* @return String[] the sorted list of the suggest words with these 2 criteria:
* first criteria: the edit distance, second criteria (only if restricted mode): the popularity
* of the suggest words in the field of the user index
*
*/
public String[] suggestSimilar(String word, int numSug, IndexReader ir, String field, SuggestMode suggestMode, float accuracy) throws IOException {
// obtainSearcher calls ensureOpen
final IndexSearcher indexSearcher = obtainSearcher();
try {
if (ir == null || field == null) {
suggestMode = SuggestMode.SUGGEST_ALWAYS;
}
if (suggestMode == SuggestMode.SUGGEST_ALWAYS) {
ir = null;
field = null;
}
final int lengthWord = word.length();
final int freq = (ir != null && field != null) ? ir.docFreq(new Term(field, word)) : 0;
final int goalFreq = suggestMode == SuggestMode.SUGGEST_MORE_POPULAR ? freq : 0;
// if the word exists in the real index and we don't care for word frequency, return the word itself
if (suggestMode == SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX && freq > 0) {
return new String[] { word };
}
BooleanQuery.Builder query = new BooleanQuery.Builder();
String[] grams;
String key;
for (int ng = getMin(lengthWord); ng <= getMax(lengthWord); ng++) {
// form key
key = "gram" + ng;
// form word into ngrams (allow dups too)
grams = formGrams(word, ng);
if (grams.length == 0) {
// hmm
continue;
}
if (bStart > 0) {
// should we boost prefixes?
// matches start of word
add(query, "start" + ng, grams[0], bStart);
}
if (bEnd > 0) {
// should we boost suffixes
// matches end of word
add(query, "end" + ng, grams[grams.length - 1], bEnd);
}
for (int i = 0; i < grams.length; i++) {
add(query, key, grams[i]);
}
}
int maxHits = 10 * numSug;
// System.out.println("Q: " + query);
ScoreDoc[] hits = indexSearcher.search(query.build(), maxHits).scoreDocs;
// System.out.println("HITS: " + hits.length());
SuggestWordQueue sugQueue = new SuggestWordQueue(numSug, comparator);
// go thru more than 'maxr' matches in case the distance filter triggers
int stop = Math.min(hits.length, maxHits);
SuggestWord sugWord = new SuggestWord();
for (int i = 0; i < stop; i++) {
// get orig word
sugWord.string = indexSearcher.doc(hits[i].doc).get(F_WORD);
// don't suggest a word for itself, that would be silly
if (sugWord.string.equals(word)) {
continue;
}
// edit distance
sugWord.score = sd.getDistance(word, sugWord.string);
if (sugWord.score < accuracy) {
continue;
}
if (ir != null && field != null) {
// use the user index
// freq in the index
sugWord.freq = ir.docFreq(new Term(field, sugWord.string));
// don't suggest a word that is not present in the field
if ((suggestMode == SuggestMode.SUGGEST_MORE_POPULAR && goalFreq > sugWord.freq) || sugWord.freq < 1) {
continue;
}
}
sugQueue.insertWithOverflow(sugWord);
if (sugQueue.size() == numSug) {
// if queue full, maintain the minScore score
accuracy = sugQueue.top().score;
}
sugWord = new SuggestWord();
}
// convert to array string
String[] list = new String[sugQueue.size()];
for (int i = sugQueue.size() - 1; i >= 0; i--) {
list[i] = sugQueue.pop().string;
}
return list;
} finally {
releaseSearcher(indexSearcher);
}
}
use of org.apache.lucene.search.ScoreDoc in project lucene-solr by apache.
the class ThreadedIndexingAndSearchingTestCase method runTest.
public void runTest(String testName) throws Exception {
failed.set(false);
addCount.set(0);
delCount.set(0);
packCount.set(0);
final long t0 = System.currentTimeMillis();
Random random = new Random(random().nextLong());
final LineFileDocs docs = new LineFileDocs(random);
final Path tempDir = createTempDir(testName);
// some subclasses rely on this being MDW
dir = getDirectory(newMockFSDirectory(tempDir));
if (dir instanceof BaseDirectoryWrapper) {
// don't double-checkIndex, we do it ourselves.
((BaseDirectoryWrapper) dir).setCheckIndexOnClose(false);
}
MockAnalyzer analyzer = new MockAnalyzer(random());
analyzer.setMaxTokenLength(TestUtil.nextInt(random(), 1, IndexWriter.MAX_TERM_LENGTH));
final IndexWriterConfig conf = newIndexWriterConfig(analyzer).setCommitOnClose(false);
conf.setInfoStream(new FailOnNonBulkMergesInfoStream());
if (conf.getMergePolicy() instanceof MockRandomMergePolicy) {
((MockRandomMergePolicy) conf.getMergePolicy()).setDoNonBulkMerges(false);
}
if (LuceneTestCase.TEST_NIGHTLY) {
// newIWConfig makes smallish max seg size, which
// results in tons and tons of segments for this test
// when run nightly:
MergePolicy mp = conf.getMergePolicy();
if (mp instanceof TieredMergePolicy) {
((TieredMergePolicy) mp).setMaxMergedSegmentMB(5000.);
} else if (mp instanceof LogByteSizeMergePolicy) {
((LogByteSizeMergePolicy) mp).setMaxMergeMB(1000.);
} else if (mp instanceof LogMergePolicy) {
((LogMergePolicy) mp).setMaxMergeDocs(100000);
}
// when running nightly, merging can still have crazy parameters,
// and might use many per-field codecs. turn on CFS for IW flushes
// and ensure CFS ratio is reasonable to keep it contained.
conf.setUseCompoundFile(true);
mp.setNoCFSRatio(Math.max(0.25d, mp.getNoCFSRatio()));
}
conf.setMergedSegmentWarmer(new IndexWriter.IndexReaderWarmer() {
@Override
public void warm(LeafReader reader) throws IOException {
if (VERBOSE) {
System.out.println("TEST: now warm merged reader=" + reader);
}
warmed.put(((SegmentReader) reader).core, Boolean.TRUE);
final int maxDoc = reader.maxDoc();
final Bits liveDocs = reader.getLiveDocs();
int sum = 0;
final int inc = Math.max(1, maxDoc / 50);
for (int docID = 0; docID < maxDoc; docID += inc) {
if (liveDocs == null || liveDocs.get(docID)) {
final Document doc = reader.document(docID);
sum += doc.getFields().size();
}
}
IndexSearcher searcher = newSearcher(reader, false);
sum += searcher.search(new TermQuery(new Term("body", "united")), 10).totalHits;
if (VERBOSE) {
System.out.println("TEST: warm visited " + sum + " fields");
}
}
});
if (VERBOSE) {
conf.setInfoStream(new PrintStreamInfoStream(System.out) {
@Override
public void message(String component, String message) {
if ("TP".equals(component)) {
// ignore test points!
return;
}
super.message(component, message);
}
});
}
writer = new IndexWriter(dir, conf);
TestUtil.reduceOpenFiles(writer);
final ExecutorService es = random().nextBoolean() ? null : Executors.newCachedThreadPool(new NamedThreadFactory(testName));
doAfterWriter(es);
final int NUM_INDEX_THREADS = TestUtil.nextInt(random(), 2, 4);
final int RUN_TIME_SEC = LuceneTestCase.TEST_NIGHTLY ? 300 : RANDOM_MULTIPLIER;
final Set<String> delIDs = Collections.synchronizedSet(new HashSet<String>());
final Set<String> delPackIDs = Collections.synchronizedSet(new HashSet<String>());
final List<SubDocs> allSubDocs = Collections.synchronizedList(new ArrayList<SubDocs>());
final long stopTime = System.currentTimeMillis() + RUN_TIME_SEC * 1000;
final Thread[] indexThreads = launchIndexingThreads(docs, NUM_INDEX_THREADS, stopTime, delIDs, delPackIDs, allSubDocs);
if (VERBOSE) {
System.out.println("TEST: DONE start " + NUM_INDEX_THREADS + " indexing threads [" + (System.currentTimeMillis() - t0) + " ms]");
}
// Let index build up a bit
Thread.sleep(100);
doSearching(es, stopTime);
if (VERBOSE) {
System.out.println("TEST: all searching done [" + (System.currentTimeMillis() - t0) + " ms]");
}
for (Thread thread : indexThreads) {
thread.join();
}
if (VERBOSE) {
System.out.println("TEST: done join indexing threads [" + (System.currentTimeMillis() - t0) + " ms]; addCount=" + addCount + " delCount=" + delCount);
}
final IndexSearcher s = getFinalSearcher();
if (VERBOSE) {
System.out.println("TEST: finalSearcher=" + s);
}
assertFalse(failed.get());
boolean doFail = false;
// Verify: make sure delIDs are in fact deleted:
for (String id : delIDs) {
final TopDocs hits = s.search(new TermQuery(new Term("docid", id)), 1);
if (hits.totalHits != 0) {
System.out.println("doc id=" + id + " is supposed to be deleted, but got " + hits.totalHits + " hits; first docID=" + hits.scoreDocs[0].doc);
doFail = true;
}
}
// Verify: make sure delPackIDs are in fact deleted:
for (String id : delPackIDs) {
final TopDocs hits = s.search(new TermQuery(new Term("packID", id)), 1);
if (hits.totalHits != 0) {
System.out.println("packID=" + id + " is supposed to be deleted, but got " + hits.totalHits + " matches");
doFail = true;
}
}
// Verify: make sure each group of sub-docs are still in docID order:
for (SubDocs subDocs : allSubDocs) {
TopDocs hits = s.search(new TermQuery(new Term("packID", subDocs.packID)), 20);
if (!subDocs.deleted) {
// We sort by relevance but the scores should be identical so sort falls back to by docID:
if (hits.totalHits != subDocs.subIDs.size()) {
System.out.println("packID=" + subDocs.packID + ": expected " + subDocs.subIDs.size() + " hits but got " + hits.totalHits);
doFail = true;
} else {
int lastDocID = -1;
int startDocID = -1;
for (ScoreDoc scoreDoc : hits.scoreDocs) {
final int docID = scoreDoc.doc;
if (lastDocID != -1) {
assertEquals(1 + lastDocID, docID);
} else {
startDocID = docID;
}
lastDocID = docID;
final Document doc = s.doc(docID);
assertEquals(subDocs.packID, doc.get("packID"));
}
lastDocID = startDocID - 1;
for (String subID : subDocs.subIDs) {
hits = s.search(new TermQuery(new Term("docid", subID)), 1);
assertEquals(1, hits.totalHits);
final int docID = hits.scoreDocs[0].doc;
if (lastDocID != -1) {
assertEquals(1 + lastDocID, docID);
}
lastDocID = docID;
}
}
} else {
// because we can re-use packID for update:
for (String subID : subDocs.subIDs) {
assertEquals(0, s.search(new TermQuery(new Term("docid", subID)), 1).totalHits);
}
}
}
// Verify: make sure all not-deleted docs are in fact
// not deleted:
final int endID = Integer.parseInt(docs.nextDoc().get("docid"));
docs.close();
for (int id = 0; id < endID; id++) {
String stringID = "" + id;
if (!delIDs.contains(stringID)) {
final TopDocs hits = s.search(new TermQuery(new Term("docid", stringID)), 1);
if (hits.totalHits != 1) {
System.out.println("doc id=" + stringID + " is not supposed to be deleted, but got hitCount=" + hits.totalHits + "; delIDs=" + delIDs);
doFail = true;
}
}
}
assertFalse(doFail);
assertEquals("index=" + writer.segString() + " addCount=" + addCount + " delCount=" + delCount, addCount.get() - delCount.get(), s.getIndexReader().numDocs());
releaseSearcher(s);
writer.commit();
assertEquals("index=" + writer.segString() + " addCount=" + addCount + " delCount=" + delCount, addCount.get() - delCount.get(), writer.numDocs());
doClose();
try {
writer.commit();
} finally {
writer.close();
}
// searches, and that IS may be using this es!
if (es != null) {
es.shutdown();
es.awaitTermination(1, TimeUnit.SECONDS);
}
TestUtil.checkIndex(dir);
dir.close();
if (VERBOSE) {
System.out.println("TEST: done [" + (System.currentTimeMillis() - t0) + " ms]");
}
}
use of org.apache.lucene.search.ScoreDoc in project lucene-solr by apache.
the class FormBasedXmlQueryDemo method doPost.
@Override
protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
//Take all completed form fields and add to a Properties object
Properties completedFormFields = new Properties();
Enumeration<?> pNames = request.getParameterNames();
while (pNames.hasMoreElements()) {
String propName = (String) pNames.nextElement();
String value = request.getParameter(propName);
if ((value != null) && (value.trim().length() > 0)) {
completedFormFields.setProperty(propName, value);
}
}
try {
//Create an XML query by populating template with given user criteria
org.w3c.dom.Document xmlQuery = queryTemplateManager.getQueryAsDOM(completedFormFields);
//Parse the XML to produce a Lucene query
Query query = xmlParser.getQuery(xmlQuery.getDocumentElement());
//Run the query
TopDocs topDocs = searcher.search(query, 10);
//and package the results and forward to JSP
if (topDocs != null) {
ScoreDoc[] sd = topDocs.scoreDocs;
Document[] results = new Document[sd.length];
for (int i = 0; i < results.length; i++) {
results[i] = searcher.doc(sd[i].doc);
request.setAttribute("results", results);
}
}
RequestDispatcher dispatcher = getServletContext().getRequestDispatcher("/index.jsp");
dispatcher.forward(request, response);
} catch (Exception e) {
throw new ServletException("Error processing query", e);
}
}
use of org.apache.lucene.search.ScoreDoc in project lucene-solr by apache.
the class TestGrouping method slowGrouping.
private TopGroups<BytesRef> slowGrouping(GroupDoc[] groupDocs, String searchTerm, boolean fillFields, boolean getScores, boolean getMaxScores, boolean doAllGroups, Sort groupSort, Sort docSort, int topNGroups, int docsPerGroup, int groupOffset, int docOffset) {
final Comparator<GroupDoc> groupSortComp = getComparator(groupSort);
Arrays.sort(groupDocs, groupSortComp);
final HashMap<BytesRef, List<GroupDoc>> groups = new HashMap<>();
final List<BytesRef> sortedGroups = new ArrayList<>();
final List<Comparable<?>[]> sortedGroupFields = new ArrayList<>();
int totalHitCount = 0;
Set<BytesRef> knownGroups = new HashSet<>();
//System.out.println("TEST: slowGrouping");
for (GroupDoc d : groupDocs) {
// TODO: would be better to filter by searchTerm before sorting!
if (!d.content.startsWith(searchTerm)) {
continue;
}
totalHitCount++;
if (doAllGroups) {
if (!knownGroups.contains(d.group)) {
knownGroups.add(d.group);
//System.out.println(" add group=" + groupToString(d.group));
}
}
List<GroupDoc> l = groups.get(d.group);
if (l == null) {
//System.out.println(" add sortedGroup=" + groupToString(d.group));
sortedGroups.add(d.group);
if (fillFields) {
sortedGroupFields.add(fillFields(d, groupSort));
}
l = new ArrayList<>();
groups.put(d.group, l);
}
l.add(d);
}
if (groupOffset >= sortedGroups.size()) {
// slice is out of bounds
return null;
}
final int limit = Math.min(groupOffset + topNGroups, groups.size());
final Comparator<GroupDoc> docSortComp = getComparator(docSort);
@SuppressWarnings({ "unchecked", "rawtypes" }) final GroupDocs<BytesRef>[] result = new GroupDocs[limit - groupOffset];
int totalGroupedHitCount = 0;
for (int idx = groupOffset; idx < limit; idx++) {
final BytesRef group = sortedGroups.get(idx);
final List<GroupDoc> docs = groups.get(group);
totalGroupedHitCount += docs.size();
Collections.sort(docs, docSortComp);
final ScoreDoc[] hits;
if (docs.size() > docOffset) {
final int docIDXLimit = Math.min(docOffset + docsPerGroup, docs.size());
hits = new ScoreDoc[docIDXLimit - docOffset];
for (int docIDX = docOffset; docIDX < docIDXLimit; docIDX++) {
final GroupDoc d = docs.get(docIDX);
final FieldDoc fd;
if (fillFields) {
fd = new FieldDoc(d.id, getScores ? d.score : Float.NaN, fillFields(d, docSort));
} else {
fd = new FieldDoc(d.id, getScores ? d.score : Float.NaN);
}
hits[docIDX - docOffset] = fd;
}
} else {
hits = new ScoreDoc[0];
}
result[idx - groupOffset] = new GroupDocs<>(Float.NaN, 0.0f, docs.size(), hits, group, fillFields ? sortedGroupFields.get(idx) : null);
}
if (doAllGroups) {
return new TopGroups<>(new TopGroups<>(groupSort.getSort(), docSort.getSort(), totalHitCount, totalGroupedHitCount, result, Float.NaN), knownGroups.size());
} else {
return new TopGroups<>(groupSort.getSort(), docSort.getSort(), totalHitCount, totalGroupedHitCount, result, Float.NaN);
}
}
use of org.apache.lucene.search.ScoreDoc in project lucene-solr by apache.
the class TestGrouping method testRandom.
public void testRandom() throws Exception {
int numberOfRuns = TestUtil.nextInt(random(), 3, 6);
for (int iter = 0; iter < numberOfRuns; iter++) {
if (VERBOSE) {
System.out.println("TEST: iter=" + iter);
}
final int numDocs = TestUtil.nextInt(random(), 100, 1000) * RANDOM_MULTIPLIER;
//final int numDocs = _TestUtil.nextInt(random, 5, 20);
final int numGroups = TestUtil.nextInt(random(), 1, numDocs);
if (VERBOSE) {
System.out.println("TEST: numDocs=" + numDocs + " numGroups=" + numGroups);
}
final List<BytesRef> groups = new ArrayList<>();
for (int i = 0; i < numGroups; i++) {
String randomValue;
do {
// B/c of DV based impl we can't see the difference between an empty string and a null value.
// For that reason we don't generate empty string
// groups.
randomValue = TestUtil.randomRealisticUnicodeString(random());
//randomValue = TestUtil.randomSimpleString(random());
} while ("".equals(randomValue));
groups.add(new BytesRef(randomValue));
}
final String[] contentStrings = new String[TestUtil.nextInt(random(), 2, 20)];
if (VERBOSE) {
System.out.println("TEST: create fake content");
}
for (int contentIDX = 0; contentIDX < contentStrings.length; contentIDX++) {
final StringBuilder sb = new StringBuilder();
sb.append("real").append(random().nextInt(3)).append(' ');
final int fakeCount = random().nextInt(10);
for (int fakeIDX = 0; fakeIDX < fakeCount; fakeIDX++) {
sb.append("fake ");
}
contentStrings[contentIDX] = sb.toString();
if (VERBOSE) {
System.out.println(" content=" + sb.toString());
}
}
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir, newIndexWriterConfig(new MockAnalyzer(random())));
Document doc = new Document();
Document docNoGroup = new Document();
Field idvGroupField = new SortedDocValuesField("group", new BytesRef());
doc.add(idvGroupField);
docNoGroup.add(idvGroupField);
Field group = newStringField("group", "", Field.Store.NO);
doc.add(group);
Field sort1 = new SortedDocValuesField("sort1", new BytesRef());
doc.add(sort1);
docNoGroup.add(sort1);
Field sort2 = new SortedDocValuesField("sort2", new BytesRef());
doc.add(sort2);
docNoGroup.add(sort2);
Field content = newTextField("content", "", Field.Store.NO);
doc.add(content);
docNoGroup.add(content);
NumericDocValuesField idDV = new NumericDocValuesField("id", 0);
doc.add(idDV);
docNoGroup.add(idDV);
final GroupDoc[] groupDocs = new GroupDoc[numDocs];
for (int i = 0; i < numDocs; i++) {
final BytesRef groupValue;
if (random().nextInt(24) == 17) {
// So we test the "doc doesn't have the group'd
// field" case:
groupValue = null;
} else {
groupValue = groups.get(random().nextInt(groups.size()));
}
final GroupDoc groupDoc = new GroupDoc(i, groupValue, groups.get(random().nextInt(groups.size())), groups.get(random().nextInt(groups.size())), contentStrings[random().nextInt(contentStrings.length)]);
if (VERBOSE) {
System.out.println(" doc content=" + groupDoc.content + " id=" + i + " group=" + (groupDoc.group == null ? "null" : groupDoc.group.utf8ToString()) + " sort1=" + groupDoc.sort1.utf8ToString() + " sort2=" + groupDoc.sort2.utf8ToString());
}
groupDocs[i] = groupDoc;
if (groupDoc.group != null) {
group.setStringValue(groupDoc.group.utf8ToString());
idvGroupField.setBytesValue(BytesRef.deepCopyOf(groupDoc.group));
} else {
// TODO: not true
// Must explicitly set empty string, else eg if
// the segment has all docs missing the field then
// we get null back instead of empty BytesRef:
idvGroupField.setBytesValue(new BytesRef());
}
sort1.setBytesValue(BytesRef.deepCopyOf(groupDoc.sort1));
sort2.setBytesValue(BytesRef.deepCopyOf(groupDoc.sort2));
content.setStringValue(groupDoc.content);
idDV.setLongValue(groupDoc.id);
if (groupDoc.group == null) {
w.addDocument(docNoGroup);
} else {
w.addDocument(doc);
}
}
final GroupDoc[] groupDocsByID = new GroupDoc[groupDocs.length];
System.arraycopy(groupDocs, 0, groupDocsByID, 0, groupDocs.length);
final DirectoryReader r = w.getReader();
w.close();
NumericDocValues values = MultiDocValues.getNumericValues(r, "id");
int[] docIDToID = new int[r.maxDoc()];
for (int i = 0; i < r.maxDoc(); i++) {
assertEquals(i, values.nextDoc());
docIDToID[i] = (int) values.longValue();
}
DirectoryReader rBlocks = null;
Directory dirBlocks = null;
final IndexSearcher s = newSearcher(r);
if (VERBOSE) {
System.out.println("\nTEST: searcher=" + s);
}
final ShardState shards = new ShardState(s);
Set<Integer> seenIDs = new HashSet<>();
for (int contentID = 0; contentID < 3; contentID++) {
final ScoreDoc[] hits = s.search(new TermQuery(new Term("content", "real" + contentID)), numDocs).scoreDocs;
for (ScoreDoc hit : hits) {
int idValue = docIDToID[hit.doc];
final GroupDoc gd = groupDocs[idValue];
seenIDs.add(idValue);
assertTrue(gd.score == 0.0);
gd.score = hit.score;
assertEquals(gd.id, idValue);
}
}
// make sure all groups were seen across the hits
assertEquals(groupDocs.length, seenIDs.size());
for (GroupDoc gd : groupDocs) {
assertTrue(Float.isFinite(gd.score));
assertTrue(gd.score >= 0.0);
}
// Build 2nd index, where docs are added in blocks by
// group, so we can use single pass collector
dirBlocks = newDirectory();
rBlocks = getDocBlockReader(dirBlocks, groupDocs);
final Query lastDocInBlock = new TermQuery(new Term("groupend", "x"));
final IndexSearcher sBlocks = newSearcher(rBlocks);
final ShardState shardsBlocks = new ShardState(sBlocks);
// ReaderBlocks only increases maxDoc() vs reader, which
// means a monotonic shift in scores, so we can
// reliably remap them w/ Map:
final Map<String, Map<Float, Float>> scoreMap = new HashMap<>();
values = MultiDocValues.getNumericValues(rBlocks, "id");
assertNotNull(values);
int[] docIDToIDBlocks = new int[rBlocks.maxDoc()];
for (int i = 0; i < rBlocks.maxDoc(); i++) {
assertEquals(i, values.nextDoc());
docIDToIDBlocks[i] = (int) values.longValue();
}
//System.out.println("fixup score2");
for (int contentID = 0; contentID < 3; contentID++) {
//System.out.println(" term=real" + contentID);
final Map<Float, Float> termScoreMap = new HashMap<>();
scoreMap.put("real" + contentID, termScoreMap);
//System.out.println("term=real" + contentID + " dfold=" + s.docFreq(new Term("content", "real"+contentID)) +
//" dfnew=" + sBlocks.docFreq(new Term("content", "real"+contentID)));
final ScoreDoc[] hits = sBlocks.search(new TermQuery(new Term("content", "real" + contentID)), numDocs).scoreDocs;
for (ScoreDoc hit : hits) {
final GroupDoc gd = groupDocsByID[docIDToIDBlocks[hit.doc]];
assertTrue(gd.score2 == 0.0);
gd.score2 = hit.score;
assertEquals(gd.id, docIDToIDBlocks[hit.doc]);
//System.out.println(" score=" + gd.score + " score2=" + hit.score + " id=" + docIDToIDBlocks[hit.doc]);
termScoreMap.put(gd.score, gd.score2);
}
}
for (int searchIter = 0; searchIter < 100; searchIter++) {
if (VERBOSE) {
System.out.println("\nTEST: searchIter=" + searchIter);
}
final String searchTerm = "real" + random().nextInt(3);
final boolean fillFields = random().nextBoolean();
boolean getScores = random().nextBoolean();
final boolean getMaxScores = random().nextBoolean();
final Sort groupSort = getRandomSort();
//final Sort groupSort = new Sort(new SortField[] {new SortField("sort1", SortField.STRING), new SortField("id", SortField.INT)});
final Sort docSort = getRandomSort();
getScores |= (groupSort.needsScores() || docSort.needsScores());
final int topNGroups = TestUtil.nextInt(random(), 1, 30);
//final int topNGroups = 10;
final int docsPerGroup = TestUtil.nextInt(random(), 1, 50);
final int groupOffset = TestUtil.nextInt(random(), 0, (topNGroups - 1) / 2);
//final int groupOffset = 0;
final int docOffset = TestUtil.nextInt(random(), 0, docsPerGroup - 1);
//final int docOffset = 0;
final boolean doCache = random().nextBoolean();
final boolean doAllGroups = random().nextBoolean();
if (VERBOSE) {
System.out.println("TEST: groupSort=" + groupSort + " docSort=" + docSort + " searchTerm=" + searchTerm + " dF=" + r.docFreq(new Term("content", searchTerm)) + " dFBlock=" + rBlocks.docFreq(new Term("content", searchTerm)) + " topNGroups=" + topNGroups + " groupOffset=" + groupOffset + " docOffset=" + docOffset + " doCache=" + doCache + " docsPerGroup=" + docsPerGroup + " doAllGroups=" + doAllGroups + " getScores=" + getScores + " getMaxScores=" + getMaxScores);
}
String groupField = "group";
if (VERBOSE) {
System.out.println(" groupField=" + groupField);
}
final FirstPassGroupingCollector<?> c1 = createRandomFirstPassCollector(groupField, groupSort, groupOffset + topNGroups);
final CachingCollector cCache;
final Collector c;
final AllGroupsCollector<?> allGroupsCollector;
if (doAllGroups) {
allGroupsCollector = createAllGroupsCollector(c1, groupField);
} else {
allGroupsCollector = null;
}
final boolean useWrappingCollector = random().nextBoolean();
if (doCache) {
final double maxCacheMB = random().nextDouble();
if (VERBOSE) {
System.out.println("TEST: maxCacheMB=" + maxCacheMB);
}
if (useWrappingCollector) {
if (doAllGroups) {
cCache = CachingCollector.create(c1, true, maxCacheMB);
c = MultiCollector.wrap(cCache, allGroupsCollector);
} else {
c = cCache = CachingCollector.create(c1, true, maxCacheMB);
}
} else {
// Collect only into cache, then replay multiple times:
c = cCache = CachingCollector.create(true, maxCacheMB);
}
} else {
cCache = null;
if (doAllGroups) {
c = MultiCollector.wrap(c1, allGroupsCollector);
} else {
c = c1;
}
}
// Search top reader:
final Query query = new TermQuery(new Term("content", searchTerm));
s.search(query, c);
if (doCache && !useWrappingCollector) {
if (cCache.isCached()) {
// Replay for first-pass grouping
cCache.replay(c1);
if (doAllGroups) {
// Replay for all groups:
cCache.replay(allGroupsCollector);
}
} else {
// Replay by re-running search:
s.search(query, c1);
if (doAllGroups) {
s.search(query, allGroupsCollector);
}
}
}
// Get 1st pass top groups
final Collection<SearchGroup<BytesRef>> topGroups = getSearchGroups(c1, groupOffset, fillFields);
final TopGroups<BytesRef> groupsResult;
if (VERBOSE) {
System.out.println("TEST: first pass topGroups");
if (topGroups == null) {
System.out.println(" null");
} else {
for (SearchGroup<BytesRef> searchGroup : topGroups) {
System.out.println(" " + (searchGroup.groupValue == null ? "null" : searchGroup.groupValue) + ": " + Arrays.deepToString(searchGroup.sortValues));
}
}
}
// Get 1st pass top groups using shards
final TopGroups<BytesRef> topGroupsShards = searchShards(s, shards.subSearchers, query, groupSort, docSort, groupOffset, topNGroups, docOffset, docsPerGroup, getScores, getMaxScores, true, true);
final TopGroupsCollector<?> c2;
if (topGroups != null) {
if (VERBOSE) {
System.out.println("TEST: topGroups");
for (SearchGroup<BytesRef> searchGroup : topGroups) {
System.out.println(" " + (searchGroup.groupValue == null ? "null" : searchGroup.groupValue.utf8ToString()) + ": " + Arrays.deepToString(searchGroup.sortValues));
}
}
c2 = createSecondPassCollector(c1, groupSort, docSort, groupOffset, docOffset + docsPerGroup, getScores, getMaxScores, fillFields);
if (doCache) {
if (cCache.isCached()) {
if (VERBOSE) {
System.out.println("TEST: cache is intact");
}
cCache.replay(c2);
} else {
if (VERBOSE) {
System.out.println("TEST: cache was too large");
}
s.search(query, c2);
}
} else {
s.search(query, c2);
}
if (doAllGroups) {
TopGroups<BytesRef> tempTopGroups = getTopGroups(c2, docOffset);
groupsResult = new TopGroups<>(tempTopGroups, allGroupsCollector.getGroupCount());
} else {
groupsResult = getTopGroups(c2, docOffset);
}
} else {
c2 = null;
groupsResult = null;
if (VERBOSE) {
System.out.println("TEST: no results");
}
}
final TopGroups<BytesRef> expectedGroups = slowGrouping(groupDocs, searchTerm, fillFields, getScores, getMaxScores, doAllGroups, groupSort, docSort, topNGroups, docsPerGroup, groupOffset, docOffset);
if (VERBOSE) {
if (expectedGroups == null) {
System.out.println("TEST: no expected groups");
} else {
System.out.println("TEST: expected groups totalGroupedHitCount=" + expectedGroups.totalGroupedHitCount);
for (GroupDocs<BytesRef> gd : expectedGroups.groups) {
System.out.println(" group=" + (gd.groupValue == null ? "null" : gd.groupValue) + " totalHits=" + gd.totalHits + " scoreDocs.len=" + gd.scoreDocs.length);
for (ScoreDoc sd : gd.scoreDocs) {
System.out.println(" id=" + sd.doc + " score=" + sd.score);
}
}
}
if (groupsResult == null) {
System.out.println("TEST: no matched groups");
} else {
System.out.println("TEST: matched groups totalGroupedHitCount=" + groupsResult.totalGroupedHitCount);
for (GroupDocs<BytesRef> gd : groupsResult.groups) {
System.out.println(" group=" + (gd.groupValue == null ? "null" : gd.groupValue) + " totalHits=" + gd.totalHits);
for (ScoreDoc sd : gd.scoreDocs) {
System.out.println(" id=" + docIDToID[sd.doc] + " score=" + sd.score);
}
}
if (searchIter == 14) {
for (int docIDX = 0; docIDX < s.getIndexReader().maxDoc(); docIDX++) {
System.out.println("ID=" + docIDToID[docIDX] + " explain=" + s.explain(query, docIDX));
}
}
}
if (topGroupsShards == null) {
System.out.println("TEST: no matched-merged groups");
} else {
System.out.println("TEST: matched-merged groups totalGroupedHitCount=" + topGroupsShards.totalGroupedHitCount);
for (GroupDocs<BytesRef> gd : topGroupsShards.groups) {
System.out.println(" group=" + (gd.groupValue == null ? "null" : gd.groupValue) + " totalHits=" + gd.totalHits);
for (ScoreDoc sd : gd.scoreDocs) {
System.out.println(" id=" + docIDToID[sd.doc] + " score=" + sd.score);
}
}
}
}
assertEquals(docIDToID, expectedGroups, groupsResult, true, true, true, getScores, true);
// Confirm merged shards match:
assertEquals(docIDToID, expectedGroups, topGroupsShards, true, false, fillFields, getScores, true);
if (topGroupsShards != null) {
verifyShards(shards.docStarts, topGroupsShards);
}
final boolean needsScores = getScores || getMaxScores || docSort == null;
final BlockGroupingCollector c3 = new BlockGroupingCollector(groupSort, groupOffset + topNGroups, needsScores, sBlocks.createNormalizedWeight(lastDocInBlock, false));
final AllGroupsCollector<BytesRef> allGroupsCollector2;
final Collector c4;
if (doAllGroups) {
// NOTE: must be "group" and not "group_dv"
// (groupField) because we didn't index doc
// values in the block index:
allGroupsCollector2 = new AllGroupsCollector<>(new TermGroupSelector("group"));
c4 = MultiCollector.wrap(c3, allGroupsCollector2);
} else {
allGroupsCollector2 = null;
c4 = c3;
}
// Get block grouping result:
sBlocks.search(query, c4);
@SuppressWarnings({ "unchecked", "rawtypes" }) final TopGroups<BytesRef> tempTopGroupsBlocks = (TopGroups<BytesRef>) c3.getTopGroups(docSort, groupOffset, docOffset, docOffset + docsPerGroup, fillFields);
final TopGroups<BytesRef> groupsResultBlocks;
if (doAllGroups && tempTopGroupsBlocks != null) {
assertEquals((int) tempTopGroupsBlocks.totalGroupCount, allGroupsCollector2.getGroupCount());
groupsResultBlocks = new TopGroups<>(tempTopGroupsBlocks, allGroupsCollector2.getGroupCount());
} else {
groupsResultBlocks = tempTopGroupsBlocks;
}
if (VERBOSE) {
if (groupsResultBlocks == null) {
System.out.println("TEST: no block groups");
} else {
System.out.println("TEST: block groups totalGroupedHitCount=" + groupsResultBlocks.totalGroupedHitCount);
boolean first = true;
for (GroupDocs<BytesRef> gd : groupsResultBlocks.groups) {
System.out.println(" group=" + (gd.groupValue == null ? "null" : gd.groupValue.utf8ToString()) + " totalHits=" + gd.totalHits);
for (ScoreDoc sd : gd.scoreDocs) {
System.out.println(" id=" + docIDToIDBlocks[sd.doc] + " score=" + sd.score);
if (first) {
System.out.println("explain: " + sBlocks.explain(query, sd.doc));
first = false;
}
}
}
}
}
// Get shard'd block grouping result:
final TopGroups<BytesRef> topGroupsBlockShards = searchShards(sBlocks, shardsBlocks.subSearchers, query, groupSort, docSort, groupOffset, topNGroups, docOffset, docsPerGroup, getScores, getMaxScores, false, false);
if (expectedGroups != null) {
// Fixup scores for reader2
for (GroupDocs<?> groupDocsHits : expectedGroups.groups) {
for (ScoreDoc hit : groupDocsHits.scoreDocs) {
final GroupDoc gd = groupDocsByID[hit.doc];
assertEquals(gd.id, hit.doc);
//System.out.println("fixup score " + hit.score + " to " + gd.score2 + " vs " + gd.score);
hit.score = gd.score2;
}
}
final SortField[] sortFields = groupSort.getSort();
final Map<Float, Float> termScoreMap = scoreMap.get(searchTerm);
for (int groupSortIDX = 0; groupSortIDX < sortFields.length; groupSortIDX++) {
if (sortFields[groupSortIDX].getType() == SortField.Type.SCORE) {
for (GroupDocs<?> groupDocsHits : expectedGroups.groups) {
if (groupDocsHits.groupSortValues != null) {
//System.out.println("remap " + groupDocsHits.groupSortValues[groupSortIDX] + " to " + termScoreMap.get(groupDocsHits.groupSortValues[groupSortIDX]));
groupDocsHits.groupSortValues[groupSortIDX] = termScoreMap.get(groupDocsHits.groupSortValues[groupSortIDX]);
assertNotNull(groupDocsHits.groupSortValues[groupSortIDX]);
}
}
}
}
final SortField[] docSortFields = docSort.getSort();
for (int docSortIDX = 0; docSortIDX < docSortFields.length; docSortIDX++) {
if (docSortFields[docSortIDX].getType() == SortField.Type.SCORE) {
for (GroupDocs<?> groupDocsHits : expectedGroups.groups) {
for (ScoreDoc _hit : groupDocsHits.scoreDocs) {
FieldDoc hit = (FieldDoc) _hit;
if (hit.fields != null) {
hit.fields[docSortIDX] = termScoreMap.get(hit.fields[docSortIDX]);
assertNotNull(hit.fields[docSortIDX]);
}
}
}
}
}
}
assertEquals(docIDToIDBlocks, expectedGroups, groupsResultBlocks, false, true, true, getScores, false);
assertEquals(docIDToIDBlocks, expectedGroups, topGroupsBlockShards, false, false, fillFields, getScores, false);
}
r.close();
dir.close();
rBlocks.close();
dirBlocks.close();
}
}
Aggregations