use of org.apache.lucene.search.suggest.document.TopSuggestDocs.SuggestScoreDoc in project lucene-solr by apache.
the class TopSuggestDocsCollector method get.
/**
* Returns at most <code>num</code> Top scoring {@link org.apache.lucene.search.suggest.document.TopSuggestDocs}s
*/
public TopSuggestDocs get() throws IOException {
SuggestScoreDoc[] suggestScoreDocs;
if (seenSurfaceForms != null) {
// NOTE: this also clears the priorityQueue:
for (SuggestScoreDoc hit : priorityQueue.getResults()) {
pendingResults.add(hit);
}
// Deduplicate all hits: we already dedup'd efficiently within each segment by
// truncating the FST top paths search, but across segments there may still be dups:
seenSurfaceForms.clear();
// TODO: we could use a priority queue here to make cost O(N * log(num)) instead of O(N * log(N)), where N = O(num *
// numSegments), but typically numSegments is smallish and num is smallish so this won't matter much in practice:
Collections.sort(pendingResults, new Comparator<SuggestScoreDoc>() {
@Override
public int compare(SuggestScoreDoc a, SuggestScoreDoc b) {
// sort by higher score
int cmp = Float.compare(b.score, a.score);
if (cmp == 0) {
// tie break by lower docID:
cmp = Integer.compare(a.doc, b.doc);
}
return cmp;
}
});
List<SuggestScoreDoc> hits = new ArrayList<>();
for (SuggestScoreDoc hit : pendingResults) {
if (seenSurfaceForms.contains(hit.key) == false) {
seenSurfaceForms.add(hit.key);
hits.add(hit);
if (hits.size() == num) {
break;
}
}
}
suggestScoreDocs = hits.toArray(new SuggestScoreDoc[0]);
} else {
suggestScoreDocs = priorityQueue.getResults();
}
if (suggestScoreDocs.length > 0) {
return new TopSuggestDocs(suggestScoreDocs.length, suggestScoreDocs, suggestScoreDocs[0].score);
} else {
return TopSuggestDocs.EMPTY;
}
}
use of org.apache.lucene.search.suggest.document.TopSuggestDocs.SuggestScoreDoc in project lucene-solr by apache.
the class TestSuggestField method testReturnedDocID.
@Test
public void testReturnedDocID() throws Exception {
Analyzer analyzer = new MockAnalyzer(random());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(analyzer, "suggest_field"));
int num = Math.min(1000, atLeast(10));
for (int i = 0; i < num; i++) {
Document document = new Document();
document.add(new SuggestField("suggest_field", "abc_" + i, num));
document.add(new StoredField("int_field", i));
iw.addDocument(document);
if (random().nextBoolean()) {
iw.commit();
}
}
DirectoryReader reader = iw.getReader();
SuggestIndexSearcher indexSearcher = new SuggestIndexSearcher(reader);
PrefixCompletionQuery query = new PrefixCompletionQuery(analyzer, new Term("suggest_field", "abc_"));
TopSuggestDocs suggest = indexSearcher.suggest(query, num, false);
assertEquals(num, suggest.totalHits);
for (SuggestScoreDoc suggestScoreDoc : suggest.scoreLookupDocs()) {
String key = suggestScoreDoc.key.toString();
assertTrue(key.startsWith("abc_"));
String substring = key.substring(4);
int fieldValue = Integer.parseInt(substring);
Document doc = reader.document(suggestScoreDoc.doc);
assertEquals(doc.getField("int_field").numericValue().intValue(), fieldValue);
}
reader.close();
iw.close();
}
use of org.apache.lucene.search.suggest.document.TopSuggestDocs.SuggestScoreDoc in project lucene-solr by apache.
the class TestSuggestField method testRandom.
public void testRandom() throws Exception {
int numDigits = TestUtil.nextInt(random(), 1, 6);
Set<String> keys = new HashSet<>();
int keyCount = TestUtil.nextInt(random(), 1, 20);
if (numDigits == 1) {
keyCount = Math.min(9, keyCount);
}
while (keys.size() < keyCount) {
keys.add(randomSimpleString(numDigits, 10));
}
List<String> keysList = new ArrayList<>(keys);
Analyzer analyzer = new MockAnalyzer(random());
IndexWriterConfig iwc = iwcWithSuggestField(analyzer, "suggest_field");
// we rely on docID order:
iwc.setMergePolicy(newLogMergePolicy());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
int docCount = TestUtil.nextInt(random(), 1, 200);
Entry[] docs = new Entry[docCount];
for (int i = 0; i < docCount; i++) {
int weight = random().nextInt(40);
String key = keysList.get(random().nextInt(keyCount));
//System.out.println("KEY: " + key);
docs[i] = new Entry(key, null, weight, i);
Document doc = new Document();
doc.add(new SuggestField("suggest_field", key, weight));
iw.addDocument(doc);
if (usually()) {
iw.commit();
}
}
DirectoryReader reader = iw.getReader();
SuggestIndexSearcher searcher = new SuggestIndexSearcher(reader);
int iters = atLeast(200);
for (int iter = 0; iter < iters; iter++) {
String prefix = randomSimpleString(numDigits, 2);
if (VERBOSE) {
System.out.println("\nTEST: prefix=" + prefix);
}
// slow but hopefully correct suggester:
List<Entry> expected = new ArrayList<>();
for (Entry doc : docs) {
if (doc.output.startsWith(prefix)) {
expected.add(doc);
}
}
Collections.sort(expected, new Comparator<Entry>() {
@Override
public int compare(Entry a, Entry b) {
// sort by higher score:
int cmp = Float.compare(b.value, a.value);
if (cmp == 0) {
// tie break by smaller docID:
cmp = Integer.compare(a.id, b.id);
}
return cmp;
}
});
boolean dedup = random().nextBoolean();
if (dedup) {
List<Entry> deduped = new ArrayList<>();
Set<String> seen = new HashSet<>();
for (Entry entry : expected) {
if (seen.contains(entry.output) == false) {
seen.add(entry.output);
deduped.add(entry);
}
}
expected = deduped;
}
// TODO: re-enable this, except something is buggy about tie breaks at the topN threshold now:
//int topN = TestUtil.nextInt(random(), 1, docCount+10);
int topN = docCount;
if (VERBOSE) {
if (dedup) {
System.out.println(" expected (dedup'd) topN=" + topN + ":");
} else {
System.out.println(" expected topN=" + topN + ":");
}
for (int i = 0; i < expected.size(); i++) {
if (i >= topN) {
System.out.println(" leftover: " + i + ": " + expected.get(i));
} else {
System.out.println(" " + i + ": " + expected.get(i));
}
}
}
expected = expected.subList(0, Math.min(topN, expected.size()));
PrefixCompletionQuery query = new PrefixCompletionQuery(analyzer, new Term("suggest_field", prefix));
TopSuggestDocsCollector collector = new TopSuggestDocsCollector(topN, dedup);
searcher.suggest(query, collector);
TopSuggestDocs actual = collector.get();
if (VERBOSE) {
System.out.println(" actual:");
SuggestScoreDoc[] suggestScoreDocs = (SuggestScoreDoc[]) actual.scoreDocs;
for (int i = 0; i < suggestScoreDocs.length; i++) {
System.out.println(" " + i + ": " + suggestScoreDocs[i]);
}
}
assertSuggestions(actual, expected.toArray(new Entry[expected.size()]));
}
reader.close();
iw.close();
}
use of org.apache.lucene.search.suggest.document.TopSuggestDocs.SuggestScoreDoc in project lucene-solr by apache.
the class TestSuggestField method testScoring.
@Test
public void testScoring() throws Exception {
Analyzer analyzer = new MockAnalyzer(random());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(analyzer, "suggest_field"));
int num = Math.min(1000, atLeast(100));
String[] prefixes = { "abc", "bac", "cab" };
Map<String, Integer> mappings = new HashMap<>();
for (int i = 0; i < num; i++) {
Document document = new Document();
String suggest = prefixes[i % 3] + TestUtil.randomSimpleString(random(), 10) + "_" + String.valueOf(i);
int weight = random().nextInt(Integer.MAX_VALUE);
document.add(new SuggestField("suggest_field", suggest, weight));
mappings.put(suggest, weight);
iw.addDocument(document);
if (usually()) {
iw.commit();
}
}
DirectoryReader reader = iw.getReader();
SuggestIndexSearcher indexSearcher = new SuggestIndexSearcher(reader);
for (String prefix : prefixes) {
PrefixCompletionQuery query = new PrefixCompletionQuery(analyzer, new Term("suggest_field", prefix));
TopSuggestDocs suggest = indexSearcher.suggest(query, num, false);
assertTrue(suggest.totalHits > 0);
float topScore = -1;
for (SuggestScoreDoc scoreDoc : suggest.scoreLookupDocs()) {
if (topScore != -1) {
assertTrue(topScore >= scoreDoc.score);
}
topScore = scoreDoc.score;
assertThat((float) mappings.get(scoreDoc.key.toString()), equalTo(scoreDoc.score));
assertNotNull(mappings.remove(scoreDoc.key.toString()));
}
}
assertThat(mappings.size(), equalTo(0));
reader.close();
iw.close();
}
use of org.apache.lucene.search.suggest.document.TopSuggestDocs.SuggestScoreDoc in project lucene-solr by apache.
the class TestSuggestField method testMultipleSuggestFieldsPerDoc.
@Test
public void testMultipleSuggestFieldsPerDoc() throws Exception {
Analyzer analyzer = new MockAnalyzer(random());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(analyzer, "sug_field_1", "sug_field_2"));
Document document = new Document();
document.add(new SuggestField("sug_field_1", "apple", 4));
document.add(new SuggestField("sug_field_2", "april", 3));
iw.addDocument(document);
document = new Document();
document.add(new SuggestField("sug_field_1", "aples", 3));
document.add(new SuggestField("sug_field_2", "apartment", 2));
iw.addDocument(document);
if (rarely()) {
iw.commit();
}
DirectoryReader reader = iw.getReader();
SuggestIndexSearcher suggestIndexSearcher = new SuggestIndexSearcher(reader);
PrefixCompletionQuery query = new PrefixCompletionQuery(analyzer, new Term("sug_field_1", "ap"));
TopSuggestDocs suggestDocs1 = suggestIndexSearcher.suggest(query, 4, false);
assertSuggestions(suggestDocs1, new Entry("apple", 4), new Entry("aples", 3));
query = new PrefixCompletionQuery(analyzer, new Term("sug_field_2", "ap"));
TopSuggestDocs suggestDocs2 = suggestIndexSearcher.suggest(query, 4, false);
assertSuggestions(suggestDocs2, new Entry("april", 3), new Entry("apartment", 2));
// check that the doc ids are consistent
for (int i = 0; i < suggestDocs1.scoreDocs.length; i++) {
ScoreDoc suggestScoreDoc = suggestDocs1.scoreDocs[i];
assertThat(suggestScoreDoc.doc, equalTo(suggestDocs2.scoreDocs[i].doc));
}
reader.close();
iw.close();
}
Aggregations