use of org.apache.lucene.search.suggest.Lookup.LookupResult in project lucene-solr by apache.
the class TestFreeTextSuggester method testRandom.
public void testRandom() throws IOException {
String[] terms = new String[TestUtil.nextInt(random(), 2, 10)];
Set<String> seen = new HashSet<>();
while (seen.size() < terms.length) {
String token = TestUtil.randomSimpleString(random(), 1, 5);
if (!seen.contains(token)) {
terms[seen.size()] = token;
seen.add(token);
}
}
Analyzer a = new MockAnalyzer(random());
int numDocs = atLeast(10);
long totTokens = 0;
final String[][] docs = new String[numDocs][];
for (int i = 0; i < numDocs; i++) {
docs[i] = new String[atLeast(100)];
if (VERBOSE) {
System.out.print(" doc " + i + ":");
}
for (int j = 0; j < docs[i].length; j++) {
docs[i][j] = getZipfToken(terms);
if (VERBOSE) {
System.out.print(" " + docs[i][j]);
}
}
if (VERBOSE) {
System.out.println();
}
totTokens += docs[i].length;
}
int grams = TestUtil.nextInt(random(), 1, 4);
if (VERBOSE) {
System.out.println("TEST: " + terms.length + " terms; " + numDocs + " docs; " + grams + " grams");
}
// Build suggester model:
FreeTextSuggester sug = new FreeTextSuggester(a, a, grams, (byte) 0x20);
sug.build(new InputIterator() {
int upto;
@Override
public BytesRef next() {
if (upto == docs.length) {
return null;
} else {
StringBuilder b = new StringBuilder();
for (String token : docs[upto]) {
b.append(' ');
b.append(token);
}
upto++;
return new BytesRef(b.toString());
}
}
@Override
public long weight() {
return random().nextLong();
}
@Override
public BytesRef payload() {
return null;
}
@Override
public boolean hasPayloads() {
return false;
}
@Override
public Set<BytesRef> contexts() {
return null;
}
@Override
public boolean hasContexts() {
return false;
}
});
// Build inefficient but hopefully correct model:
List<Map<String, Integer>> gramCounts = new ArrayList<>(grams);
for (int gram = 0; gram < grams; gram++) {
if (VERBOSE) {
System.out.println("TEST: build model for gram=" + gram);
}
Map<String, Integer> model = new HashMap<>();
gramCounts.add(model);
for (String[] doc : docs) {
for (int i = 0; i < doc.length - gram; i++) {
StringBuilder b = new StringBuilder();
for (int j = i; j <= i + gram; j++) {
if (j > i) {
b.append(' ');
}
b.append(doc[j]);
}
String token = b.toString();
Integer curCount = model.get(token);
if (curCount == null) {
model.put(token, 1);
} else {
model.put(token, 1 + curCount);
}
if (VERBOSE) {
System.out.println(" add '" + token + "' -> count=" + model.get(token));
}
}
}
}
int lookups = atLeast(100);
for (int iter = 0; iter < lookups; iter++) {
String[] tokens = new String[TestUtil.nextInt(random(), 1, 5)];
for (int i = 0; i < tokens.length; i++) {
tokens[i] = getZipfToken(terms);
}
// Maybe trim last token; be sure not to create the
// empty string:
int trimStart;
if (tokens.length == 1) {
trimStart = 1;
} else {
trimStart = 0;
}
int trimAt = TestUtil.nextInt(random(), trimStart, tokens[tokens.length - 1].length());
tokens[tokens.length - 1] = tokens[tokens.length - 1].substring(0, trimAt);
int num = TestUtil.nextInt(random(), 1, 100);
StringBuilder b = new StringBuilder();
for (String token : tokens) {
b.append(' ');
b.append(token);
}
String query = b.toString();
query = query.substring(1);
if (VERBOSE) {
System.out.println("\nTEST: iter=" + iter + " query='" + query + "' num=" + num);
}
// Expected:
List<LookupResult> expected = new ArrayList<>();
double backoff = 1.0;
seen = new HashSet<>();
if (VERBOSE) {
System.out.println(" compute expected");
}
for (int i = grams - 1; i >= 0; i--) {
if (VERBOSE) {
System.out.println(" grams=" + i);
}
if (tokens.length < i + 1) {
// Don't have enough tokens to use this model
if (VERBOSE) {
System.out.println(" skip");
}
continue;
}
if (i == 0 && tokens[tokens.length - 1].length() == 0) {
// Never suggest unigrams from empty string:
if (VERBOSE) {
System.out.println(" skip unigram priors only");
}
continue;
}
// Build up "context" ngram:
b = new StringBuilder();
for (int j = tokens.length - i - 1; j < tokens.length - 1; j++) {
b.append(' ');
b.append(tokens[j]);
}
String context = b.toString();
if (context.length() > 0) {
context = context.substring(1);
}
if (VERBOSE) {
System.out.println(" context='" + context + "'");
}
long contextCount;
if (context.length() == 0) {
contextCount = totTokens;
} else {
Integer count = gramCounts.get(i - 1).get(context);
if (count == null) {
// We never saw this context:
backoff *= FreeTextSuggester.ALPHA;
if (VERBOSE) {
System.out.println(" skip: never saw context");
}
continue;
}
contextCount = count;
}
if (VERBOSE) {
System.out.println(" contextCount=" + contextCount);
}
Map<String, Integer> model = gramCounts.get(i);
// First pass, gather all predictions for this model:
if (VERBOSE) {
System.out.println(" find terms w/ prefix=" + tokens[tokens.length - 1]);
}
List<LookupResult> tmp = new ArrayList<>();
for (String term : terms) {
if (term.startsWith(tokens[tokens.length - 1])) {
if (VERBOSE) {
System.out.println(" term=" + term);
}
if (seen.contains(term)) {
if (VERBOSE) {
System.out.println(" skip seen");
}
continue;
}
String ngram = (context + " " + term).trim();
Integer count = model.get(ngram);
if (count != null) {
LookupResult lr = new LookupResult(ngram, (long) (Long.MAX_VALUE * (backoff * (double) count / contextCount)));
tmp.add(lr);
if (VERBOSE) {
System.out.println(" add tmp key='" + lr.key + "' score=" + lr.value);
}
}
}
}
// Second pass, trim to only top N, and fold those
// into overall suggestions:
Collections.sort(tmp, byScoreThenKey);
if (tmp.size() > num) {
tmp.subList(num, tmp.size()).clear();
}
for (LookupResult result : tmp) {
String key = result.key.toString();
int idx = key.lastIndexOf(' ');
String lastToken;
if (idx != -1) {
lastToken = key.substring(idx + 1);
} else {
lastToken = key;
}
if (!seen.contains(lastToken)) {
seen.add(lastToken);
expected.add(result);
if (VERBOSE) {
System.out.println(" keep key='" + result.key + "' score=" + result.value);
}
}
}
backoff *= FreeTextSuggester.ALPHA;
}
Collections.sort(expected, byScoreThenKey);
if (expected.size() > num) {
expected.subList(num, expected.size()).clear();
}
// Actual:
List<LookupResult> actual = sug.lookup(query, num);
if (VERBOSE) {
System.out.println(" expected: " + expected);
System.out.println(" actual: " + actual);
}
assertEquals(expected.toString(), actual.toString());
}
a.close();
}
use of org.apache.lucene.search.suggest.Lookup.LookupResult in project lucene-solr by apache.
the class SuggestComponent method toSuggesterResult.
/** Convert NamedList (suggester response) to {@link SuggesterResult} */
private SuggesterResult toSuggesterResult(Map<String, SimpleOrderedMap<NamedList<Object>>> suggestionsMap) {
SuggesterResult result = new SuggesterResult();
if (suggestionsMap == null) {
return result;
}
// for each token
for (Map.Entry<String, SimpleOrderedMap<NamedList<Object>>> entry : suggestionsMap.entrySet()) {
String suggesterName = entry.getKey();
for (Iterator<Map.Entry<String, NamedList<Object>>> suggestionsIter = entry.getValue().iterator(); suggestionsIter.hasNext(); ) {
Map.Entry<String, NamedList<Object>> suggestions = suggestionsIter.next();
String tokenString = suggestions.getKey();
List<LookupResult> lookupResults = new ArrayList<>();
NamedList<Object> suggestion = suggestions.getValue();
// for each suggestion
for (int j = 0; j < suggestion.size(); j++) {
String property = suggestion.getName(j);
if (property.equals(SuggesterResultLabels.SUGGESTIONS)) {
@SuppressWarnings("unchecked") List<NamedList<Object>> suggestionEntries = (List<NamedList<Object>>) suggestion.getVal(j);
for (NamedList<Object> suggestionEntry : suggestionEntries) {
String term = (String) suggestionEntry.get(SuggesterResultLabels.SUGGESTION_TERM);
Long weight = (Long) suggestionEntry.get(SuggesterResultLabels.SUGGESTION_WEIGHT);
String payload = (String) suggestionEntry.get(SuggesterResultLabels.SUGGESTION_PAYLOAD);
LookupResult res = new LookupResult(new CharsRef(term), weight, new BytesRef(payload));
lookupResults.add(res);
}
}
result.add(suggesterName, tokenString, lookupResults);
}
}
}
return result;
}
use of org.apache.lucene.search.suggest.Lookup.LookupResult in project lucene-solr by apache.
the class WFSTCompletionTest method testExactFirst.
public void testExactFirst() throws Exception {
Directory tempDir = getDirectory();
WFSTCompletionLookup suggester = new WFSTCompletionLookup(tempDir, "wfst", true);
suggester.build(new InputArrayIterator(new Input[] { new Input("x y", 20), new Input("x", 2) }));
for (int topN = 1; topN < 4; topN++) {
List<LookupResult> results = suggester.lookup("x", false, topN);
assertEquals(Math.min(topN, 2), results.size());
assertEquals("x", results.get(0).key);
assertEquals(2, results.get(0).value);
if (topN > 1) {
assertEquals("x y", results.get(1).key);
assertEquals(20, results.get(1).value);
}
}
tempDir.close();
}
use of org.apache.lucene.search.suggest.Lookup.LookupResult in project lucene-solr by apache.
the class WFSTCompletionTest method testNonExactFirst.
public void testNonExactFirst() throws Exception {
Directory tempDir = getDirectory();
WFSTCompletionLookup suggester = new WFSTCompletionLookup(tempDir, "wfst", false);
suggester.build(new InputArrayIterator(new Input[] { new Input("x y", 20), new Input("x", 2) }));
for (int topN = 1; topN < 4; topN++) {
List<LookupResult> results = suggester.lookup("x", false, topN);
assertEquals(Math.min(topN, 2), results.size());
assertEquals("x y", results.get(0).key);
assertEquals(20, results.get(0).value);
if (topN > 1) {
assertEquals("x", results.get(1).key);
assertEquals(2, results.get(1).value);
}
}
tempDir.close();
}
use of org.apache.lucene.search.suggest.Lookup.LookupResult in project lucene-solr by apache.
the class FSTCompletionTest method testMultilingualInput.
public void testMultilingualInput() throws Exception {
List<Input> input = LookupBenchmarkTest.readTop50KWiki();
Directory tempDir = getDirectory();
FSTCompletionLookup lookup = new FSTCompletionLookup(tempDir, "fst");
lookup.build(new InputArrayIterator(input));
assertEquals(input.size(), lookup.getCount());
for (Input tf : input) {
assertNotNull("Not found: " + tf.term.toString(), lookup.get(TestUtil.bytesToCharSequence(tf.term, random())));
assertEquals(tf.term.utf8ToString(), lookup.lookup(TestUtil.bytesToCharSequence(tf.term, random()), true, 1).get(0).key.toString());
}
List<LookupResult> result = lookup.lookup(stringToCharSequence("wit"), true, 5);
assertEquals(5, result.size());
// exact match.
assertTrue(result.get(0).key.toString().equals("wit"));
// highest count.
assertTrue(result.get(1).key.toString().equals("with"));
tempDir.close();
}
Aggregations