use of org.apache.lucene.search.suggest.InputArrayIterator in project lucene-solr by apache.
the class AnalyzingSuggesterTest method testGraphDups.
public void testGraphDups() throws Exception {
final Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true);
return new TokenStreamComponents(tokenizer) {
int tokenStreamCounter = 0;
final TokenStream[] tokenStreams = new TokenStream[] { new CannedTokenStream(new Token[] { token("wifi", 1, 1), token("hotspot", 0, 2), token("network", 1, 1), token("is", 1, 1), token("slow", 1, 1) }), new CannedTokenStream(new Token[] { token("wi", 1, 1), token("hotspot", 0, 3), token("fi", 1, 1), token("network", 1, 1), token("is", 1, 1), token("fast", 1, 1) }), new CannedTokenStream(new Token[] { token("wifi", 1, 1), token("hotspot", 0, 2), token("network", 1, 1) }) };
@Override
public TokenStream getTokenStream() {
TokenStream result = tokenStreams[tokenStreamCounter];
tokenStreamCounter++;
return result;
}
@Override
protected void setReader(final Reader reader) {
}
};
}
};
Input[] keys = new Input[] { new Input("wifi network is slow", 50), new Input("wi fi network is fast", 10) };
//AnalyzingSuggester suggester = new AnalyzingSuggester(analyzer, AnalyzingSuggester.EXACT_FIRST, 256, -1);
Directory tempDir = getDirectory();
AnalyzingSuggester suggester = new AnalyzingSuggester(tempDir, "suggest", analyzer);
suggester.build(new InputArrayIterator(keys));
List<LookupResult> results = suggester.lookup("wifi network", false, 10);
if (VERBOSE) {
System.out.println("Results: " + results);
}
assertEquals(2, results.size());
assertEquals("wifi network is slow", results.get(0).key);
assertEquals(50, results.get(0).value);
assertEquals("wi fi network is fast", results.get(1).key);
assertEquals(10, results.get(1).value);
IOUtils.close(analyzer, tempDir);
}
use of org.apache.lucene.search.suggest.InputArrayIterator in project lucene-solr by apache.
the class AnalyzingSuggesterTest method testDupSurfaceFormsMissingResults.
public void testDupSurfaceFormsMissingResults() throws Exception {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true);
return new TokenStreamComponents(tokenizer) {
@Override
public TokenStream getTokenStream() {
return new CannedTokenStream(new Token[] { token("hairy", 1, 1), token("smelly", 0, 1), token("dog", 1, 1) });
}
@Override
protected void setReader(final Reader reader) {
}
};
}
};
Directory tempDir = getDirectory();
AnalyzingSuggester suggester = new AnalyzingSuggester(tempDir, "suggest", a, a, 0, 256, -1, true);
suggester.build(new InputArrayIterator(shuffle(new Input("hambone", 6), new Input("nellie", 5))));
List<LookupResult> results = suggester.lookup("nellie", false, 2);
assertEquals(2, results.size());
assertEquals("hambone", results.get(0).key);
assertEquals(6, results.get(0).value);
assertEquals("nellie", results.get(1).key);
assertEquals(5, results.get(1).value);
// Try again after save/load:
Path tmpDir = createTempDir("AnalyzingSuggesterTest");
Path path = tmpDir.resolve("suggester");
OutputStream os = Files.newOutputStream(path);
suggester.store(os);
os.close();
InputStream is = Files.newInputStream(path);
suggester.load(is);
is.close();
results = suggester.lookup("nellie", false, 2);
assertEquals(2, results.size());
assertEquals("hambone", results.get(0).key);
assertEquals(6, results.get(0).value);
assertEquals("nellie", results.get(1).key);
assertEquals(5, results.get(1).value);
IOUtils.close(a, tempDir);
}
use of org.apache.lucene.search.suggest.InputArrayIterator in project lucene-solr by apache.
the class AnalyzingSuggesterTest method testRandom.
public void testRandom() throws Exception {
int numQueries = atLeast(1000);
final List<TermFreq2> slowCompletor = new ArrayList<>();
final TreeSet<String> allPrefixes = new TreeSet<>();
final Set<String> seen = new HashSet<>();
boolean doPayloads = random().nextBoolean();
Input[] keys = null;
Input[] payloadKeys = null;
if (doPayloads) {
payloadKeys = new Input[numQueries];
} else {
keys = new Input[numQueries];
}
boolean preserveSep = random().nextBoolean();
final int numStopChars = random().nextInt(10);
final boolean preserveHoles = random().nextBoolean();
if (VERBOSE) {
System.out.println("TEST: " + numQueries + " words; preserveSep=" + preserveSep + " numStopChars=" + numStopChars + " preserveHoles=" + preserveHoles);
}
for (int i = 0; i < numQueries; i++) {
int numTokens = TestUtil.nextInt(random(), 1, 4);
String key;
String analyzedKey;
while (true) {
key = "";
analyzedKey = "";
boolean lastRemoved = false;
for (int token = 0; token < numTokens; token++) {
String s;
while (true) {
// TODO: would be nice to fix this slowCompletor/comparator to
// use full range, but we might lose some coverage too...
s = TestUtil.randomSimpleString(random());
if (s.length() > 0) {
if (token > 0) {
key += " ";
}
if (preserveSep && analyzedKey.length() > 0 && analyzedKey.charAt(analyzedKey.length() - 1) != SEP) {
analyzedKey += SEP;
}
key += s;
if (s.length() == 1 && isStopChar(s.charAt(0), numStopChars)) {
lastRemoved = true;
if (preserveSep && preserveHoles) {
analyzedKey += SEP;
}
} else {
lastRemoved = false;
analyzedKey += s;
}
break;
}
}
}
analyzedKey = analyzedKey.replaceAll("(^|" + SEP + ")" + SEP + "$", "");
if (preserveSep && lastRemoved) {
analyzedKey += SEP;
}
// Don't add same surface form more than once:
if (!seen.contains(key)) {
seen.add(key);
break;
}
}
for (int j = 1; j < key.length(); j++) {
allPrefixes.add(key.substring(0, j));
}
// we can probably do Integer.MAX_VALUE here, but why worry.
int weight = random().nextInt(1 << 24);
BytesRef payload;
if (doPayloads) {
byte[] bytes = new byte[random().nextInt(10)];
random().nextBytes(bytes);
payload = new BytesRef(bytes);
payloadKeys[i] = new Input(key, weight, payload);
} else {
keys[i] = new Input(key, weight);
payload = null;
}
slowCompletor.add(new TermFreq2(key, analyzedKey, weight, payload));
}
if (VERBOSE) {
// Don't just sort original list, to avoid VERBOSE
// altering the test:
List<TermFreq2> sorted = new ArrayList<>(slowCompletor);
Collections.sort(sorted);
for (TermFreq2 ent : sorted) {
System.out.println(" surface='" + ent.surfaceForm + "' analyzed='" + ent.analyzedForm + "' weight=" + ent.weight);
}
}
Analyzer a = new MockTokenEatingAnalyzer(numStopChars, preserveHoles);
Directory tempDir = getDirectory();
AnalyzingSuggester suggester = new AnalyzingSuggester(tempDir, "suggest", a, a, preserveSep ? AnalyzingSuggester.PRESERVE_SEP : 0, 256, -1, true);
if (doPayloads) {
suggester.build(new InputArrayIterator(shuffle(payloadKeys)));
} else {
suggester.build(new InputArrayIterator(shuffle(keys)));
}
for (String prefix : allPrefixes) {
if (VERBOSE) {
System.out.println("\nTEST: prefix=" + prefix);
}
final int topN = TestUtil.nextInt(random(), 1, 10);
List<LookupResult> r = suggester.lookup(TestUtil.stringToCharSequence(prefix, random()), false, topN);
// 2. go thru whole set to find suggestions:
List<TermFreq2> matches = new ArrayList<>();
// "Analyze" the key:
String[] tokens = prefix.split(" ");
StringBuilder builder = new StringBuilder();
boolean lastRemoved = false;
for (int i = 0; i < tokens.length; i++) {
String token = tokens[i];
if (preserveSep && builder.length() > 0 && !builder.toString().endsWith("" + SEP)) {
builder.append(SEP);
}
if (token.length() == 1 && isStopChar(token.charAt(0), numStopChars)) {
if (preserveSep && preserveHoles) {
builder.append(SEP);
}
lastRemoved = true;
} else {
builder.append(token);
lastRemoved = false;
}
}
String analyzedKey = builder.toString();
// issue open for this):
while (true) {
String s = analyzedKey.replaceAll(SEP + "$", "");
if (s.equals(analyzedKey)) {
break;
}
analyzedKey = s;
}
if (analyzedKey.length() == 0) {
// string! You get no results, not all results...
continue;
}
if (preserveSep && (prefix.endsWith(" ") || lastRemoved)) {
analyzedKey += SEP;
}
if (VERBOSE) {
System.out.println(" analyzed: " + analyzedKey);
}
// TODO: could be faster... but it's slowCompletor for a reason
for (TermFreq2 e : slowCompletor) {
if (e.analyzedForm.startsWith(analyzedKey)) {
matches.add(e);
}
}
assertTrue(numStopChars > 0 || matches.size() > 0);
if (matches.size() > 1) {
Collections.sort(matches, new Comparator<TermFreq2>() {
@Override
public int compare(TermFreq2 left, TermFreq2 right) {
int cmp = Float.compare(right.weight, left.weight);
if (cmp == 0) {
return left.analyzedForm.compareTo(right.analyzedForm);
} else {
return cmp;
}
}
});
}
if (matches.size() > topN) {
matches = matches.subList(0, topN);
}
if (VERBOSE) {
System.out.println(" expected:");
for (TermFreq2 lr : matches) {
System.out.println(" key=" + lr.surfaceForm + " weight=" + lr.weight);
}
System.out.println(" actual:");
for (LookupResult lr : r) {
System.out.println(" key=" + lr.key + " weight=" + lr.value);
}
}
assertEquals(matches.size(), r.size());
for (int hit = 0; hit < r.size(); hit++) {
//System.out.println(" check hit " + hit);
assertEquals(matches.get(hit).surfaceForm.toString(), r.get(hit).key.toString());
assertEquals(matches.get(hit).weight, r.get(hit).value, 0f);
if (doPayloads) {
assertEquals(matches.get(hit).payload, r.get(hit).payload);
}
}
}
IOUtils.close(a, tempDir);
}
use of org.apache.lucene.search.suggest.InputArrayIterator in project lucene-solr by apache.
the class AnalyzingSuggesterTest method testDupSurfaceFormsMissingResults3.
public void testDupSurfaceFormsMissingResults3() throws Exception {
Analyzer a = new MockAnalyzer(random());
Directory tempDir = getDirectory();
AnalyzingSuggester suggester = new AnalyzingSuggester(tempDir, "suggest", a, a, AnalyzingSuggester.PRESERVE_SEP, 256, -1, true);
suggester.build(new InputArrayIterator(new Input[] { new Input("a a", 7), new Input("a a", 7), new Input("a c", 6), new Input("a c", 3), new Input("a b", 5) }));
assertEquals("[a a/7, a c/6, a b/5]", suggester.lookup("a", false, 3).toString());
IOUtils.close(tempDir, a);
}
use of org.apache.lucene.search.suggest.InputArrayIterator in project lucene-solr by apache.
the class AnalyzingInfixSuggesterTest method testRandomNRT.
public void testRandomNRT() throws Exception {
final Path tempDir = createTempDir("AnalyzingInfixSuggesterTest");
Analyzer a = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
int minPrefixChars = random().nextInt(7);
if (VERBOSE) {
System.out.println(" minPrefixChars=" + minPrefixChars);
}
AnalyzingInfixSuggester suggester = new AnalyzingInfixSuggester(newFSDirectory(tempDir), a, a, minPrefixChars, false);
// Initial suggester built with nothing:
suggester.build(new InputArrayIterator(new Input[0]));
LookupThread lookupThread = new LookupThread(suggester);
lookupThread.start();
int iters = atLeast(1000);
int visibleUpto = 0;
Set<Long> usedWeights = new HashSet<>();
Set<String> usedKeys = new HashSet<>();
List<Input> inputs = new ArrayList<>();
List<Update> pendingUpdates = new ArrayList<>();
for (int iter = 0; iter < iters; iter++) {
String text;
while (true) {
text = randomText();
if (usedKeys.contains(text) == false) {
usedKeys.add(text);
break;
}
}
// Carefully pick a weight we never used, to sidestep
// tie-break problems:
long weight;
while (true) {
weight = random().nextInt(10 * iters);
if (usedWeights.contains(weight) == false) {
usedWeights.add(weight);
break;
}
}
if (inputs.size() > 0 && random().nextInt(4) == 1) {
// Update an existing suggestion
Update update = new Update();
update.index = random().nextInt(inputs.size());
update.weight = weight;
Input input = inputs.get(update.index);
pendingUpdates.add(update);
if (VERBOSE) {
System.out.println("TEST: iter=" + iter + " update input=" + input.term.utf8ToString() + "/" + weight);
}
suggester.update(input.term, null, weight, input.term);
} else {
// Add a new suggestion
inputs.add(new Input(text, weight, new BytesRef(text)));
if (VERBOSE) {
System.out.println("TEST: iter=" + iter + " add input=" + text + "/" + weight);
}
BytesRef br = new BytesRef(text);
suggester.add(br, null, weight, br);
}
if (random().nextInt(15) == 7) {
if (VERBOSE) {
System.out.println("TEST: now refresh suggester");
}
suggester.refresh();
visibleUpto = inputs.size();
for (Update update : pendingUpdates) {
Input oldInput = inputs.get(update.index);
Input newInput = new Input(oldInput.term, update.weight, oldInput.payload);
inputs.set(update.index, newInput);
}
pendingUpdates.clear();
}
if (random().nextInt(50) == 7) {
if (VERBOSE) {
System.out.println("TEST: now close/reopen suggester");
}
lookupThread.finish();
suggester.close();
suggester = new AnalyzingInfixSuggester(newFSDirectory(tempDir), a, a, minPrefixChars, false);
lookupThread = new LookupThread(suggester);
lookupThread.start();
visibleUpto = inputs.size();
for (Update update : pendingUpdates) {
Input oldInput = inputs.get(update.index);
Input newInput = new Input(oldInput.term, update.weight, oldInput.payload);
inputs.set(update.index, newInput);
}
pendingUpdates.clear();
}
if (visibleUpto > 0) {
String query = randomText();
boolean lastPrefix = random().nextInt(5) != 1;
if (lastPrefix == false) {
query += " ";
}
String[] queryTerms = query.split("\\s");
boolean allTermsRequired = random().nextInt(10) == 7;
boolean doHilite = random().nextBoolean();
if (VERBOSE) {
System.out.println("TEST: lookup \"" + query + "\" allTermsRequired=" + allTermsRequired + " doHilite=" + doHilite);
}
// Stupid slow but hopefully correct matching:
List<Input> expected = new ArrayList<>();
for (int i = 0; i < visibleUpto; i++) {
Input input = inputs.get(i);
String[] inputTerms = input.term.utf8ToString().split("\\s");
boolean match = false;
for (int j = 0; j < queryTerms.length; j++) {
if (j < queryTerms.length - 1 || lastPrefix == false) {
// Exact match
for (int k = 0; k < inputTerms.length; k++) {
if (inputTerms[k].equals(queryTerms[j])) {
match = true;
break;
}
}
} else {
// Prefix match
for (int k = 0; k < inputTerms.length; k++) {
if (inputTerms[k].startsWith(queryTerms[j])) {
match = true;
break;
}
}
}
if (match) {
if (allTermsRequired == false) {
// At least one query term does match:
break;
}
match = false;
} else if (allTermsRequired) {
// At least one query term does not match:
break;
}
}
if (match) {
if (doHilite) {
expected.add(new Input(hilite(lastPrefix, inputTerms, queryTerms), input.v, input.term));
} else {
expected.add(input);
}
}
}
Collections.sort(expected, (a1, b) -> {
if (a1.v > b.v) {
return -1;
} else if (a1.v < b.v) {
return 1;
} else {
return 0;
}
});
if (expected.isEmpty() == false) {
int topN = TestUtil.nextInt(random(), 1, expected.size());
List<LookupResult> actual = suggester.lookup(TestUtil.stringToCharSequence(query, random()), topN, allTermsRequired, doHilite);
int expectedCount = Math.min(topN, expected.size());
if (VERBOSE) {
System.out.println(" expected:");
for (int i = 0; i < expectedCount; i++) {
Input x = expected.get(i);
System.out.println(" " + x.term.utf8ToString() + "/" + x.v);
}
System.out.println(" actual:");
for (LookupResult result : actual) {
System.out.println(" " + result);
}
}
assertEquals(expectedCount, actual.size());
for (int i = 0; i < expectedCount; i++) {
if (doHilite) {
assertEquals(expected.get(i).term.utf8ToString(), actual.get(i).highlightKey);
} else {
assertEquals(expected.get(i).term.utf8ToString(), actual.get(i).key);
}
assertEquals(expected.get(i).v, actual.get(i).value);
assertEquals(expected.get(i).payload, actual.get(i).payload);
}
} else {
if (VERBOSE) {
System.out.println(" no expected matches");
}
}
}
}
lookupThread.finish();
suggester.close();
a.close();
}
Aggregations