Search in sources :

Example 51 with LookupResult

use of org.apache.lucene.search.suggest.Lookup.LookupResult in project lucene-solr by apache.

the class AnalyzingSuggesterTest method testDupSurfaceFormsMissingResults.

public void testDupSurfaceFormsMissingResults() throws Exception {
    Analyzer a = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true);
            return new TokenStreamComponents(tokenizer) {

                @Override
                public TokenStream getTokenStream() {
                    return new CannedTokenStream(new Token[] { token("hairy", 1, 1), token("smelly", 0, 1), token("dog", 1, 1) });
                }

                @Override
                protected void setReader(final Reader reader) {
                }
            };
        }
    };
    Directory tempDir = getDirectory();
    AnalyzingSuggester suggester = new AnalyzingSuggester(tempDir, "suggest", a, a, 0, 256, -1, true);
    suggester.build(new InputArrayIterator(shuffle(new Input("hambone", 6), new Input("nellie", 5))));
    List<LookupResult> results = suggester.lookup("nellie", false, 2);
    assertEquals(2, results.size());
    assertEquals("hambone", results.get(0).key);
    assertEquals(6, results.get(0).value);
    assertEquals("nellie", results.get(1).key);
    assertEquals(5, results.get(1).value);
    // Try again after save/load:
    Path tmpDir = createTempDir("AnalyzingSuggesterTest");
    Path path = tmpDir.resolve("suggester");
    OutputStream os = Files.newOutputStream(path);
    suggester.store(os);
    os.close();
    InputStream is = Files.newInputStream(path);
    suggester.load(is);
    is.close();
    results = suggester.lookup("nellie", false, 2);
    assertEquals(2, results.size());
    assertEquals("hambone", results.get(0).key);
    assertEquals(6, results.get(0).value);
    assertEquals("nellie", results.get(1).key);
    assertEquals(5, results.get(1).value);
    IOUtils.close(a, tempDir);
}
Also used : Path(java.nio.file.Path) InputStream(java.io.InputStream) OutputStream(java.io.OutputStream) Reader(java.io.Reader) Analyzer(org.apache.lucene.analysis.Analyzer) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) Input(org.apache.lucene.search.suggest.Input) InputArrayIterator(org.apache.lucene.search.suggest.InputArrayIterator) LookupResult(org.apache.lucene.search.suggest.Lookup.LookupResult) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) Directory(org.apache.lucene.store.Directory)

Example 52 with LookupResult

use of org.apache.lucene.search.suggest.Lookup.LookupResult in project lucene-solr by apache.

the class AnalyzingSuggesterTest method testRandom.

public void testRandom() throws Exception {
    int numQueries = atLeast(1000);
    final List<TermFreq2> slowCompletor = new ArrayList<>();
    final TreeSet<String> allPrefixes = new TreeSet<>();
    final Set<String> seen = new HashSet<>();
    boolean doPayloads = random().nextBoolean();
    Input[] keys = null;
    Input[] payloadKeys = null;
    if (doPayloads) {
        payloadKeys = new Input[numQueries];
    } else {
        keys = new Input[numQueries];
    }
    boolean preserveSep = random().nextBoolean();
    final int numStopChars = random().nextInt(10);
    final boolean preserveHoles = random().nextBoolean();
    if (VERBOSE) {
        System.out.println("TEST: " + numQueries + " words; preserveSep=" + preserveSep + " numStopChars=" + numStopChars + " preserveHoles=" + preserveHoles);
    }
    for (int i = 0; i < numQueries; i++) {
        int numTokens = TestUtil.nextInt(random(), 1, 4);
        String key;
        String analyzedKey;
        while (true) {
            key = "";
            analyzedKey = "";
            boolean lastRemoved = false;
            for (int token = 0; token < numTokens; token++) {
                String s;
                while (true) {
                    // TODO: would be nice to fix this slowCompletor/comparator to
                    // use full range, but we might lose some coverage too...
                    s = TestUtil.randomSimpleString(random());
                    if (s.length() > 0) {
                        if (token > 0) {
                            key += " ";
                        }
                        if (preserveSep && analyzedKey.length() > 0 && analyzedKey.charAt(analyzedKey.length() - 1) != SEP) {
                            analyzedKey += SEP;
                        }
                        key += s;
                        if (s.length() == 1 && isStopChar(s.charAt(0), numStopChars)) {
                            lastRemoved = true;
                            if (preserveSep && preserveHoles) {
                                analyzedKey += SEP;
                            }
                        } else {
                            lastRemoved = false;
                            analyzedKey += s;
                        }
                        break;
                    }
                }
            }
            analyzedKey = analyzedKey.replaceAll("(^|" + SEP + ")" + SEP + "$", "");
            if (preserveSep && lastRemoved) {
                analyzedKey += SEP;
            }
            // Don't add same surface form more than once:
            if (!seen.contains(key)) {
                seen.add(key);
                break;
            }
        }
        for (int j = 1; j < key.length(); j++) {
            allPrefixes.add(key.substring(0, j));
        }
        // we can probably do Integer.MAX_VALUE here, but why worry.
        int weight = random().nextInt(1 << 24);
        BytesRef payload;
        if (doPayloads) {
            byte[] bytes = new byte[random().nextInt(10)];
            random().nextBytes(bytes);
            payload = new BytesRef(bytes);
            payloadKeys[i] = new Input(key, weight, payload);
        } else {
            keys[i] = new Input(key, weight);
            payload = null;
        }
        slowCompletor.add(new TermFreq2(key, analyzedKey, weight, payload));
    }
    if (VERBOSE) {
        // Don't just sort original list, to avoid VERBOSE
        // altering the test:
        List<TermFreq2> sorted = new ArrayList<>(slowCompletor);
        Collections.sort(sorted);
        for (TermFreq2 ent : sorted) {
            System.out.println("  surface='" + ent.surfaceForm + "' analyzed='" + ent.analyzedForm + "' weight=" + ent.weight);
        }
    }
    Analyzer a = new MockTokenEatingAnalyzer(numStopChars, preserveHoles);
    Directory tempDir = getDirectory();
    AnalyzingSuggester suggester = new AnalyzingSuggester(tempDir, "suggest", a, a, preserveSep ? AnalyzingSuggester.PRESERVE_SEP : 0, 256, -1, true);
    if (doPayloads) {
        suggester.build(new InputArrayIterator(shuffle(payloadKeys)));
    } else {
        suggester.build(new InputArrayIterator(shuffle(keys)));
    }
    for (String prefix : allPrefixes) {
        if (VERBOSE) {
            System.out.println("\nTEST: prefix=" + prefix);
        }
        final int topN = TestUtil.nextInt(random(), 1, 10);
        List<LookupResult> r = suggester.lookup(TestUtil.stringToCharSequence(prefix, random()), false, topN);
        // 2. go thru whole set to find suggestions:
        List<TermFreq2> matches = new ArrayList<>();
        // "Analyze" the key:
        String[] tokens = prefix.split(" ");
        StringBuilder builder = new StringBuilder();
        boolean lastRemoved = false;
        for (int i = 0; i < tokens.length; i++) {
            String token = tokens[i];
            if (preserveSep && builder.length() > 0 && !builder.toString().endsWith("" + SEP)) {
                builder.append(SEP);
            }
            if (token.length() == 1 && isStopChar(token.charAt(0), numStopChars)) {
                if (preserveSep && preserveHoles) {
                    builder.append(SEP);
                }
                lastRemoved = true;
            } else {
                builder.append(token);
                lastRemoved = false;
            }
        }
        String analyzedKey = builder.toString();
        // issue open for this):
        while (true) {
            String s = analyzedKey.replaceAll(SEP + "$", "");
            if (s.equals(analyzedKey)) {
                break;
            }
            analyzedKey = s;
        }
        if (analyzedKey.length() == 0) {
            // string!  You get no results, not all results...
            continue;
        }
        if (preserveSep && (prefix.endsWith(" ") || lastRemoved)) {
            analyzedKey += SEP;
        }
        if (VERBOSE) {
            System.out.println("  analyzed: " + analyzedKey);
        }
        // TODO: could be faster... but it's slowCompletor for a reason
        for (TermFreq2 e : slowCompletor) {
            if (e.analyzedForm.startsWith(analyzedKey)) {
                matches.add(e);
            }
        }
        assertTrue(numStopChars > 0 || matches.size() > 0);
        if (matches.size() > 1) {
            Collections.sort(matches, new Comparator<TermFreq2>() {

                @Override
                public int compare(TermFreq2 left, TermFreq2 right) {
                    int cmp = Float.compare(right.weight, left.weight);
                    if (cmp == 0) {
                        return left.analyzedForm.compareTo(right.analyzedForm);
                    } else {
                        return cmp;
                    }
                }
            });
        }
        if (matches.size() > topN) {
            matches = matches.subList(0, topN);
        }
        if (VERBOSE) {
            System.out.println("  expected:");
            for (TermFreq2 lr : matches) {
                System.out.println("    key=" + lr.surfaceForm + " weight=" + lr.weight);
            }
            System.out.println("  actual:");
            for (LookupResult lr : r) {
                System.out.println("    key=" + lr.key + " weight=" + lr.value);
            }
        }
        assertEquals(matches.size(), r.size());
        for (int hit = 0; hit < r.size(); hit++) {
            //System.out.println("  check hit " + hit);
            assertEquals(matches.get(hit).surfaceForm.toString(), r.get(hit).key.toString());
            assertEquals(matches.get(hit).weight, r.get(hit).value, 0f);
            if (doPayloads) {
                assertEquals(matches.get(hit).payload, r.get(hit).payload);
            }
        }
    }
    IOUtils.close(a, tempDir);
}
Also used : ArrayList(java.util.ArrayList) Analyzer(org.apache.lucene.analysis.Analyzer) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) Input(org.apache.lucene.search.suggest.Input) TreeSet(java.util.TreeSet) BytesRef(org.apache.lucene.util.BytesRef) HashSet(java.util.HashSet) Directory(org.apache.lucene.store.Directory) InputArrayIterator(org.apache.lucene.search.suggest.InputArrayIterator) LookupResult(org.apache.lucene.search.suggest.Lookup.LookupResult)

Example 53 with LookupResult

use of org.apache.lucene.search.suggest.Lookup.LookupResult in project lucene-solr by apache.

the class AnalyzingInfixSuggesterTest method testRandomNRT.

public void testRandomNRT() throws Exception {
    final Path tempDir = createTempDir("AnalyzingInfixSuggesterTest");
    Analyzer a = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
    int minPrefixChars = random().nextInt(7);
    if (VERBOSE) {
        System.out.println("  minPrefixChars=" + minPrefixChars);
    }
    AnalyzingInfixSuggester suggester = new AnalyzingInfixSuggester(newFSDirectory(tempDir), a, a, minPrefixChars, false);
    // Initial suggester built with nothing:
    suggester.build(new InputArrayIterator(new Input[0]));
    LookupThread lookupThread = new LookupThread(suggester);
    lookupThread.start();
    int iters = atLeast(1000);
    int visibleUpto = 0;
    Set<Long> usedWeights = new HashSet<>();
    Set<String> usedKeys = new HashSet<>();
    List<Input> inputs = new ArrayList<>();
    List<Update> pendingUpdates = new ArrayList<>();
    for (int iter = 0; iter < iters; iter++) {
        String text;
        while (true) {
            text = randomText();
            if (usedKeys.contains(text) == false) {
                usedKeys.add(text);
                break;
            }
        }
        // Carefully pick a weight we never used, to sidestep
        // tie-break problems:
        long weight;
        while (true) {
            weight = random().nextInt(10 * iters);
            if (usedWeights.contains(weight) == false) {
                usedWeights.add(weight);
                break;
            }
        }
        if (inputs.size() > 0 && random().nextInt(4) == 1) {
            // Update an existing suggestion
            Update update = new Update();
            update.index = random().nextInt(inputs.size());
            update.weight = weight;
            Input input = inputs.get(update.index);
            pendingUpdates.add(update);
            if (VERBOSE) {
                System.out.println("TEST: iter=" + iter + " update input=" + input.term.utf8ToString() + "/" + weight);
            }
            suggester.update(input.term, null, weight, input.term);
        } else {
            // Add a new suggestion
            inputs.add(new Input(text, weight, new BytesRef(text)));
            if (VERBOSE) {
                System.out.println("TEST: iter=" + iter + " add input=" + text + "/" + weight);
            }
            BytesRef br = new BytesRef(text);
            suggester.add(br, null, weight, br);
        }
        if (random().nextInt(15) == 7) {
            if (VERBOSE) {
                System.out.println("TEST: now refresh suggester");
            }
            suggester.refresh();
            visibleUpto = inputs.size();
            for (Update update : pendingUpdates) {
                Input oldInput = inputs.get(update.index);
                Input newInput = new Input(oldInput.term, update.weight, oldInput.payload);
                inputs.set(update.index, newInput);
            }
            pendingUpdates.clear();
        }
        if (random().nextInt(50) == 7) {
            if (VERBOSE) {
                System.out.println("TEST: now close/reopen suggester");
            }
            lookupThread.finish();
            suggester.close();
            suggester = new AnalyzingInfixSuggester(newFSDirectory(tempDir), a, a, minPrefixChars, false);
            lookupThread = new LookupThread(suggester);
            lookupThread.start();
            visibleUpto = inputs.size();
            for (Update update : pendingUpdates) {
                Input oldInput = inputs.get(update.index);
                Input newInput = new Input(oldInput.term, update.weight, oldInput.payload);
                inputs.set(update.index, newInput);
            }
            pendingUpdates.clear();
        }
        if (visibleUpto > 0) {
            String query = randomText();
            boolean lastPrefix = random().nextInt(5) != 1;
            if (lastPrefix == false) {
                query += " ";
            }
            String[] queryTerms = query.split("\\s");
            boolean allTermsRequired = random().nextInt(10) == 7;
            boolean doHilite = random().nextBoolean();
            if (VERBOSE) {
                System.out.println("TEST: lookup \"" + query + "\" allTermsRequired=" + allTermsRequired + " doHilite=" + doHilite);
            }
            // Stupid slow but hopefully correct matching:
            List<Input> expected = new ArrayList<>();
            for (int i = 0; i < visibleUpto; i++) {
                Input input = inputs.get(i);
                String[] inputTerms = input.term.utf8ToString().split("\\s");
                boolean match = false;
                for (int j = 0; j < queryTerms.length; j++) {
                    if (j < queryTerms.length - 1 || lastPrefix == false) {
                        // Exact match
                        for (int k = 0; k < inputTerms.length; k++) {
                            if (inputTerms[k].equals(queryTerms[j])) {
                                match = true;
                                break;
                            }
                        }
                    } else {
                        // Prefix match
                        for (int k = 0; k < inputTerms.length; k++) {
                            if (inputTerms[k].startsWith(queryTerms[j])) {
                                match = true;
                                break;
                            }
                        }
                    }
                    if (match) {
                        if (allTermsRequired == false) {
                            // At least one query term does match:
                            break;
                        }
                        match = false;
                    } else if (allTermsRequired) {
                        // At least one query term does not match:
                        break;
                    }
                }
                if (match) {
                    if (doHilite) {
                        expected.add(new Input(hilite(lastPrefix, inputTerms, queryTerms), input.v, input.term));
                    } else {
                        expected.add(input);
                    }
                }
            }
            Collections.sort(expected, (a1, b) -> {
                if (a1.v > b.v) {
                    return -1;
                } else if (a1.v < b.v) {
                    return 1;
                } else {
                    return 0;
                }
            });
            if (expected.isEmpty() == false) {
                int topN = TestUtil.nextInt(random(), 1, expected.size());
                List<LookupResult> actual = suggester.lookup(TestUtil.stringToCharSequence(query, random()), topN, allTermsRequired, doHilite);
                int expectedCount = Math.min(topN, expected.size());
                if (VERBOSE) {
                    System.out.println("  expected:");
                    for (int i = 0; i < expectedCount; i++) {
                        Input x = expected.get(i);
                        System.out.println("    " + x.term.utf8ToString() + "/" + x.v);
                    }
                    System.out.println("  actual:");
                    for (LookupResult result : actual) {
                        System.out.println("    " + result);
                    }
                }
                assertEquals(expectedCount, actual.size());
                for (int i = 0; i < expectedCount; i++) {
                    if (doHilite) {
                        assertEquals(expected.get(i).term.utf8ToString(), actual.get(i).highlightKey);
                    } else {
                        assertEquals(expected.get(i).term.utf8ToString(), actual.get(i).key);
                    }
                    assertEquals(expected.get(i).v, actual.get(i).value);
                    assertEquals(expected.get(i).payload, actual.get(i).payload);
                }
            } else {
                if (VERBOSE) {
                    System.out.println("  no expected matches");
                }
            }
        }
    }
    lookupThread.finish();
    suggester.close();
    a.close();
}
Also used : Path(java.nio.file.Path) ArrayList(java.util.ArrayList) Analyzer(org.apache.lucene.analysis.Analyzer) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) Input(org.apache.lucene.search.suggest.Input) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) InputArrayIterator(org.apache.lucene.search.suggest.InputArrayIterator) LookupResult(org.apache.lucene.search.suggest.Lookup.LookupResult) BytesRef(org.apache.lucene.util.BytesRef) HashSet(java.util.HashSet)

Example 54 with LookupResult

use of org.apache.lucene.search.suggest.Lookup.LookupResult in project lucene-solr by apache.

the class AnalyzingInfixSuggesterTest method testRandomMinPrefixLength.

public void testRandomMinPrefixLength() throws Exception {
    Input[] keys = new Input[] { new Input("lend me your ear", 8, new BytesRef("foobar")), new Input("a penny saved is a penny earned", 10, new BytesRef("foobaz")) };
    Path tempDir = createTempDir("AnalyzingInfixSuggesterTest");
    Analyzer a = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
    int minPrefixLength = random().nextInt(10);
    AnalyzingInfixSuggester suggester = new AnalyzingInfixSuggester(newFSDirectory(tempDir), a, a, minPrefixLength, false);
    suggester.build(new InputArrayIterator(keys));
    for (int i = 0; i < 2; i++) {
        for (int j = 0; j < 2; j++) {
            boolean doHighlight = j == 0;
            List<LookupResult> results = suggester.lookup(TestUtil.stringToCharSequence("ear", random()), 10, true, doHighlight);
            assertEquals(2, results.size());
            assertEquals("a penny saved is a penny earned", results.get(0).key);
            if (doHighlight) {
                assertEquals("a penny saved is a penny <b>ear</b>ned", results.get(0).highlightKey);
            }
            assertEquals(10, results.get(0).value);
            assertEquals("lend me your ear", results.get(1).key);
            if (doHighlight) {
                assertEquals("lend me your <b>ear</b>", results.get(1).highlightKey);
            }
            assertEquals(new BytesRef("foobaz"), results.get(0).payload);
            assertEquals(8, results.get(1).value);
            assertEquals(new BytesRef("foobar"), results.get(1).payload);
            results = suggester.lookup(TestUtil.stringToCharSequence("ear ", random()), 10, true, doHighlight);
            assertEquals(1, results.size());
            assertEquals("lend me your ear", results.get(0).key);
            if (doHighlight) {
                assertEquals("lend me your <b>ear</b>", results.get(0).highlightKey);
            }
            assertEquals(8, results.get(0).value);
            assertEquals(new BytesRef("foobar"), results.get(0).payload);
            results = suggester.lookup(TestUtil.stringToCharSequence("pen", random()), 10, true, doHighlight);
            assertEquals(1, results.size());
            assertEquals("a penny saved is a penny earned", results.get(0).key);
            if (doHighlight) {
                assertEquals("a <b>pen</b>ny saved is a <b>pen</b>ny earned", results.get(0).highlightKey);
            }
            assertEquals(10, results.get(0).value);
            assertEquals(new BytesRef("foobaz"), results.get(0).payload);
            results = suggester.lookup(TestUtil.stringToCharSequence("p", random()), 10, true, doHighlight);
            assertEquals(1, results.size());
            assertEquals("a penny saved is a penny earned", results.get(0).key);
            if (doHighlight) {
                assertEquals("a <b>p</b>enny saved is a <b>p</b>enny earned", results.get(0).highlightKey);
            }
            assertEquals(10, results.get(0).value);
            assertEquals(new BytesRef("foobaz"), results.get(0).payload);
        }
        // Make sure things still work after close and reopen:
        suggester.close();
        suggester = new AnalyzingInfixSuggester(newFSDirectory(tempDir), a, a, minPrefixLength, false);
    }
    suggester.close();
    a.close();
}
Also used : Path(java.nio.file.Path) Input(org.apache.lucene.search.suggest.Input) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) InputArrayIterator(org.apache.lucene.search.suggest.InputArrayIterator) LookupResult(org.apache.lucene.search.suggest.Lookup.LookupResult) Analyzer(org.apache.lucene.analysis.Analyzer) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) BytesRef(org.apache.lucene.util.BytesRef)

Example 55 with LookupResult

use of org.apache.lucene.search.suggest.Lookup.LookupResult in project lucene-solr by apache.

the class AnalyzingInfixSuggesterTest method testBothExactAndPrefix.

public void testBothExactAndPrefix() throws Exception {
    Analyzer a = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
    AnalyzingInfixSuggester suggester = new AnalyzingInfixSuggester(newDirectory(), a, a, 3, false);
    suggester.build(new InputArrayIterator(new Input[0]));
    suggester.add(new BytesRef("the pen is pretty"), null, 10, new BytesRef("foobaz"));
    suggester.refresh();
    List<LookupResult> results = suggester.lookup(TestUtil.stringToCharSequence("pen p", random()), 10, true, true);
    assertEquals(1, results.size());
    assertEquals("the pen is pretty", results.get(0).key);
    assertEquals("the <b>pen</b> is <b>p</b>retty", results.get(0).highlightKey);
    assertEquals(10, results.get(0).value);
    assertEquals(new BytesRef("foobaz"), results.get(0).payload);
    suggester.close();
    a.close();
}
Also used : Input(org.apache.lucene.search.suggest.Input) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) InputArrayIterator(org.apache.lucene.search.suggest.InputArrayIterator) LookupResult(org.apache.lucene.search.suggest.Lookup.LookupResult) Analyzer(org.apache.lucene.analysis.Analyzer) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) BytesRef(org.apache.lucene.util.BytesRef)

Aggregations

LookupResult (org.apache.lucene.search.suggest.Lookup.LookupResult)62 Input (org.apache.lucene.search.suggest.Input)48 InputArrayIterator (org.apache.lucene.search.suggest.InputArrayIterator)48 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)45 Analyzer (org.apache.lucene.analysis.Analyzer)43 Directory (org.apache.lucene.store.Directory)36 BytesRef (org.apache.lucene.util.BytesRef)21 ArrayList (java.util.ArrayList)12 Path (java.nio.file.Path)11 HashSet (java.util.HashSet)8 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)7 Reader (java.io.Reader)6 CannedTokenStream (org.apache.lucene.analysis.CannedTokenStream)6 Tokenizer (org.apache.lucene.analysis.Tokenizer)6 Token (org.apache.lucene.analysis.Token)5 TokenStream (org.apache.lucene.analysis.TokenStream)5 HashMap (java.util.HashMap)4 CharArraySet (org.apache.lucene.analysis.CharArraySet)4 InputStream (java.io.InputStream)3 OutputStream (java.io.OutputStream)3