Search in sources :

Example 36 with InputArrayIterator

use of org.apache.lucene.search.suggest.InputArrayIterator in project lucene-solr by apache.

the class AnalyzingSuggesterTest method testTooLongSuggestion.

// TODO: we need BaseSuggesterTestCase?
public void testTooLongSuggestion() throws Exception {
    Analyzer a = new MockAnalyzer(random());
    Directory tempDir = getDirectory();
    AnalyzingSuggester suggester = new AnalyzingSuggester(tempDir, "suggest", a);
    String bigString = TestUtil.randomSimpleString(random(), 30000, 30000);
    try {
        suggester.build(new InputArrayIterator(new Input[] { new Input(bigString, 7) }));
        fail("did not hit expected exception");
    } catch (StackOverflowError soe) {
    // OK
    } catch (IllegalArgumentException iae) {
    // expected
    }
    IOUtils.close(a, tempDir);
}
Also used : Input(org.apache.lucene.search.suggest.Input) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) InputArrayIterator(org.apache.lucene.search.suggest.InputArrayIterator) Analyzer(org.apache.lucene.analysis.Analyzer) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) Directory(org.apache.lucene.store.Directory)

Example 37 with InputArrayIterator

use of org.apache.lucene.search.suggest.InputArrayIterator in project lucene-solr by apache.

the class AnalyzingInfixSuggesterTest method testEmptyAtStart.

public void testEmptyAtStart() throws Exception {
    Analyzer a = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
    AnalyzingInfixSuggester suggester = new AnalyzingInfixSuggester(newDirectory(), a, a, 3, false);
    suggester.build(new InputArrayIterator(new Input[0]));
    suggester.add(new BytesRef("a penny saved is a penny earned"), null, 10, new BytesRef("foobaz"));
    suggester.add(new BytesRef("lend me your ear"), null, 8, new BytesRef("foobar"));
    suggester.refresh();
    List<LookupResult> results = suggester.lookup(TestUtil.stringToCharSequence("ear", random()), 10, true, true);
    assertEquals(2, results.size());
    assertEquals("a penny saved is a penny earned", results.get(0).key);
    assertEquals("a penny saved is a penny <b>ear</b>ned", results.get(0).highlightKey);
    assertEquals(10, results.get(0).value);
    assertEquals(new BytesRef("foobaz"), results.get(0).payload);
    assertEquals("lend me your ear", results.get(1).key);
    assertEquals("lend me your <b>ear</b>", results.get(1).highlightKey);
    assertEquals(8, results.get(1).value);
    assertEquals(new BytesRef("foobar"), results.get(1).payload);
    results = suggester.lookup(TestUtil.stringToCharSequence("ear ", random()), 10, true, true);
    assertEquals(1, results.size());
    assertEquals("lend me your ear", results.get(0).key);
    assertEquals("lend me your <b>ear</b>", results.get(0).highlightKey);
    assertEquals(8, results.get(0).value);
    assertEquals(new BytesRef("foobar"), results.get(0).payload);
    results = suggester.lookup(TestUtil.stringToCharSequence("pen", random()), 10, true, true);
    assertEquals(1, results.size());
    assertEquals("a penny saved is a penny earned", results.get(0).key);
    assertEquals("a <b>pen</b>ny saved is a <b>pen</b>ny earned", results.get(0).highlightKey);
    assertEquals(10, results.get(0).value);
    assertEquals(new BytesRef("foobaz"), results.get(0).payload);
    results = suggester.lookup(TestUtil.stringToCharSequence("p", random()), 10, true, true);
    assertEquals(1, results.size());
    assertEquals("a penny saved is a penny earned", results.get(0).key);
    assertEquals("a <b>p</b>enny saved is a <b>p</b>enny earned", results.get(0).highlightKey);
    assertEquals(10, results.get(0).value);
    assertEquals(new BytesRef("foobaz"), results.get(0).payload);
    suggester.close();
    a.close();
}
Also used : Input(org.apache.lucene.search.suggest.Input) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) InputArrayIterator(org.apache.lucene.search.suggest.InputArrayIterator) LookupResult(org.apache.lucene.search.suggest.Lookup.LookupResult) Analyzer(org.apache.lucene.analysis.Analyzer) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) BytesRef(org.apache.lucene.util.BytesRef)

Example 38 with InputArrayIterator

use of org.apache.lucene.search.suggest.InputArrayIterator in project lucene-solr by apache.

the class FuzzySuggesterTest method testEmpty.

public void testEmpty() throws Exception {
    Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false);
    Directory tempDir = getDirectory();
    FuzzySuggester suggester = new FuzzySuggester(tempDir, "fuzzy", analyzer);
    suggester.build(new InputArrayIterator(new Input[0]));
    List<LookupResult> result = suggester.lookup("a", false, 20);
    assertTrue(result.isEmpty());
    IOUtils.close(analyzer, tempDir);
}
Also used : Input(org.apache.lucene.search.suggest.Input) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) InputArrayIterator(org.apache.lucene.search.suggest.InputArrayIterator) LookupResult(org.apache.lucene.search.suggest.Lookup.LookupResult) Analyzer(org.apache.lucene.analysis.Analyzer) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) Directory(org.apache.lucene.store.Directory)

Example 39 with InputArrayIterator

use of org.apache.lucene.search.suggest.InputArrayIterator in project lucene-solr by apache.

the class FuzzySuggesterTest method testRandom.

public void testRandom() throws Exception {
    int numQueries = atLeast(100);
    final List<TermFreqPayload2> slowCompletor = new ArrayList<>();
    final TreeSet<String> allPrefixes = new TreeSet<>();
    final Set<String> seen = new HashSet<>();
    Input[] keys = new Input[numQueries];
    boolean preserveSep = random().nextBoolean();
    boolean unicodeAware = random().nextBoolean();
    final int numStopChars = random().nextInt(10);
    final boolean preserveHoles = random().nextBoolean();
    if (VERBOSE) {
        System.out.println("TEST: " + numQueries + " words; preserveSep=" + preserveSep + " ; unicodeAware=" + unicodeAware + " numStopChars=" + numStopChars + " preserveHoles=" + preserveHoles);
    }
    for (int i = 0; i < numQueries; i++) {
        int numTokens = TestUtil.nextInt(random(), 1, 4);
        String key;
        String analyzedKey;
        while (true) {
            key = "";
            analyzedKey = "";
            boolean lastRemoved = false;
            for (int token = 0; token < numTokens; token++) {
                String s;
                while (true) {
                    // TODO: would be nice to fix this slowCompletor/comparator to
                    // use full range, but we might lose some coverage too...
                    s = TestUtil.randomSimpleString(random());
                    if (s.length() > 0) {
                        if (token > 0) {
                            key += " ";
                        }
                        if (preserveSep && analyzedKey.length() > 0 && (unicodeAware ? analyzedKey.codePointAt(analyzedKey.codePointCount(0, analyzedKey.length()) - 1) != ' ' : analyzedKey.charAt(analyzedKey.length() - 1) != ' ')) {
                            analyzedKey += " ";
                        }
                        key += s;
                        if (s.length() == 1 && isStopChar(s.charAt(0), numStopChars)) {
                            if (preserveSep && preserveHoles) {
                                analyzedKey += '';
                            }
                            lastRemoved = true;
                        } else {
                            analyzedKey += s;
                            lastRemoved = false;
                        }
                        break;
                    }
                }
            }
            analyzedKey = analyzedKey.replaceAll("(^| )$", "");
            if (preserveSep && lastRemoved) {
                analyzedKey += " ";
            }
            // Don't add same surface form more than once:
            if (!seen.contains(key)) {
                seen.add(key);
                break;
            }
        }
        for (int j = 1; j < key.length(); j++) {
            allPrefixes.add(key.substring(0, j));
        }
        // we can probably do Integer.MAX_VALUE here, but why worry.
        int weight = random().nextInt(1 << 24);
        keys[i] = new Input(key, weight);
        slowCompletor.add(new TermFreqPayload2(key, analyzedKey, weight));
    }
    if (VERBOSE) {
        // Don't just sort original list, to avoid VERBOSE
        // altering the test:
        List<TermFreqPayload2> sorted = new ArrayList<>(slowCompletor);
        Collections.sort(sorted);
        for (TermFreqPayload2 ent : sorted) {
            System.out.println("  surface='" + ent.surfaceForm + " analyzed='" + ent.analyzedForm + "' weight=" + ent.weight);
        }
    }
    Analyzer a = new MockTokenEatingAnalyzer(numStopChars, preserveHoles);
    Directory tempDir = getDirectory();
    FuzzySuggester suggester = new FuzzySuggester(tempDir, "fuzzy", a, a, preserveSep ? AnalyzingSuggester.PRESERVE_SEP : 0, 256, -1, true, 1, false, 1, 3, unicodeAware);
    suggester.build(new InputArrayIterator(keys));
    for (String prefix : allPrefixes) {
        if (VERBOSE) {
            System.out.println("\nTEST: prefix=" + prefix);
        }
        final int topN = TestUtil.nextInt(random(), 1, 10);
        List<LookupResult> r = suggester.lookup(TestUtil.stringToCharSequence(prefix, random()), false, topN);
        // 2. go thru whole set to find suggestions:
        List<LookupResult> matches = new ArrayList<>();
        // "Analyze" the key:
        String[] tokens = prefix.split(" ");
        StringBuilder builder = new StringBuilder();
        boolean lastRemoved = false;
        for (int i = 0; i < tokens.length; i++) {
            String token = tokens[i];
            if (preserveSep && builder.length() > 0 && !builder.toString().endsWith(" ")) {
                builder.append(' ');
            }
            if (token.length() == 1 && isStopChar(token.charAt(0), numStopChars)) {
                if (preserveSep && preserveHoles) {
                    builder.append("");
                }
                lastRemoved = true;
            } else {
                builder.append(token);
                lastRemoved = false;
            }
        }
        String analyzedKey = builder.toString();
        // issue open for this):
        while (true) {
            String s = analyzedKey.replaceAll("(^| )$", "");
            s = s.replaceAll("\\s+$", "");
            if (s.equals(analyzedKey)) {
                break;
            }
            analyzedKey = s;
        }
        if (analyzedKey.length() == 0) {
            // string!  You get no results, not all results...
            continue;
        }
        if (preserveSep && (prefix.endsWith(" ") || lastRemoved)) {
            analyzedKey += " ";
        }
        if (VERBOSE) {
            System.out.println("  analyzed: " + analyzedKey);
        }
        TokenStreamToAutomaton tokenStreamToAutomaton = suggester.getTokenStreamToAutomaton();
        // NOTE: not great that we ask the suggester to give
        // us the "answer key" (ie maybe we have a bug in
        // suggester.toLevA ...) ... but testRandom2() fixes
        // this:
        Automaton automaton = suggester.convertAutomaton(suggester.toLevenshteinAutomata(suggester.toLookupAutomaton(analyzedKey)));
        assertTrue(automaton.isDeterministic());
        // TODO: could be faster... but it's slowCompletor for a reason
        BytesRefBuilder spare = new BytesRefBuilder();
        for (TermFreqPayload2 e : slowCompletor) {
            spare.copyChars(e.analyzedForm);
            FiniteStringsIterator finiteStrings = new FiniteStringsIterator(suggester.toAutomaton(spare.get(), tokenStreamToAutomaton));
            for (IntsRef string; (string = finiteStrings.next()) != null; ) {
                int p = 0;
                BytesRef ref = Util.toBytesRef(string, spare);
                boolean added = false;
                for (int i = ref.offset; i < ref.length; i++) {
                    int q = automaton.step(p, ref.bytes[i] & 0xff);
                    if (q == -1) {
                        break;
                    } else if (automaton.isAccept(q)) {
                        matches.add(new LookupResult(e.surfaceForm, e.weight));
                        added = true;
                        break;
                    }
                    p = q;
                }
                if (!added && automaton.isAccept(p)) {
                    matches.add(new LookupResult(e.surfaceForm, e.weight));
                }
            }
        }
        assertTrue(numStopChars > 0 || matches.size() > 0);
        if (matches.size() > 1) {
            Collections.sort(matches, new Comparator<LookupResult>() {

                @Override
                public int compare(LookupResult left, LookupResult right) {
                    int cmp = Float.compare(right.value, left.value);
                    if (cmp == 0) {
                        return left.compareTo(right);
                    } else {
                        return cmp;
                    }
                }
            });
        }
        if (matches.size() > topN) {
            matches = matches.subList(0, topN);
        }
        if (VERBOSE) {
            System.out.println("  expected:");
            for (LookupResult lr : matches) {
                System.out.println("    key=" + lr.key + " weight=" + lr.value);
            }
            System.out.println("  actual:");
            for (LookupResult lr : r) {
                System.out.println("    key=" + lr.key + " weight=" + lr.value);
            }
        }
        assertEquals(prefix + "  " + topN, matches.size(), r.size());
        for (int hit = 0; hit < r.size(); hit++) {
            //System.out.println("  check hit " + hit);
            assertEquals(prefix + "  " + topN, matches.get(hit).key.toString(), r.get(hit).key.toString());
            assertEquals(matches.get(hit).value, r.get(hit).value, 0f);
        }
    }
    IOUtils.close(a, tempDir);
}
Also used : FiniteStringsIterator(org.apache.lucene.util.automaton.FiniteStringsIterator) ArrayList(java.util.ArrayList) Analyzer(org.apache.lucene.analysis.Analyzer) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) Input(org.apache.lucene.search.suggest.Input) TreeSet(java.util.TreeSet) IntsRef(org.apache.lucene.util.IntsRef) BytesRef(org.apache.lucene.util.BytesRef) HashSet(java.util.HashSet) Directory(org.apache.lucene.store.Directory) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) TokenStreamToAutomaton(org.apache.lucene.analysis.TokenStreamToAutomaton) Automaton(org.apache.lucene.util.automaton.Automaton) InputArrayIterator(org.apache.lucene.search.suggest.InputArrayIterator) LookupResult(org.apache.lucene.search.suggest.Lookup.LookupResult) TokenStreamToAutomaton(org.apache.lucene.analysis.TokenStreamToAutomaton)

Example 40 with InputArrayIterator

use of org.apache.lucene.search.suggest.InputArrayIterator in project lucene-solr by apache.

the class FuzzySuggesterTest method testRandomEdits.

public void testRandomEdits() throws IOException {
    List<Input> keys = new ArrayList<>();
    int numTerms = atLeast(100);
    for (int i = 0; i < numTerms; i++) {
        keys.add(new Input("boo" + TestUtil.randomSimpleString(random()), 1 + random().nextInt(100)));
    }
    keys.add(new Input("foo bar boo far", 12));
    MockAnalyzer analyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false);
    Directory tempDir = getDirectory();
    FuzzySuggester suggester = new FuzzySuggester(tempDir, "fuzzy", analyzer, analyzer, FuzzySuggester.EXACT_FIRST | FuzzySuggester.PRESERVE_SEP, 256, -1, true, FuzzySuggester.DEFAULT_MAX_EDITS, FuzzySuggester.DEFAULT_TRANSPOSITIONS, 0, FuzzySuggester.DEFAULT_MIN_FUZZY_LENGTH, FuzzySuggester.DEFAULT_UNICODE_AWARE);
    suggester.build(new InputArrayIterator(keys));
    int numIters = atLeast(10);
    for (int i = 0; i < numIters; i++) {
        String addRandomEdit = addRandomEdit("foo bar boo", FuzzySuggester.DEFAULT_NON_FUZZY_PREFIX);
        List<LookupResult> results = suggester.lookup(TestUtil.stringToCharSequence(addRandomEdit, random()), false, 2);
        assertEquals(addRandomEdit, 1, results.size());
        assertEquals("foo bar boo far", results.get(0).key.toString());
        assertEquals(12, results.get(0).value, 0.01F);
    }
    IOUtils.close(analyzer, tempDir);
}
Also used : Input(org.apache.lucene.search.suggest.Input) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) InputArrayIterator(org.apache.lucene.search.suggest.InputArrayIterator) ArrayList(java.util.ArrayList) LookupResult(org.apache.lucene.search.suggest.Lookup.LookupResult) Directory(org.apache.lucene.store.Directory)

Aggregations

InputArrayIterator (org.apache.lucene.search.suggest.InputArrayIterator)76 Input (org.apache.lucene.search.suggest.Input)71 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)67 Analyzer (org.apache.lucene.analysis.Analyzer)65 LookupResult (org.apache.lucene.search.suggest.Lookup.LookupResult)48 Directory (org.apache.lucene.store.Directory)43 BytesRef (org.apache.lucene.util.BytesRef)26 Path (java.nio.file.Path)17 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)11 Tokenizer (org.apache.lucene.analysis.Tokenizer)10 Reader (java.io.Reader)8 ArrayList (java.util.ArrayList)8 CannedTokenStream (org.apache.lucene.analysis.CannedTokenStream)8 StandardAnalyzer (org.apache.lucene.analysis.standard.StandardAnalyzer)7 TokenStream (org.apache.lucene.analysis.TokenStream)6 HashSet (java.util.HashSet)5 CharArraySet (org.apache.lucene.analysis.CharArraySet)5 Token (org.apache.lucene.analysis.Token)5 InputStream (java.io.InputStream)4 OutputStream (java.io.OutputStream)4