Search in sources :

Example 61 with InputArrayIterator

use of org.apache.lucene.search.suggest.InputArrayIterator in project lucene-solr by apache.

the class AnalyzingSuggesterTest method testGraphDups.

public void testGraphDups() throws Exception {
    final Analyzer analyzer = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true);
            return new TokenStreamComponents(tokenizer) {

                int tokenStreamCounter = 0;

                final TokenStream[] tokenStreams = new TokenStream[] { new CannedTokenStream(new Token[] { token("wifi", 1, 1), token("hotspot", 0, 2), token("network", 1, 1), token("is", 1, 1), token("slow", 1, 1) }), new CannedTokenStream(new Token[] { token("wi", 1, 1), token("hotspot", 0, 3), token("fi", 1, 1), token("network", 1, 1), token("is", 1, 1), token("fast", 1, 1) }), new CannedTokenStream(new Token[] { token("wifi", 1, 1), token("hotspot", 0, 2), token("network", 1, 1) }) };

                @Override
                public TokenStream getTokenStream() {
                    TokenStream result = tokenStreams[tokenStreamCounter];
                    tokenStreamCounter++;
                    return result;
                }

                @Override
                protected void setReader(final Reader reader) {
                }
            };
        }
    };
    Input[] keys = new Input[] { new Input("wifi network is slow", 50), new Input("wi fi network is fast", 10) };
    //AnalyzingSuggester suggester = new AnalyzingSuggester(analyzer, AnalyzingSuggester.EXACT_FIRST, 256, -1);
    Directory tempDir = getDirectory();
    AnalyzingSuggester suggester = new AnalyzingSuggester(tempDir, "suggest", analyzer);
    suggester.build(new InputArrayIterator(keys));
    List<LookupResult> results = suggester.lookup("wifi network", false, 10);
    if (VERBOSE) {
        System.out.println("Results: " + results);
    }
    assertEquals(2, results.size());
    assertEquals("wifi network is slow", results.get(0).key);
    assertEquals(50, results.get(0).value);
    assertEquals("wi fi network is fast", results.get(1).key);
    assertEquals(10, results.get(1).value);
    IOUtils.close(analyzer, tempDir);
}
Also used : CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) CannedBinaryTokenStream(org.apache.lucene.analysis.CannedBinaryTokenStream) TokenStream(org.apache.lucene.analysis.TokenStream) Reader(java.io.Reader) Token(org.apache.lucene.analysis.Token) BinaryToken(org.apache.lucene.analysis.CannedBinaryTokenStream.BinaryToken) Analyzer(org.apache.lucene.analysis.Analyzer) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) Input(org.apache.lucene.search.suggest.Input) InputArrayIterator(org.apache.lucene.search.suggest.InputArrayIterator) LookupResult(org.apache.lucene.search.suggest.Lookup.LookupResult) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) Directory(org.apache.lucene.store.Directory)

Example 62 with InputArrayIterator

use of org.apache.lucene.search.suggest.InputArrayIterator in project lucene-solr by apache.

the class AnalyzingSuggesterTest method testDupSurfaceFormsMissingResults.

public void testDupSurfaceFormsMissingResults() throws Exception {
    Analyzer a = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true);
            return new TokenStreamComponents(tokenizer) {

                @Override
                public TokenStream getTokenStream() {
                    return new CannedTokenStream(new Token[] { token("hairy", 1, 1), token("smelly", 0, 1), token("dog", 1, 1) });
                }

                @Override
                protected void setReader(final Reader reader) {
                }
            };
        }
    };
    Directory tempDir = getDirectory();
    AnalyzingSuggester suggester = new AnalyzingSuggester(tempDir, "suggest", a, a, 0, 256, -1, true);
    suggester.build(new InputArrayIterator(shuffle(new Input("hambone", 6), new Input("nellie", 5))));
    List<LookupResult> results = suggester.lookup("nellie", false, 2);
    assertEquals(2, results.size());
    assertEquals("hambone", results.get(0).key);
    assertEquals(6, results.get(0).value);
    assertEquals("nellie", results.get(1).key);
    assertEquals(5, results.get(1).value);
    // Try again after save/load:
    Path tmpDir = createTempDir("AnalyzingSuggesterTest");
    Path path = tmpDir.resolve("suggester");
    OutputStream os = Files.newOutputStream(path);
    suggester.store(os);
    os.close();
    InputStream is = Files.newInputStream(path);
    suggester.load(is);
    is.close();
    results = suggester.lookup("nellie", false, 2);
    assertEquals(2, results.size());
    assertEquals("hambone", results.get(0).key);
    assertEquals(6, results.get(0).value);
    assertEquals("nellie", results.get(1).key);
    assertEquals(5, results.get(1).value);
    IOUtils.close(a, tempDir);
}
Also used : Path(java.nio.file.Path) InputStream(java.io.InputStream) OutputStream(java.io.OutputStream) Reader(java.io.Reader) Analyzer(org.apache.lucene.analysis.Analyzer) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) Input(org.apache.lucene.search.suggest.Input) InputArrayIterator(org.apache.lucene.search.suggest.InputArrayIterator) LookupResult(org.apache.lucene.search.suggest.Lookup.LookupResult) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) Directory(org.apache.lucene.store.Directory)

Example 63 with InputArrayIterator

use of org.apache.lucene.search.suggest.InputArrayIterator in project lucene-solr by apache.

the class AnalyzingSuggesterTest method testRandom.

public void testRandom() throws Exception {
    int numQueries = atLeast(1000);
    final List<TermFreq2> slowCompletor = new ArrayList<>();
    final TreeSet<String> allPrefixes = new TreeSet<>();
    final Set<String> seen = new HashSet<>();
    boolean doPayloads = random().nextBoolean();
    Input[] keys = null;
    Input[] payloadKeys = null;
    if (doPayloads) {
        payloadKeys = new Input[numQueries];
    } else {
        keys = new Input[numQueries];
    }
    boolean preserveSep = random().nextBoolean();
    final int numStopChars = random().nextInt(10);
    final boolean preserveHoles = random().nextBoolean();
    if (VERBOSE) {
        System.out.println("TEST: " + numQueries + " words; preserveSep=" + preserveSep + " numStopChars=" + numStopChars + " preserveHoles=" + preserveHoles);
    }
    for (int i = 0; i < numQueries; i++) {
        int numTokens = TestUtil.nextInt(random(), 1, 4);
        String key;
        String analyzedKey;
        while (true) {
            key = "";
            analyzedKey = "";
            boolean lastRemoved = false;
            for (int token = 0; token < numTokens; token++) {
                String s;
                while (true) {
                    // TODO: would be nice to fix this slowCompletor/comparator to
                    // use full range, but we might lose some coverage too...
                    s = TestUtil.randomSimpleString(random());
                    if (s.length() > 0) {
                        if (token > 0) {
                            key += " ";
                        }
                        if (preserveSep && analyzedKey.length() > 0 && analyzedKey.charAt(analyzedKey.length() - 1) != SEP) {
                            analyzedKey += SEP;
                        }
                        key += s;
                        if (s.length() == 1 && isStopChar(s.charAt(0), numStopChars)) {
                            lastRemoved = true;
                            if (preserveSep && preserveHoles) {
                                analyzedKey += SEP;
                            }
                        } else {
                            lastRemoved = false;
                            analyzedKey += s;
                        }
                        break;
                    }
                }
            }
            analyzedKey = analyzedKey.replaceAll("(^|" + SEP + ")" + SEP + "$", "");
            if (preserveSep && lastRemoved) {
                analyzedKey += SEP;
            }
            // Don't add same surface form more than once:
            if (!seen.contains(key)) {
                seen.add(key);
                break;
            }
        }
        for (int j = 1; j < key.length(); j++) {
            allPrefixes.add(key.substring(0, j));
        }
        // we can probably do Integer.MAX_VALUE here, but why worry.
        int weight = random().nextInt(1 << 24);
        BytesRef payload;
        if (doPayloads) {
            byte[] bytes = new byte[random().nextInt(10)];
            random().nextBytes(bytes);
            payload = new BytesRef(bytes);
            payloadKeys[i] = new Input(key, weight, payload);
        } else {
            keys[i] = new Input(key, weight);
            payload = null;
        }
        slowCompletor.add(new TermFreq2(key, analyzedKey, weight, payload));
    }
    if (VERBOSE) {
        // Don't just sort original list, to avoid VERBOSE
        // altering the test:
        List<TermFreq2> sorted = new ArrayList<>(slowCompletor);
        Collections.sort(sorted);
        for (TermFreq2 ent : sorted) {
            System.out.println("  surface='" + ent.surfaceForm + "' analyzed='" + ent.analyzedForm + "' weight=" + ent.weight);
        }
    }
    Analyzer a = new MockTokenEatingAnalyzer(numStopChars, preserveHoles);
    Directory tempDir = getDirectory();
    AnalyzingSuggester suggester = new AnalyzingSuggester(tempDir, "suggest", a, a, preserveSep ? AnalyzingSuggester.PRESERVE_SEP : 0, 256, -1, true);
    if (doPayloads) {
        suggester.build(new InputArrayIterator(shuffle(payloadKeys)));
    } else {
        suggester.build(new InputArrayIterator(shuffle(keys)));
    }
    for (String prefix : allPrefixes) {
        if (VERBOSE) {
            System.out.println("\nTEST: prefix=" + prefix);
        }
        final int topN = TestUtil.nextInt(random(), 1, 10);
        List<LookupResult> r = suggester.lookup(TestUtil.stringToCharSequence(prefix, random()), false, topN);
        // 2. go thru whole set to find suggestions:
        List<TermFreq2> matches = new ArrayList<>();
        // "Analyze" the key:
        String[] tokens = prefix.split(" ");
        StringBuilder builder = new StringBuilder();
        boolean lastRemoved = false;
        for (int i = 0; i < tokens.length; i++) {
            String token = tokens[i];
            if (preserveSep && builder.length() > 0 && !builder.toString().endsWith("" + SEP)) {
                builder.append(SEP);
            }
            if (token.length() == 1 && isStopChar(token.charAt(0), numStopChars)) {
                if (preserveSep && preserveHoles) {
                    builder.append(SEP);
                }
                lastRemoved = true;
            } else {
                builder.append(token);
                lastRemoved = false;
            }
        }
        String analyzedKey = builder.toString();
        // issue open for this):
        while (true) {
            String s = analyzedKey.replaceAll(SEP + "$", "");
            if (s.equals(analyzedKey)) {
                break;
            }
            analyzedKey = s;
        }
        if (analyzedKey.length() == 0) {
            // string!  You get no results, not all results...
            continue;
        }
        if (preserveSep && (prefix.endsWith(" ") || lastRemoved)) {
            analyzedKey += SEP;
        }
        if (VERBOSE) {
            System.out.println("  analyzed: " + analyzedKey);
        }
        // TODO: could be faster... but it's slowCompletor for a reason
        for (TermFreq2 e : slowCompletor) {
            if (e.analyzedForm.startsWith(analyzedKey)) {
                matches.add(e);
            }
        }
        assertTrue(numStopChars > 0 || matches.size() > 0);
        if (matches.size() > 1) {
            Collections.sort(matches, new Comparator<TermFreq2>() {

                @Override
                public int compare(TermFreq2 left, TermFreq2 right) {
                    int cmp = Float.compare(right.weight, left.weight);
                    if (cmp == 0) {
                        return left.analyzedForm.compareTo(right.analyzedForm);
                    } else {
                        return cmp;
                    }
                }
            });
        }
        if (matches.size() > topN) {
            matches = matches.subList(0, topN);
        }
        if (VERBOSE) {
            System.out.println("  expected:");
            for (TermFreq2 lr : matches) {
                System.out.println("    key=" + lr.surfaceForm + " weight=" + lr.weight);
            }
            System.out.println("  actual:");
            for (LookupResult lr : r) {
                System.out.println("    key=" + lr.key + " weight=" + lr.value);
            }
        }
        assertEquals(matches.size(), r.size());
        for (int hit = 0; hit < r.size(); hit++) {
            //System.out.println("  check hit " + hit);
            assertEquals(matches.get(hit).surfaceForm.toString(), r.get(hit).key.toString());
            assertEquals(matches.get(hit).weight, r.get(hit).value, 0f);
            if (doPayloads) {
                assertEquals(matches.get(hit).payload, r.get(hit).payload);
            }
        }
    }
    IOUtils.close(a, tempDir);
}
Also used : ArrayList(java.util.ArrayList) Analyzer(org.apache.lucene.analysis.Analyzer) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) Input(org.apache.lucene.search.suggest.Input) TreeSet(java.util.TreeSet) BytesRef(org.apache.lucene.util.BytesRef) HashSet(java.util.HashSet) Directory(org.apache.lucene.store.Directory) InputArrayIterator(org.apache.lucene.search.suggest.InputArrayIterator) LookupResult(org.apache.lucene.search.suggest.Lookup.LookupResult)

Example 64 with InputArrayIterator

use of org.apache.lucene.search.suggest.InputArrayIterator in project lucene-solr by apache.

the class AnalyzingSuggesterTest method testDupSurfaceFormsMissingResults3.

public void testDupSurfaceFormsMissingResults3() throws Exception {
    Analyzer a = new MockAnalyzer(random());
    Directory tempDir = getDirectory();
    AnalyzingSuggester suggester = new AnalyzingSuggester(tempDir, "suggest", a, a, AnalyzingSuggester.PRESERVE_SEP, 256, -1, true);
    suggester.build(new InputArrayIterator(new Input[] { new Input("a a", 7), new Input("a a", 7), new Input("a c", 6), new Input("a c", 3), new Input("a b", 5) }));
    assertEquals("[a a/7, a c/6, a b/5]", suggester.lookup("a", false, 3).toString());
    IOUtils.close(tempDir, a);
}
Also used : Input(org.apache.lucene.search.suggest.Input) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) InputArrayIterator(org.apache.lucene.search.suggest.InputArrayIterator) Analyzer(org.apache.lucene.analysis.Analyzer) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) Directory(org.apache.lucene.store.Directory)

Example 65 with InputArrayIterator

use of org.apache.lucene.search.suggest.InputArrayIterator in project lucene-solr by apache.

the class AnalyzingInfixSuggesterTest method testRandomNRT.

public void testRandomNRT() throws Exception {
    final Path tempDir = createTempDir("AnalyzingInfixSuggesterTest");
    Analyzer a = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
    int minPrefixChars = random().nextInt(7);
    if (VERBOSE) {
        System.out.println("  minPrefixChars=" + minPrefixChars);
    }
    AnalyzingInfixSuggester suggester = new AnalyzingInfixSuggester(newFSDirectory(tempDir), a, a, minPrefixChars, false);
    // Initial suggester built with nothing:
    suggester.build(new InputArrayIterator(new Input[0]));
    LookupThread lookupThread = new LookupThread(suggester);
    lookupThread.start();
    int iters = atLeast(1000);
    int visibleUpto = 0;
    Set<Long> usedWeights = new HashSet<>();
    Set<String> usedKeys = new HashSet<>();
    List<Input> inputs = new ArrayList<>();
    List<Update> pendingUpdates = new ArrayList<>();
    for (int iter = 0; iter < iters; iter++) {
        String text;
        while (true) {
            text = randomText();
            if (usedKeys.contains(text) == false) {
                usedKeys.add(text);
                break;
            }
        }
        // Carefully pick a weight we never used, to sidestep
        // tie-break problems:
        long weight;
        while (true) {
            weight = random().nextInt(10 * iters);
            if (usedWeights.contains(weight) == false) {
                usedWeights.add(weight);
                break;
            }
        }
        if (inputs.size() > 0 && random().nextInt(4) == 1) {
            // Update an existing suggestion
            Update update = new Update();
            update.index = random().nextInt(inputs.size());
            update.weight = weight;
            Input input = inputs.get(update.index);
            pendingUpdates.add(update);
            if (VERBOSE) {
                System.out.println("TEST: iter=" + iter + " update input=" + input.term.utf8ToString() + "/" + weight);
            }
            suggester.update(input.term, null, weight, input.term);
        } else {
            // Add a new suggestion
            inputs.add(new Input(text, weight, new BytesRef(text)));
            if (VERBOSE) {
                System.out.println("TEST: iter=" + iter + " add input=" + text + "/" + weight);
            }
            BytesRef br = new BytesRef(text);
            suggester.add(br, null, weight, br);
        }
        if (random().nextInt(15) == 7) {
            if (VERBOSE) {
                System.out.println("TEST: now refresh suggester");
            }
            suggester.refresh();
            visibleUpto = inputs.size();
            for (Update update : pendingUpdates) {
                Input oldInput = inputs.get(update.index);
                Input newInput = new Input(oldInput.term, update.weight, oldInput.payload);
                inputs.set(update.index, newInput);
            }
            pendingUpdates.clear();
        }
        if (random().nextInt(50) == 7) {
            if (VERBOSE) {
                System.out.println("TEST: now close/reopen suggester");
            }
            lookupThread.finish();
            suggester.close();
            suggester = new AnalyzingInfixSuggester(newFSDirectory(tempDir), a, a, minPrefixChars, false);
            lookupThread = new LookupThread(suggester);
            lookupThread.start();
            visibleUpto = inputs.size();
            for (Update update : pendingUpdates) {
                Input oldInput = inputs.get(update.index);
                Input newInput = new Input(oldInput.term, update.weight, oldInput.payload);
                inputs.set(update.index, newInput);
            }
            pendingUpdates.clear();
        }
        if (visibleUpto > 0) {
            String query = randomText();
            boolean lastPrefix = random().nextInt(5) != 1;
            if (lastPrefix == false) {
                query += " ";
            }
            String[] queryTerms = query.split("\\s");
            boolean allTermsRequired = random().nextInt(10) == 7;
            boolean doHilite = random().nextBoolean();
            if (VERBOSE) {
                System.out.println("TEST: lookup \"" + query + "\" allTermsRequired=" + allTermsRequired + " doHilite=" + doHilite);
            }
            // Stupid slow but hopefully correct matching:
            List<Input> expected = new ArrayList<>();
            for (int i = 0; i < visibleUpto; i++) {
                Input input = inputs.get(i);
                String[] inputTerms = input.term.utf8ToString().split("\\s");
                boolean match = false;
                for (int j = 0; j < queryTerms.length; j++) {
                    if (j < queryTerms.length - 1 || lastPrefix == false) {
                        // Exact match
                        for (int k = 0; k < inputTerms.length; k++) {
                            if (inputTerms[k].equals(queryTerms[j])) {
                                match = true;
                                break;
                            }
                        }
                    } else {
                        // Prefix match
                        for (int k = 0; k < inputTerms.length; k++) {
                            if (inputTerms[k].startsWith(queryTerms[j])) {
                                match = true;
                                break;
                            }
                        }
                    }
                    if (match) {
                        if (allTermsRequired == false) {
                            // At least one query term does match:
                            break;
                        }
                        match = false;
                    } else if (allTermsRequired) {
                        // At least one query term does not match:
                        break;
                    }
                }
                if (match) {
                    if (doHilite) {
                        expected.add(new Input(hilite(lastPrefix, inputTerms, queryTerms), input.v, input.term));
                    } else {
                        expected.add(input);
                    }
                }
            }
            Collections.sort(expected, (a1, b) -> {
                if (a1.v > b.v) {
                    return -1;
                } else if (a1.v < b.v) {
                    return 1;
                } else {
                    return 0;
                }
            });
            if (expected.isEmpty() == false) {
                int topN = TestUtil.nextInt(random(), 1, expected.size());
                List<LookupResult> actual = suggester.lookup(TestUtil.stringToCharSequence(query, random()), topN, allTermsRequired, doHilite);
                int expectedCount = Math.min(topN, expected.size());
                if (VERBOSE) {
                    System.out.println("  expected:");
                    for (int i = 0; i < expectedCount; i++) {
                        Input x = expected.get(i);
                        System.out.println("    " + x.term.utf8ToString() + "/" + x.v);
                    }
                    System.out.println("  actual:");
                    for (LookupResult result : actual) {
                        System.out.println("    " + result);
                    }
                }
                assertEquals(expectedCount, actual.size());
                for (int i = 0; i < expectedCount; i++) {
                    if (doHilite) {
                        assertEquals(expected.get(i).term.utf8ToString(), actual.get(i).highlightKey);
                    } else {
                        assertEquals(expected.get(i).term.utf8ToString(), actual.get(i).key);
                    }
                    assertEquals(expected.get(i).v, actual.get(i).value);
                    assertEquals(expected.get(i).payload, actual.get(i).payload);
                }
            } else {
                if (VERBOSE) {
                    System.out.println("  no expected matches");
                }
            }
        }
    }
    lookupThread.finish();
    suggester.close();
    a.close();
}
Also used : Path(java.nio.file.Path) ArrayList(java.util.ArrayList) Analyzer(org.apache.lucene.analysis.Analyzer) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) Input(org.apache.lucene.search.suggest.Input) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) InputArrayIterator(org.apache.lucene.search.suggest.InputArrayIterator) LookupResult(org.apache.lucene.search.suggest.Lookup.LookupResult) BytesRef(org.apache.lucene.util.BytesRef) HashSet(java.util.HashSet)

Aggregations

InputArrayIterator (org.apache.lucene.search.suggest.InputArrayIterator)76 Input (org.apache.lucene.search.suggest.Input)71 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)67 Analyzer (org.apache.lucene.analysis.Analyzer)65 LookupResult (org.apache.lucene.search.suggest.Lookup.LookupResult)48 Directory (org.apache.lucene.store.Directory)43 BytesRef (org.apache.lucene.util.BytesRef)26 Path (java.nio.file.Path)17 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)11 Tokenizer (org.apache.lucene.analysis.Tokenizer)10 Reader (java.io.Reader)8 ArrayList (java.util.ArrayList)8 CannedTokenStream (org.apache.lucene.analysis.CannedTokenStream)8 StandardAnalyzer (org.apache.lucene.analysis.standard.StandardAnalyzer)7 TokenStream (org.apache.lucene.analysis.TokenStream)6 HashSet (java.util.HashSet)5 CharArraySet (org.apache.lucene.analysis.CharArraySet)5 Token (org.apache.lucene.analysis.Token)5 InputStream (java.io.InputStream)4 OutputStream (java.io.OutputStream)4