Search in sources :

Example 36 with LookupResult

use of org.apache.lucene.search.suggest.Lookup.LookupResult in project lucene-solr by apache.

the class TestFreeTextSuggester method testRandom.

public void testRandom() throws IOException {
    String[] terms = new String[TestUtil.nextInt(random(), 2, 10)];
    Set<String> seen = new HashSet<>();
    while (seen.size() < terms.length) {
        String token = TestUtil.randomSimpleString(random(), 1, 5);
        if (!seen.contains(token)) {
            terms[seen.size()] = token;
            seen.add(token);
        }
    }
    Analyzer a = new MockAnalyzer(random());
    int numDocs = atLeast(10);
    long totTokens = 0;
    final String[][] docs = new String[numDocs][];
    for (int i = 0; i < numDocs; i++) {
        docs[i] = new String[atLeast(100)];
        if (VERBOSE) {
            System.out.print("  doc " + i + ":");
        }
        for (int j = 0; j < docs[i].length; j++) {
            docs[i][j] = getZipfToken(terms);
            if (VERBOSE) {
                System.out.print(" " + docs[i][j]);
            }
        }
        if (VERBOSE) {
            System.out.println();
        }
        totTokens += docs[i].length;
    }
    int grams = TestUtil.nextInt(random(), 1, 4);
    if (VERBOSE) {
        System.out.println("TEST: " + terms.length + " terms; " + numDocs + " docs; " + grams + " grams");
    }
    // Build suggester model:
    FreeTextSuggester sug = new FreeTextSuggester(a, a, grams, (byte) 0x20);
    sug.build(new InputIterator() {

        int upto;

        @Override
        public BytesRef next() {
            if (upto == docs.length) {
                return null;
            } else {
                StringBuilder b = new StringBuilder();
                for (String token : docs[upto]) {
                    b.append(' ');
                    b.append(token);
                }
                upto++;
                return new BytesRef(b.toString());
            }
        }

        @Override
        public long weight() {
            return random().nextLong();
        }

        @Override
        public BytesRef payload() {
            return null;
        }

        @Override
        public boolean hasPayloads() {
            return false;
        }

        @Override
        public Set<BytesRef> contexts() {
            return null;
        }

        @Override
        public boolean hasContexts() {
            return false;
        }
    });
    // Build inefficient but hopefully correct model:
    List<Map<String, Integer>> gramCounts = new ArrayList<>(grams);
    for (int gram = 0; gram < grams; gram++) {
        if (VERBOSE) {
            System.out.println("TEST: build model for gram=" + gram);
        }
        Map<String, Integer> model = new HashMap<>();
        gramCounts.add(model);
        for (String[] doc : docs) {
            for (int i = 0; i < doc.length - gram; i++) {
                StringBuilder b = new StringBuilder();
                for (int j = i; j <= i + gram; j++) {
                    if (j > i) {
                        b.append(' ');
                    }
                    b.append(doc[j]);
                }
                String token = b.toString();
                Integer curCount = model.get(token);
                if (curCount == null) {
                    model.put(token, 1);
                } else {
                    model.put(token, 1 + curCount);
                }
                if (VERBOSE) {
                    System.out.println("  add '" + token + "' -> count=" + model.get(token));
                }
            }
        }
    }
    int lookups = atLeast(100);
    for (int iter = 0; iter < lookups; iter++) {
        String[] tokens = new String[TestUtil.nextInt(random(), 1, 5)];
        for (int i = 0; i < tokens.length; i++) {
            tokens[i] = getZipfToken(terms);
        }
        // Maybe trim last token; be sure not to create the
        // empty string:
        int trimStart;
        if (tokens.length == 1) {
            trimStart = 1;
        } else {
            trimStart = 0;
        }
        int trimAt = TestUtil.nextInt(random(), trimStart, tokens[tokens.length - 1].length());
        tokens[tokens.length - 1] = tokens[tokens.length - 1].substring(0, trimAt);
        int num = TestUtil.nextInt(random(), 1, 100);
        StringBuilder b = new StringBuilder();
        for (String token : tokens) {
            b.append(' ');
            b.append(token);
        }
        String query = b.toString();
        query = query.substring(1);
        if (VERBOSE) {
            System.out.println("\nTEST: iter=" + iter + " query='" + query + "' num=" + num);
        }
        // Expected:
        List<LookupResult> expected = new ArrayList<>();
        double backoff = 1.0;
        seen = new HashSet<>();
        if (VERBOSE) {
            System.out.println("  compute expected");
        }
        for (int i = grams - 1; i >= 0; i--) {
            if (VERBOSE) {
                System.out.println("    grams=" + i);
            }
            if (tokens.length < i + 1) {
                // Don't have enough tokens to use this model
                if (VERBOSE) {
                    System.out.println("      skip");
                }
                continue;
            }
            if (i == 0 && tokens[tokens.length - 1].length() == 0) {
                // Never suggest unigrams from empty string:
                if (VERBOSE) {
                    System.out.println("      skip unigram priors only");
                }
                continue;
            }
            // Build up "context" ngram:
            b = new StringBuilder();
            for (int j = tokens.length - i - 1; j < tokens.length - 1; j++) {
                b.append(' ');
                b.append(tokens[j]);
            }
            String context = b.toString();
            if (context.length() > 0) {
                context = context.substring(1);
            }
            if (VERBOSE) {
                System.out.println("      context='" + context + "'");
            }
            long contextCount;
            if (context.length() == 0) {
                contextCount = totTokens;
            } else {
                Integer count = gramCounts.get(i - 1).get(context);
                if (count == null) {
                    // We never saw this context:
                    backoff *= FreeTextSuggester.ALPHA;
                    if (VERBOSE) {
                        System.out.println("      skip: never saw context");
                    }
                    continue;
                }
                contextCount = count;
            }
            if (VERBOSE) {
                System.out.println("      contextCount=" + contextCount);
            }
            Map<String, Integer> model = gramCounts.get(i);
            // First pass, gather all predictions for this model:
            if (VERBOSE) {
                System.out.println("      find terms w/ prefix=" + tokens[tokens.length - 1]);
            }
            List<LookupResult> tmp = new ArrayList<>();
            for (String term : terms) {
                if (term.startsWith(tokens[tokens.length - 1])) {
                    if (VERBOSE) {
                        System.out.println("        term=" + term);
                    }
                    if (seen.contains(term)) {
                        if (VERBOSE) {
                            System.out.println("          skip seen");
                        }
                        continue;
                    }
                    String ngram = (context + " " + term).trim();
                    Integer count = model.get(ngram);
                    if (count != null) {
                        LookupResult lr = new LookupResult(ngram, (long) (Long.MAX_VALUE * (backoff * (double) count / contextCount)));
                        tmp.add(lr);
                        if (VERBOSE) {
                            System.out.println("      add tmp key='" + lr.key + "' score=" + lr.value);
                        }
                    }
                }
            }
            // Second pass, trim to only top N, and fold those
            // into overall suggestions:
            Collections.sort(tmp, byScoreThenKey);
            if (tmp.size() > num) {
                tmp.subList(num, tmp.size()).clear();
            }
            for (LookupResult result : tmp) {
                String key = result.key.toString();
                int idx = key.lastIndexOf(' ');
                String lastToken;
                if (idx != -1) {
                    lastToken = key.substring(idx + 1);
                } else {
                    lastToken = key;
                }
                if (!seen.contains(lastToken)) {
                    seen.add(lastToken);
                    expected.add(result);
                    if (VERBOSE) {
                        System.out.println("      keep key='" + result.key + "' score=" + result.value);
                    }
                }
            }
            backoff *= FreeTextSuggester.ALPHA;
        }
        Collections.sort(expected, byScoreThenKey);
        if (expected.size() > num) {
            expected.subList(num, expected.size()).clear();
        }
        // Actual:
        List<LookupResult> actual = sug.lookup(query, num);
        if (VERBOSE) {
            System.out.println("  expected: " + expected);
            System.out.println("    actual: " + actual);
        }
        assertEquals(expected.toString(), actual.toString());
    }
    a.close();
}
Also used : CharArraySet(org.apache.lucene.analysis.CharArraySet) HashSet(java.util.HashSet) Set(java.util.Set) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) Analyzer(org.apache.lucene.analysis.Analyzer) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) InputIterator(org.apache.lucene.search.suggest.InputIterator) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) BytesRef(org.apache.lucene.util.BytesRef) HashSet(java.util.HashSet) LookupResult(org.apache.lucene.search.suggest.Lookup.LookupResult) HashMap(java.util.HashMap) Map(java.util.Map)

Example 37 with LookupResult

use of org.apache.lucene.search.suggest.Lookup.LookupResult in project lucene-solr by apache.

the class SuggestComponent method toSuggesterResult.

/** Convert NamedList (suggester response) to {@link SuggesterResult} */
private SuggesterResult toSuggesterResult(Map<String, SimpleOrderedMap<NamedList<Object>>> suggestionsMap) {
    SuggesterResult result = new SuggesterResult();
    if (suggestionsMap == null) {
        return result;
    }
    // for each token
    for (Map.Entry<String, SimpleOrderedMap<NamedList<Object>>> entry : suggestionsMap.entrySet()) {
        String suggesterName = entry.getKey();
        for (Iterator<Map.Entry<String, NamedList<Object>>> suggestionsIter = entry.getValue().iterator(); suggestionsIter.hasNext(); ) {
            Map.Entry<String, NamedList<Object>> suggestions = suggestionsIter.next();
            String tokenString = suggestions.getKey();
            List<LookupResult> lookupResults = new ArrayList<>();
            NamedList<Object> suggestion = suggestions.getValue();
            // for each suggestion
            for (int j = 0; j < suggestion.size(); j++) {
                String property = suggestion.getName(j);
                if (property.equals(SuggesterResultLabels.SUGGESTIONS)) {
                    @SuppressWarnings("unchecked") List<NamedList<Object>> suggestionEntries = (List<NamedList<Object>>) suggestion.getVal(j);
                    for (NamedList<Object> suggestionEntry : suggestionEntries) {
                        String term = (String) suggestionEntry.get(SuggesterResultLabels.SUGGESTION_TERM);
                        Long weight = (Long) suggestionEntry.get(SuggesterResultLabels.SUGGESTION_WEIGHT);
                        String payload = (String) suggestionEntry.get(SuggesterResultLabels.SUGGESTION_PAYLOAD);
                        LookupResult res = new LookupResult(new CharsRef(term), weight, new BytesRef(payload));
                        lookupResults.add(res);
                    }
                }
                result.add(suggesterName, tokenString, lookupResults);
            }
        }
    }
    return result;
}
Also used : NamedList(org.apache.solr.common.util.NamedList) ArrayList(java.util.ArrayList) SimpleOrderedMap(org.apache.solr.common.util.SimpleOrderedMap) CharsRef(org.apache.lucene.util.CharsRef) SuggesterResult(org.apache.solr.spelling.suggest.SuggesterResult) LookupResult(org.apache.lucene.search.suggest.Lookup.LookupResult) AtomicLong(java.util.concurrent.atomic.AtomicLong) ArrayList(java.util.ArrayList) LinkedList(java.util.LinkedList) NamedList(org.apache.solr.common.util.NamedList) List(java.util.List) SimpleOrderedMap(org.apache.solr.common.util.SimpleOrderedMap) HashMap(java.util.HashMap) Map(java.util.Map) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) MetricsMap(org.apache.solr.metrics.MetricsMap) BytesRef(org.apache.lucene.util.BytesRef)

Example 38 with LookupResult

use of org.apache.lucene.search.suggest.Lookup.LookupResult in project lucene-solr by apache.

the class WFSTCompletionTest method testExactFirst.

public void testExactFirst() throws Exception {
    Directory tempDir = getDirectory();
    WFSTCompletionLookup suggester = new WFSTCompletionLookup(tempDir, "wfst", true);
    suggester.build(new InputArrayIterator(new Input[] { new Input("x y", 20), new Input("x", 2) }));
    for (int topN = 1; topN < 4; topN++) {
        List<LookupResult> results = suggester.lookup("x", false, topN);
        assertEquals(Math.min(topN, 2), results.size());
        assertEquals("x", results.get(0).key);
        assertEquals(2, results.get(0).value);
        if (topN > 1) {
            assertEquals("x y", results.get(1).key);
            assertEquals(20, results.get(1).value);
        }
    }
    tempDir.close();
}
Also used : Input(org.apache.lucene.search.suggest.Input) InputArrayIterator(org.apache.lucene.search.suggest.InputArrayIterator) LookupResult(org.apache.lucene.search.suggest.Lookup.LookupResult) Directory(org.apache.lucene.store.Directory)

Example 39 with LookupResult

use of org.apache.lucene.search.suggest.Lookup.LookupResult in project lucene-solr by apache.

the class WFSTCompletionTest method testNonExactFirst.

public void testNonExactFirst() throws Exception {
    Directory tempDir = getDirectory();
    WFSTCompletionLookup suggester = new WFSTCompletionLookup(tempDir, "wfst", false);
    suggester.build(new InputArrayIterator(new Input[] { new Input("x y", 20), new Input("x", 2) }));
    for (int topN = 1; topN < 4; topN++) {
        List<LookupResult> results = suggester.lookup("x", false, topN);
        assertEquals(Math.min(topN, 2), results.size());
        assertEquals("x y", results.get(0).key);
        assertEquals(20, results.get(0).value);
        if (topN > 1) {
            assertEquals("x", results.get(1).key);
            assertEquals(2, results.get(1).value);
        }
    }
    tempDir.close();
}
Also used : Input(org.apache.lucene.search.suggest.Input) InputArrayIterator(org.apache.lucene.search.suggest.InputArrayIterator) LookupResult(org.apache.lucene.search.suggest.Lookup.LookupResult) Directory(org.apache.lucene.store.Directory)

Example 40 with LookupResult

use of org.apache.lucene.search.suggest.Lookup.LookupResult in project lucene-solr by apache.

the class FSTCompletionTest method testMultilingualInput.

public void testMultilingualInput() throws Exception {
    List<Input> input = LookupBenchmarkTest.readTop50KWiki();
    Directory tempDir = getDirectory();
    FSTCompletionLookup lookup = new FSTCompletionLookup(tempDir, "fst");
    lookup.build(new InputArrayIterator(input));
    assertEquals(input.size(), lookup.getCount());
    for (Input tf : input) {
        assertNotNull("Not found: " + tf.term.toString(), lookup.get(TestUtil.bytesToCharSequence(tf.term, random())));
        assertEquals(tf.term.utf8ToString(), lookup.lookup(TestUtil.bytesToCharSequence(tf.term, random()), true, 1).get(0).key.toString());
    }
    List<LookupResult> result = lookup.lookup(stringToCharSequence("wit"), true, 5);
    assertEquals(5, result.size());
    // exact match.
    assertTrue(result.get(0).key.toString().equals("wit"));
    // highest count.
    assertTrue(result.get(1).key.toString().equals("with"));
    tempDir.close();
}
Also used : LookupResult(org.apache.lucene.search.suggest.Lookup.LookupResult) Directory(org.apache.lucene.store.Directory)

Aggregations

LookupResult (org.apache.lucene.search.suggest.Lookup.LookupResult)65 Input (org.apache.lucene.search.suggest.Input)48 InputArrayIterator (org.apache.lucene.search.suggest.InputArrayIterator)48 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)45 Analyzer (org.apache.lucene.analysis.Analyzer)43 Directory (org.apache.lucene.store.Directory)36 BytesRef (org.apache.lucene.util.BytesRef)22 ArrayList (java.util.ArrayList)14 Path (java.nio.file.Path)11 HashSet (java.util.HashSet)9 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)7 Reader (java.io.Reader)6 CannedTokenStream (org.apache.lucene.analysis.CannedTokenStream)6 Tokenizer (org.apache.lucene.analysis.Tokenizer)6 HashMap (java.util.HashMap)5 Token (org.apache.lucene.analysis.Token)5 TokenStream (org.apache.lucene.analysis.TokenStream)5 LinkedList (java.util.LinkedList)4 CharArraySet (org.apache.lucene.analysis.CharArraySet)4 SuggesterResult (org.apache.solr.spelling.suggest.SuggesterResult)4