Search in sources :

Example 1 with InputIterator

use of org.apache.lucene.search.suggest.InputIterator in project lucene-solr by apache.

the class TestFreeTextSuggester method testWiki.

@Ignore
public void testWiki() throws Exception {
    final LineFileDocs lfd = new LineFileDocs(null, "/lucenedata/enwiki/enwiki-20120502-lines-1k.txt");
    // Skip header:
    lfd.nextDoc();
    Analyzer analyzer = new MockAnalyzer(random());
    FreeTextSuggester sug = new FreeTextSuggester(analyzer);
    sug.build(new InputIterator() {

        private int count;

        @Override
        public long weight() {
            return 1;
        }

        @Override
        public BytesRef next() {
            Document doc;
            try {
                doc = lfd.nextDoc();
            } catch (IOException ioe) {
                throw new RuntimeException(ioe);
            }
            if (doc == null) {
                return null;
            }
            if (count++ == 10000) {
                return null;
            }
            return new BytesRef(doc.get("body"));
        }

        @Override
        public BytesRef payload() {
            return null;
        }

        @Override
        public boolean hasPayloads() {
            return false;
        }

        @Override
        public Set<BytesRef> contexts() {
            return null;
        }

        @Override
        public boolean hasContexts() {
            return false;
        }
    });
    if (VERBOSE) {
        System.out.println(sug.ramBytesUsed() + " bytes");
        List<LookupResult> results = sug.lookup("general r", 10);
        System.out.println("results:");
        for (LookupResult result : results) {
            System.out.println("  " + result);
        }
    }
    analyzer.close();
}
Also used : CharArraySet(org.apache.lucene.analysis.CharArraySet) HashSet(java.util.HashSet) Set(java.util.Set) IOException(java.io.IOException) Analyzer(org.apache.lucene.analysis.Analyzer) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) Document(org.apache.lucene.document.Document) InputIterator(org.apache.lucene.search.suggest.InputIterator) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) LookupResult(org.apache.lucene.search.suggest.Lookup.LookupResult) BytesRef(org.apache.lucene.util.BytesRef) LineFileDocs(org.apache.lucene.util.LineFileDocs) Ignore(org.junit.Ignore)

Example 2 with InputIterator

use of org.apache.lucene.search.suggest.InputIterator in project lucene-solr by apache.

the class WFSTCompletionLookup method build.

@Override
public void build(InputIterator iterator) throws IOException {
    if (iterator.hasPayloads()) {
        throw new IllegalArgumentException("this suggester doesn't support payloads");
    }
    if (iterator.hasContexts()) {
        throw new IllegalArgumentException("this suggester doesn't support contexts");
    }
    count = 0;
    BytesRef scratch = new BytesRef();
    InputIterator iter = new WFSTInputIterator(tempDir, tempFileNamePrefix, iterator);
    IntsRefBuilder scratchInts = new IntsRefBuilder();
    BytesRefBuilder previous = null;
    PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
    Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
    while ((scratch = iter.next()) != null) {
        long cost = iter.weight();
        if (previous == null) {
            previous = new BytesRefBuilder();
        } else if (scratch.equals(previous.get())) {
            // for duplicate suggestions, the best weight is actually
            continue;
        // added
        }
        Util.toIntsRef(scratch, scratchInts);
        builder.add(scratchInts.get(), cost);
        previous.copyBytes(scratch);
        count++;
    }
    fst = builder.finish();
}
Also used : InputIterator(org.apache.lucene.search.suggest.InputIterator) SortedInputIterator(org.apache.lucene.search.suggest.SortedInputIterator) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) PositiveIntOutputs(org.apache.lucene.util.fst.PositiveIntOutputs) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) Builder(org.apache.lucene.util.fst.Builder) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) BytesRef(org.apache.lucene.util.BytesRef)

Example 3 with InputIterator

use of org.apache.lucene.search.suggest.InputIterator in project lucene-solr by apache.

the class TestFreeTextSuggester method testRandom.

public void testRandom() throws IOException {
    String[] terms = new String[TestUtil.nextInt(random(), 2, 10)];
    Set<String> seen = new HashSet<>();
    while (seen.size() < terms.length) {
        String token = TestUtil.randomSimpleString(random(), 1, 5);
        if (!seen.contains(token)) {
            terms[seen.size()] = token;
            seen.add(token);
        }
    }
    Analyzer a = new MockAnalyzer(random());
    int numDocs = atLeast(10);
    long totTokens = 0;
    final String[][] docs = new String[numDocs][];
    for (int i = 0; i < numDocs; i++) {
        docs[i] = new String[atLeast(100)];
        if (VERBOSE) {
            System.out.print("  doc " + i + ":");
        }
        for (int j = 0; j < docs[i].length; j++) {
            docs[i][j] = getZipfToken(terms);
            if (VERBOSE) {
                System.out.print(" " + docs[i][j]);
            }
        }
        if (VERBOSE) {
            System.out.println();
        }
        totTokens += docs[i].length;
    }
    int grams = TestUtil.nextInt(random(), 1, 4);
    if (VERBOSE) {
        System.out.println("TEST: " + terms.length + " terms; " + numDocs + " docs; " + grams + " grams");
    }
    // Build suggester model:
    FreeTextSuggester sug = new FreeTextSuggester(a, a, grams, (byte) 0x20);
    sug.build(new InputIterator() {

        int upto;

        @Override
        public BytesRef next() {
            if (upto == docs.length) {
                return null;
            } else {
                StringBuilder b = new StringBuilder();
                for (String token : docs[upto]) {
                    b.append(' ');
                    b.append(token);
                }
                upto++;
                return new BytesRef(b.toString());
            }
        }

        @Override
        public long weight() {
            return random().nextLong();
        }

        @Override
        public BytesRef payload() {
            return null;
        }

        @Override
        public boolean hasPayloads() {
            return false;
        }

        @Override
        public Set<BytesRef> contexts() {
            return null;
        }

        @Override
        public boolean hasContexts() {
            return false;
        }
    });
    // Build inefficient but hopefully correct model:
    List<Map<String, Integer>> gramCounts = new ArrayList<>(grams);
    for (int gram = 0; gram < grams; gram++) {
        if (VERBOSE) {
            System.out.println("TEST: build model for gram=" + gram);
        }
        Map<String, Integer> model = new HashMap<>();
        gramCounts.add(model);
        for (String[] doc : docs) {
            for (int i = 0; i < doc.length - gram; i++) {
                StringBuilder b = new StringBuilder();
                for (int j = i; j <= i + gram; j++) {
                    if (j > i) {
                        b.append(' ');
                    }
                    b.append(doc[j]);
                }
                String token = b.toString();
                Integer curCount = model.get(token);
                if (curCount == null) {
                    model.put(token, 1);
                } else {
                    model.put(token, 1 + curCount);
                }
                if (VERBOSE) {
                    System.out.println("  add '" + token + "' -> count=" + model.get(token));
                }
            }
        }
    }
    int lookups = atLeast(100);
    for (int iter = 0; iter < lookups; iter++) {
        String[] tokens = new String[TestUtil.nextInt(random(), 1, 5)];
        for (int i = 0; i < tokens.length; i++) {
            tokens[i] = getZipfToken(terms);
        }
        // Maybe trim last token; be sure not to create the
        // empty string:
        int trimStart;
        if (tokens.length == 1) {
            trimStart = 1;
        } else {
            trimStart = 0;
        }
        int trimAt = TestUtil.nextInt(random(), trimStart, tokens[tokens.length - 1].length());
        tokens[tokens.length - 1] = tokens[tokens.length - 1].substring(0, trimAt);
        int num = TestUtil.nextInt(random(), 1, 100);
        StringBuilder b = new StringBuilder();
        for (String token : tokens) {
            b.append(' ');
            b.append(token);
        }
        String query = b.toString();
        query = query.substring(1);
        if (VERBOSE) {
            System.out.println("\nTEST: iter=" + iter + " query='" + query + "' num=" + num);
        }
        // Expected:
        List<LookupResult> expected = new ArrayList<>();
        double backoff = 1.0;
        seen = new HashSet<>();
        if (VERBOSE) {
            System.out.println("  compute expected");
        }
        for (int i = grams - 1; i >= 0; i--) {
            if (VERBOSE) {
                System.out.println("    grams=" + i);
            }
            if (tokens.length < i + 1) {
                // Don't have enough tokens to use this model
                if (VERBOSE) {
                    System.out.println("      skip");
                }
                continue;
            }
            if (i == 0 && tokens[tokens.length - 1].length() == 0) {
                // Never suggest unigrams from empty string:
                if (VERBOSE) {
                    System.out.println("      skip unigram priors only");
                }
                continue;
            }
            // Build up "context" ngram:
            b = new StringBuilder();
            for (int j = tokens.length - i - 1; j < tokens.length - 1; j++) {
                b.append(' ');
                b.append(tokens[j]);
            }
            String context = b.toString();
            if (context.length() > 0) {
                context = context.substring(1);
            }
            if (VERBOSE) {
                System.out.println("      context='" + context + "'");
            }
            long contextCount;
            if (context.length() == 0) {
                contextCount = totTokens;
            } else {
                Integer count = gramCounts.get(i - 1).get(context);
                if (count == null) {
                    // We never saw this context:
                    backoff *= FreeTextSuggester.ALPHA;
                    if (VERBOSE) {
                        System.out.println("      skip: never saw context");
                    }
                    continue;
                }
                contextCount = count;
            }
            if (VERBOSE) {
                System.out.println("      contextCount=" + contextCount);
            }
            Map<String, Integer> model = gramCounts.get(i);
            // First pass, gather all predictions for this model:
            if (VERBOSE) {
                System.out.println("      find terms w/ prefix=" + tokens[tokens.length - 1]);
            }
            List<LookupResult> tmp = new ArrayList<>();
            for (String term : terms) {
                if (term.startsWith(tokens[tokens.length - 1])) {
                    if (VERBOSE) {
                        System.out.println("        term=" + term);
                    }
                    if (seen.contains(term)) {
                        if (VERBOSE) {
                            System.out.println("          skip seen");
                        }
                        continue;
                    }
                    String ngram = (context + " " + term).trim();
                    Integer count = model.get(ngram);
                    if (count != null) {
                        LookupResult lr = new LookupResult(ngram, (long) (Long.MAX_VALUE * (backoff * (double) count / contextCount)));
                        tmp.add(lr);
                        if (VERBOSE) {
                            System.out.println("      add tmp key='" + lr.key + "' score=" + lr.value);
                        }
                    }
                }
            }
            // Second pass, trim to only top N, and fold those
            // into overall suggestions:
            Collections.sort(tmp, byScoreThenKey);
            if (tmp.size() > num) {
                tmp.subList(num, tmp.size()).clear();
            }
            for (LookupResult result : tmp) {
                String key = result.key.toString();
                int idx = key.lastIndexOf(' ');
                String lastToken;
                if (idx != -1) {
                    lastToken = key.substring(idx + 1);
                } else {
                    lastToken = key;
                }
                if (!seen.contains(lastToken)) {
                    seen.add(lastToken);
                    expected.add(result);
                    if (VERBOSE) {
                        System.out.println("      keep key='" + result.key + "' score=" + result.value);
                    }
                }
            }
            backoff *= FreeTextSuggester.ALPHA;
        }
        Collections.sort(expected, byScoreThenKey);
        if (expected.size() > num) {
            expected.subList(num, expected.size()).clear();
        }
        // Actual:
        List<LookupResult> actual = sug.lookup(query, num);
        if (VERBOSE) {
            System.out.println("  expected: " + expected);
            System.out.println("    actual: " + actual);
        }
        assertEquals(expected.toString(), actual.toString());
    }
    a.close();
}
Also used : CharArraySet(org.apache.lucene.analysis.CharArraySet) HashSet(java.util.HashSet) Set(java.util.Set) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) Analyzer(org.apache.lucene.analysis.Analyzer) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) InputIterator(org.apache.lucene.search.suggest.InputIterator) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) BytesRef(org.apache.lucene.util.BytesRef) HashSet(java.util.HashSet) LookupResult(org.apache.lucene.search.suggest.Lookup.LookupResult) HashMap(java.util.HashMap) Map(java.util.Map)

Aggregations

InputIterator (org.apache.lucene.search.suggest.InputIterator)3 BytesRef (org.apache.lucene.util.BytesRef)3 HashSet (java.util.HashSet)2 Set (java.util.Set)2 Analyzer (org.apache.lucene.analysis.Analyzer)2 CharArraySet (org.apache.lucene.analysis.CharArraySet)2 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)2 LookupResult (org.apache.lucene.search.suggest.Lookup.LookupResult)2 IOException (java.io.IOException)1 ArrayList (java.util.ArrayList)1 HashMap (java.util.HashMap)1 Map (java.util.Map)1 Document (org.apache.lucene.document.Document)1 SortedInputIterator (org.apache.lucene.search.suggest.SortedInputIterator)1 BytesRefBuilder (org.apache.lucene.util.BytesRefBuilder)1 CharsRefBuilder (org.apache.lucene.util.CharsRefBuilder)1 IntsRefBuilder (org.apache.lucene.util.IntsRefBuilder)1 LineFileDocs (org.apache.lucene.util.LineFileDocs)1 Builder (org.apache.lucene.util.fst.Builder)1 PositiveIntOutputs (org.apache.lucene.util.fst.PositiveIntOutputs)1