Search in sources :

Example 1 with FiniteStringsIterator

use of org.apache.lucene.util.automaton.FiniteStringsIterator in project lucene-solr by apache.

the class FuzzySuggester method toLevenshteinAutomata.

Automaton toLevenshteinAutomata(Automaton automaton) {
    List<Automaton> subs = new ArrayList<>();
    FiniteStringsIterator finiteStrings = new FiniteStringsIterator(automaton);
    for (IntsRef string; (string = finiteStrings.next()) != null; ) {
        if (string.length <= nonFuzzyPrefix || string.length < minFuzzyLength) {
            subs.add(Automata.makeString(string.ints, string.offset, string.length));
        } else {
            int[] ints = new int[string.length - nonFuzzyPrefix];
            System.arraycopy(string.ints, string.offset + nonFuzzyPrefix, ints, 0, ints.length);
            // TODO: maybe add alphaMin to LevenshteinAutomata,
            // and pass 1 instead of 0?  We probably don't want
            // to allow the trailing dedup bytes to be
            // edited... but then 0 byte is "in general" allowed
            // on input (but not in UTF8).
            LevenshteinAutomata lev = new LevenshteinAutomata(ints, unicodeAware ? Character.MAX_CODE_POINT : 255, transpositions);
            subs.add(lev.toAutomaton(maxEdits, UnicodeUtil.newString(string.ints, string.offset, nonFuzzyPrefix)));
        }
    }
    if (subs.isEmpty()) {
        // matches nothing
        return Automata.makeEmpty();
    } else if (subs.size() == 1) {
        // no synonyms or anything: just a single path through the tokenstream
        return subs.get(0);
    } else {
        // multiple paths: this is really scary! is it slow?
        // maybe we should not do this and throw UOE?
        Automaton a = Operations.union(subs);
        // this only happens if you have multiple paths anyway (e.g. synonyms)
        return Operations.determinize(a, DEFAULT_MAX_DETERMINIZED_STATES);
    }
}
Also used : FiniteStringsIterator(org.apache.lucene.util.automaton.FiniteStringsIterator) Automaton(org.apache.lucene.util.automaton.Automaton) TokenStreamToAutomaton(org.apache.lucene.analysis.TokenStreamToAutomaton) LevenshteinAutomata(org.apache.lucene.util.automaton.LevenshteinAutomata) ArrayList(java.util.ArrayList) IntsRef(org.apache.lucene.util.IntsRef)

Example 2 with FiniteStringsIterator

use of org.apache.lucene.util.automaton.FiniteStringsIterator in project lucene-solr by apache.

the class FuzzyCompletionQuery method toLevenshteinAutomata.

private Automaton toLevenshteinAutomata(Automaton automaton, Set<IntsRef> refs) {
    List<Automaton> subs = new ArrayList<>();
    FiniteStringsIterator finiteStrings = new FiniteStringsIterator(automaton);
    for (IntsRef string; (string = finiteStrings.next()) != null; ) {
        refs.add(IntsRef.deepCopyOf(string));
        if (string.length <= nonFuzzyPrefix || string.length < minFuzzyLength) {
            subs.add(Automata.makeString(string.ints, string.offset, string.length));
        } else {
            int[] ints = new int[string.length - nonFuzzyPrefix];
            System.arraycopy(string.ints, string.offset + nonFuzzyPrefix, ints, 0, ints.length);
            // TODO: maybe add alphaMin to LevenshteinAutomata,
            // and pass 1 instead of 0?  We probably don't want
            // to allow the trailing dedup bytes to be
            // edited... but then 0 byte is "in general" allowed
            // on input (but not in UTF8).
            LevenshteinAutomata lev = new LevenshteinAutomata(ints, unicodeAware ? Character.MAX_CODE_POINT : 255, transpositions);
            subs.add(lev.toAutomaton(maxEdits, UnicodeUtil.newString(string.ints, string.offset, nonFuzzyPrefix)));
        }
    }
    if (subs.isEmpty()) {
        // matches nothing
        return Automata.makeEmpty();
    } else if (subs.size() == 1) {
        // no synonyms or anything: just a single path through the tokenstream
        return subs.get(0);
    } else {
        // multiple paths: this is really scary! is it slow?
        // maybe we should not do this and throw UOE?
        Automaton a = Operations.union(subs);
        // this only happens if you have multiple paths anyway (e.g. synonyms)
        return Operations.determinize(a, maxDeterminizedStates);
    }
}
Also used : FiniteStringsIterator(org.apache.lucene.util.automaton.FiniteStringsIterator) Automaton(org.apache.lucene.util.automaton.Automaton) LevenshteinAutomata(org.apache.lucene.util.automaton.LevenshteinAutomata) ArrayList(java.util.ArrayList) IntsRef(org.apache.lucene.util.IntsRef)

Example 3 with FiniteStringsIterator

use of org.apache.lucene.util.automaton.FiniteStringsIterator in project elasticsearch by elastic.

the class XFuzzySuggester method toLevenshteinAutomata.

Automaton toLevenshteinAutomata(Automaton automaton) {
    List<Automaton> subs = new ArrayList<>();
    FiniteStringsIterator finiteStrings = new FiniteStringsIterator(automaton);
    for (IntsRef string; (string = finiteStrings.next()) != null; ) {
        if (string.length <= nonFuzzyPrefix || string.length < minFuzzyLength) {
            subs.add(Automata.makeString(string.ints, string.offset, string.length));
        } else {
            int[] ints = new int[string.length - nonFuzzyPrefix];
            System.arraycopy(string.ints, string.offset + nonFuzzyPrefix, ints, 0, ints.length);
            // TODO: maybe add alphaMin to LevenshteinAutomata,
            // and pass 1 instead of 0?  We probably don't want
            // to allow the trailing dedup bytes to be
            // edited... but then 0 byte is "in general" allowed
            // on input (but not in UTF8).
            LevenshteinAutomata lev = new LevenshteinAutomata(ints, unicodeAware ? Character.MAX_CODE_POINT : 255, transpositions);
            subs.add(lev.toAutomaton(maxEdits, UnicodeUtil.newString(string.ints, string.offset, nonFuzzyPrefix)));
        }
    }
    if (subs.isEmpty()) {
        // matches nothing
        return Automata.makeEmpty();
    } else if (subs.size() == 1) {
        // no synonyms or anything: just a single path through the tokenstream
        return subs.get(0);
    } else {
        // multiple paths: this is really scary! is it slow?
        // maybe we should not do this and throw UOE?
        Automaton a = Operations.union(subs);
        // this only happens if you have multiple paths anyway (e.g. synonyms)
        return Operations.determinize(a, DEFAULT_MAX_DETERMINIZED_STATES);
    }
}
Also used : FiniteStringsIterator(org.apache.lucene.util.automaton.FiniteStringsIterator) Automaton(org.apache.lucene.util.automaton.Automaton) TokenStreamToAutomaton(org.apache.lucene.analysis.TokenStreamToAutomaton) LevenshteinAutomata(org.apache.lucene.util.automaton.LevenshteinAutomata) ArrayList(java.util.ArrayList) IntsRef(org.apache.lucene.util.IntsRef)

Example 4 with FiniteStringsIterator

use of org.apache.lucene.util.automaton.FiniteStringsIterator in project lucene-solr by apache.

the class FuzzySuggesterTest method testRandom.

public void testRandom() throws Exception {
    int numQueries = atLeast(100);
    final List<TermFreqPayload2> slowCompletor = new ArrayList<>();
    final TreeSet<String> allPrefixes = new TreeSet<>();
    final Set<String> seen = new HashSet<>();
    Input[] keys = new Input[numQueries];
    boolean preserveSep = random().nextBoolean();
    boolean unicodeAware = random().nextBoolean();
    final int numStopChars = random().nextInt(10);
    final boolean preserveHoles = random().nextBoolean();
    if (VERBOSE) {
        System.out.println("TEST: " + numQueries + " words; preserveSep=" + preserveSep + " ; unicodeAware=" + unicodeAware + " numStopChars=" + numStopChars + " preserveHoles=" + preserveHoles);
    }
    for (int i = 0; i < numQueries; i++) {
        int numTokens = TestUtil.nextInt(random(), 1, 4);
        String key;
        String analyzedKey;
        while (true) {
            key = "";
            analyzedKey = "";
            boolean lastRemoved = false;
            for (int token = 0; token < numTokens; token++) {
                String s;
                while (true) {
                    // TODO: would be nice to fix this slowCompletor/comparator to
                    // use full range, but we might lose some coverage too...
                    s = TestUtil.randomSimpleString(random());
                    if (s.length() > 0) {
                        if (token > 0) {
                            key += " ";
                        }
                        if (preserveSep && analyzedKey.length() > 0 && (unicodeAware ? analyzedKey.codePointAt(analyzedKey.codePointCount(0, analyzedKey.length()) - 1) != ' ' : analyzedKey.charAt(analyzedKey.length() - 1) != ' ')) {
                            analyzedKey += " ";
                        }
                        key += s;
                        if (s.length() == 1 && isStopChar(s.charAt(0), numStopChars)) {
                            if (preserveSep && preserveHoles) {
                                analyzedKey += '';
                            }
                            lastRemoved = true;
                        } else {
                            analyzedKey += s;
                            lastRemoved = false;
                        }
                        break;
                    }
                }
            }
            analyzedKey = analyzedKey.replaceAll("(^| )$", "");
            if (preserveSep && lastRemoved) {
                analyzedKey += " ";
            }
            // Don't add same surface form more than once:
            if (!seen.contains(key)) {
                seen.add(key);
                break;
            }
        }
        for (int j = 1; j < key.length(); j++) {
            allPrefixes.add(key.substring(0, j));
        }
        // we can probably do Integer.MAX_VALUE here, but why worry.
        int weight = random().nextInt(1 << 24);
        keys[i] = new Input(key, weight);
        slowCompletor.add(new TermFreqPayload2(key, analyzedKey, weight));
    }
    if (VERBOSE) {
        // Don't just sort original list, to avoid VERBOSE
        // altering the test:
        List<TermFreqPayload2> sorted = new ArrayList<>(slowCompletor);
        Collections.sort(sorted);
        for (TermFreqPayload2 ent : sorted) {
            System.out.println("  surface='" + ent.surfaceForm + " analyzed='" + ent.analyzedForm + "' weight=" + ent.weight);
        }
    }
    Analyzer a = new MockTokenEatingAnalyzer(numStopChars, preserveHoles);
    Directory tempDir = getDirectory();
    FuzzySuggester suggester = new FuzzySuggester(tempDir, "fuzzy", a, a, preserveSep ? AnalyzingSuggester.PRESERVE_SEP : 0, 256, -1, true, 1, false, 1, 3, unicodeAware);
    suggester.build(new InputArrayIterator(keys));
    for (String prefix : allPrefixes) {
        if (VERBOSE) {
            System.out.println("\nTEST: prefix=" + prefix);
        }
        final int topN = TestUtil.nextInt(random(), 1, 10);
        List<LookupResult> r = suggester.lookup(TestUtil.stringToCharSequence(prefix, random()), false, topN);
        // 2. go thru whole set to find suggestions:
        List<LookupResult> matches = new ArrayList<>();
        // "Analyze" the key:
        String[] tokens = prefix.split(" ");
        StringBuilder builder = new StringBuilder();
        boolean lastRemoved = false;
        for (int i = 0; i < tokens.length; i++) {
            String token = tokens[i];
            if (preserveSep && builder.length() > 0 && !builder.toString().endsWith(" ")) {
                builder.append(' ');
            }
            if (token.length() == 1 && isStopChar(token.charAt(0), numStopChars)) {
                if (preserveSep && preserveHoles) {
                    builder.append("");
                }
                lastRemoved = true;
            } else {
                builder.append(token);
                lastRemoved = false;
            }
        }
        String analyzedKey = builder.toString();
        // issue open for this):
        while (true) {
            String s = analyzedKey.replaceAll("(^| )$", "");
            s = s.replaceAll("\\s+$", "");
            if (s.equals(analyzedKey)) {
                break;
            }
            analyzedKey = s;
        }
        if (analyzedKey.length() == 0) {
            // string!  You get no results, not all results...
            continue;
        }
        if (preserveSep && (prefix.endsWith(" ") || lastRemoved)) {
            analyzedKey += " ";
        }
        if (VERBOSE) {
            System.out.println("  analyzed: " + analyzedKey);
        }
        TokenStreamToAutomaton tokenStreamToAutomaton = suggester.getTokenStreamToAutomaton();
        // NOTE: not great that we ask the suggester to give
        // us the "answer key" (ie maybe we have a bug in
        // suggester.toLevA ...) ... but testRandom2() fixes
        // this:
        Automaton automaton = suggester.convertAutomaton(suggester.toLevenshteinAutomata(suggester.toLookupAutomaton(analyzedKey)));
        assertTrue(automaton.isDeterministic());
        // TODO: could be faster... but it's slowCompletor for a reason
        BytesRefBuilder spare = new BytesRefBuilder();
        for (TermFreqPayload2 e : slowCompletor) {
            spare.copyChars(e.analyzedForm);
            FiniteStringsIterator finiteStrings = new FiniteStringsIterator(suggester.toAutomaton(spare.get(), tokenStreamToAutomaton));
            for (IntsRef string; (string = finiteStrings.next()) != null; ) {
                int p = 0;
                BytesRef ref = Util.toBytesRef(string, spare);
                boolean added = false;
                for (int i = ref.offset; i < ref.length; i++) {
                    int q = automaton.step(p, ref.bytes[i] & 0xff);
                    if (q == -1) {
                        break;
                    } else if (automaton.isAccept(q)) {
                        matches.add(new LookupResult(e.surfaceForm, e.weight));
                        added = true;
                        break;
                    }
                    p = q;
                }
                if (!added && automaton.isAccept(p)) {
                    matches.add(new LookupResult(e.surfaceForm, e.weight));
                }
            }
        }
        assertTrue(numStopChars > 0 || matches.size() > 0);
        if (matches.size() > 1) {
            Collections.sort(matches, new Comparator<LookupResult>() {

                @Override
                public int compare(LookupResult left, LookupResult right) {
                    int cmp = Float.compare(right.value, left.value);
                    if (cmp == 0) {
                        return left.compareTo(right);
                    } else {
                        return cmp;
                    }
                }
            });
        }
        if (matches.size() > topN) {
            matches = matches.subList(0, topN);
        }
        if (VERBOSE) {
            System.out.println("  expected:");
            for (LookupResult lr : matches) {
                System.out.println("    key=" + lr.key + " weight=" + lr.value);
            }
            System.out.println("  actual:");
            for (LookupResult lr : r) {
                System.out.println("    key=" + lr.key + " weight=" + lr.value);
            }
        }
        assertEquals(prefix + "  " + topN, matches.size(), r.size());
        for (int hit = 0; hit < r.size(); hit++) {
            //System.out.println("  check hit " + hit);
            assertEquals(prefix + "  " + topN, matches.get(hit).key.toString(), r.get(hit).key.toString());
            assertEquals(matches.get(hit).value, r.get(hit).value, 0f);
        }
    }
    IOUtils.close(a, tempDir);
}
Also used : FiniteStringsIterator(org.apache.lucene.util.automaton.FiniteStringsIterator) ArrayList(java.util.ArrayList) Analyzer(org.apache.lucene.analysis.Analyzer) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) Input(org.apache.lucene.search.suggest.Input) TreeSet(java.util.TreeSet) IntsRef(org.apache.lucene.util.IntsRef) BytesRef(org.apache.lucene.util.BytesRef) HashSet(java.util.HashSet) Directory(org.apache.lucene.store.Directory) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) TokenStreamToAutomaton(org.apache.lucene.analysis.TokenStreamToAutomaton) Automaton(org.apache.lucene.util.automaton.Automaton) InputArrayIterator(org.apache.lucene.search.suggest.InputArrayIterator) LookupResult(org.apache.lucene.search.suggest.Lookup.LookupResult) TokenStreamToAutomaton(org.apache.lucene.analysis.TokenStreamToAutomaton)

Example 5 with FiniteStringsIterator

use of org.apache.lucene.util.automaton.FiniteStringsIterator in project lucene-solr by apache.

the class GraphTokenStreamFiniteStrings method getFiniteStrings.

/**
   * Get all finite strings that start at {@code startState} and end at {@code endState}.
   */
public Iterator<TokenStream> getFiniteStrings(int startState, int endState) throws IOException {
    final FiniteStringsIterator it = new FiniteStringsIterator(det, startState, endState);
    return new Iterator<TokenStream>() {

        IntsRef current;

        boolean finished = false;

        @Override
        public boolean hasNext() {
            if (finished == false && current == null) {
                current = it.next();
                if (current == null) {
                    finished = true;
                }
            }
            return current != null;
        }

        @Override
        public TokenStream next() {
            if (current == null) {
                hasNext();
            }
            TokenStream next = new FiniteStringsTokenStream(current);
            current = null;
            return next;
        }
    };
}
Also used : FiniteStringsIterator(org.apache.lucene.util.automaton.FiniteStringsIterator) TokenStream(org.apache.lucene.analysis.TokenStream) Iterator(java.util.Iterator) FiniteStringsIterator(org.apache.lucene.util.automaton.FiniteStringsIterator) IntsRef(org.apache.lucene.util.IntsRef)

Aggregations

IntsRef (org.apache.lucene.util.IntsRef)5 FiniteStringsIterator (org.apache.lucene.util.automaton.FiniteStringsIterator)5 ArrayList (java.util.ArrayList)4 Automaton (org.apache.lucene.util.automaton.Automaton)4 TokenStreamToAutomaton (org.apache.lucene.analysis.TokenStreamToAutomaton)3 LevenshteinAutomata (org.apache.lucene.util.automaton.LevenshteinAutomata)3 HashSet (java.util.HashSet)1 Iterator (java.util.Iterator)1 TreeSet (java.util.TreeSet)1 Analyzer (org.apache.lucene.analysis.Analyzer)1 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)1 TokenStream (org.apache.lucene.analysis.TokenStream)1 Input (org.apache.lucene.search.suggest.Input)1 InputArrayIterator (org.apache.lucene.search.suggest.InputArrayIterator)1 LookupResult (org.apache.lucene.search.suggest.Lookup.LookupResult)1 Directory (org.apache.lucene.store.Directory)1 BytesRef (org.apache.lucene.util.BytesRef)1 BytesRefBuilder (org.apache.lucene.util.BytesRefBuilder)1