Search in sources :

Example 26 with Automaton

use of org.apache.lucene.util.automaton.Automaton in project elasticsearch by elastic.

the class XAnalyzingSuggester method lookup.

@Override
public List<LookupResult> lookup(final CharSequence key, Set<BytesRef> contexts, boolean onlyMorePopular, int num) {
    assert num > 0;
    if (onlyMorePopular) {
        throw new IllegalArgumentException("this suggester only works with onlyMorePopular=false");
    }
    if (fst == null) {
        return Collections.emptyList();
    }
    //System.out.println("lookup key=" + key + " num=" + num);
    for (int i = 0; i < key.length(); i++) {
        if (key.charAt(i) == holeCharacter) {
            throw new IllegalArgumentException("lookup key cannot contain HOLE character U+001E; this character is reserved");
        }
        if (key.charAt(i) == sepLabel) {
            throw new IllegalArgumentException("lookup key cannot contain unit separator character U+001F; this character is reserved");
        }
    }
    final BytesRef utf8Key = new BytesRef(key);
    try {
        Automaton lookupAutomaton = toLookupAutomaton(key);
        final CharsRefBuilder spare = new CharsRefBuilder();
        //System.out.println("  now intersect exactFirst=" + exactFirst);
        // Intersect automaton w/ suggest wFST and get all
        // prefix starting nodes & their outputs:
        //final PathIntersector intersector = getPathIntersector(lookupAutomaton, fst);
        //System.out.println("  prefixPaths: " + prefixPaths.size());
        BytesReader bytesReader = fst.getBytesReader();
        FST.Arc<Pair<Long, BytesRef>> scratchArc = new FST.Arc<>();
        final List<LookupResult> results = new ArrayList<>();
        List<FSTUtil.Path<Pair<Long, BytesRef>>> prefixPaths = FSTUtil.intersectPrefixPaths(convertAutomaton(lookupAutomaton), fst);
        if (exactFirst) {
            int count = 0;
            for (FSTUtil.Path<Pair<Long, BytesRef>> path : prefixPaths) {
                if (fst.findTargetArc(endByte, path.fstNode, scratchArc, bytesReader) != null) {
                    // This node has END_BYTE arc leaving, meaning it's an
                    // "exact" match:
                    count++;
                }
            }
            // Searcher just to find the single exact only
            // match, if present:
            Util.TopNSearcher<Pair<Long, BytesRef>> searcher;
            searcher = new Util.TopNSearcher<>(fst, count * maxSurfaceFormsPerAnalyzedForm, count * maxSurfaceFormsPerAnalyzedForm, weightComparator);
            // ...:
            for (FSTUtil.Path<Pair<Long, BytesRef>> path : prefixPaths) {
                if (fst.findTargetArc(endByte, path.fstNode, scratchArc, bytesReader) != null) {
                    // This node has END_BYTE arc leaving, meaning it's an
                    // "exact" match:
                    searcher.addStartPaths(scratchArc, fst.outputs.add(path.output, scratchArc.output), false, path.input);
                }
            }
            Util.TopResults<Pair<Long, BytesRef>> completions = searcher.search();
            // maxSurfaceFormsPerAnalyzedForm:
            for (Result<Pair<Long, BytesRef>> completion : completions) {
                BytesRef output2 = completion.output.output2;
                if (sameSurfaceForm(utf8Key, output2)) {
                    results.add(getLookupResult(completion.output.output1, output2, spare));
                    break;
                }
            }
            if (results.size() == num) {
                // That was quick:
                return results;
            }
        }
        Util.TopNSearcher<Pair<Long, BytesRef>> searcher;
        searcher = new Util.TopNSearcher<Pair<Long, BytesRef>>(fst, num - results.size(), num * maxAnalyzedPathsForOneInput, weightComparator) {

            private final Set<BytesRef> seen = new HashSet<>();

            @Override
            protected boolean acceptResult(IntsRef input, Pair<Long, BytesRef> output) {
                // can get duplicate surface forms:
                if (seen.contains(output.output2)) {
                    return false;
                }
                seen.add(output.output2);
                if (!exactFirst) {
                    return true;
                } else {
                    // create duplicate results:
                    if (sameSurfaceForm(utf8Key, output.output2)) {
                        // have already found it in the first search:
                        assert results.size() == 1;
                        return false;
                    } else {
                        return true;
                    }
                }
            }
        };
        prefixPaths = getFullPrefixPaths(prefixPaths, lookupAutomaton, fst);
        for (FSTUtil.Path<Pair<Long, BytesRef>> path : prefixPaths) {
            searcher.addStartPaths(path.fstNode, path.output, true, path.input);
        }
        TopResults<Pair<Long, BytesRef>> completions = searcher.search();
        for (Result<Pair<Long, BytesRef>> completion : completions) {
            LookupResult result = getLookupResult(completion.output.output1, completion.output.output2, spare);
            // TODO: for fuzzy case would be nice to return
            // how many edits were required
            //System.out.println("    result=" + result);
            results.add(result);
            if (results.size() == num) {
                // produce one extra path
                break;
            }
        }
        return results;
    } catch (IOException bogus) {
        throw new RuntimeException(bogus);
    }
}
Also used : ArrayList(java.util.ArrayList) Util(org.apache.lucene.util.fst.Util) ArrayUtil(org.apache.lucene.util.ArrayUtil) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) IntsRef(org.apache.lucene.util.IntsRef) BytesRef(org.apache.lucene.util.BytesRef) Pair(org.apache.lucene.util.fst.PairOutputs.Pair) HashSet(java.util.HashSet) Automaton(org.apache.lucene.util.automaton.Automaton) TokenStreamToAutomaton(org.apache.lucene.analysis.TokenStreamToAutomaton) FST(org.apache.lucene.util.fst.FST) IOException(java.io.IOException) BytesReader(org.apache.lucene.util.fst.FST.BytesReader)

Example 27 with Automaton

use of org.apache.lucene.util.automaton.Automaton in project elasticsearch by elastic.

the class XAnalyzingSuggester method toLookupAutomaton.

final Automaton toLookupAutomaton(final CharSequence key) throws IOException {
    // TODO: is there a Reader from a CharSequence?
    // Turn tokenstream into automaton:
    Automaton automaton = null;
    try (TokenStream ts = queryAnalyzer.tokenStream("", key.toString())) {
        automaton = getTokenStreamToAutomaton().toAutomaton(ts);
    }
    automaton = replaceSep(automaton);
    // TODO: we can optimize this somewhat by determinizing
    // while we convert
    // This automaton should not blow up during determinize:
    automaton = Operations.determinize(automaton, Integer.MAX_VALUE);
    return automaton;
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) Automaton(org.apache.lucene.util.automaton.Automaton) TokenStreamToAutomaton(org.apache.lucene.analysis.TokenStreamToAutomaton)

Example 28 with Automaton

use of org.apache.lucene.util.automaton.Automaton in project elasticsearch by elastic.

the class XFuzzySuggester method toLevenshteinAutomata.

Automaton toLevenshteinAutomata(Automaton automaton) {
    List<Automaton> subs = new ArrayList<>();
    FiniteStringsIterator finiteStrings = new FiniteStringsIterator(automaton);
    for (IntsRef string; (string = finiteStrings.next()) != null; ) {
        if (string.length <= nonFuzzyPrefix || string.length < minFuzzyLength) {
            subs.add(Automata.makeString(string.ints, string.offset, string.length));
        } else {
            int[] ints = new int[string.length - nonFuzzyPrefix];
            System.arraycopy(string.ints, string.offset + nonFuzzyPrefix, ints, 0, ints.length);
            // TODO: maybe add alphaMin to LevenshteinAutomata,
            // and pass 1 instead of 0?  We probably don't want
            // to allow the trailing dedup bytes to be
            // edited... but then 0 byte is "in general" allowed
            // on input (but not in UTF8).
            LevenshteinAutomata lev = new LevenshteinAutomata(ints, unicodeAware ? Character.MAX_CODE_POINT : 255, transpositions);
            subs.add(lev.toAutomaton(maxEdits, UnicodeUtil.newString(string.ints, string.offset, nonFuzzyPrefix)));
        }
    }
    if (subs.isEmpty()) {
        // matches nothing
        return Automata.makeEmpty();
    } else if (subs.size() == 1) {
        // no synonyms or anything: just a single path through the tokenstream
        return subs.get(0);
    } else {
        // multiple paths: this is really scary! is it slow?
        // maybe we should not do this and throw UOE?
        Automaton a = Operations.union(subs);
        // this only happens if you have multiple paths anyway (e.g. synonyms)
        return Operations.determinize(a, DEFAULT_MAX_DETERMINIZED_STATES);
    }
}
Also used : FiniteStringsIterator(org.apache.lucene.util.automaton.FiniteStringsIterator) Automaton(org.apache.lucene.util.automaton.Automaton) TokenStreamToAutomaton(org.apache.lucene.analysis.TokenStreamToAutomaton) LevenshteinAutomata(org.apache.lucene.util.automaton.LevenshteinAutomata) ArrayList(java.util.ArrayList) IntsRef(org.apache.lucene.util.IntsRef)

Example 29 with Automaton

use of org.apache.lucene.util.automaton.Automaton in project elasticsearch by elastic.

the class XFuzzySuggester method convertAutomaton.

@Override
protected Automaton convertAutomaton(Automaton a) {
    if (unicodeAware) {
        // FLORIAN EDIT: get converted Automaton from superclass
        Automaton utf8automaton = new UTF32ToUTF8().convert(super.convertAutomaton(a));
        // This automaton should not blow up during determinize:
        utf8automaton = Operations.determinize(utf8automaton, Integer.MAX_VALUE);
        return utf8automaton;
    } else {
        return super.convertAutomaton(a);
    }
}
Also used : Automaton(org.apache.lucene.util.automaton.Automaton) TokenStreamToAutomaton(org.apache.lucene.analysis.TokenStreamToAutomaton) UTF32ToUTF8(org.apache.lucene.util.automaton.UTF32ToUTF8)

Example 30 with Automaton

use of org.apache.lucene.util.automaton.Automaton in project elasticsearch by elastic.

the class XContentMapValues method filter.

/**
     * Returns a function that filters a document map based on the given include and exclude rules.
     * @see #filter(Map, String[], String[]) for details
     */
public static Function<Map<String, ?>, Map<String, Object>> filter(String[] includes, String[] excludes) {
    CharacterRunAutomaton matchAllAutomaton = new CharacterRunAutomaton(Automata.makeAnyString());
    CharacterRunAutomaton include;
    if (includes == null || includes.length == 0) {
        include = matchAllAutomaton;
    } else {
        Automaton includeA = Regex.simpleMatchToAutomaton(includes);
        includeA = makeMatchDotsInFieldNames(includeA);
        include = new CharacterRunAutomaton(includeA);
    }
    Automaton excludeA;
    if (excludes == null || excludes.length == 0) {
        excludeA = Automata.makeEmpty();
    } else {
        excludeA = Regex.simpleMatchToAutomaton(excludes);
        excludeA = makeMatchDotsInFieldNames(excludeA);
    }
    CharacterRunAutomaton exclude = new CharacterRunAutomaton(excludeA);
    return (map) -> filter(map, include, 0, exclude, 0, matchAllAutomaton);
}
Also used : Arrays(java.util.Arrays) Numbers(org.elasticsearch.common.Numbers) Automaton(org.apache.lucene.util.automaton.Automaton) Booleans(org.elasticsearch.common.Booleans) HashMap(java.util.HashMap) Function(java.util.function.Function) Strings(org.elasticsearch.common.Strings) ArrayList(java.util.ArrayList) List(java.util.List) Operations(org.apache.lucene.util.automaton.Operations) TimeValue(org.elasticsearch.common.unit.TimeValue) Map(java.util.Map) Regex(org.elasticsearch.common.regex.Regex) ElasticsearchParseException(org.elasticsearch.ElasticsearchParseException) Automata(org.apache.lucene.util.automaton.Automata) CharacterRunAutomaton(org.apache.lucene.util.automaton.CharacterRunAutomaton) Automaton(org.apache.lucene.util.automaton.Automaton) CharacterRunAutomaton(org.apache.lucene.util.automaton.CharacterRunAutomaton) CharacterRunAutomaton(org.apache.lucene.util.automaton.CharacterRunAutomaton)

Aggregations

Automaton (org.apache.lucene.util.automaton.Automaton)57 TokenStreamToAutomaton (org.apache.lucene.analysis.TokenStreamToAutomaton)17 IntsRef (org.apache.lucene.util.IntsRef)13 BytesRef (org.apache.lucene.util.BytesRef)12 ArrayList (java.util.ArrayList)11 Directory (org.apache.lucene.store.Directory)8 HashSet (java.util.HashSet)7 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)7 Document (org.apache.lucene.document.Document)6 CompiledAutomaton (org.apache.lucene.util.automaton.CompiledAutomaton)6 Transition (org.apache.lucene.util.automaton.Transition)6 TokenStream (org.apache.lucene.analysis.TokenStream)5 BytesRefBuilder (org.apache.lucene.util.BytesRefBuilder)5 CharsRefBuilder (org.apache.lucene.util.CharsRefBuilder)5 CharacterRunAutomaton (org.apache.lucene.util.automaton.CharacterRunAutomaton)5 Analyzer (org.apache.lucene.analysis.Analyzer)4 IntsRefBuilder (org.apache.lucene.util.IntsRefBuilder)4 FiniteStringsIterator (org.apache.lucene.util.automaton.FiniteStringsIterator)4 LevenshteinAutomata (org.apache.lucene.util.automaton.LevenshteinAutomata)4 RegExp (org.apache.lucene.util.automaton.RegExp)4