Search in sources :

Example 6 with FST

use of org.apache.lucene.util.fst.FST in project lucene-solr by apache.

the class AnalyzingSuggester method lookup.

@Override
public List<LookupResult> lookup(final CharSequence key, Set<BytesRef> contexts, boolean onlyMorePopular, int num) {
    assert num > 0;
    if (onlyMorePopular) {
        throw new IllegalArgumentException("this suggester only works with onlyMorePopular=false");
    }
    if (contexts != null) {
        throw new IllegalArgumentException("this suggester doesn't support contexts");
    }
    if (fst == null) {
        return Collections.emptyList();
    }
    //System.out.println("lookup key=" + key + " num=" + num);
    for (int i = 0; i < key.length(); i++) {
        if (key.charAt(i) == 0x1E) {
            throw new IllegalArgumentException("lookup key cannot contain HOLE character U+001E; this character is reserved");
        }
        if (key.charAt(i) == 0x1F) {
            throw new IllegalArgumentException("lookup key cannot contain unit separator character U+001F; this character is reserved");
        }
    }
    final BytesRef utf8Key = new BytesRef(key);
    try {
        Automaton lookupAutomaton = toLookupAutomaton(key);
        final CharsRefBuilder spare = new CharsRefBuilder();
        //System.out.println("  now intersect exactFirst=" + exactFirst);
        // Intersect automaton w/ suggest wFST and get all
        // prefix starting nodes & their outputs:
        //final PathIntersector intersector = getPathIntersector(lookupAutomaton, fst);
        //System.out.println("  prefixPaths: " + prefixPaths.size());
        BytesReader bytesReader = fst.getBytesReader();
        FST.Arc<Pair<Long, BytesRef>> scratchArc = new FST.Arc<>();
        final List<LookupResult> results = new ArrayList<>();
        List<FSTUtil.Path<Pair<Long, BytesRef>>> prefixPaths = FSTUtil.intersectPrefixPaths(convertAutomaton(lookupAutomaton), fst);
        if (exactFirst) {
            int count = 0;
            for (FSTUtil.Path<Pair<Long, BytesRef>> path : prefixPaths) {
                if (fst.findTargetArc(END_BYTE, path.fstNode, scratchArc, bytesReader) != null) {
                    // This node has END_BYTE arc leaving, meaning it's an
                    // "exact" match:
                    count++;
                }
            }
            // Searcher just to find the single exact only
            // match, if present:
            Util.TopNSearcher<Pair<Long, BytesRef>> searcher;
            searcher = new Util.TopNSearcher<>(fst, count * maxSurfaceFormsPerAnalyzedForm, count * maxSurfaceFormsPerAnalyzedForm, weightComparator);
            // ...:
            for (FSTUtil.Path<Pair<Long, BytesRef>> path : prefixPaths) {
                if (fst.findTargetArc(END_BYTE, path.fstNode, scratchArc, bytesReader) != null) {
                    // This node has END_BYTE arc leaving, meaning it's an
                    // "exact" match:
                    searcher.addStartPaths(scratchArc, fst.outputs.add(path.output, scratchArc.output), false, path.input);
                }
            }
            TopResults<Pair<Long, BytesRef>> completions = searcher.search();
            assert completions.isComplete;
            // maxSurfaceFormsPerAnalyzedForm:
            for (Result<Pair<Long, BytesRef>> completion : completions) {
                BytesRef output2 = completion.output.output2;
                if (sameSurfaceForm(utf8Key, output2)) {
                    results.add(getLookupResult(completion.output.output1, output2, spare));
                    break;
                }
            }
            if (results.size() == num) {
                // That was quick:
                return results;
            }
        }
        Util.TopNSearcher<Pair<Long, BytesRef>> searcher;
        searcher = new Util.TopNSearcher<Pair<Long, BytesRef>>(fst, num - results.size(), num * maxAnalyzedPathsForOneInput, weightComparator) {

            private final Set<BytesRef> seen = new HashSet<>();

            @Override
            protected boolean acceptResult(IntsRef input, Pair<Long, BytesRef> output) {
                // can get duplicate surface forms:
                if (seen.contains(output.output2)) {
                    return false;
                }
                seen.add(output.output2);
                if (!exactFirst) {
                    return true;
                } else {
                    // create duplicate results:
                    if (sameSurfaceForm(utf8Key, output.output2)) {
                        // have already found it in the first search:
                        assert results.size() == 1;
                        return false;
                    } else {
                        return true;
                    }
                }
            }
        };
        prefixPaths = getFullPrefixPaths(prefixPaths, lookupAutomaton, fst);
        for (FSTUtil.Path<Pair<Long, BytesRef>> path : prefixPaths) {
            searcher.addStartPaths(path.fstNode, path.output, true, path.input);
        }
        TopResults<Pair<Long, BytesRef>> completions = searcher.search();
        assert completions.isComplete;
        for (Result<Pair<Long, BytesRef>> completion : completions) {
            LookupResult result = getLookupResult(completion.output.output1, completion.output.output2, spare);
            // TODO: for fuzzy case would be nice to return
            // how many edits were required
            //System.out.println("    result=" + result);
            results.add(result);
            if (results.size() == num) {
                // produce one extra path
                break;
            }
        }
        return results;
    } catch (IOException bogus) {
        throw new RuntimeException(bogus);
    }
}
Also used : ArrayList(java.util.ArrayList) Util(org.apache.lucene.util.fst.Util) CodecUtil(org.apache.lucene.codecs.CodecUtil) ArrayUtil(org.apache.lucene.util.ArrayUtil) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) IntsRef(org.apache.lucene.util.IntsRef) BytesRef(org.apache.lucene.util.BytesRef) Pair(org.apache.lucene.util.fst.PairOutputs.Pair) HashSet(java.util.HashSet) TokenStreamToAutomaton(org.apache.lucene.analysis.TokenStreamToAutomaton) Automaton(org.apache.lucene.util.automaton.Automaton) FST(org.apache.lucene.util.fst.FST) IOException(java.io.IOException) BytesReader(org.apache.lucene.util.fst.FST.BytesReader)

Example 7 with FST

use of org.apache.lucene.util.fst.FST in project elasticsearch by elastic.

the class XAnalyzingSuggester method lookup.

@Override
public List<LookupResult> lookup(final CharSequence key, Set<BytesRef> contexts, boolean onlyMorePopular, int num) {
    assert num > 0;
    if (onlyMorePopular) {
        throw new IllegalArgumentException("this suggester only works with onlyMorePopular=false");
    }
    if (fst == null) {
        return Collections.emptyList();
    }
    //System.out.println("lookup key=" + key + " num=" + num);
    for (int i = 0; i < key.length(); i++) {
        if (key.charAt(i) == holeCharacter) {
            throw new IllegalArgumentException("lookup key cannot contain HOLE character U+001E; this character is reserved");
        }
        if (key.charAt(i) == sepLabel) {
            throw new IllegalArgumentException("lookup key cannot contain unit separator character U+001F; this character is reserved");
        }
    }
    final BytesRef utf8Key = new BytesRef(key);
    try {
        Automaton lookupAutomaton = toLookupAutomaton(key);
        final CharsRefBuilder spare = new CharsRefBuilder();
        //System.out.println("  now intersect exactFirst=" + exactFirst);
        // Intersect automaton w/ suggest wFST and get all
        // prefix starting nodes & their outputs:
        //final PathIntersector intersector = getPathIntersector(lookupAutomaton, fst);
        //System.out.println("  prefixPaths: " + prefixPaths.size());
        BytesReader bytesReader = fst.getBytesReader();
        FST.Arc<Pair<Long, BytesRef>> scratchArc = new FST.Arc<>();
        final List<LookupResult> results = new ArrayList<>();
        List<FSTUtil.Path<Pair<Long, BytesRef>>> prefixPaths = FSTUtil.intersectPrefixPaths(convertAutomaton(lookupAutomaton), fst);
        if (exactFirst) {
            int count = 0;
            for (FSTUtil.Path<Pair<Long, BytesRef>> path : prefixPaths) {
                if (fst.findTargetArc(endByte, path.fstNode, scratchArc, bytesReader) != null) {
                    // This node has END_BYTE arc leaving, meaning it's an
                    // "exact" match:
                    count++;
                }
            }
            // Searcher just to find the single exact only
            // match, if present:
            Util.TopNSearcher<Pair<Long, BytesRef>> searcher;
            searcher = new Util.TopNSearcher<>(fst, count * maxSurfaceFormsPerAnalyzedForm, count * maxSurfaceFormsPerAnalyzedForm, weightComparator);
            // ...:
            for (FSTUtil.Path<Pair<Long, BytesRef>> path : prefixPaths) {
                if (fst.findTargetArc(endByte, path.fstNode, scratchArc, bytesReader) != null) {
                    // This node has END_BYTE arc leaving, meaning it's an
                    // "exact" match:
                    searcher.addStartPaths(scratchArc, fst.outputs.add(path.output, scratchArc.output), false, path.input);
                }
            }
            Util.TopResults<Pair<Long, BytesRef>> completions = searcher.search();
            // maxSurfaceFormsPerAnalyzedForm:
            for (Result<Pair<Long, BytesRef>> completion : completions) {
                BytesRef output2 = completion.output.output2;
                if (sameSurfaceForm(utf8Key, output2)) {
                    results.add(getLookupResult(completion.output.output1, output2, spare));
                    break;
                }
            }
            if (results.size() == num) {
                // That was quick:
                return results;
            }
        }
        Util.TopNSearcher<Pair<Long, BytesRef>> searcher;
        searcher = new Util.TopNSearcher<Pair<Long, BytesRef>>(fst, num - results.size(), num * maxAnalyzedPathsForOneInput, weightComparator) {

            private final Set<BytesRef> seen = new HashSet<>();

            @Override
            protected boolean acceptResult(IntsRef input, Pair<Long, BytesRef> output) {
                // can get duplicate surface forms:
                if (seen.contains(output.output2)) {
                    return false;
                }
                seen.add(output.output2);
                if (!exactFirst) {
                    return true;
                } else {
                    // create duplicate results:
                    if (sameSurfaceForm(utf8Key, output.output2)) {
                        // have already found it in the first search:
                        assert results.size() == 1;
                        return false;
                    } else {
                        return true;
                    }
                }
            }
        };
        prefixPaths = getFullPrefixPaths(prefixPaths, lookupAutomaton, fst);
        for (FSTUtil.Path<Pair<Long, BytesRef>> path : prefixPaths) {
            searcher.addStartPaths(path.fstNode, path.output, true, path.input);
        }
        TopResults<Pair<Long, BytesRef>> completions = searcher.search();
        for (Result<Pair<Long, BytesRef>> completion : completions) {
            LookupResult result = getLookupResult(completion.output.output1, completion.output.output2, spare);
            // TODO: for fuzzy case would be nice to return
            // how many edits were required
            //System.out.println("    result=" + result);
            results.add(result);
            if (results.size() == num) {
                // produce one extra path
                break;
            }
        }
        return results;
    } catch (IOException bogus) {
        throw new RuntimeException(bogus);
    }
}
Also used : ArrayList(java.util.ArrayList) Util(org.apache.lucene.util.fst.Util) ArrayUtil(org.apache.lucene.util.ArrayUtil) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) IntsRef(org.apache.lucene.util.IntsRef) BytesRef(org.apache.lucene.util.BytesRef) Pair(org.apache.lucene.util.fst.PairOutputs.Pair) HashSet(java.util.HashSet) Automaton(org.apache.lucene.util.automaton.Automaton) TokenStreamToAutomaton(org.apache.lucene.analysis.TokenStreamToAutomaton) FST(org.apache.lucene.util.fst.FST) IOException(java.io.IOException) BytesReader(org.apache.lucene.util.fst.FST.BytesReader)

Example 8 with FST

use of org.apache.lucene.util.fst.FST in project lucene-solr by apache.

the class FSTTermsReader method walk.

static <T> void walk(FST<T> fst) throws IOException {
    final ArrayList<FST.Arc<T>> queue = new ArrayList<>();
    final BitSet seen = new BitSet();
    final FST.BytesReader reader = fst.getBytesReader();
    final FST.Arc<T> startArc = fst.getFirstArc(new FST.Arc<T>());
    queue.add(startArc);
    while (!queue.isEmpty()) {
        final FST.Arc<T> arc = queue.remove(0);
        final long node = arc.target;
        //System.out.println(arc);
        if (FST.targetHasArcs(arc) && !seen.get((int) node)) {
            seen.set((int) node);
            fst.readFirstRealTargetArc(node, arc, reader);
            while (true) {
                queue.add(new FST.Arc<T>().copyFrom(arc));
                if (arc.isLast()) {
                    break;
                } else {
                    fst.readNextRealArc(arc, reader);
                }
            }
        }
    }
}
Also used : FST(org.apache.lucene.util.fst.FST) FST(org.apache.lucene.util.fst.FST) ArrayList(java.util.ArrayList) BitSet(java.util.BitSet)

Example 9 with FST

use of org.apache.lucene.util.fst.FST in project lucene-solr by apache.

the class Dictionary method lookup.

IntsRef lookup(FST<IntsRef> fst, char[] word, int offset, int length) {
    if (fst == null) {
        return null;
    }
    final FST.BytesReader bytesReader = fst.getBytesReader();
    final FST.Arc<IntsRef> arc = fst.getFirstArc(new FST.Arc<IntsRef>());
    // Accumulate output as we go
    final IntsRef NO_OUTPUT = fst.outputs.getNoOutput();
    IntsRef output = NO_OUTPUT;
    int l = offset + length;
    try {
        for (int i = offset, cp = 0; i < l; i += Character.charCount(cp)) {
            cp = Character.codePointAt(word, i, l);
            if (fst.findTargetArc(cp, arc, arc, bytesReader) == null) {
                return null;
            } else if (arc.output != NO_OUTPUT) {
                output = fst.outputs.add(output, arc.output);
            }
        }
        if (fst.findTargetArc(FST.END_LABEL, arc, arc, bytesReader) == null) {
            return null;
        } else if (arc.output != NO_OUTPUT) {
            return fst.outputs.add(output, arc.output);
        } else {
            return output;
        }
    } catch (IOException bogus) {
        throw new RuntimeException(bogus);
    }
}
Also used : FST(org.apache.lucene.util.fst.FST) IntsRef(org.apache.lucene.util.IntsRef) IOException(java.io.IOException)

Example 10 with FST

use of org.apache.lucene.util.fst.FST in project lucene-solr by apache.

the class Stemmer method stem.

/**
   * Generates a list of stems for the provided word
   *
   * @param word Word to generate the stems for
   * @param previous previous affix that was removed (so we dont remove same one twice)
   * @param prevFlag Flag from a previous stemming step that need to be cross-checked with any affixes in this recursive step
   * @param prefixFlag flag of the most inner removed prefix, so that when removing a suffix, it's also checked against the word
   * @param recursionDepth current recursiondepth
   * @param doPrefix true if we should remove prefixes
   * @param doSuffix true if we should remove suffixes
   * @param previousWasPrefix true if the previous removal was a prefix:
   *        if we are removing a suffix, and it has no continuation requirements, it's ok.
   *        but two prefixes (COMPLEXPREFIXES) or two suffixes must have continuation requirements to recurse. 
   * @param circumfix true if the previous prefix removal was signed as a circumfix
   *        this means inner most suffix must also contain circumfix flag.
   * @param caseVariant true if we are searching for a case variant. if the word has KEEPCASE flag it cannot succeed.
   * @return List of stems, or empty list if no stems are found
   */
private List<CharsRef> stem(char[] word, int length, int previous, int prevFlag, int prefixFlag, int recursionDepth, boolean doPrefix, boolean doSuffix, boolean previousWasPrefix, boolean circumfix, boolean caseVariant) throws IOException {
    // TODO: allow this stuff to be reused by tokenfilter
    List<CharsRef> stems = new ArrayList<>();
    if (doPrefix && dictionary.prefixes != null) {
        FST<IntsRef> fst = dictionary.prefixes;
        Outputs<IntsRef> outputs = fst.outputs;
        FST.BytesReader bytesReader = prefixReaders[recursionDepth];
        FST.Arc<IntsRef> arc = prefixArcs[recursionDepth];
        fst.getFirstArc(arc);
        IntsRef NO_OUTPUT = outputs.getNoOutput();
        IntsRef output = NO_OUTPUT;
        int limit = dictionary.fullStrip ? length : length - 1;
        for (int i = 0; i < limit; i++) {
            if (i > 0) {
                int ch = word[i - 1];
                if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) {
                    break;
                } else if (arc.output != NO_OUTPUT) {
                    output = fst.outputs.add(output, arc.output);
                }
            }
            IntsRef prefixes = null;
            if (!arc.isFinal()) {
                continue;
            } else {
                prefixes = fst.outputs.add(output, arc.nextFinalOutput);
            }
            for (int j = 0; j < prefixes.length; j++) {
                int prefix = prefixes.ints[prefixes.offset + j];
                if (prefix == previous) {
                    continue;
                }
                affixReader.setPosition(8 * prefix);
                char flag = (char) (affixReader.readShort() & 0xffff);
                char stripOrd = (char) (affixReader.readShort() & 0xffff);
                int condition = (char) (affixReader.readShort() & 0xffff);
                boolean crossProduct = (condition & 1) == 1;
                condition >>>= 1;
                char append = (char) (affixReader.readShort() & 0xffff);
                final boolean compatible;
                if (recursionDepth == 0) {
                    if (dictionary.onlyincompound == -1) {
                        compatible = true;
                    } else {
                        // check if affix is allowed in a non-compound word
                        dictionary.flagLookup.get(append, scratch);
                        char[] appendFlags = Dictionary.decodeFlags(scratch);
                        compatible = !Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound);
                    }
                } else if (crossProduct) {
                    // cross check incoming continuation class (flag of previous affix) against list.
                    dictionary.flagLookup.get(append, scratch);
                    char[] appendFlags = Dictionary.decodeFlags(scratch);
                    assert prevFlag >= 0;
                    boolean allowed = dictionary.onlyincompound == -1 || !Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound);
                    compatible = allowed && hasCrossCheckedFlag((char) prevFlag, appendFlags, false);
                } else {
                    compatible = false;
                }
                if (compatible) {
                    int deAffixedStart = i;
                    int deAffixedLength = length - deAffixedStart;
                    int stripStart = dictionary.stripOffsets[stripOrd];
                    int stripEnd = dictionary.stripOffsets[stripOrd + 1];
                    int stripLength = stripEnd - stripStart;
                    if (!checkCondition(condition, dictionary.stripData, stripStart, stripLength, word, deAffixedStart, deAffixedLength)) {
                        continue;
                    }
                    char[] strippedWord = new char[stripLength + deAffixedLength];
                    System.arraycopy(dictionary.stripData, stripStart, strippedWord, 0, stripLength);
                    System.arraycopy(word, deAffixedStart, strippedWord, stripLength, deAffixedLength);
                    List<CharsRef> stemList = applyAffix(strippedWord, strippedWord.length, prefix, -1, recursionDepth, true, circumfix, caseVariant);
                    stems.addAll(stemList);
                }
            }
        }
    }
    if (doSuffix && dictionary.suffixes != null) {
        FST<IntsRef> fst = dictionary.suffixes;
        Outputs<IntsRef> outputs = fst.outputs;
        FST.BytesReader bytesReader = suffixReaders[recursionDepth];
        FST.Arc<IntsRef> arc = suffixArcs[recursionDepth];
        fst.getFirstArc(arc);
        IntsRef NO_OUTPUT = outputs.getNoOutput();
        IntsRef output = NO_OUTPUT;
        int limit = dictionary.fullStrip ? 0 : 1;
        for (int i = length; i >= limit; i--) {
            if (i < length) {
                int ch = word[i];
                if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) {
                    break;
                } else if (arc.output != NO_OUTPUT) {
                    output = fst.outputs.add(output, arc.output);
                }
            }
            IntsRef suffixes = null;
            if (!arc.isFinal()) {
                continue;
            } else {
                suffixes = fst.outputs.add(output, arc.nextFinalOutput);
            }
            for (int j = 0; j < suffixes.length; j++) {
                int suffix = suffixes.ints[suffixes.offset + j];
                if (suffix == previous) {
                    continue;
                }
                affixReader.setPosition(8 * suffix);
                char flag = (char) (affixReader.readShort() & 0xffff);
                char stripOrd = (char) (affixReader.readShort() & 0xffff);
                int condition = (char) (affixReader.readShort() & 0xffff);
                boolean crossProduct = (condition & 1) == 1;
                condition >>>= 1;
                char append = (char) (affixReader.readShort() & 0xffff);
                final boolean compatible;
                if (recursionDepth == 0) {
                    if (dictionary.onlyincompound == -1) {
                        compatible = true;
                    } else {
                        // check if affix is allowed in a non-compound word
                        dictionary.flagLookup.get(append, scratch);
                        char[] appendFlags = Dictionary.decodeFlags(scratch);
                        compatible = !Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound);
                    }
                } else if (crossProduct) {
                    // cross check incoming continuation class (flag of previous affix) against list.
                    dictionary.flagLookup.get(append, scratch);
                    char[] appendFlags = Dictionary.decodeFlags(scratch);
                    assert prevFlag >= 0;
                    boolean allowed = dictionary.onlyincompound == -1 || !Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound);
                    compatible = allowed && hasCrossCheckedFlag((char) prevFlag, appendFlags, previousWasPrefix);
                } else {
                    compatible = false;
                }
                if (compatible) {
                    int appendLength = length - i;
                    int deAffixedLength = length - appendLength;
                    int stripStart = dictionary.stripOffsets[stripOrd];
                    int stripEnd = dictionary.stripOffsets[stripOrd + 1];
                    int stripLength = stripEnd - stripStart;
                    if (!checkCondition(condition, word, 0, deAffixedLength, dictionary.stripData, stripStart, stripLength)) {
                        continue;
                    }
                    char[] strippedWord = new char[stripLength + deAffixedLength];
                    System.arraycopy(word, 0, strippedWord, 0, deAffixedLength);
                    System.arraycopy(dictionary.stripData, stripStart, strippedWord, deAffixedLength, stripLength);
                    List<CharsRef> stemList = applyAffix(strippedWord, strippedWord.length, suffix, prefixFlag, recursionDepth, false, circumfix, caseVariant);
                    stems.addAll(stemList);
                }
            }
        }
    }
    return stems;
}
Also used : FST(org.apache.lucene.util.fst.FST) ArrayList(java.util.ArrayList) CharsRef(org.apache.lucene.util.CharsRef) IntsRef(org.apache.lucene.util.IntsRef)

Aggregations

FST (org.apache.lucene.util.fst.FST)10 ArrayList (java.util.ArrayList)7 IntsRef (org.apache.lucene.util.IntsRef)4 IOException (java.io.IOException)3 CharsRef (org.apache.lucene.util.CharsRef)3 BitSet (java.util.BitSet)2 HashSet (java.util.HashSet)2 TokenStreamToAutomaton (org.apache.lucene.analysis.TokenStreamToAutomaton)2 ArrayUtil (org.apache.lucene.util.ArrayUtil)2 BytesRef (org.apache.lucene.util.BytesRef)2 CharsRefBuilder (org.apache.lucene.util.CharsRefBuilder)2 Automaton (org.apache.lucene.util.automaton.Automaton)2 BytesReader (org.apache.lucene.util.fst.FST.BytesReader)2 Pair (org.apache.lucene.util.fst.PairOutputs.Pair)2 Util (org.apache.lucene.util.fst.Util)2 InputStreamReader (java.io.InputStreamReader)1 LineNumberReader (java.io.LineNumberReader)1 ParseException (java.text.ParseException)1 HashMap (java.util.HashMap)1 LinkedHashMap (java.util.LinkedHashMap)1