Search in sources :

Example 1 with FST

use of org.apache.lucene.util.fst.FST in project lucene-solr by apache.

the class FSTUtil method intersectPrefixPaths.

/**
   * Enumerates all minimal prefix paths in the automaton that also intersect the FST,
   * accumulating the FST end node and output for each path.
   */
public static <T> List<Path<T>> intersectPrefixPaths(Automaton a, FST<T> fst) throws IOException {
    assert a.isDeterministic();
    final List<Path<T>> queue = new ArrayList<>();
    final List<Path<T>> endNodes = new ArrayList<>();
    if (a.getNumStates() == 0) {
        return endNodes;
    }
    queue.add(new Path<>(0, fst.getFirstArc(new FST.Arc<T>()), fst.outputs.getNoOutput(), new IntsRefBuilder()));
    final FST.Arc<T> scratchArc = new FST.Arc<>();
    final FST.BytesReader fstReader = fst.getBytesReader();
    Transition t = new Transition();
    while (queue.size() != 0) {
        final Path<T> path = queue.remove(queue.size() - 1);
        if (a.isAccept(path.state)) {
            endNodes.add(path);
            // we accept all further paths too
            continue;
        }
        IntsRefBuilder currentInput = path.input;
        int count = a.initTransition(path.state, t);
        for (int i = 0; i < count; i++) {
            a.getNextTransition(t);
            final int min = t.min;
            final int max = t.max;
            if (min == max) {
                final FST.Arc<T> nextArc = fst.findTargetArc(t.min, path.fstNode, scratchArc, fstReader);
                if (nextArc != null) {
                    final IntsRefBuilder newInput = new IntsRefBuilder();
                    newInput.copyInts(currentInput.get());
                    newInput.append(t.min);
                    queue.add(new Path<>(t.dest, new FST.Arc<T>().copyFrom(nextArc), fst.outputs.add(path.output, nextArc.output), newInput));
                }
            } else {
                // TODO: if this transition's TO state is accepting, and
                // it accepts the entire range possible in the FST (ie. 0 to 255),
                // we can simply use the prefix as the accepted state instead of
                // looking up all the ranges and terminate early
                // here.  This just shifts the work from one queue
                // (this one) to another (the completion search
                // done in AnalyzingSuggester).
                FST.Arc<T> nextArc = Util.readCeilArc(min, fst, path.fstNode, scratchArc, fstReader);
                while (nextArc != null && nextArc.label <= max) {
                    assert nextArc.label <= max;
                    assert nextArc.label >= min : nextArc.label + " " + min;
                    final IntsRefBuilder newInput = new IntsRefBuilder();
                    newInput.copyInts(currentInput.get());
                    newInput.append(nextArc.label);
                    queue.add(new Path<>(t.dest, new FST.Arc<T>().copyFrom(nextArc), fst.outputs.add(path.output, nextArc.output), newInput));
                    // used in assert
                    final int label = nextArc.label;
                    nextArc = nextArc.isLast() ? null : fst.readNextRealArc(nextArc, fstReader);
                    assert nextArc == null || label < nextArc.label : "last: " + label + " next: " + nextArc.label;
                }
            }
        }
    }
    return endNodes;
}
Also used : FST(org.apache.lucene.util.fst.FST) ArrayList(java.util.ArrayList) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) FST(org.apache.lucene.util.fst.FST) Transition(org.apache.lucene.util.automaton.Transition)

Example 2 with FST

use of org.apache.lucene.util.fst.FST in project lucene-solr by apache.

the class AnalyzingSuggester method lookup.

@Override
public List<LookupResult> lookup(final CharSequence key, Set<BytesRef> contexts, boolean onlyMorePopular, int num) {
    assert num > 0;
    if (onlyMorePopular) {
        throw new IllegalArgumentException("this suggester only works with onlyMorePopular=false");
    }
    if (contexts != null) {
        throw new IllegalArgumentException("this suggester doesn't support contexts");
    }
    if (fst == null) {
        return Collections.emptyList();
    }
    //System.out.println("lookup key=" + key + " num=" + num);
    for (int i = 0; i < key.length(); i++) {
        if (key.charAt(i) == 0x1E) {
            throw new IllegalArgumentException("lookup key cannot contain HOLE character U+001E; this character is reserved");
        }
        if (key.charAt(i) == 0x1F) {
            throw new IllegalArgumentException("lookup key cannot contain unit separator character U+001F; this character is reserved");
        }
    }
    final BytesRef utf8Key = new BytesRef(key);
    try {
        Automaton lookupAutomaton = toLookupAutomaton(key);
        final CharsRefBuilder spare = new CharsRefBuilder();
        //System.out.println("  now intersect exactFirst=" + exactFirst);
        // Intersect automaton w/ suggest wFST and get all
        // prefix starting nodes & their outputs:
        //final PathIntersector intersector = getPathIntersector(lookupAutomaton, fst);
        //System.out.println("  prefixPaths: " + prefixPaths.size());
        BytesReader bytesReader = fst.getBytesReader();
        FST.Arc<Pair<Long, BytesRef>> scratchArc = new FST.Arc<>();
        final List<LookupResult> results = new ArrayList<>();
        List<FSTUtil.Path<Pair<Long, BytesRef>>> prefixPaths = FSTUtil.intersectPrefixPaths(convertAutomaton(lookupAutomaton), fst);
        if (exactFirst) {
            int count = 0;
            for (FSTUtil.Path<Pair<Long, BytesRef>> path : prefixPaths) {
                if (fst.findTargetArc(END_BYTE, path.fstNode, scratchArc, bytesReader) != null) {
                    // This node has END_BYTE arc leaving, meaning it's an
                    // "exact" match:
                    count++;
                }
            }
            // Searcher just to find the single exact only
            // match, if present:
            Util.TopNSearcher<Pair<Long, BytesRef>> searcher;
            searcher = new Util.TopNSearcher<>(fst, count * maxSurfaceFormsPerAnalyzedForm, count * maxSurfaceFormsPerAnalyzedForm, weightComparator);
            // ...:
            for (FSTUtil.Path<Pair<Long, BytesRef>> path : prefixPaths) {
                if (fst.findTargetArc(END_BYTE, path.fstNode, scratchArc, bytesReader) != null) {
                    // This node has END_BYTE arc leaving, meaning it's an
                    // "exact" match:
                    searcher.addStartPaths(scratchArc, fst.outputs.add(path.output, scratchArc.output), false, path.input);
                }
            }
            TopResults<Pair<Long, BytesRef>> completions = searcher.search();
            assert completions.isComplete;
            // maxSurfaceFormsPerAnalyzedForm:
            for (Result<Pair<Long, BytesRef>> completion : completions) {
                BytesRef output2 = completion.output.output2;
                if (sameSurfaceForm(utf8Key, output2)) {
                    results.add(getLookupResult(completion.output.output1, output2, spare));
                    break;
                }
            }
            if (results.size() == num) {
                // That was quick:
                return results;
            }
        }
        Util.TopNSearcher<Pair<Long, BytesRef>> searcher;
        searcher = new Util.TopNSearcher<Pair<Long, BytesRef>>(fst, num - results.size(), num * maxAnalyzedPathsForOneInput, weightComparator) {

            private final Set<BytesRef> seen = new HashSet<>();

            @Override
            protected boolean acceptResult(IntsRef input, Pair<Long, BytesRef> output) {
                // can get duplicate surface forms:
                if (seen.contains(output.output2)) {
                    return false;
                }
                seen.add(output.output2);
                if (!exactFirst) {
                    return true;
                } else {
                    // create duplicate results:
                    if (sameSurfaceForm(utf8Key, output.output2)) {
                        // have already found it in the first search:
                        assert results.size() == 1;
                        return false;
                    } else {
                        return true;
                    }
                }
            }
        };
        prefixPaths = getFullPrefixPaths(prefixPaths, lookupAutomaton, fst);
        for (FSTUtil.Path<Pair<Long, BytesRef>> path : prefixPaths) {
            searcher.addStartPaths(path.fstNode, path.output, true, path.input);
        }
        TopResults<Pair<Long, BytesRef>> completions = searcher.search();
        assert completions.isComplete;
        for (Result<Pair<Long, BytesRef>> completion : completions) {
            LookupResult result = getLookupResult(completion.output.output1, completion.output.output2, spare);
            // TODO: for fuzzy case would be nice to return
            // how many edits were required
            //System.out.println("    result=" + result);
            results.add(result);
            if (results.size() == num) {
                // produce one extra path
                break;
            }
        }
        return results;
    } catch (IOException bogus) {
        throw new RuntimeException(bogus);
    }
}
Also used : ArrayList(java.util.ArrayList) Util(org.apache.lucene.util.fst.Util) CodecUtil(org.apache.lucene.codecs.CodecUtil) ArrayUtil(org.apache.lucene.util.ArrayUtil) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) IntsRef(org.apache.lucene.util.IntsRef) BytesRef(org.apache.lucene.util.BytesRef) Pair(org.apache.lucene.util.fst.PairOutputs.Pair) HashSet(java.util.HashSet) TokenStreamToAutomaton(org.apache.lucene.analysis.TokenStreamToAutomaton) Automaton(org.apache.lucene.util.automaton.Automaton) FST(org.apache.lucene.util.fst.FST) IOException(java.io.IOException) BytesReader(org.apache.lucene.util.fst.FST.BytesReader)

Example 3 with FST

use of org.apache.lucene.util.fst.FST in project lucene-solr by apache.

the class Dictionary method readAffixFile.

/**
   * Reads the affix file through the provided InputStream, building up the prefix and suffix maps
   *
   * @param affixStream InputStream to read the content of the affix file from
   * @param decoder CharsetDecoder to decode the content of the file
   * @throws IOException Can be thrown while reading from the InputStream
   */
private void readAffixFile(InputStream affixStream, CharsetDecoder decoder) throws IOException, ParseException {
    TreeMap<String, List<Integer>> prefixes = new TreeMap<>();
    TreeMap<String, List<Integer>> suffixes = new TreeMap<>();
    Map<String, Integer> seenPatterns = new HashMap<>();
    // zero condition -> 0 ord
    seenPatterns.put(".*", 0);
    patterns.add(null);
    // zero strip -> 0 ord
    Map<String, Integer> seenStrips = new LinkedHashMap<>();
    seenStrips.put("", 0);
    LineNumberReader reader = new LineNumberReader(new InputStreamReader(affixStream, decoder));
    String line = null;
    while ((line = reader.readLine()) != null) {
        // ignore any BOM marker on first line
        if (reader.getLineNumber() == 1 && line.startsWith("")) {
            line = line.substring(1);
        }
        if (line.startsWith(ALIAS_KEY)) {
            parseAlias(line);
        } else if (line.startsWith(MORPH_ALIAS_KEY)) {
            parseMorphAlias(line);
        } else if (line.startsWith(PREFIX_KEY)) {
            parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN, seenPatterns, seenStrips);
        } else if (line.startsWith(SUFFIX_KEY)) {
            parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN, seenPatterns, seenStrips);
        } else if (line.startsWith(FLAG_KEY)) {
            // Assume that the FLAG line comes before any prefix or suffixes
            // Store the strategy so it can be used when parsing the dic file
            flagParsingStrategy = getFlagParsingStrategy(line);
        } else if (line.equals(COMPLEXPREFIXES_KEY)) {
            // 2-stage prefix+1-stage suffix instead of 2-stage suffix+1-stage prefix
            complexPrefixes = true;
        } else if (line.startsWith(CIRCUMFIX_KEY)) {
            String[] parts = line.split("\\s+");
            if (parts.length != 2) {
                throw new ParseException("Illegal CIRCUMFIX declaration", reader.getLineNumber());
            }
            circumfix = flagParsingStrategy.parseFlag(parts[1]);
        } else if (line.startsWith(KEEPCASE_KEY)) {
            String[] parts = line.split("\\s+");
            if (parts.length != 2) {
                throw new ParseException("Illegal KEEPCASE declaration", reader.getLineNumber());
            }
            keepcase = flagParsingStrategy.parseFlag(parts[1]);
        } else if (line.startsWith(NEEDAFFIX_KEY) || line.startsWith(PSEUDOROOT_KEY)) {
            String[] parts = line.split("\\s+");
            if (parts.length != 2) {
                throw new ParseException("Illegal NEEDAFFIX declaration", reader.getLineNumber());
            }
            needaffix = flagParsingStrategy.parseFlag(parts[1]);
        } else if (line.startsWith(ONLYINCOMPOUND_KEY)) {
            String[] parts = line.split("\\s+");
            if (parts.length != 2) {
                throw new ParseException("Illegal ONLYINCOMPOUND declaration", reader.getLineNumber());
            }
            onlyincompound = flagParsingStrategy.parseFlag(parts[1]);
        } else if (line.startsWith(IGNORE_KEY)) {
            String[] parts = line.split("\\s+");
            if (parts.length != 2) {
                throw new ParseException("Illegal IGNORE declaration", reader.getLineNumber());
            }
            ignore = parts[1].toCharArray();
            Arrays.sort(ignore);
            needsInputCleaning = true;
        } else if (line.startsWith(ICONV_KEY) || line.startsWith(OCONV_KEY)) {
            String[] parts = line.split("\\s+");
            String type = parts[0];
            if (parts.length != 2) {
                throw new ParseException("Illegal " + type + " declaration", reader.getLineNumber());
            }
            int num = Integer.parseInt(parts[1]);
            FST<CharsRef> res = parseConversions(reader, num);
            if (type.equals("ICONV")) {
                iconv = res;
                needsInputCleaning |= iconv != null;
            } else {
                oconv = res;
                needsOutputCleaning |= oconv != null;
            }
        } else if (line.startsWith(FULLSTRIP_KEY)) {
            fullStrip = true;
        } else if (line.startsWith(LANG_KEY)) {
            language = line.substring(LANG_KEY.length()).trim();
            alternateCasing = "tr_TR".equals(language) || "az_AZ".equals(language);
        }
    }
    this.prefixes = affixFST(prefixes);
    this.suffixes = affixFST(suffixes);
    int totalChars = 0;
    for (String strip : seenStrips.keySet()) {
        totalChars += strip.length();
    }
    stripData = new char[totalChars];
    stripOffsets = new int[seenStrips.size() + 1];
    int currentOffset = 0;
    int currentIndex = 0;
    for (String strip : seenStrips.keySet()) {
        stripOffsets[currentIndex++] = currentOffset;
        strip.getChars(0, strip.length(), stripData, currentOffset);
        currentOffset += strip.length();
    }
    assert currentIndex == seenStrips.size();
    stripOffsets[currentIndex] = currentOffset;
}
Also used : InputStreamReader(java.io.InputStreamReader) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) FST(org.apache.lucene.util.fst.FST) TreeMap(java.util.TreeMap) LinkedHashMap(java.util.LinkedHashMap) LineNumberReader(java.io.LineNumberReader) List(java.util.List) ArrayList(java.util.ArrayList) ParseException(java.text.ParseException)

Example 4 with FST

use of org.apache.lucene.util.fst.FST in project lucene-solr by apache.

the class Dictionary method applyMappings.

// TODO: this could be more efficient!
static void applyMappings(FST<CharsRef> fst, StringBuilder sb) throws IOException {
    final FST.BytesReader bytesReader = fst.getBytesReader();
    final FST.Arc<CharsRef> firstArc = fst.getFirstArc(new FST.Arc<CharsRef>());
    final CharsRef NO_OUTPUT = fst.outputs.getNoOutput();
    // temporary stuff
    final FST.Arc<CharsRef> arc = new FST.Arc<>();
    int longestMatch;
    CharsRef longestOutput;
    for (int i = 0; i < sb.length(); i++) {
        arc.copyFrom(firstArc);
        CharsRef output = NO_OUTPUT;
        longestMatch = -1;
        longestOutput = null;
        for (int j = i; j < sb.length(); j++) {
            char ch = sb.charAt(j);
            if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) {
                break;
            } else {
                output = fst.outputs.add(output, arc.output);
            }
            if (arc.isFinal()) {
                longestOutput = fst.outputs.add(output, arc.nextFinalOutput);
                longestMatch = j;
            }
        }
        if (longestMatch >= 0) {
            sb.delete(i, longestMatch + 1);
            sb.insert(i, longestOutput);
            i += (longestOutput.length - 1);
        }
    }
}
Also used : FST(org.apache.lucene.util.fst.FST) CharsRef(org.apache.lucene.util.CharsRef)

Example 5 with FST

use of org.apache.lucene.util.fst.FST in project lucene-solr by apache.

the class MappingCharFilter method read.

@Override
public int read() throws IOException {
    //System.out.println("\nread");
    while (true) {
        if (replacement != null && replacementPointer < replacement.length) {
            //System.out.println("  return repl[" + replacementPointer + "]=" + replacement.chars[replacement.offset + replacementPointer]);
            return replacement.chars[replacement.offset + replacementPointer++];
        }
        // TODO: a more efficient approach would be Aho/Corasick's
        // algorithm
        // (http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm)
        // or this generalizatio: www.cis.uni-muenchen.de/people/Schulz/Pub/dictle5.ps
        //
        // I think this would be (almost?) equivalent to 1) adding
        // epsilon arcs from all final nodes back to the init
        // node in the FST, 2) adding a .* (skip any char)
        // loop on the initial node, and 3) determinizing
        // that.  Then we would not have to restart matching
        // at each position.
        int lastMatchLen = -1;
        CharsRef lastMatch = null;
        final int firstCH = buffer.get(inputOff);
        if (firstCH != -1) {
            FST.Arc<CharsRef> arc = cachedRootArcs.get(Character.valueOf((char) firstCH));
            if (arc != null) {
                if (!FST.targetHasArcs(arc)) {
                    // Fast pass for single character match:
                    assert arc.isFinal();
                    lastMatchLen = 1;
                    lastMatch = arc.output;
                } else {
                    int lookahead = 0;
                    CharsRef output = arc.output;
                    while (true) {
                        lookahead++;
                        if (arc.isFinal()) {
                            // Match! (to node is final)
                            lastMatchLen = lookahead;
                            lastMatch = outputs.add(output, arc.nextFinalOutput);
                        // Greedy: keep searching to see if there's a
                        // longer match...
                        }
                        if (!FST.targetHasArcs(arc)) {
                            break;
                        }
                        int ch = buffer.get(inputOff + lookahead);
                        if (ch == -1) {
                            break;
                        }
                        if ((arc = map.findTargetArc(ch, arc, scratchArc, fstReader)) == null) {
                            // Dead end
                            break;
                        }
                        output = outputs.add(output, arc.output);
                    }
                }
            }
        }
        if (lastMatch != null) {
            inputOff += lastMatchLen;
            //System.out.println("  match!  len=" + lastMatchLen + " repl=" + lastMatch);
            final int diff = lastMatchLen - lastMatch.length;
            if (diff != 0) {
                final int prevCumulativeDiff = getLastCumulativeDiff();
                if (diff > 0) {
                    // Replacement is shorter than matched input:
                    addOffCorrectMap(inputOff - diff - prevCumulativeDiff, prevCumulativeDiff + diff);
                } else {
                    // Replacement is longer than matched input: remap
                    // the "extra" chars all back to the same input
                    // offset:
                    final int outputStart = inputOff - prevCumulativeDiff;
                    for (int extraIDX = 0; extraIDX < -diff; extraIDX++) {
                        addOffCorrectMap(outputStart + extraIDX, prevCumulativeDiff - extraIDX - 1);
                    }
                }
            }
            replacement = lastMatch;
            replacementPointer = 0;
        } else {
            final int ret = buffer.get(inputOff);
            if (ret != -1) {
                inputOff++;
                buffer.freeBefore(inputOff);
            }
            return ret;
        }
    }
}
Also used : FST(org.apache.lucene.util.fst.FST) CharsRef(org.apache.lucene.util.CharsRef)

Aggregations

FST (org.apache.lucene.util.fst.FST)10 ArrayList (java.util.ArrayList)7 IntsRef (org.apache.lucene.util.IntsRef)4 IOException (java.io.IOException)3 CharsRef (org.apache.lucene.util.CharsRef)3 BitSet (java.util.BitSet)2 HashSet (java.util.HashSet)2 TokenStreamToAutomaton (org.apache.lucene.analysis.TokenStreamToAutomaton)2 ArrayUtil (org.apache.lucene.util.ArrayUtil)2 BytesRef (org.apache.lucene.util.BytesRef)2 CharsRefBuilder (org.apache.lucene.util.CharsRefBuilder)2 Automaton (org.apache.lucene.util.automaton.Automaton)2 BytesReader (org.apache.lucene.util.fst.FST.BytesReader)2 Pair (org.apache.lucene.util.fst.PairOutputs.Pair)2 Util (org.apache.lucene.util.fst.Util)2 InputStreamReader (java.io.InputStreamReader)1 LineNumberReader (java.io.LineNumberReader)1 ParseException (java.text.ParseException)1 HashMap (java.util.HashMap)1 LinkedHashMap (java.util.LinkedHashMap)1