Search in sources :

Example 36 with IntsRef

use of org.apache.lucene.util.IntsRef in project lucene-solr by apache.

the class GraphTokenStreamFiniteStrings method getFiniteStrings.

/**
   * Get all finite strings that start at {@code startState} and end at {@code endState}.
   */
public Iterator<TokenStream> getFiniteStrings(int startState, int endState) throws IOException {
    final FiniteStringsIterator it = new FiniteStringsIterator(det, startState, endState);
    return new Iterator<TokenStream>() {

        IntsRef current;

        boolean finished = false;

        @Override
        public boolean hasNext() {
            if (finished == false && current == null) {
                current = it.next();
                if (current == null) {
                    finished = true;
                }
            }
            return current != null;
        }

        @Override
        public TokenStream next() {
            if (current == null) {
                hasNext();
            }
            TokenStream next = new FiniteStringsTokenStream(current);
            current = null;
            return next;
        }
    };
}
Also used : FiniteStringsIterator(org.apache.lucene.util.automaton.FiniteStringsIterator) TokenStream(org.apache.lucene.analysis.TokenStream) Iterator(java.util.Iterator) FiniteStringsIterator(org.apache.lucene.util.automaton.FiniteStringsIterator) IntsRef(org.apache.lucene.util.IntsRef)

Example 37 with IntsRef

use of org.apache.lucene.util.IntsRef in project lucene-solr by apache.

the class TestFuzzyQuery method getDistance.

// Poached from LuceneLevenshteinDistance.java (from suggest module): it supports transpositions (treats them as ed=1, not ed=2)
private static int getDistance(String target, String other) {
    IntsRef targetPoints;
    IntsRef otherPoints;
    int n;
    // cost array
    int[][] d;
    // NOTE: if we cared, we could 3*m space instead of m*n space, similar to 
    // what LevenshteinDistance does, except cycling thru a ring of three 
    // horizontal cost arrays... but this comparator is never actually used by 
    // DirectSpellChecker, it's only used for merging results from multiple shards 
    // in "distributed spellcheck", and it's inefficient in other ways too...
    // cheaper to do this up front once
    targetPoints = toIntsRef(target);
    otherPoints = toIntsRef(other);
    n = targetPoints.length;
    final int m = otherPoints.length;
    d = new int[n + 1][m + 1];
    if (n == 0 || m == 0) {
        if (n == m) {
            return 0;
        } else {
            return Math.max(n, m);
        }
    }
    // indexes into strings s and t
    // iterates through s
    int i;
    // iterates through t
    int j;
    // jth character of t
    int t_j;
    // cost
    int cost;
    for (i = 0; i <= n; i++) {
        d[i][0] = i;
    }
    for (j = 0; j <= m; j++) {
        d[0][j] = j;
    }
    for (j = 1; j <= m; j++) {
        t_j = otherPoints.ints[j - 1];
        for (i = 1; i <= n; i++) {
            cost = targetPoints.ints[i - 1] == t_j ? 0 : 1;
            // minimum of cell to the left+1, to the top+1, diagonally left and up +cost
            d[i][j] = Math.min(Math.min(d[i - 1][j] + 1, d[i][j - 1] + 1), d[i - 1][j - 1] + cost);
            // transposition
            if (i > 1 && j > 1 && targetPoints.ints[i - 1] == otherPoints.ints[j - 2] && targetPoints.ints[i - 2] == otherPoints.ints[j - 1]) {
                d[i][j] = Math.min(d[i][j], d[i - 2][j - 2] + cost);
            }
        }
    }
    return d[n][m];
}
Also used : IntsRef(org.apache.lucene.util.IntsRef)

Example 38 with IntsRef

use of org.apache.lucene.util.IntsRef in project lucene-solr by apache.

the class Stemmer method doStem.

private List<CharsRef> doStem(char[] word, int length, boolean caseVariant) {
    List<CharsRef> stems = new ArrayList<>();
    IntsRef forms = dictionary.lookupWord(word, 0, length);
    if (forms != null) {
        for (int i = 0; i < forms.length; i += formStep) {
            boolean checkKeepCase = caseVariant && dictionary.keepcase != -1;
            boolean checkNeedAffix = dictionary.needaffix != -1;
            boolean checkOnlyInCompound = dictionary.onlyincompound != -1;
            if (checkKeepCase || checkNeedAffix || checkOnlyInCompound) {
                dictionary.flagLookup.get(forms.ints[forms.offset + i], scratch);
                char[] wordFlags = Dictionary.decodeFlags(scratch);
                // we are looking for a case variant, but this word does not allow it
                if (checkKeepCase && Dictionary.hasFlag(wordFlags, (char) dictionary.keepcase)) {
                    continue;
                }
                // we can't add this form, it's a pseudostem requiring an affix
                if (checkNeedAffix && Dictionary.hasFlag(wordFlags, (char) dictionary.needaffix)) {
                    continue;
                }
                // we can't add this form, it only belongs inside a compound word
                if (checkOnlyInCompound && Dictionary.hasFlag(wordFlags, (char) dictionary.onlyincompound)) {
                    continue;
                }
            }
            stems.add(newStem(word, length, forms, i));
        }
    }
    try {
        boolean v = stems.addAll(stem(word, length, -1, -1, -1, 0, true, true, false, false, caseVariant));
    } catch (IOException bogus) {
        throw new RuntimeException(bogus);
    }
    return stems;
}
Also used : ArrayList(java.util.ArrayList) IntsRef(org.apache.lucene.util.IntsRef) IOException(java.io.IOException) CharsRef(org.apache.lucene.util.CharsRef)

Example 39 with IntsRef

use of org.apache.lucene.util.IntsRef in project lucene-solr by apache.

the class Stemmer method stem.

/**
   * Generates a list of stems for the provided word
   *
   * @param word Word to generate the stems for
   * @param previous previous affix that was removed (so we dont remove same one twice)
   * @param prevFlag Flag from a previous stemming step that need to be cross-checked with any affixes in this recursive step
   * @param prefixFlag flag of the most inner removed prefix, so that when removing a suffix, it's also checked against the word
   * @param recursionDepth current recursiondepth
   * @param doPrefix true if we should remove prefixes
   * @param doSuffix true if we should remove suffixes
   * @param previousWasPrefix true if the previous removal was a prefix:
   *        if we are removing a suffix, and it has no continuation requirements, it's ok.
   *        but two prefixes (COMPLEXPREFIXES) or two suffixes must have continuation requirements to recurse. 
   * @param circumfix true if the previous prefix removal was signed as a circumfix
   *        this means inner most suffix must also contain circumfix flag.
   * @param caseVariant true if we are searching for a case variant. if the word has KEEPCASE flag it cannot succeed.
   * @return List of stems, or empty list if no stems are found
   */
private List<CharsRef> stem(char[] word, int length, int previous, int prevFlag, int prefixFlag, int recursionDepth, boolean doPrefix, boolean doSuffix, boolean previousWasPrefix, boolean circumfix, boolean caseVariant) throws IOException {
    // TODO: allow this stuff to be reused by tokenfilter
    List<CharsRef> stems = new ArrayList<>();
    if (doPrefix && dictionary.prefixes != null) {
        FST<IntsRef> fst = dictionary.prefixes;
        Outputs<IntsRef> outputs = fst.outputs;
        FST.BytesReader bytesReader = prefixReaders[recursionDepth];
        FST.Arc<IntsRef> arc = prefixArcs[recursionDepth];
        fst.getFirstArc(arc);
        IntsRef NO_OUTPUT = outputs.getNoOutput();
        IntsRef output = NO_OUTPUT;
        int limit = dictionary.fullStrip ? length : length - 1;
        for (int i = 0; i < limit; i++) {
            if (i > 0) {
                int ch = word[i - 1];
                if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) {
                    break;
                } else if (arc.output != NO_OUTPUT) {
                    output = fst.outputs.add(output, arc.output);
                }
            }
            IntsRef prefixes = null;
            if (!arc.isFinal()) {
                continue;
            } else {
                prefixes = fst.outputs.add(output, arc.nextFinalOutput);
            }
            for (int j = 0; j < prefixes.length; j++) {
                int prefix = prefixes.ints[prefixes.offset + j];
                if (prefix == previous) {
                    continue;
                }
                affixReader.setPosition(8 * prefix);
                char flag = (char) (affixReader.readShort() & 0xffff);
                char stripOrd = (char) (affixReader.readShort() & 0xffff);
                int condition = (char) (affixReader.readShort() & 0xffff);
                boolean crossProduct = (condition & 1) == 1;
                condition >>>= 1;
                char append = (char) (affixReader.readShort() & 0xffff);
                final boolean compatible;
                if (recursionDepth == 0) {
                    if (dictionary.onlyincompound == -1) {
                        compatible = true;
                    } else {
                        // check if affix is allowed in a non-compound word
                        dictionary.flagLookup.get(append, scratch);
                        char[] appendFlags = Dictionary.decodeFlags(scratch);
                        compatible = !Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound);
                    }
                } else if (crossProduct) {
                    // cross check incoming continuation class (flag of previous affix) against list.
                    dictionary.flagLookup.get(append, scratch);
                    char[] appendFlags = Dictionary.decodeFlags(scratch);
                    assert prevFlag >= 0;
                    boolean allowed = dictionary.onlyincompound == -1 || !Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound);
                    compatible = allowed && hasCrossCheckedFlag((char) prevFlag, appendFlags, false);
                } else {
                    compatible = false;
                }
                if (compatible) {
                    int deAffixedStart = i;
                    int deAffixedLength = length - deAffixedStart;
                    int stripStart = dictionary.stripOffsets[stripOrd];
                    int stripEnd = dictionary.stripOffsets[stripOrd + 1];
                    int stripLength = stripEnd - stripStart;
                    if (!checkCondition(condition, dictionary.stripData, stripStart, stripLength, word, deAffixedStart, deAffixedLength)) {
                        continue;
                    }
                    char[] strippedWord = new char[stripLength + deAffixedLength];
                    System.arraycopy(dictionary.stripData, stripStart, strippedWord, 0, stripLength);
                    System.arraycopy(word, deAffixedStart, strippedWord, stripLength, deAffixedLength);
                    List<CharsRef> stemList = applyAffix(strippedWord, strippedWord.length, prefix, -1, recursionDepth, true, circumfix, caseVariant);
                    stems.addAll(stemList);
                }
            }
        }
    }
    if (doSuffix && dictionary.suffixes != null) {
        FST<IntsRef> fst = dictionary.suffixes;
        Outputs<IntsRef> outputs = fst.outputs;
        FST.BytesReader bytesReader = suffixReaders[recursionDepth];
        FST.Arc<IntsRef> arc = suffixArcs[recursionDepth];
        fst.getFirstArc(arc);
        IntsRef NO_OUTPUT = outputs.getNoOutput();
        IntsRef output = NO_OUTPUT;
        int limit = dictionary.fullStrip ? 0 : 1;
        for (int i = length; i >= limit; i--) {
            if (i < length) {
                int ch = word[i];
                if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) {
                    break;
                } else if (arc.output != NO_OUTPUT) {
                    output = fst.outputs.add(output, arc.output);
                }
            }
            IntsRef suffixes = null;
            if (!arc.isFinal()) {
                continue;
            } else {
                suffixes = fst.outputs.add(output, arc.nextFinalOutput);
            }
            for (int j = 0; j < suffixes.length; j++) {
                int suffix = suffixes.ints[suffixes.offset + j];
                if (suffix == previous) {
                    continue;
                }
                affixReader.setPosition(8 * suffix);
                char flag = (char) (affixReader.readShort() & 0xffff);
                char stripOrd = (char) (affixReader.readShort() & 0xffff);
                int condition = (char) (affixReader.readShort() & 0xffff);
                boolean crossProduct = (condition & 1) == 1;
                condition >>>= 1;
                char append = (char) (affixReader.readShort() & 0xffff);
                final boolean compatible;
                if (recursionDepth == 0) {
                    if (dictionary.onlyincompound == -1) {
                        compatible = true;
                    } else {
                        // check if affix is allowed in a non-compound word
                        dictionary.flagLookup.get(append, scratch);
                        char[] appendFlags = Dictionary.decodeFlags(scratch);
                        compatible = !Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound);
                    }
                } else if (crossProduct) {
                    // cross check incoming continuation class (flag of previous affix) against list.
                    dictionary.flagLookup.get(append, scratch);
                    char[] appendFlags = Dictionary.decodeFlags(scratch);
                    assert prevFlag >= 0;
                    boolean allowed = dictionary.onlyincompound == -1 || !Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound);
                    compatible = allowed && hasCrossCheckedFlag((char) prevFlag, appendFlags, previousWasPrefix);
                } else {
                    compatible = false;
                }
                if (compatible) {
                    int appendLength = length - i;
                    int deAffixedLength = length - appendLength;
                    int stripStart = dictionary.stripOffsets[stripOrd];
                    int stripEnd = dictionary.stripOffsets[stripOrd + 1];
                    int stripLength = stripEnd - stripStart;
                    if (!checkCondition(condition, word, 0, deAffixedLength, dictionary.stripData, stripStart, stripLength)) {
                        continue;
                    }
                    char[] strippedWord = new char[stripLength + deAffixedLength];
                    System.arraycopy(word, 0, strippedWord, 0, deAffixedLength);
                    System.arraycopy(dictionary.stripData, stripStart, strippedWord, deAffixedLength, stripLength);
                    List<CharsRef> stemList = applyAffix(strippedWord, strippedWord.length, suffix, prefixFlag, recursionDepth, false, circumfix, caseVariant);
                    stems.addAll(stemList);
                }
            }
        }
    }
    return stems;
}
Also used : FST(org.apache.lucene.util.fst.FST) ArrayList(java.util.ArrayList) CharsRef(org.apache.lucene.util.CharsRef) IntsRef(org.apache.lucene.util.IntsRef)

Example 40 with IntsRef

use of org.apache.lucene.util.IntsRef in project lucene-solr by apache.

the class TestAutomaton method assertSame.

private void assertSame(Collection<BytesRef> terms, Automaton a) {
    try {
        assertTrue(Operations.isFinite(a));
        assertFalse(Operations.isTotal(a));
        Automaton detA = Operations.determinize(a, DEFAULT_MAX_DETERMINIZED_STATES);
        // Make sure all terms are accepted:
        IntsRefBuilder scratch = new IntsRefBuilder();
        for (BytesRef term : terms) {
            Util.toIntsRef(term, scratch);
            assertTrue("failed to accept term=" + term.utf8ToString(), Operations.run(detA, term.utf8ToString()));
        }
        // Use getFiniteStrings:
        Set<IntsRef> expected = new HashSet<>();
        for (BytesRef term : terms) {
            IntsRefBuilder intsRef = new IntsRefBuilder();
            Util.toUTF32(term.utf8ToString(), intsRef);
            expected.add(intsRef.toIntsRef());
        }
        Set<IntsRef> actual = TestOperations.getFiniteStrings(a);
        if (expected.equals(actual) == false) {
            System.out.println("FAILED:");
            for (IntsRef term : expected) {
                if (actual.contains(term) == false) {
                    System.out.println("  term=" + term + " should be accepted but isn't");
                }
            }
            for (IntsRef term : actual) {
                if (expected.contains(term) == false) {
                    System.out.println("  term=" + term + " is accepted but should not be");
                }
            }
            throw new AssertionError("mismatch");
        }
        // Use sameLanguage:
        Automaton a2 = Operations.removeDeadStates(Operations.determinize(unionTerms(terms), Integer.MAX_VALUE));
        assertTrue(Operations.sameLanguage(a2, Operations.removeDeadStates(Operations.determinize(a, Integer.MAX_VALUE))));
        // Do same check, in UTF8 space
        Automaton utf8 = randomNoOp(new UTF32ToUTF8().convert(a));
        Set<IntsRef> expected2 = new HashSet<>();
        for (BytesRef term : terms) {
            IntsRefBuilder intsRef = new IntsRefBuilder();
            Util.toIntsRef(term, intsRef);
            expected2.add(intsRef.toIntsRef());
        }
        assertEquals(expected2, TestOperations.getFiniteStrings(utf8));
    } catch (AssertionError ae) {
        System.out.println("TEST: FAILED: not same");
        System.out.println("  terms (count=" + terms.size() + "):");
        for (BytesRef term : terms) {
            System.out.println("    " + term);
        }
        System.out.println("  automaton:");
        System.out.println(a.toDot());
        //a.writeDot("fail");
        throw ae;
    }
}
Also used : IntsRef(org.apache.lucene.util.IntsRef) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) BytesRef(org.apache.lucene.util.BytesRef) HashSet(java.util.HashSet)

Aggregations

IntsRef (org.apache.lucene.util.IntsRef)63 BytesRef (org.apache.lucene.util.BytesRef)19 IntsRefBuilder (org.apache.lucene.util.IntsRefBuilder)19 HashSet (java.util.HashSet)16 ArrayList (java.util.ArrayList)13 Automaton (org.apache.lucene.util.automaton.Automaton)13 BytesRefBuilder (org.apache.lucene.util.BytesRefBuilder)12 TokenStreamToAutomaton (org.apache.lucene.analysis.TokenStreamToAutomaton)9 IOException (java.io.IOException)7 Directory (org.apache.lucene.store.Directory)7 HashMap (java.util.HashMap)5 FiniteStringsIterator (org.apache.lucene.util.automaton.FiniteStringsIterator)5 BytesReader (org.apache.lucene.util.fst.FST.BytesReader)5 Pair (org.apache.lucene.util.fst.PairOutputs.Pair)5 ByteArrayInputStream (java.io.ByteArrayInputStream)4 FilterInputStream (java.io.FilterInputStream)4 InputStream (java.io.InputStream)4 Map (java.util.Map)4 Random (java.util.Random)4 TokenStream (org.apache.lucene.analysis.TokenStream)4