Search in sources :

Example 26 with CharsRef

use of org.apache.lucene.util.CharsRef in project lucene-solr by apache.

the class Stemmer method stem.

/**
   * Generates a list of stems for the provided word
   *
   * @param word Word to generate the stems for
   * @param previous previous affix that was removed (so we dont remove same one twice)
   * @param prevFlag Flag from a previous stemming step that need to be cross-checked with any affixes in this recursive step
   * @param prefixFlag flag of the most inner removed prefix, so that when removing a suffix, it's also checked against the word
   * @param recursionDepth current recursiondepth
   * @param doPrefix true if we should remove prefixes
   * @param doSuffix true if we should remove suffixes
   * @param previousWasPrefix true if the previous removal was a prefix:
   *        if we are removing a suffix, and it has no continuation requirements, it's ok.
   *        but two prefixes (COMPLEXPREFIXES) or two suffixes must have continuation requirements to recurse. 
   * @param circumfix true if the previous prefix removal was signed as a circumfix
   *        this means inner most suffix must also contain circumfix flag.
   * @param caseVariant true if we are searching for a case variant. if the word has KEEPCASE flag it cannot succeed.
   * @return List of stems, or empty list if no stems are found
   */
private List<CharsRef> stem(char[] word, int length, int previous, int prevFlag, int prefixFlag, int recursionDepth, boolean doPrefix, boolean doSuffix, boolean previousWasPrefix, boolean circumfix, boolean caseVariant) throws IOException {
    // TODO: allow this stuff to be reused by tokenfilter
    List<CharsRef> stems = new ArrayList<>();
    if (doPrefix && dictionary.prefixes != null) {
        FST<IntsRef> fst = dictionary.prefixes;
        Outputs<IntsRef> outputs = fst.outputs;
        FST.BytesReader bytesReader = prefixReaders[recursionDepth];
        FST.Arc<IntsRef> arc = prefixArcs[recursionDepth];
        fst.getFirstArc(arc);
        IntsRef NO_OUTPUT = outputs.getNoOutput();
        IntsRef output = NO_OUTPUT;
        int limit = dictionary.fullStrip ? length : length - 1;
        for (int i = 0; i < limit; i++) {
            if (i > 0) {
                int ch = word[i - 1];
                if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) {
                    break;
                } else if (arc.output != NO_OUTPUT) {
                    output = fst.outputs.add(output, arc.output);
                }
            }
            IntsRef prefixes = null;
            if (!arc.isFinal()) {
                continue;
            } else {
                prefixes = fst.outputs.add(output, arc.nextFinalOutput);
            }
            for (int j = 0; j < prefixes.length; j++) {
                int prefix = prefixes.ints[prefixes.offset + j];
                if (prefix == previous) {
                    continue;
                }
                affixReader.setPosition(8 * prefix);
                char flag = (char) (affixReader.readShort() & 0xffff);
                char stripOrd = (char) (affixReader.readShort() & 0xffff);
                int condition = (char) (affixReader.readShort() & 0xffff);
                boolean crossProduct = (condition & 1) == 1;
                condition >>>= 1;
                char append = (char) (affixReader.readShort() & 0xffff);
                final boolean compatible;
                if (recursionDepth == 0) {
                    if (dictionary.onlyincompound == -1) {
                        compatible = true;
                    } else {
                        // check if affix is allowed in a non-compound word
                        dictionary.flagLookup.get(append, scratch);
                        char[] appendFlags = Dictionary.decodeFlags(scratch);
                        compatible = !Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound);
                    }
                } else if (crossProduct) {
                    // cross check incoming continuation class (flag of previous affix) against list.
                    dictionary.flagLookup.get(append, scratch);
                    char[] appendFlags = Dictionary.decodeFlags(scratch);
                    assert prevFlag >= 0;
                    boolean allowed = dictionary.onlyincompound == -1 || !Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound);
                    compatible = allowed && hasCrossCheckedFlag((char) prevFlag, appendFlags, false);
                } else {
                    compatible = false;
                }
                if (compatible) {
                    int deAffixedStart = i;
                    int deAffixedLength = length - deAffixedStart;
                    int stripStart = dictionary.stripOffsets[stripOrd];
                    int stripEnd = dictionary.stripOffsets[stripOrd + 1];
                    int stripLength = stripEnd - stripStart;
                    if (!checkCondition(condition, dictionary.stripData, stripStart, stripLength, word, deAffixedStart, deAffixedLength)) {
                        continue;
                    }
                    char[] strippedWord = new char[stripLength + deAffixedLength];
                    System.arraycopy(dictionary.stripData, stripStart, strippedWord, 0, stripLength);
                    System.arraycopy(word, deAffixedStart, strippedWord, stripLength, deAffixedLength);
                    List<CharsRef> stemList = applyAffix(strippedWord, strippedWord.length, prefix, -1, recursionDepth, true, circumfix, caseVariant);
                    stems.addAll(stemList);
                }
            }
        }
    }
    if (doSuffix && dictionary.suffixes != null) {
        FST<IntsRef> fst = dictionary.suffixes;
        Outputs<IntsRef> outputs = fst.outputs;
        FST.BytesReader bytesReader = suffixReaders[recursionDepth];
        FST.Arc<IntsRef> arc = suffixArcs[recursionDepth];
        fst.getFirstArc(arc);
        IntsRef NO_OUTPUT = outputs.getNoOutput();
        IntsRef output = NO_OUTPUT;
        int limit = dictionary.fullStrip ? 0 : 1;
        for (int i = length; i >= limit; i--) {
            if (i < length) {
                int ch = word[i];
                if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) {
                    break;
                } else if (arc.output != NO_OUTPUT) {
                    output = fst.outputs.add(output, arc.output);
                }
            }
            IntsRef suffixes = null;
            if (!arc.isFinal()) {
                continue;
            } else {
                suffixes = fst.outputs.add(output, arc.nextFinalOutput);
            }
            for (int j = 0; j < suffixes.length; j++) {
                int suffix = suffixes.ints[suffixes.offset + j];
                if (suffix == previous) {
                    continue;
                }
                affixReader.setPosition(8 * suffix);
                char flag = (char) (affixReader.readShort() & 0xffff);
                char stripOrd = (char) (affixReader.readShort() & 0xffff);
                int condition = (char) (affixReader.readShort() & 0xffff);
                boolean crossProduct = (condition & 1) == 1;
                condition >>>= 1;
                char append = (char) (affixReader.readShort() & 0xffff);
                final boolean compatible;
                if (recursionDepth == 0) {
                    if (dictionary.onlyincompound == -1) {
                        compatible = true;
                    } else {
                        // check if affix is allowed in a non-compound word
                        dictionary.flagLookup.get(append, scratch);
                        char[] appendFlags = Dictionary.decodeFlags(scratch);
                        compatible = !Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound);
                    }
                } else if (crossProduct) {
                    // cross check incoming continuation class (flag of previous affix) against list.
                    dictionary.flagLookup.get(append, scratch);
                    char[] appendFlags = Dictionary.decodeFlags(scratch);
                    assert prevFlag >= 0;
                    boolean allowed = dictionary.onlyincompound == -1 || !Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound);
                    compatible = allowed && hasCrossCheckedFlag((char) prevFlag, appendFlags, previousWasPrefix);
                } else {
                    compatible = false;
                }
                if (compatible) {
                    int appendLength = length - i;
                    int deAffixedLength = length - appendLength;
                    int stripStart = dictionary.stripOffsets[stripOrd];
                    int stripEnd = dictionary.stripOffsets[stripOrd + 1];
                    int stripLength = stripEnd - stripStart;
                    if (!checkCondition(condition, word, 0, deAffixedLength, dictionary.stripData, stripStart, stripLength)) {
                        continue;
                    }
                    char[] strippedWord = new char[stripLength + deAffixedLength];
                    System.arraycopy(word, 0, strippedWord, 0, deAffixedLength);
                    System.arraycopy(dictionary.stripData, stripStart, strippedWord, deAffixedLength, stripLength);
                    List<CharsRef> stemList = applyAffix(strippedWord, strippedWord.length, suffix, prefixFlag, recursionDepth, false, circumfix, caseVariant);
                    stems.addAll(stemList);
                }
            }
        }
    }
    return stems;
}
Also used : FST(org.apache.lucene.util.fst.FST) ArrayList(java.util.ArrayList) CharsRef(org.apache.lucene.util.CharsRef) IntsRef(org.apache.lucene.util.IntsRef)

Example 27 with CharsRef

use of org.apache.lucene.util.CharsRef in project lucene-solr by apache.

the class WordnetSynonymParser method parse.

@Override
public void parse(Reader in) throws IOException, ParseException {
    LineNumberReader br = new LineNumberReader(in);
    try {
        String line = null;
        String lastSynSetID = "";
        CharsRef[] synset = new CharsRef[8];
        int synsetSize = 0;
        while ((line = br.readLine()) != null) {
            String synSetID = line.substring(2, 11);
            if (!synSetID.equals(lastSynSetID)) {
                addInternal(synset, synsetSize);
                synsetSize = 0;
            }
            if (synset.length <= synsetSize + 1) {
                synset = Arrays.copyOf(synset, synset.length * 2);
            }
            synset[synsetSize] = parseSynonym(line, new CharsRefBuilder());
            synsetSize++;
            lastSynSetID = synSetID;
        }
        // final synset in the file
        addInternal(synset, synsetSize);
    } catch (IllegalArgumentException e) {
        ParseException ex = new ParseException("Invalid synonym rule at line " + br.getLineNumber(), 0);
        ex.initCause(e);
        throw ex;
    } finally {
        br.close();
    }
}
Also used : CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) ParseException(java.text.ParseException) CharsRef(org.apache.lucene.util.CharsRef) LineNumberReader(java.io.LineNumberReader)

Aggregations

CharsRef (org.apache.lucene.util.CharsRef)27 BytesRef (org.apache.lucene.util.BytesRef)8 ArrayList (java.util.ArrayList)6 CharsRefBuilder (org.apache.lucene.util.CharsRefBuilder)6 IntsRefBuilder (org.apache.lucene.util.IntsRefBuilder)6 StringReader (java.io.StringReader)4 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)4 SynonymFilter (org.apache.lucene.analysis.synonym.SynonymFilter)4 SynonymMap (org.apache.lucene.analysis.synonym.SynonymMap)4 HashMap (java.util.HashMap)3 TokenStream (org.apache.lucene.analysis.TokenStream)3 IntsRef (org.apache.lucene.util.IntsRef)3 Test (org.junit.Test)3 ParseException (java.text.ParseException)2 HashSet (java.util.HashSet)2 Map (java.util.Map)2 ConcurrentHashMap (java.util.concurrent.ConcurrentHashMap)2 Analyzer (org.apache.lucene.analysis.Analyzer)2 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)2 Tokenizer (org.apache.lucene.analysis.Tokenizer)2