Search in sources :

Example 1 with CharsRef

use of org.apache.lucene.util.CharsRef in project lucene-solr by apache.

the class BaseSynonymParserTestCase method assertEntryAbsent.

/**
   * Validates that there are no synonyms for the given word.
   * @param synonynMap  the generated synonym map after parsing
   * @param word        word (phrase) we are validating the synonyms for. Should be the value that comes out of the analyzer.
   *                    All spaces will be replaced by word separators.
   */
public static void assertEntryAbsent(SynonymMap synonynMap, String word) throws IOException {
    word = word.replace(' ', SynonymMap.WORD_SEPARATOR);
    BytesRef value = Util.get(synonynMap.fst, Util.toUTF32(new CharsRef(word), new IntsRefBuilder()));
    assertNull("There should be no synonyms for: " + word, value);
}
Also used : IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) BytesRef(org.apache.lucene.util.BytesRef) CharsRef(org.apache.lucene.util.CharsRef)

Example 2 with CharsRef

use of org.apache.lucene.util.CharsRef in project lucene-solr by apache.

the class Dictionary method applyMappings.

// TODO: this could be more efficient!
static void applyMappings(FST<CharsRef> fst, StringBuilder sb) throws IOException {
    final FST.BytesReader bytesReader = fst.getBytesReader();
    final FST.Arc<CharsRef> firstArc = fst.getFirstArc(new FST.Arc<CharsRef>());
    final CharsRef NO_OUTPUT = fst.outputs.getNoOutput();
    // temporary stuff
    final FST.Arc<CharsRef> arc = new FST.Arc<>();
    int longestMatch;
    CharsRef longestOutput;
    for (int i = 0; i < sb.length(); i++) {
        arc.copyFrom(firstArc);
        CharsRef output = NO_OUTPUT;
        longestMatch = -1;
        longestOutput = null;
        for (int j = i; j < sb.length(); j++) {
            char ch = sb.charAt(j);
            if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) {
                break;
            } else {
                output = fst.outputs.add(output, arc.output);
            }
            if (arc.isFinal()) {
                longestOutput = fst.outputs.add(output, arc.nextFinalOutput);
                longestMatch = j;
            }
        }
        if (longestMatch >= 0) {
            sb.delete(i, longestMatch + 1);
            sb.insert(i, longestOutput);
            i += (longestOutput.length - 1);
        }
    }
}
Also used : FST(org.apache.lucene.util.fst.FST) CharsRef(org.apache.lucene.util.CharsRef)

Example 3 with CharsRef

use of org.apache.lucene.util.CharsRef in project lucene-solr by apache.

the class Stemmer method applyAffix.

/**
   * Applies the affix rule to the given word, producing a list of stems if any are found
   *
   * @param strippedWord Word the affix has been removed and the strip added
   * @param length valid length of stripped word
   * @param affix HunspellAffix representing the affix rule itself
   * @param prefixFlag when we already stripped a prefix, we cant simply recurse and check the suffix, unless both are compatible
   *                   so we must check dictionary form against both to add it as a stem!
   * @param recursionDepth current recursion depth
   * @param prefix true if we are removing a prefix (false if it's a suffix)
   * @return List of stems for the word, or an empty list if none are found
   */
List<CharsRef> applyAffix(char[] strippedWord, int length, int affix, int prefixFlag, int recursionDepth, boolean prefix, boolean circumfix, boolean caseVariant) throws IOException {
    // TODO: just pass this in from before, no need to decode it twice
    affixReader.setPosition(8 * affix);
    char flag = (char) (affixReader.readShort() & 0xffff);
    // strip
    affixReader.skipBytes(2);
    int condition = (char) (affixReader.readShort() & 0xffff);
    boolean crossProduct = (condition & 1) == 1;
    condition >>>= 1;
    char append = (char) (affixReader.readShort() & 0xffff);
    List<CharsRef> stems = new ArrayList<>();
    IntsRef forms = dictionary.lookupWord(strippedWord, 0, length);
    if (forms != null) {
        for (int i = 0; i < forms.length; i += formStep) {
            dictionary.flagLookup.get(forms.ints[forms.offset + i], scratch);
            char[] wordFlags = Dictionary.decodeFlags(scratch);
            if (Dictionary.hasFlag(wordFlags, flag)) {
                // confusing: in this one exception, we already chained the first prefix against the second,
                // so it doesnt need to be checked against the word
                boolean chainedPrefix = dictionary.complexPrefixes && recursionDepth == 1 && prefix;
                if (chainedPrefix == false && prefixFlag >= 0 && !Dictionary.hasFlag(wordFlags, (char) prefixFlag)) {
                    // see if we can chain prefix thru the suffix continuation class (only if it has any!)
                    dictionary.flagLookup.get(append, scratch);
                    char[] appendFlags = Dictionary.decodeFlags(scratch);
                    if (!hasCrossCheckedFlag((char) prefixFlag, appendFlags, false)) {
                        continue;
                    }
                }
                // to ensure it has it, and vice versa
                if (dictionary.circumfix != -1) {
                    dictionary.flagLookup.get(append, scratch);
                    char[] appendFlags = Dictionary.decodeFlags(scratch);
                    boolean suffixCircumfix = Dictionary.hasFlag(appendFlags, (char) dictionary.circumfix);
                    if (circumfix != suffixCircumfix) {
                        continue;
                    }
                }
                // we are looking for a case variant, but this word does not allow it
                if (caseVariant && dictionary.keepcase != -1 && Dictionary.hasFlag(wordFlags, (char) dictionary.keepcase)) {
                    continue;
                }
                // we aren't decompounding (yet)
                if (dictionary.onlyincompound != -1 && Dictionary.hasFlag(wordFlags, (char) dictionary.onlyincompound)) {
                    continue;
                }
                stems.add(newStem(strippedWord, length, forms, i));
            }
        }
    }
    // if a circumfix flag is defined in the dictionary, and we are a prefix, we need to check if we have that flag
    if (dictionary.circumfix != -1 && !circumfix && prefix) {
        dictionary.flagLookup.get(append, scratch);
        char[] appendFlags = Dictionary.decodeFlags(scratch);
        circumfix = Dictionary.hasFlag(appendFlags, (char) dictionary.circumfix);
    }
    if (crossProduct) {
        if (recursionDepth == 0) {
            if (prefix) {
                // we took away the first prefix.
                // COMPLEXPREFIXES = true:  combine with a second prefix and another suffix 
                // COMPLEXPREFIXES = false: combine with a suffix
                stems.addAll(stem(strippedWord, length, affix, flag, flag, ++recursionDepth, dictionary.complexPrefixes && dictionary.twoStageAffix, true, true, circumfix, caseVariant));
            } else if (dictionary.complexPrefixes == false && dictionary.twoStageAffix) {
                // we took away a suffix.
                // COMPLEXPREFIXES = true: we don't recurse! only one suffix allowed
                // COMPLEXPREFIXES = false: combine with another suffix
                stems.addAll(stem(strippedWord, length, affix, flag, prefixFlag, ++recursionDepth, false, true, false, circumfix, caseVariant));
            }
        } else if (recursionDepth == 1) {
            if (prefix && dictionary.complexPrefixes) {
                // we took away the second prefix: go look for another suffix
                stems.addAll(stem(strippedWord, length, affix, flag, flag, ++recursionDepth, false, true, true, circumfix, caseVariant));
            } else if (prefix == false && dictionary.complexPrefixes == false && dictionary.twoStageAffix) {
                // we took away a prefix, then a suffix: go look for another suffix
                stems.addAll(stem(strippedWord, length, affix, flag, prefixFlag, ++recursionDepth, false, true, false, circumfix, caseVariant));
            }
        }
    }
    return stems;
}
Also used : ArrayList(java.util.ArrayList) IntsRef(org.apache.lucene.util.IntsRef) CharsRef(org.apache.lucene.util.CharsRef)

Example 4 with CharsRef

use of org.apache.lucene.util.CharsRef in project lucene-solr by apache.

the class Stemmer method uniqueStems.

/**
   * Find the unique stem(s) of the provided word
   * 
   * @param word Word to find the stems for
   * @return List of stems for the word
   */
public List<CharsRef> uniqueStems(char[] word, int length) {
    List<CharsRef> stems = stem(word, length);
    if (stems.size() < 2) {
        return stems;
    }
    CharArraySet terms = new CharArraySet(8, dictionary.ignoreCase);
    List<CharsRef> deduped = new ArrayList<>();
    for (CharsRef s : stems) {
        if (!terms.contains(s)) {
            deduped.add(s);
            terms.add(s);
        }
    }
    return deduped;
}
Also used : CharArraySet(org.apache.lucene.analysis.CharArraySet) ArrayList(java.util.ArrayList) CharsRef(org.apache.lucene.util.CharsRef)

Example 5 with CharsRef

use of org.apache.lucene.util.CharsRef in project lucene-solr by apache.

the class Test64kAffixes method test.

public void test() throws Exception {
    Path tempDir = createTempDir("64kaffixes");
    Path affix = tempDir.resolve("64kaffixes.aff");
    Path dict = tempDir.resolve("64kaffixes.dic");
    BufferedWriter affixWriter = Files.newBufferedWriter(affix, StandardCharsets.UTF_8);
    // 65k affixes with flag 1, then an affix with flag 2
    affixWriter.write("SET UTF-8\nFLAG num\nSFX 1 Y 65536\n");
    for (int i = 0; i < 65536; i++) {
        affixWriter.write("SFX 1 0 " + Integer.toHexString(i) + " .\n");
    }
    affixWriter.write("SFX 2 Y 1\nSFX 2 0 s\n");
    affixWriter.close();
    BufferedWriter dictWriter = Files.newBufferedWriter(dict, StandardCharsets.UTF_8);
    // drink signed with affix 2 (takes -s)
    dictWriter.write("1\ndrink/2\n");
    dictWriter.close();
    try (InputStream affStream = Files.newInputStream(affix);
        InputStream dictStream = Files.newInputStream(dict);
        Directory tempDir2 = newDirectory()) {
        Dictionary dictionary = new Dictionary(tempDir2, "dictionary", affStream, dictStream);
        Stemmer stemmer = new Stemmer(dictionary);
        // drinks should still stem to drink
        List<CharsRef> stems = stemmer.stem("drinks");
        assertEquals(1, stems.size());
        assertEquals("drink", stems.get(0).toString());
    }
}
Also used : Path(java.nio.file.Path) InputStream(java.io.InputStream) CharsRef(org.apache.lucene.util.CharsRef) BufferedWriter(java.io.BufferedWriter) Directory(org.apache.lucene.store.Directory)

Aggregations

CharsRef (org.apache.lucene.util.CharsRef)30 BytesRef (org.apache.lucene.util.BytesRef)9 ArrayList (java.util.ArrayList)7 CharsRefBuilder (org.apache.lucene.util.CharsRefBuilder)6 IntsRefBuilder (org.apache.lucene.util.IntsRefBuilder)6 HashMap (java.util.HashMap)5 StringReader (java.io.StringReader)4 ConcurrentHashMap (java.util.concurrent.ConcurrentHashMap)4 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)4 SynonymFilter (org.apache.lucene.analysis.synonym.SynonymFilter)4 SynonymMap (org.apache.lucene.analysis.synonym.SynonymMap)4 SimpleOrderedMap (org.apache.solr.common.util.SimpleOrderedMap)4 SuggesterResult (org.apache.solr.spelling.suggest.SuggesterResult)4 HashSet (java.util.HashSet)3 Map (java.util.Map)3 TokenStream (org.apache.lucene.analysis.TokenStream)3 LookupResult (org.apache.lucene.search.suggest.Lookup.LookupResult)3 IntsRef (org.apache.lucene.util.IntsRef)3 Test (org.junit.Test)3 ParseException (java.text.ParseException)2