use of org.apache.lucene.util.CharsRef in project lucene-solr by apache.
the class BaseSynonymParserTestCase method assertEntryAbsent.
/**
* Validates that there are no synonyms for the given word.
* @param synonynMap the generated synonym map after parsing
* @param word word (phrase) we are validating the synonyms for. Should be the value that comes out of the analyzer.
* All spaces will be replaced by word separators.
*/
public static void assertEntryAbsent(SynonymMap synonynMap, String word) throws IOException {
word = word.replace(' ', SynonymMap.WORD_SEPARATOR);
BytesRef value = Util.get(synonynMap.fst, Util.toUTF32(new CharsRef(word), new IntsRefBuilder()));
assertNull("There should be no synonyms for: " + word, value);
}
use of org.apache.lucene.util.CharsRef in project lucene-solr by apache.
the class Dictionary method applyMappings.
// TODO: this could be more efficient!
static void applyMappings(FST<CharsRef> fst, StringBuilder sb) throws IOException {
final FST.BytesReader bytesReader = fst.getBytesReader();
final FST.Arc<CharsRef> firstArc = fst.getFirstArc(new FST.Arc<CharsRef>());
final CharsRef NO_OUTPUT = fst.outputs.getNoOutput();
// temporary stuff
final FST.Arc<CharsRef> arc = new FST.Arc<>();
int longestMatch;
CharsRef longestOutput;
for (int i = 0; i < sb.length(); i++) {
arc.copyFrom(firstArc);
CharsRef output = NO_OUTPUT;
longestMatch = -1;
longestOutput = null;
for (int j = i; j < sb.length(); j++) {
char ch = sb.charAt(j);
if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) {
break;
} else {
output = fst.outputs.add(output, arc.output);
}
if (arc.isFinal()) {
longestOutput = fst.outputs.add(output, arc.nextFinalOutput);
longestMatch = j;
}
}
if (longestMatch >= 0) {
sb.delete(i, longestMatch + 1);
sb.insert(i, longestOutput);
i += (longestOutput.length - 1);
}
}
}
use of org.apache.lucene.util.CharsRef in project lucene-solr by apache.
the class Stemmer method applyAffix.
/**
* Applies the affix rule to the given word, producing a list of stems if any are found
*
* @param strippedWord Word the affix has been removed and the strip added
* @param length valid length of stripped word
* @param affix HunspellAffix representing the affix rule itself
* @param prefixFlag when we already stripped a prefix, we cant simply recurse and check the suffix, unless both are compatible
* so we must check dictionary form against both to add it as a stem!
* @param recursionDepth current recursion depth
* @param prefix true if we are removing a prefix (false if it's a suffix)
* @return List of stems for the word, or an empty list if none are found
*/
List<CharsRef> applyAffix(char[] strippedWord, int length, int affix, int prefixFlag, int recursionDepth, boolean prefix, boolean circumfix, boolean caseVariant) throws IOException {
// TODO: just pass this in from before, no need to decode it twice
affixReader.setPosition(8 * affix);
char flag = (char) (affixReader.readShort() & 0xffff);
// strip
affixReader.skipBytes(2);
int condition = (char) (affixReader.readShort() & 0xffff);
boolean crossProduct = (condition & 1) == 1;
condition >>>= 1;
char append = (char) (affixReader.readShort() & 0xffff);
List<CharsRef> stems = new ArrayList<>();
IntsRef forms = dictionary.lookupWord(strippedWord, 0, length);
if (forms != null) {
for (int i = 0; i < forms.length; i += formStep) {
dictionary.flagLookup.get(forms.ints[forms.offset + i], scratch);
char[] wordFlags = Dictionary.decodeFlags(scratch);
if (Dictionary.hasFlag(wordFlags, flag)) {
// confusing: in this one exception, we already chained the first prefix against the second,
// so it doesnt need to be checked against the word
boolean chainedPrefix = dictionary.complexPrefixes && recursionDepth == 1 && prefix;
if (chainedPrefix == false && prefixFlag >= 0 && !Dictionary.hasFlag(wordFlags, (char) prefixFlag)) {
// see if we can chain prefix thru the suffix continuation class (only if it has any!)
dictionary.flagLookup.get(append, scratch);
char[] appendFlags = Dictionary.decodeFlags(scratch);
if (!hasCrossCheckedFlag((char) prefixFlag, appendFlags, false)) {
continue;
}
}
// to ensure it has it, and vice versa
if (dictionary.circumfix != -1) {
dictionary.flagLookup.get(append, scratch);
char[] appendFlags = Dictionary.decodeFlags(scratch);
boolean suffixCircumfix = Dictionary.hasFlag(appendFlags, (char) dictionary.circumfix);
if (circumfix != suffixCircumfix) {
continue;
}
}
// we are looking for a case variant, but this word does not allow it
if (caseVariant && dictionary.keepcase != -1 && Dictionary.hasFlag(wordFlags, (char) dictionary.keepcase)) {
continue;
}
// we aren't decompounding (yet)
if (dictionary.onlyincompound != -1 && Dictionary.hasFlag(wordFlags, (char) dictionary.onlyincompound)) {
continue;
}
stems.add(newStem(strippedWord, length, forms, i));
}
}
}
// if a circumfix flag is defined in the dictionary, and we are a prefix, we need to check if we have that flag
if (dictionary.circumfix != -1 && !circumfix && prefix) {
dictionary.flagLookup.get(append, scratch);
char[] appendFlags = Dictionary.decodeFlags(scratch);
circumfix = Dictionary.hasFlag(appendFlags, (char) dictionary.circumfix);
}
if (crossProduct) {
if (recursionDepth == 0) {
if (prefix) {
// we took away the first prefix.
// COMPLEXPREFIXES = true: combine with a second prefix and another suffix
// COMPLEXPREFIXES = false: combine with a suffix
stems.addAll(stem(strippedWord, length, affix, flag, flag, ++recursionDepth, dictionary.complexPrefixes && dictionary.twoStageAffix, true, true, circumfix, caseVariant));
} else if (dictionary.complexPrefixes == false && dictionary.twoStageAffix) {
// we took away a suffix.
// COMPLEXPREFIXES = true: we don't recurse! only one suffix allowed
// COMPLEXPREFIXES = false: combine with another suffix
stems.addAll(stem(strippedWord, length, affix, flag, prefixFlag, ++recursionDepth, false, true, false, circumfix, caseVariant));
}
} else if (recursionDepth == 1) {
if (prefix && dictionary.complexPrefixes) {
// we took away the second prefix: go look for another suffix
stems.addAll(stem(strippedWord, length, affix, flag, flag, ++recursionDepth, false, true, true, circumfix, caseVariant));
} else if (prefix == false && dictionary.complexPrefixes == false && dictionary.twoStageAffix) {
// we took away a prefix, then a suffix: go look for another suffix
stems.addAll(stem(strippedWord, length, affix, flag, prefixFlag, ++recursionDepth, false, true, false, circumfix, caseVariant));
}
}
}
return stems;
}
use of org.apache.lucene.util.CharsRef in project lucene-solr by apache.
the class Stemmer method uniqueStems.
/**
* Find the unique stem(s) of the provided word
*
* @param word Word to find the stems for
* @return List of stems for the word
*/
public List<CharsRef> uniqueStems(char[] word, int length) {
List<CharsRef> stems = stem(word, length);
if (stems.size() < 2) {
return stems;
}
CharArraySet terms = new CharArraySet(8, dictionary.ignoreCase);
List<CharsRef> deduped = new ArrayList<>();
for (CharsRef s : stems) {
if (!terms.contains(s)) {
deduped.add(s);
terms.add(s);
}
}
return deduped;
}
use of org.apache.lucene.util.CharsRef in project lucene-solr by apache.
the class Test64kAffixes method test.
public void test() throws Exception {
Path tempDir = createTempDir("64kaffixes");
Path affix = tempDir.resolve("64kaffixes.aff");
Path dict = tempDir.resolve("64kaffixes.dic");
BufferedWriter affixWriter = Files.newBufferedWriter(affix, StandardCharsets.UTF_8);
// 65k affixes with flag 1, then an affix with flag 2
affixWriter.write("SET UTF-8\nFLAG num\nSFX 1 Y 65536\n");
for (int i = 0; i < 65536; i++) {
affixWriter.write("SFX 1 0 " + Integer.toHexString(i) + " .\n");
}
affixWriter.write("SFX 2 Y 1\nSFX 2 0 s\n");
affixWriter.close();
BufferedWriter dictWriter = Files.newBufferedWriter(dict, StandardCharsets.UTF_8);
// drink signed with affix 2 (takes -s)
dictWriter.write("1\ndrink/2\n");
dictWriter.close();
try (InputStream affStream = Files.newInputStream(affix);
InputStream dictStream = Files.newInputStream(dict);
Directory tempDir2 = newDirectory()) {
Dictionary dictionary = new Dictionary(tempDir2, "dictionary", affStream, dictStream);
Stemmer stemmer = new Stemmer(dictionary);
// drinks should still stem to drink
List<CharsRef> stems = stemmer.stem("drinks");
assertEquals(1, stems.size());
assertEquals("drink", stems.get(0).toString());
}
}
Aggregations