use of org.apache.lucene.util.IntsRef in project lucene-solr by apache.
the class TestFuzzyQuery method toIntsRef.
private static IntsRef toIntsRef(String s) {
// worst case
IntsRef ref = new IntsRef(s.length());
int utf16Len = s.length();
for (int i = 0, cp = 0; i < utf16Len; i += Character.charCount(cp)) {
cp = ref.ints[ref.length++] = Character.codePointAt(s, i);
}
return ref;
}
use of org.apache.lucene.util.IntsRef in project lucene-solr by apache.
the class Stemmer method applyAffix.
/**
* Applies the affix rule to the given word, producing a list of stems if any are found
*
* @param strippedWord Word the affix has been removed and the strip added
* @param length valid length of stripped word
* @param affix HunspellAffix representing the affix rule itself
* @param prefixFlag when we already stripped a prefix, we cant simply recurse and check the suffix, unless both are compatible
* so we must check dictionary form against both to add it as a stem!
* @param recursionDepth current recursion depth
* @param prefix true if we are removing a prefix (false if it's a suffix)
* @return List of stems for the word, or an empty list if none are found
*/
List<CharsRef> applyAffix(char[] strippedWord, int length, int affix, int prefixFlag, int recursionDepth, boolean prefix, boolean circumfix, boolean caseVariant) throws IOException {
// TODO: just pass this in from before, no need to decode it twice
affixReader.setPosition(8 * affix);
char flag = (char) (affixReader.readShort() & 0xffff);
// strip
affixReader.skipBytes(2);
int condition = (char) (affixReader.readShort() & 0xffff);
boolean crossProduct = (condition & 1) == 1;
condition >>>= 1;
char append = (char) (affixReader.readShort() & 0xffff);
List<CharsRef> stems = new ArrayList<>();
IntsRef forms = dictionary.lookupWord(strippedWord, 0, length);
if (forms != null) {
for (int i = 0; i < forms.length; i += formStep) {
dictionary.flagLookup.get(forms.ints[forms.offset + i], scratch);
char[] wordFlags = Dictionary.decodeFlags(scratch);
if (Dictionary.hasFlag(wordFlags, flag)) {
// confusing: in this one exception, we already chained the first prefix against the second,
// so it doesnt need to be checked against the word
boolean chainedPrefix = dictionary.complexPrefixes && recursionDepth == 1 && prefix;
if (chainedPrefix == false && prefixFlag >= 0 && !Dictionary.hasFlag(wordFlags, (char) prefixFlag)) {
// see if we can chain prefix thru the suffix continuation class (only if it has any!)
dictionary.flagLookup.get(append, scratch);
char[] appendFlags = Dictionary.decodeFlags(scratch);
if (!hasCrossCheckedFlag((char) prefixFlag, appendFlags, false)) {
continue;
}
}
// to ensure it has it, and vice versa
if (dictionary.circumfix != -1) {
dictionary.flagLookup.get(append, scratch);
char[] appendFlags = Dictionary.decodeFlags(scratch);
boolean suffixCircumfix = Dictionary.hasFlag(appendFlags, (char) dictionary.circumfix);
if (circumfix != suffixCircumfix) {
continue;
}
}
// we are looking for a case variant, but this word does not allow it
if (caseVariant && dictionary.keepcase != -1 && Dictionary.hasFlag(wordFlags, (char) dictionary.keepcase)) {
continue;
}
// we aren't decompounding (yet)
if (dictionary.onlyincompound != -1 && Dictionary.hasFlag(wordFlags, (char) dictionary.onlyincompound)) {
continue;
}
stems.add(newStem(strippedWord, length, forms, i));
}
}
}
// if a circumfix flag is defined in the dictionary, and we are a prefix, we need to check if we have that flag
if (dictionary.circumfix != -1 && !circumfix && prefix) {
dictionary.flagLookup.get(append, scratch);
char[] appendFlags = Dictionary.decodeFlags(scratch);
circumfix = Dictionary.hasFlag(appendFlags, (char) dictionary.circumfix);
}
if (crossProduct) {
if (recursionDepth == 0) {
if (prefix) {
// we took away the first prefix.
// COMPLEXPREFIXES = true: combine with a second prefix and another suffix
// COMPLEXPREFIXES = false: combine with a suffix
stems.addAll(stem(strippedWord, length, affix, flag, flag, ++recursionDepth, dictionary.complexPrefixes && dictionary.twoStageAffix, true, true, circumfix, caseVariant));
} else if (dictionary.complexPrefixes == false && dictionary.twoStageAffix) {
// we took away a suffix.
// COMPLEXPREFIXES = true: we don't recurse! only one suffix allowed
// COMPLEXPREFIXES = false: combine with another suffix
stems.addAll(stem(strippedWord, length, affix, flag, prefixFlag, ++recursionDepth, false, true, false, circumfix, caseVariant));
}
} else if (recursionDepth == 1) {
if (prefix && dictionary.complexPrefixes) {
// we took away the second prefix: go look for another suffix
stems.addAll(stem(strippedWord, length, affix, flag, flag, ++recursionDepth, false, true, true, circumfix, caseVariant));
} else if (prefix == false && dictionary.complexPrefixes == false && dictionary.twoStageAffix) {
// we took away a prefix, then a suffix: go look for another suffix
stems.addAll(stem(strippedWord, length, affix, flag, prefixFlag, ++recursionDepth, false, true, false, circumfix, caseVariant));
}
}
}
return stems;
}
use of org.apache.lucene.util.IntsRef in project lucene-solr by apache.
the class TestDictionary method testCompressedEmptyAliasDictionary.
public void testCompressedEmptyAliasDictionary() throws Exception {
InputStream affixStream = getClass().getResourceAsStream("compressed-empty-alias.aff");
InputStream dictStream = getClass().getResourceAsStream("compressed.dic");
Directory tempDir = getDirectory();
Dictionary dictionary = new Dictionary(tempDir, "dictionary", affixStream, dictStream);
assertEquals(3, dictionary.lookupSuffix(new char[] { 'e' }, 0, 1).length);
assertEquals(1, dictionary.lookupPrefix(new char[] { 's' }, 0, 1).length);
IntsRef ordList = dictionary.lookupWord(new char[] { 'o', 'l', 'r' }, 0, 3);
BytesRef ref = new BytesRef();
dictionary.flagLookup.get(ordList.ints[0], ref);
char[] flags = Dictionary.decodeFlags(ref);
assertEquals(1, flags.length);
affixStream.close();
dictStream.close();
tempDir.close();
}
use of org.apache.lucene.util.IntsRef in project lucene-solr by apache.
the class TestDictionary method testSimpleDictionary.
public void testSimpleDictionary() throws Exception {
InputStream affixStream = getClass().getResourceAsStream("simple.aff");
InputStream dictStream = getClass().getResourceAsStream("simple.dic");
Directory tempDir = getDirectory();
Dictionary dictionary = new Dictionary(tempDir, "dictionary", affixStream, dictStream);
assertEquals(3, dictionary.lookupSuffix(new char[] { 'e' }, 0, 1).length);
assertEquals(1, dictionary.lookupPrefix(new char[] { 's' }, 0, 1).length);
IntsRef ordList = dictionary.lookupWord(new char[] { 'o', 'l', 'r' }, 0, 3);
assertNotNull(ordList);
assertEquals(1, ordList.length);
BytesRef ref = new BytesRef();
dictionary.flagLookup.get(ordList.ints[0], ref);
char[] flags = Dictionary.decodeFlags(ref);
assertEquals(1, flags.length);
ordList = dictionary.lookupWord(new char[] { 'l', 'u', 'c', 'e', 'n' }, 0, 5);
assertNotNull(ordList);
assertEquals(1, ordList.length);
dictionary.flagLookup.get(ordList.ints[0], ref);
flags = Dictionary.decodeFlags(ref);
assertEquals(1, flags.length);
affixStream.close();
dictStream.close();
tempDir.close();
}
use of org.apache.lucene.util.IntsRef in project lucene-solr by apache.
the class TaxonomyFacetCounts method count.
private final void count(List<MatchingDocs> matchingDocs) throws IOException {
IntsRef scratch = new IntsRef();
for (MatchingDocs hits : matchingDocs) {
OrdinalsReader.OrdinalsSegmentReader ords = ordinalsReader.getReader(hits.context);
DocIdSetIterator docs = hits.bits.iterator();
int doc;
while ((doc = docs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
ords.get(doc, scratch);
for (int i = 0; i < scratch.length; i++) {
values[scratch.ints[scratch.offset + i]]++;
}
}
}
rollup();
}
Aggregations