use of org.apache.lucene.util.IntsRef in project lucene-solr by apache.
the class GraphTokenStreamFiniteStrings method getFiniteStrings.
/**
* Get all finite strings that start at {@code startState} and end at {@code endState}.
*/
public Iterator<TokenStream> getFiniteStrings(int startState, int endState) throws IOException {
final FiniteStringsIterator it = new FiniteStringsIterator(det, startState, endState);
return new Iterator<TokenStream>() {
IntsRef current;
boolean finished = false;
@Override
public boolean hasNext() {
if (finished == false && current == null) {
current = it.next();
if (current == null) {
finished = true;
}
}
return current != null;
}
@Override
public TokenStream next() {
if (current == null) {
hasNext();
}
TokenStream next = new FiniteStringsTokenStream(current);
current = null;
return next;
}
};
}
use of org.apache.lucene.util.IntsRef in project lucene-solr by apache.
the class TestFuzzyQuery method getDistance.
// Poached from LuceneLevenshteinDistance.java (from suggest module): it supports transpositions (treats them as ed=1, not ed=2)
private static int getDistance(String target, String other) {
IntsRef targetPoints;
IntsRef otherPoints;
int n;
// cost array
int[][] d;
// NOTE: if we cared, we could 3*m space instead of m*n space, similar to
// what LevenshteinDistance does, except cycling thru a ring of three
// horizontal cost arrays... but this comparator is never actually used by
// DirectSpellChecker, it's only used for merging results from multiple shards
// in "distributed spellcheck", and it's inefficient in other ways too...
// cheaper to do this up front once
targetPoints = toIntsRef(target);
otherPoints = toIntsRef(other);
n = targetPoints.length;
final int m = otherPoints.length;
d = new int[n + 1][m + 1];
if (n == 0 || m == 0) {
if (n == m) {
return 0;
} else {
return Math.max(n, m);
}
}
// indexes into strings s and t
// iterates through s
int i;
// iterates through t
int j;
// jth character of t
int t_j;
// cost
int cost;
for (i = 0; i <= n; i++) {
d[i][0] = i;
}
for (j = 0; j <= m; j++) {
d[0][j] = j;
}
for (j = 1; j <= m; j++) {
t_j = otherPoints.ints[j - 1];
for (i = 1; i <= n; i++) {
cost = targetPoints.ints[i - 1] == t_j ? 0 : 1;
// minimum of cell to the left+1, to the top+1, diagonally left and up +cost
d[i][j] = Math.min(Math.min(d[i - 1][j] + 1, d[i][j - 1] + 1), d[i - 1][j - 1] + cost);
// transposition
if (i > 1 && j > 1 && targetPoints.ints[i - 1] == otherPoints.ints[j - 2] && targetPoints.ints[i - 2] == otherPoints.ints[j - 1]) {
d[i][j] = Math.min(d[i][j], d[i - 2][j - 2] + cost);
}
}
}
return d[n][m];
}
use of org.apache.lucene.util.IntsRef in project lucene-solr by apache.
the class Stemmer method doStem.
private List<CharsRef> doStem(char[] word, int length, boolean caseVariant) {
List<CharsRef> stems = new ArrayList<>();
IntsRef forms = dictionary.lookupWord(word, 0, length);
if (forms != null) {
for (int i = 0; i < forms.length; i += formStep) {
boolean checkKeepCase = caseVariant && dictionary.keepcase != -1;
boolean checkNeedAffix = dictionary.needaffix != -1;
boolean checkOnlyInCompound = dictionary.onlyincompound != -1;
if (checkKeepCase || checkNeedAffix || checkOnlyInCompound) {
dictionary.flagLookup.get(forms.ints[forms.offset + i], scratch);
char[] wordFlags = Dictionary.decodeFlags(scratch);
// we are looking for a case variant, but this word does not allow it
if (checkKeepCase && Dictionary.hasFlag(wordFlags, (char) dictionary.keepcase)) {
continue;
}
// we can't add this form, it's a pseudostem requiring an affix
if (checkNeedAffix && Dictionary.hasFlag(wordFlags, (char) dictionary.needaffix)) {
continue;
}
// we can't add this form, it only belongs inside a compound word
if (checkOnlyInCompound && Dictionary.hasFlag(wordFlags, (char) dictionary.onlyincompound)) {
continue;
}
}
stems.add(newStem(word, length, forms, i));
}
}
try {
boolean v = stems.addAll(stem(word, length, -1, -1, -1, 0, true, true, false, false, caseVariant));
} catch (IOException bogus) {
throw new RuntimeException(bogus);
}
return stems;
}
use of org.apache.lucene.util.IntsRef in project lucene-solr by apache.
the class Stemmer method stem.
/**
* Generates a list of stems for the provided word
*
* @param word Word to generate the stems for
* @param previous previous affix that was removed (so we dont remove same one twice)
* @param prevFlag Flag from a previous stemming step that need to be cross-checked with any affixes in this recursive step
* @param prefixFlag flag of the most inner removed prefix, so that when removing a suffix, it's also checked against the word
* @param recursionDepth current recursiondepth
* @param doPrefix true if we should remove prefixes
* @param doSuffix true if we should remove suffixes
* @param previousWasPrefix true if the previous removal was a prefix:
* if we are removing a suffix, and it has no continuation requirements, it's ok.
* but two prefixes (COMPLEXPREFIXES) or two suffixes must have continuation requirements to recurse.
* @param circumfix true if the previous prefix removal was signed as a circumfix
* this means inner most suffix must also contain circumfix flag.
* @param caseVariant true if we are searching for a case variant. if the word has KEEPCASE flag it cannot succeed.
* @return List of stems, or empty list if no stems are found
*/
private List<CharsRef> stem(char[] word, int length, int previous, int prevFlag, int prefixFlag, int recursionDepth, boolean doPrefix, boolean doSuffix, boolean previousWasPrefix, boolean circumfix, boolean caseVariant) throws IOException {
// TODO: allow this stuff to be reused by tokenfilter
List<CharsRef> stems = new ArrayList<>();
if (doPrefix && dictionary.prefixes != null) {
FST<IntsRef> fst = dictionary.prefixes;
Outputs<IntsRef> outputs = fst.outputs;
FST.BytesReader bytesReader = prefixReaders[recursionDepth];
FST.Arc<IntsRef> arc = prefixArcs[recursionDepth];
fst.getFirstArc(arc);
IntsRef NO_OUTPUT = outputs.getNoOutput();
IntsRef output = NO_OUTPUT;
int limit = dictionary.fullStrip ? length : length - 1;
for (int i = 0; i < limit; i++) {
if (i > 0) {
int ch = word[i - 1];
if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) {
break;
} else if (arc.output != NO_OUTPUT) {
output = fst.outputs.add(output, arc.output);
}
}
IntsRef prefixes = null;
if (!arc.isFinal()) {
continue;
} else {
prefixes = fst.outputs.add(output, arc.nextFinalOutput);
}
for (int j = 0; j < prefixes.length; j++) {
int prefix = prefixes.ints[prefixes.offset + j];
if (prefix == previous) {
continue;
}
affixReader.setPosition(8 * prefix);
char flag = (char) (affixReader.readShort() & 0xffff);
char stripOrd = (char) (affixReader.readShort() & 0xffff);
int condition = (char) (affixReader.readShort() & 0xffff);
boolean crossProduct = (condition & 1) == 1;
condition >>>= 1;
char append = (char) (affixReader.readShort() & 0xffff);
final boolean compatible;
if (recursionDepth == 0) {
if (dictionary.onlyincompound == -1) {
compatible = true;
} else {
// check if affix is allowed in a non-compound word
dictionary.flagLookup.get(append, scratch);
char[] appendFlags = Dictionary.decodeFlags(scratch);
compatible = !Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound);
}
} else if (crossProduct) {
// cross check incoming continuation class (flag of previous affix) against list.
dictionary.flagLookup.get(append, scratch);
char[] appendFlags = Dictionary.decodeFlags(scratch);
assert prevFlag >= 0;
boolean allowed = dictionary.onlyincompound == -1 || !Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound);
compatible = allowed && hasCrossCheckedFlag((char) prevFlag, appendFlags, false);
} else {
compatible = false;
}
if (compatible) {
int deAffixedStart = i;
int deAffixedLength = length - deAffixedStart;
int stripStart = dictionary.stripOffsets[stripOrd];
int stripEnd = dictionary.stripOffsets[stripOrd + 1];
int stripLength = stripEnd - stripStart;
if (!checkCondition(condition, dictionary.stripData, stripStart, stripLength, word, deAffixedStart, deAffixedLength)) {
continue;
}
char[] strippedWord = new char[stripLength + deAffixedLength];
System.arraycopy(dictionary.stripData, stripStart, strippedWord, 0, stripLength);
System.arraycopy(word, deAffixedStart, strippedWord, stripLength, deAffixedLength);
List<CharsRef> stemList = applyAffix(strippedWord, strippedWord.length, prefix, -1, recursionDepth, true, circumfix, caseVariant);
stems.addAll(stemList);
}
}
}
}
if (doSuffix && dictionary.suffixes != null) {
FST<IntsRef> fst = dictionary.suffixes;
Outputs<IntsRef> outputs = fst.outputs;
FST.BytesReader bytesReader = suffixReaders[recursionDepth];
FST.Arc<IntsRef> arc = suffixArcs[recursionDepth];
fst.getFirstArc(arc);
IntsRef NO_OUTPUT = outputs.getNoOutput();
IntsRef output = NO_OUTPUT;
int limit = dictionary.fullStrip ? 0 : 1;
for (int i = length; i >= limit; i--) {
if (i < length) {
int ch = word[i];
if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) {
break;
} else if (arc.output != NO_OUTPUT) {
output = fst.outputs.add(output, arc.output);
}
}
IntsRef suffixes = null;
if (!arc.isFinal()) {
continue;
} else {
suffixes = fst.outputs.add(output, arc.nextFinalOutput);
}
for (int j = 0; j < suffixes.length; j++) {
int suffix = suffixes.ints[suffixes.offset + j];
if (suffix == previous) {
continue;
}
affixReader.setPosition(8 * suffix);
char flag = (char) (affixReader.readShort() & 0xffff);
char stripOrd = (char) (affixReader.readShort() & 0xffff);
int condition = (char) (affixReader.readShort() & 0xffff);
boolean crossProduct = (condition & 1) == 1;
condition >>>= 1;
char append = (char) (affixReader.readShort() & 0xffff);
final boolean compatible;
if (recursionDepth == 0) {
if (dictionary.onlyincompound == -1) {
compatible = true;
} else {
// check if affix is allowed in a non-compound word
dictionary.flagLookup.get(append, scratch);
char[] appendFlags = Dictionary.decodeFlags(scratch);
compatible = !Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound);
}
} else if (crossProduct) {
// cross check incoming continuation class (flag of previous affix) against list.
dictionary.flagLookup.get(append, scratch);
char[] appendFlags = Dictionary.decodeFlags(scratch);
assert prevFlag >= 0;
boolean allowed = dictionary.onlyincompound == -1 || !Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound);
compatible = allowed && hasCrossCheckedFlag((char) prevFlag, appendFlags, previousWasPrefix);
} else {
compatible = false;
}
if (compatible) {
int appendLength = length - i;
int deAffixedLength = length - appendLength;
int stripStart = dictionary.stripOffsets[stripOrd];
int stripEnd = dictionary.stripOffsets[stripOrd + 1];
int stripLength = stripEnd - stripStart;
if (!checkCondition(condition, word, 0, deAffixedLength, dictionary.stripData, stripStart, stripLength)) {
continue;
}
char[] strippedWord = new char[stripLength + deAffixedLength];
System.arraycopy(word, 0, strippedWord, 0, deAffixedLength);
System.arraycopy(dictionary.stripData, stripStart, strippedWord, deAffixedLength, stripLength);
List<CharsRef> stemList = applyAffix(strippedWord, strippedWord.length, suffix, prefixFlag, recursionDepth, false, circumfix, caseVariant);
stems.addAll(stemList);
}
}
}
}
return stems;
}
use of org.apache.lucene.util.IntsRef in project lucene-solr by apache.
the class TestAutomaton method assertSame.
private void assertSame(Collection<BytesRef> terms, Automaton a) {
try {
assertTrue(Operations.isFinite(a));
assertFalse(Operations.isTotal(a));
Automaton detA = Operations.determinize(a, DEFAULT_MAX_DETERMINIZED_STATES);
// Make sure all terms are accepted:
IntsRefBuilder scratch = new IntsRefBuilder();
for (BytesRef term : terms) {
Util.toIntsRef(term, scratch);
assertTrue("failed to accept term=" + term.utf8ToString(), Operations.run(detA, term.utf8ToString()));
}
// Use getFiniteStrings:
Set<IntsRef> expected = new HashSet<>();
for (BytesRef term : terms) {
IntsRefBuilder intsRef = new IntsRefBuilder();
Util.toUTF32(term.utf8ToString(), intsRef);
expected.add(intsRef.toIntsRef());
}
Set<IntsRef> actual = TestOperations.getFiniteStrings(a);
if (expected.equals(actual) == false) {
System.out.println("FAILED:");
for (IntsRef term : expected) {
if (actual.contains(term) == false) {
System.out.println(" term=" + term + " should be accepted but isn't");
}
}
for (IntsRef term : actual) {
if (expected.contains(term) == false) {
System.out.println(" term=" + term + " is accepted but should not be");
}
}
throw new AssertionError("mismatch");
}
// Use sameLanguage:
Automaton a2 = Operations.removeDeadStates(Operations.determinize(unionTerms(terms), Integer.MAX_VALUE));
assertTrue(Operations.sameLanguage(a2, Operations.removeDeadStates(Operations.determinize(a, Integer.MAX_VALUE))));
// Do same check, in UTF8 space
Automaton utf8 = randomNoOp(new UTF32ToUTF8().convert(a));
Set<IntsRef> expected2 = new HashSet<>();
for (BytesRef term : terms) {
IntsRefBuilder intsRef = new IntsRefBuilder();
Util.toIntsRef(term, intsRef);
expected2.add(intsRef.toIntsRef());
}
assertEquals(expected2, TestOperations.getFiniteStrings(utf8));
} catch (AssertionError ae) {
System.out.println("TEST: FAILED: not same");
System.out.println(" terms (count=" + terms.size() + "):");
for (BytesRef term : terms) {
System.out.println(" " + term);
}
System.out.println(" automaton:");
System.out.println(a.toDot());
//a.writeDot("fail");
throw ae;
}
}
Aggregations