use of org.apache.lucene.util.fst.FST in project lucene-solr by apache.
the class AnalyzingSuggester method lookup.
@Override
public List<LookupResult> lookup(final CharSequence key, Set<BytesRef> contexts, boolean onlyMorePopular, int num) {
assert num > 0;
if (onlyMorePopular) {
throw new IllegalArgumentException("this suggester only works with onlyMorePopular=false");
}
if (contexts != null) {
throw new IllegalArgumentException("this suggester doesn't support contexts");
}
if (fst == null) {
return Collections.emptyList();
}
//System.out.println("lookup key=" + key + " num=" + num);
for (int i = 0; i < key.length(); i++) {
if (key.charAt(i) == 0x1E) {
throw new IllegalArgumentException("lookup key cannot contain HOLE character U+001E; this character is reserved");
}
if (key.charAt(i) == 0x1F) {
throw new IllegalArgumentException("lookup key cannot contain unit separator character U+001F; this character is reserved");
}
}
final BytesRef utf8Key = new BytesRef(key);
try {
Automaton lookupAutomaton = toLookupAutomaton(key);
final CharsRefBuilder spare = new CharsRefBuilder();
//System.out.println(" now intersect exactFirst=" + exactFirst);
// Intersect automaton w/ suggest wFST and get all
// prefix starting nodes & their outputs:
//final PathIntersector intersector = getPathIntersector(lookupAutomaton, fst);
//System.out.println(" prefixPaths: " + prefixPaths.size());
BytesReader bytesReader = fst.getBytesReader();
FST.Arc<Pair<Long, BytesRef>> scratchArc = new FST.Arc<>();
final List<LookupResult> results = new ArrayList<>();
List<FSTUtil.Path<Pair<Long, BytesRef>>> prefixPaths = FSTUtil.intersectPrefixPaths(convertAutomaton(lookupAutomaton), fst);
if (exactFirst) {
int count = 0;
for (FSTUtil.Path<Pair<Long, BytesRef>> path : prefixPaths) {
if (fst.findTargetArc(END_BYTE, path.fstNode, scratchArc, bytesReader) != null) {
// This node has END_BYTE arc leaving, meaning it's an
// "exact" match:
count++;
}
}
// Searcher just to find the single exact only
// match, if present:
Util.TopNSearcher<Pair<Long, BytesRef>> searcher;
searcher = new Util.TopNSearcher<>(fst, count * maxSurfaceFormsPerAnalyzedForm, count * maxSurfaceFormsPerAnalyzedForm, weightComparator);
// ...:
for (FSTUtil.Path<Pair<Long, BytesRef>> path : prefixPaths) {
if (fst.findTargetArc(END_BYTE, path.fstNode, scratchArc, bytesReader) != null) {
// This node has END_BYTE arc leaving, meaning it's an
// "exact" match:
searcher.addStartPaths(scratchArc, fst.outputs.add(path.output, scratchArc.output), false, path.input);
}
}
TopResults<Pair<Long, BytesRef>> completions = searcher.search();
assert completions.isComplete;
// maxSurfaceFormsPerAnalyzedForm:
for (Result<Pair<Long, BytesRef>> completion : completions) {
BytesRef output2 = completion.output.output2;
if (sameSurfaceForm(utf8Key, output2)) {
results.add(getLookupResult(completion.output.output1, output2, spare));
break;
}
}
if (results.size() == num) {
// That was quick:
return results;
}
}
Util.TopNSearcher<Pair<Long, BytesRef>> searcher;
searcher = new Util.TopNSearcher<Pair<Long, BytesRef>>(fst, num - results.size(), num * maxAnalyzedPathsForOneInput, weightComparator) {
private final Set<BytesRef> seen = new HashSet<>();
@Override
protected boolean acceptResult(IntsRef input, Pair<Long, BytesRef> output) {
// can get duplicate surface forms:
if (seen.contains(output.output2)) {
return false;
}
seen.add(output.output2);
if (!exactFirst) {
return true;
} else {
// create duplicate results:
if (sameSurfaceForm(utf8Key, output.output2)) {
// have already found it in the first search:
assert results.size() == 1;
return false;
} else {
return true;
}
}
}
};
prefixPaths = getFullPrefixPaths(prefixPaths, lookupAutomaton, fst);
for (FSTUtil.Path<Pair<Long, BytesRef>> path : prefixPaths) {
searcher.addStartPaths(path.fstNode, path.output, true, path.input);
}
TopResults<Pair<Long, BytesRef>> completions = searcher.search();
assert completions.isComplete;
for (Result<Pair<Long, BytesRef>> completion : completions) {
LookupResult result = getLookupResult(completion.output.output1, completion.output.output2, spare);
// TODO: for fuzzy case would be nice to return
// how many edits were required
//System.out.println(" result=" + result);
results.add(result);
if (results.size() == num) {
// produce one extra path
break;
}
}
return results;
} catch (IOException bogus) {
throw new RuntimeException(bogus);
}
}
use of org.apache.lucene.util.fst.FST in project elasticsearch by elastic.
the class XAnalyzingSuggester method lookup.
@Override
public List<LookupResult> lookup(final CharSequence key, Set<BytesRef> contexts, boolean onlyMorePopular, int num) {
assert num > 0;
if (onlyMorePopular) {
throw new IllegalArgumentException("this suggester only works with onlyMorePopular=false");
}
if (fst == null) {
return Collections.emptyList();
}
//System.out.println("lookup key=" + key + " num=" + num);
for (int i = 0; i < key.length(); i++) {
if (key.charAt(i) == holeCharacter) {
throw new IllegalArgumentException("lookup key cannot contain HOLE character U+001E; this character is reserved");
}
if (key.charAt(i) == sepLabel) {
throw new IllegalArgumentException("lookup key cannot contain unit separator character U+001F; this character is reserved");
}
}
final BytesRef utf8Key = new BytesRef(key);
try {
Automaton lookupAutomaton = toLookupAutomaton(key);
final CharsRefBuilder spare = new CharsRefBuilder();
//System.out.println(" now intersect exactFirst=" + exactFirst);
// Intersect automaton w/ suggest wFST and get all
// prefix starting nodes & their outputs:
//final PathIntersector intersector = getPathIntersector(lookupAutomaton, fst);
//System.out.println(" prefixPaths: " + prefixPaths.size());
BytesReader bytesReader = fst.getBytesReader();
FST.Arc<Pair<Long, BytesRef>> scratchArc = new FST.Arc<>();
final List<LookupResult> results = new ArrayList<>();
List<FSTUtil.Path<Pair<Long, BytesRef>>> prefixPaths = FSTUtil.intersectPrefixPaths(convertAutomaton(lookupAutomaton), fst);
if (exactFirst) {
int count = 0;
for (FSTUtil.Path<Pair<Long, BytesRef>> path : prefixPaths) {
if (fst.findTargetArc(endByte, path.fstNode, scratchArc, bytesReader) != null) {
// This node has END_BYTE arc leaving, meaning it's an
// "exact" match:
count++;
}
}
// Searcher just to find the single exact only
// match, if present:
Util.TopNSearcher<Pair<Long, BytesRef>> searcher;
searcher = new Util.TopNSearcher<>(fst, count * maxSurfaceFormsPerAnalyzedForm, count * maxSurfaceFormsPerAnalyzedForm, weightComparator);
// ...:
for (FSTUtil.Path<Pair<Long, BytesRef>> path : prefixPaths) {
if (fst.findTargetArc(endByte, path.fstNode, scratchArc, bytesReader) != null) {
// This node has END_BYTE arc leaving, meaning it's an
// "exact" match:
searcher.addStartPaths(scratchArc, fst.outputs.add(path.output, scratchArc.output), false, path.input);
}
}
Util.TopResults<Pair<Long, BytesRef>> completions = searcher.search();
// maxSurfaceFormsPerAnalyzedForm:
for (Result<Pair<Long, BytesRef>> completion : completions) {
BytesRef output2 = completion.output.output2;
if (sameSurfaceForm(utf8Key, output2)) {
results.add(getLookupResult(completion.output.output1, output2, spare));
break;
}
}
if (results.size() == num) {
// That was quick:
return results;
}
}
Util.TopNSearcher<Pair<Long, BytesRef>> searcher;
searcher = new Util.TopNSearcher<Pair<Long, BytesRef>>(fst, num - results.size(), num * maxAnalyzedPathsForOneInput, weightComparator) {
private final Set<BytesRef> seen = new HashSet<>();
@Override
protected boolean acceptResult(IntsRef input, Pair<Long, BytesRef> output) {
// can get duplicate surface forms:
if (seen.contains(output.output2)) {
return false;
}
seen.add(output.output2);
if (!exactFirst) {
return true;
} else {
// create duplicate results:
if (sameSurfaceForm(utf8Key, output.output2)) {
// have already found it in the first search:
assert results.size() == 1;
return false;
} else {
return true;
}
}
}
};
prefixPaths = getFullPrefixPaths(prefixPaths, lookupAutomaton, fst);
for (FSTUtil.Path<Pair<Long, BytesRef>> path : prefixPaths) {
searcher.addStartPaths(path.fstNode, path.output, true, path.input);
}
TopResults<Pair<Long, BytesRef>> completions = searcher.search();
for (Result<Pair<Long, BytesRef>> completion : completions) {
LookupResult result = getLookupResult(completion.output.output1, completion.output.output2, spare);
// TODO: for fuzzy case would be nice to return
// how many edits were required
//System.out.println(" result=" + result);
results.add(result);
if (results.size() == num) {
// produce one extra path
break;
}
}
return results;
} catch (IOException bogus) {
throw new RuntimeException(bogus);
}
}
use of org.apache.lucene.util.fst.FST in project lucene-solr by apache.
the class FSTTermsReader method walk.
static <T> void walk(FST<T> fst) throws IOException {
final ArrayList<FST.Arc<T>> queue = new ArrayList<>();
final BitSet seen = new BitSet();
final FST.BytesReader reader = fst.getBytesReader();
final FST.Arc<T> startArc = fst.getFirstArc(new FST.Arc<T>());
queue.add(startArc);
while (!queue.isEmpty()) {
final FST.Arc<T> arc = queue.remove(0);
final long node = arc.target;
//System.out.println(arc);
if (FST.targetHasArcs(arc) && !seen.get((int) node)) {
seen.set((int) node);
fst.readFirstRealTargetArc(node, arc, reader);
while (true) {
queue.add(new FST.Arc<T>().copyFrom(arc));
if (arc.isLast()) {
break;
} else {
fst.readNextRealArc(arc, reader);
}
}
}
}
}
use of org.apache.lucene.util.fst.FST in project lucene-solr by apache.
the class Dictionary method lookup.
IntsRef lookup(FST<IntsRef> fst, char[] word, int offset, int length) {
if (fst == null) {
return null;
}
final FST.BytesReader bytesReader = fst.getBytesReader();
final FST.Arc<IntsRef> arc = fst.getFirstArc(new FST.Arc<IntsRef>());
// Accumulate output as we go
final IntsRef NO_OUTPUT = fst.outputs.getNoOutput();
IntsRef output = NO_OUTPUT;
int l = offset + length;
try {
for (int i = offset, cp = 0; i < l; i += Character.charCount(cp)) {
cp = Character.codePointAt(word, i, l);
if (fst.findTargetArc(cp, arc, arc, bytesReader) == null) {
return null;
} else if (arc.output != NO_OUTPUT) {
output = fst.outputs.add(output, arc.output);
}
}
if (fst.findTargetArc(FST.END_LABEL, arc, arc, bytesReader) == null) {
return null;
} else if (arc.output != NO_OUTPUT) {
return fst.outputs.add(output, arc.output);
} else {
return output;
}
} catch (IOException bogus) {
throw new RuntimeException(bogus);
}
}
use of org.apache.lucene.util.fst.FST in project lucene-solr by apache.
the class Stemmer method stem.
/**
* Generates a list of stems for the provided word
*
* @param word Word to generate the stems for
* @param previous previous affix that was removed (so we dont remove same one twice)
* @param prevFlag Flag from a previous stemming step that need to be cross-checked with any affixes in this recursive step
* @param prefixFlag flag of the most inner removed prefix, so that when removing a suffix, it's also checked against the word
* @param recursionDepth current recursiondepth
* @param doPrefix true if we should remove prefixes
* @param doSuffix true if we should remove suffixes
* @param previousWasPrefix true if the previous removal was a prefix:
* if we are removing a suffix, and it has no continuation requirements, it's ok.
* but two prefixes (COMPLEXPREFIXES) or two suffixes must have continuation requirements to recurse.
* @param circumfix true if the previous prefix removal was signed as a circumfix
* this means inner most suffix must also contain circumfix flag.
* @param caseVariant true if we are searching for a case variant. if the word has KEEPCASE flag it cannot succeed.
* @return List of stems, or empty list if no stems are found
*/
private List<CharsRef> stem(char[] word, int length, int previous, int prevFlag, int prefixFlag, int recursionDepth, boolean doPrefix, boolean doSuffix, boolean previousWasPrefix, boolean circumfix, boolean caseVariant) throws IOException {
// TODO: allow this stuff to be reused by tokenfilter
List<CharsRef> stems = new ArrayList<>();
if (doPrefix && dictionary.prefixes != null) {
FST<IntsRef> fst = dictionary.prefixes;
Outputs<IntsRef> outputs = fst.outputs;
FST.BytesReader bytesReader = prefixReaders[recursionDepth];
FST.Arc<IntsRef> arc = prefixArcs[recursionDepth];
fst.getFirstArc(arc);
IntsRef NO_OUTPUT = outputs.getNoOutput();
IntsRef output = NO_OUTPUT;
int limit = dictionary.fullStrip ? length : length - 1;
for (int i = 0; i < limit; i++) {
if (i > 0) {
int ch = word[i - 1];
if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) {
break;
} else if (arc.output != NO_OUTPUT) {
output = fst.outputs.add(output, arc.output);
}
}
IntsRef prefixes = null;
if (!arc.isFinal()) {
continue;
} else {
prefixes = fst.outputs.add(output, arc.nextFinalOutput);
}
for (int j = 0; j < prefixes.length; j++) {
int prefix = prefixes.ints[prefixes.offset + j];
if (prefix == previous) {
continue;
}
affixReader.setPosition(8 * prefix);
char flag = (char) (affixReader.readShort() & 0xffff);
char stripOrd = (char) (affixReader.readShort() & 0xffff);
int condition = (char) (affixReader.readShort() & 0xffff);
boolean crossProduct = (condition & 1) == 1;
condition >>>= 1;
char append = (char) (affixReader.readShort() & 0xffff);
final boolean compatible;
if (recursionDepth == 0) {
if (dictionary.onlyincompound == -1) {
compatible = true;
} else {
// check if affix is allowed in a non-compound word
dictionary.flagLookup.get(append, scratch);
char[] appendFlags = Dictionary.decodeFlags(scratch);
compatible = !Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound);
}
} else if (crossProduct) {
// cross check incoming continuation class (flag of previous affix) against list.
dictionary.flagLookup.get(append, scratch);
char[] appendFlags = Dictionary.decodeFlags(scratch);
assert prevFlag >= 0;
boolean allowed = dictionary.onlyincompound == -1 || !Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound);
compatible = allowed && hasCrossCheckedFlag((char) prevFlag, appendFlags, false);
} else {
compatible = false;
}
if (compatible) {
int deAffixedStart = i;
int deAffixedLength = length - deAffixedStart;
int stripStart = dictionary.stripOffsets[stripOrd];
int stripEnd = dictionary.stripOffsets[stripOrd + 1];
int stripLength = stripEnd - stripStart;
if (!checkCondition(condition, dictionary.stripData, stripStart, stripLength, word, deAffixedStart, deAffixedLength)) {
continue;
}
char[] strippedWord = new char[stripLength + deAffixedLength];
System.arraycopy(dictionary.stripData, stripStart, strippedWord, 0, stripLength);
System.arraycopy(word, deAffixedStart, strippedWord, stripLength, deAffixedLength);
List<CharsRef> stemList = applyAffix(strippedWord, strippedWord.length, prefix, -1, recursionDepth, true, circumfix, caseVariant);
stems.addAll(stemList);
}
}
}
}
if (doSuffix && dictionary.suffixes != null) {
FST<IntsRef> fst = dictionary.suffixes;
Outputs<IntsRef> outputs = fst.outputs;
FST.BytesReader bytesReader = suffixReaders[recursionDepth];
FST.Arc<IntsRef> arc = suffixArcs[recursionDepth];
fst.getFirstArc(arc);
IntsRef NO_OUTPUT = outputs.getNoOutput();
IntsRef output = NO_OUTPUT;
int limit = dictionary.fullStrip ? 0 : 1;
for (int i = length; i >= limit; i--) {
if (i < length) {
int ch = word[i];
if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) {
break;
} else if (arc.output != NO_OUTPUT) {
output = fst.outputs.add(output, arc.output);
}
}
IntsRef suffixes = null;
if (!arc.isFinal()) {
continue;
} else {
suffixes = fst.outputs.add(output, arc.nextFinalOutput);
}
for (int j = 0; j < suffixes.length; j++) {
int suffix = suffixes.ints[suffixes.offset + j];
if (suffix == previous) {
continue;
}
affixReader.setPosition(8 * suffix);
char flag = (char) (affixReader.readShort() & 0xffff);
char stripOrd = (char) (affixReader.readShort() & 0xffff);
int condition = (char) (affixReader.readShort() & 0xffff);
boolean crossProduct = (condition & 1) == 1;
condition >>>= 1;
char append = (char) (affixReader.readShort() & 0xffff);
final boolean compatible;
if (recursionDepth == 0) {
if (dictionary.onlyincompound == -1) {
compatible = true;
} else {
// check if affix is allowed in a non-compound word
dictionary.flagLookup.get(append, scratch);
char[] appendFlags = Dictionary.decodeFlags(scratch);
compatible = !Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound);
}
} else if (crossProduct) {
// cross check incoming continuation class (flag of previous affix) against list.
dictionary.flagLookup.get(append, scratch);
char[] appendFlags = Dictionary.decodeFlags(scratch);
assert prevFlag >= 0;
boolean allowed = dictionary.onlyincompound == -1 || !Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound);
compatible = allowed && hasCrossCheckedFlag((char) prevFlag, appendFlags, previousWasPrefix);
} else {
compatible = false;
}
if (compatible) {
int appendLength = length - i;
int deAffixedLength = length - appendLength;
int stripStart = dictionary.stripOffsets[stripOrd];
int stripEnd = dictionary.stripOffsets[stripOrd + 1];
int stripLength = stripEnd - stripStart;
if (!checkCondition(condition, word, 0, deAffixedLength, dictionary.stripData, stripStart, stripLength)) {
continue;
}
char[] strippedWord = new char[stripLength + deAffixedLength];
System.arraycopy(word, 0, strippedWord, 0, deAffixedLength);
System.arraycopy(dictionary.stripData, stripStart, strippedWord, deAffixedLength, stripLength);
List<CharsRef> stemList = applyAffix(strippedWord, strippedWord.length, suffix, prefixFlag, recursionDepth, false, circumfix, caseVariant);
stems.addAll(stemList);
}
}
}
}
return stems;
}
Aggregations