use of org.apache.lucene.util.fst.FST in project lucene-solr by apache.
the class FSTUtil method intersectPrefixPaths.
/**
* Enumerates all minimal prefix paths in the automaton that also intersect the FST,
* accumulating the FST end node and output for each path.
*/
public static <T> List<Path<T>> intersectPrefixPaths(Automaton a, FST<T> fst) throws IOException {
assert a.isDeterministic();
final List<Path<T>> queue = new ArrayList<>();
final List<Path<T>> endNodes = new ArrayList<>();
if (a.getNumStates() == 0) {
return endNodes;
}
queue.add(new Path<>(0, fst.getFirstArc(new FST.Arc<T>()), fst.outputs.getNoOutput(), new IntsRefBuilder()));
final FST.Arc<T> scratchArc = new FST.Arc<>();
final FST.BytesReader fstReader = fst.getBytesReader();
Transition t = new Transition();
while (queue.size() != 0) {
final Path<T> path = queue.remove(queue.size() - 1);
if (a.isAccept(path.state)) {
endNodes.add(path);
// we accept all further paths too
continue;
}
IntsRefBuilder currentInput = path.input;
int count = a.initTransition(path.state, t);
for (int i = 0; i < count; i++) {
a.getNextTransition(t);
final int min = t.min;
final int max = t.max;
if (min == max) {
final FST.Arc<T> nextArc = fst.findTargetArc(t.min, path.fstNode, scratchArc, fstReader);
if (nextArc != null) {
final IntsRefBuilder newInput = new IntsRefBuilder();
newInput.copyInts(currentInput.get());
newInput.append(t.min);
queue.add(new Path<>(t.dest, new FST.Arc<T>().copyFrom(nextArc), fst.outputs.add(path.output, nextArc.output), newInput));
}
} else {
// TODO: if this transition's TO state is accepting, and
// it accepts the entire range possible in the FST (ie. 0 to 255),
// we can simply use the prefix as the accepted state instead of
// looking up all the ranges and terminate early
// here. This just shifts the work from one queue
// (this one) to another (the completion search
// done in AnalyzingSuggester).
FST.Arc<T> nextArc = Util.readCeilArc(min, fst, path.fstNode, scratchArc, fstReader);
while (nextArc != null && nextArc.label <= max) {
assert nextArc.label <= max;
assert nextArc.label >= min : nextArc.label + " " + min;
final IntsRefBuilder newInput = new IntsRefBuilder();
newInput.copyInts(currentInput.get());
newInput.append(nextArc.label);
queue.add(new Path<>(t.dest, new FST.Arc<T>().copyFrom(nextArc), fst.outputs.add(path.output, nextArc.output), newInput));
// used in assert
final int label = nextArc.label;
nextArc = nextArc.isLast() ? null : fst.readNextRealArc(nextArc, fstReader);
assert nextArc == null || label < nextArc.label : "last: " + label + " next: " + nextArc.label;
}
}
}
}
return endNodes;
}
use of org.apache.lucene.util.fst.FST in project lucene-solr by apache.
the class AnalyzingSuggester method lookup.
@Override
public List<LookupResult> lookup(final CharSequence key, Set<BytesRef> contexts, boolean onlyMorePopular, int num) {
assert num > 0;
if (onlyMorePopular) {
throw new IllegalArgumentException("this suggester only works with onlyMorePopular=false");
}
if (contexts != null) {
throw new IllegalArgumentException("this suggester doesn't support contexts");
}
if (fst == null) {
return Collections.emptyList();
}
//System.out.println("lookup key=" + key + " num=" + num);
for (int i = 0; i < key.length(); i++) {
if (key.charAt(i) == 0x1E) {
throw new IllegalArgumentException("lookup key cannot contain HOLE character U+001E; this character is reserved");
}
if (key.charAt(i) == 0x1F) {
throw new IllegalArgumentException("lookup key cannot contain unit separator character U+001F; this character is reserved");
}
}
final BytesRef utf8Key = new BytesRef(key);
try {
Automaton lookupAutomaton = toLookupAutomaton(key);
final CharsRefBuilder spare = new CharsRefBuilder();
//System.out.println(" now intersect exactFirst=" + exactFirst);
// Intersect automaton w/ suggest wFST and get all
// prefix starting nodes & their outputs:
//final PathIntersector intersector = getPathIntersector(lookupAutomaton, fst);
//System.out.println(" prefixPaths: " + prefixPaths.size());
BytesReader bytesReader = fst.getBytesReader();
FST.Arc<Pair<Long, BytesRef>> scratchArc = new FST.Arc<>();
final List<LookupResult> results = new ArrayList<>();
List<FSTUtil.Path<Pair<Long, BytesRef>>> prefixPaths = FSTUtil.intersectPrefixPaths(convertAutomaton(lookupAutomaton), fst);
if (exactFirst) {
int count = 0;
for (FSTUtil.Path<Pair<Long, BytesRef>> path : prefixPaths) {
if (fst.findTargetArc(END_BYTE, path.fstNode, scratchArc, bytesReader) != null) {
// This node has END_BYTE arc leaving, meaning it's an
// "exact" match:
count++;
}
}
// Searcher just to find the single exact only
// match, if present:
Util.TopNSearcher<Pair<Long, BytesRef>> searcher;
searcher = new Util.TopNSearcher<>(fst, count * maxSurfaceFormsPerAnalyzedForm, count * maxSurfaceFormsPerAnalyzedForm, weightComparator);
// ...:
for (FSTUtil.Path<Pair<Long, BytesRef>> path : prefixPaths) {
if (fst.findTargetArc(END_BYTE, path.fstNode, scratchArc, bytesReader) != null) {
// This node has END_BYTE arc leaving, meaning it's an
// "exact" match:
searcher.addStartPaths(scratchArc, fst.outputs.add(path.output, scratchArc.output), false, path.input);
}
}
TopResults<Pair<Long, BytesRef>> completions = searcher.search();
assert completions.isComplete;
// maxSurfaceFormsPerAnalyzedForm:
for (Result<Pair<Long, BytesRef>> completion : completions) {
BytesRef output2 = completion.output.output2;
if (sameSurfaceForm(utf8Key, output2)) {
results.add(getLookupResult(completion.output.output1, output2, spare));
break;
}
}
if (results.size() == num) {
// That was quick:
return results;
}
}
Util.TopNSearcher<Pair<Long, BytesRef>> searcher;
searcher = new Util.TopNSearcher<Pair<Long, BytesRef>>(fst, num - results.size(), num * maxAnalyzedPathsForOneInput, weightComparator) {
private final Set<BytesRef> seen = new HashSet<>();
@Override
protected boolean acceptResult(IntsRef input, Pair<Long, BytesRef> output) {
// can get duplicate surface forms:
if (seen.contains(output.output2)) {
return false;
}
seen.add(output.output2);
if (!exactFirst) {
return true;
} else {
// create duplicate results:
if (sameSurfaceForm(utf8Key, output.output2)) {
// have already found it in the first search:
assert results.size() == 1;
return false;
} else {
return true;
}
}
}
};
prefixPaths = getFullPrefixPaths(prefixPaths, lookupAutomaton, fst);
for (FSTUtil.Path<Pair<Long, BytesRef>> path : prefixPaths) {
searcher.addStartPaths(path.fstNode, path.output, true, path.input);
}
TopResults<Pair<Long, BytesRef>> completions = searcher.search();
assert completions.isComplete;
for (Result<Pair<Long, BytesRef>> completion : completions) {
LookupResult result = getLookupResult(completion.output.output1, completion.output.output2, spare);
// TODO: for fuzzy case would be nice to return
// how many edits were required
//System.out.println(" result=" + result);
results.add(result);
if (results.size() == num) {
// produce one extra path
break;
}
}
return results;
} catch (IOException bogus) {
throw new RuntimeException(bogus);
}
}
use of org.apache.lucene.util.fst.FST in project lucene-solr by apache.
the class Dictionary method readAffixFile.
/**
* Reads the affix file through the provided InputStream, building up the prefix and suffix maps
*
* @param affixStream InputStream to read the content of the affix file from
* @param decoder CharsetDecoder to decode the content of the file
* @throws IOException Can be thrown while reading from the InputStream
*/
private void readAffixFile(InputStream affixStream, CharsetDecoder decoder) throws IOException, ParseException {
TreeMap<String, List<Integer>> prefixes = new TreeMap<>();
TreeMap<String, List<Integer>> suffixes = new TreeMap<>();
Map<String, Integer> seenPatterns = new HashMap<>();
// zero condition -> 0 ord
seenPatterns.put(".*", 0);
patterns.add(null);
// zero strip -> 0 ord
Map<String, Integer> seenStrips = new LinkedHashMap<>();
seenStrips.put("", 0);
LineNumberReader reader = new LineNumberReader(new InputStreamReader(affixStream, decoder));
String line = null;
while ((line = reader.readLine()) != null) {
// ignore any BOM marker on first line
if (reader.getLineNumber() == 1 && line.startsWith("")) {
line = line.substring(1);
}
if (line.startsWith(ALIAS_KEY)) {
parseAlias(line);
} else if (line.startsWith(MORPH_ALIAS_KEY)) {
parseMorphAlias(line);
} else if (line.startsWith(PREFIX_KEY)) {
parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN, seenPatterns, seenStrips);
} else if (line.startsWith(SUFFIX_KEY)) {
parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN, seenPatterns, seenStrips);
} else if (line.startsWith(FLAG_KEY)) {
// Assume that the FLAG line comes before any prefix or suffixes
// Store the strategy so it can be used when parsing the dic file
flagParsingStrategy = getFlagParsingStrategy(line);
} else if (line.equals(COMPLEXPREFIXES_KEY)) {
// 2-stage prefix+1-stage suffix instead of 2-stage suffix+1-stage prefix
complexPrefixes = true;
} else if (line.startsWith(CIRCUMFIX_KEY)) {
String[] parts = line.split("\\s+");
if (parts.length != 2) {
throw new ParseException("Illegal CIRCUMFIX declaration", reader.getLineNumber());
}
circumfix = flagParsingStrategy.parseFlag(parts[1]);
} else if (line.startsWith(KEEPCASE_KEY)) {
String[] parts = line.split("\\s+");
if (parts.length != 2) {
throw new ParseException("Illegal KEEPCASE declaration", reader.getLineNumber());
}
keepcase = flagParsingStrategy.parseFlag(parts[1]);
} else if (line.startsWith(NEEDAFFIX_KEY) || line.startsWith(PSEUDOROOT_KEY)) {
String[] parts = line.split("\\s+");
if (parts.length != 2) {
throw new ParseException("Illegal NEEDAFFIX declaration", reader.getLineNumber());
}
needaffix = flagParsingStrategy.parseFlag(parts[1]);
} else if (line.startsWith(ONLYINCOMPOUND_KEY)) {
String[] parts = line.split("\\s+");
if (parts.length != 2) {
throw new ParseException("Illegal ONLYINCOMPOUND declaration", reader.getLineNumber());
}
onlyincompound = flagParsingStrategy.parseFlag(parts[1]);
} else if (line.startsWith(IGNORE_KEY)) {
String[] parts = line.split("\\s+");
if (parts.length != 2) {
throw new ParseException("Illegal IGNORE declaration", reader.getLineNumber());
}
ignore = parts[1].toCharArray();
Arrays.sort(ignore);
needsInputCleaning = true;
} else if (line.startsWith(ICONV_KEY) || line.startsWith(OCONV_KEY)) {
String[] parts = line.split("\\s+");
String type = parts[0];
if (parts.length != 2) {
throw new ParseException("Illegal " + type + " declaration", reader.getLineNumber());
}
int num = Integer.parseInt(parts[1]);
FST<CharsRef> res = parseConversions(reader, num);
if (type.equals("ICONV")) {
iconv = res;
needsInputCleaning |= iconv != null;
} else {
oconv = res;
needsOutputCleaning |= oconv != null;
}
} else if (line.startsWith(FULLSTRIP_KEY)) {
fullStrip = true;
} else if (line.startsWith(LANG_KEY)) {
language = line.substring(LANG_KEY.length()).trim();
alternateCasing = "tr_TR".equals(language) || "az_AZ".equals(language);
}
}
this.prefixes = affixFST(prefixes);
this.suffixes = affixFST(suffixes);
int totalChars = 0;
for (String strip : seenStrips.keySet()) {
totalChars += strip.length();
}
stripData = new char[totalChars];
stripOffsets = new int[seenStrips.size() + 1];
int currentOffset = 0;
int currentIndex = 0;
for (String strip : seenStrips.keySet()) {
stripOffsets[currentIndex++] = currentOffset;
strip.getChars(0, strip.length(), stripData, currentOffset);
currentOffset += strip.length();
}
assert currentIndex == seenStrips.size();
stripOffsets[currentIndex] = currentOffset;
}
use of org.apache.lucene.util.fst.FST in project lucene-solr by apache.
the class Dictionary method applyMappings.
// TODO: this could be more efficient!
static void applyMappings(FST<CharsRef> fst, StringBuilder sb) throws IOException {
final FST.BytesReader bytesReader = fst.getBytesReader();
final FST.Arc<CharsRef> firstArc = fst.getFirstArc(new FST.Arc<CharsRef>());
final CharsRef NO_OUTPUT = fst.outputs.getNoOutput();
// temporary stuff
final FST.Arc<CharsRef> arc = new FST.Arc<>();
int longestMatch;
CharsRef longestOutput;
for (int i = 0; i < sb.length(); i++) {
arc.copyFrom(firstArc);
CharsRef output = NO_OUTPUT;
longestMatch = -1;
longestOutput = null;
for (int j = i; j < sb.length(); j++) {
char ch = sb.charAt(j);
if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) {
break;
} else {
output = fst.outputs.add(output, arc.output);
}
if (arc.isFinal()) {
longestOutput = fst.outputs.add(output, arc.nextFinalOutput);
longestMatch = j;
}
}
if (longestMatch >= 0) {
sb.delete(i, longestMatch + 1);
sb.insert(i, longestOutput);
i += (longestOutput.length - 1);
}
}
}
use of org.apache.lucene.util.fst.FST in project lucene-solr by apache.
the class MappingCharFilter method read.
@Override
public int read() throws IOException {
//System.out.println("\nread");
while (true) {
if (replacement != null && replacementPointer < replacement.length) {
//System.out.println(" return repl[" + replacementPointer + "]=" + replacement.chars[replacement.offset + replacementPointer]);
return replacement.chars[replacement.offset + replacementPointer++];
}
// TODO: a more efficient approach would be Aho/Corasick's
// algorithm
// (http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm)
// or this generalizatio: www.cis.uni-muenchen.de/people/Schulz/Pub/dictle5.ps
//
// I think this would be (almost?) equivalent to 1) adding
// epsilon arcs from all final nodes back to the init
// node in the FST, 2) adding a .* (skip any char)
// loop on the initial node, and 3) determinizing
// that. Then we would not have to restart matching
// at each position.
int lastMatchLen = -1;
CharsRef lastMatch = null;
final int firstCH = buffer.get(inputOff);
if (firstCH != -1) {
FST.Arc<CharsRef> arc = cachedRootArcs.get(Character.valueOf((char) firstCH));
if (arc != null) {
if (!FST.targetHasArcs(arc)) {
// Fast pass for single character match:
assert arc.isFinal();
lastMatchLen = 1;
lastMatch = arc.output;
} else {
int lookahead = 0;
CharsRef output = arc.output;
while (true) {
lookahead++;
if (arc.isFinal()) {
// Match! (to node is final)
lastMatchLen = lookahead;
lastMatch = outputs.add(output, arc.nextFinalOutput);
// Greedy: keep searching to see if there's a
// longer match...
}
if (!FST.targetHasArcs(arc)) {
break;
}
int ch = buffer.get(inputOff + lookahead);
if (ch == -1) {
break;
}
if ((arc = map.findTargetArc(ch, arc, scratchArc, fstReader)) == null) {
// Dead end
break;
}
output = outputs.add(output, arc.output);
}
}
}
}
if (lastMatch != null) {
inputOff += lastMatchLen;
//System.out.println(" match! len=" + lastMatchLen + " repl=" + lastMatch);
final int diff = lastMatchLen - lastMatch.length;
if (diff != 0) {
final int prevCumulativeDiff = getLastCumulativeDiff();
if (diff > 0) {
// Replacement is shorter than matched input:
addOffCorrectMap(inputOff - diff - prevCumulativeDiff, prevCumulativeDiff + diff);
} else {
// Replacement is longer than matched input: remap
// the "extra" chars all back to the same input
// offset:
final int outputStart = inputOff - prevCumulativeDiff;
for (int extraIDX = 0; extraIDX < -diff; extraIDX++) {
addOffCorrectMap(outputStart + extraIDX, prevCumulativeDiff - extraIDX - 1);
}
}
}
replacement = lastMatch;
replacementPointer = 0;
} else {
final int ret = buffer.get(inputOff);
if (ret != -1) {
inputOff++;
buffer.freeBefore(inputOff);
}
return ret;
}
}
}
Aggregations