use of org.apache.lucene.util.fst.FST.Arc in project lucene-solr by apache.
the class FreeTextSuggester method lookup.
/** Retrieve suggestions. */
public List<LookupResult> lookup(final CharSequence key, Set<BytesRef> contexts, int num) throws IOException {
if (contexts != null) {
throw new IllegalArgumentException("this suggester doesn't support contexts");
}
if (fst == null) {
throw new IllegalStateException("Lookup not supported at this time");
}
try (TokenStream ts = queryAnalyzer.tokenStream("", key.toString())) {
TermToBytesRefAttribute termBytesAtt = ts.addAttribute(TermToBytesRefAttribute.class);
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
PositionLengthAttribute posLenAtt = ts.addAttribute(PositionLengthAttribute.class);
PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
ts.reset();
BytesRefBuilder[] lastTokens = new BytesRefBuilder[grams];
//System.out.println("lookup: key='" + key + "'");
// Run full analysis, but save only the
// last 1gram, last 2gram, etc.:
int maxEndOffset = -1;
boolean sawRealToken = false;
while (ts.incrementToken()) {
BytesRef tokenBytes = termBytesAtt.getBytesRef();
sawRealToken |= tokenBytes.length > 0;
// TODO: this is somewhat iffy; today, ShingleFilter
// sets posLen to the gram count; maybe we should make
// a separate dedicated att for this?
int gramCount = posLenAtt.getPositionLength();
assert gramCount <= grams;
// Safety: make sure the recalculated count "agrees":
if (countGrams(tokenBytes) != gramCount) {
throw new IllegalArgumentException("tokens must not contain separator byte; got token=" + tokenBytes + " but gramCount=" + gramCount + " does not match recalculated count=" + countGrams(tokenBytes));
}
maxEndOffset = Math.max(maxEndOffset, offsetAtt.endOffset());
BytesRefBuilder b = new BytesRefBuilder();
b.append(tokenBytes);
lastTokens[gramCount - 1] = b;
}
ts.end();
if (!sawRealToken) {
throw new IllegalArgumentException("no tokens produced by analyzer, or the only tokens were empty strings");
}
// Carefully fill last tokens with _ tokens;
// ShingleFilter appraently won't emit "only hole"
// tokens:
int endPosInc = posIncAtt.getPositionIncrement();
// Note this will also be true if input is the empty
// string (in which case we saw no tokens and
// maxEndOffset is still -1), which in fact works out OK
// because we fill the unigram with an empty BytesRef
// below:
boolean lastTokenEnded = offsetAtt.endOffset() > maxEndOffset || endPosInc > 0;
if (lastTokenEnded) {
// starting with "foo":
for (int i = grams - 1; i > 0; i--) {
BytesRefBuilder token = lastTokens[i - 1];
if (token == null) {
continue;
}
token.append(separator);
lastTokens[i] = token;
}
lastTokens[0] = new BytesRefBuilder();
}
Arc<Long> arc = new Arc<>();
BytesReader bytesReader = fst.getBytesReader();
// Try highest order models first, and if they return
// results, return that; else, fallback:
double backoff = 1.0;
List<LookupResult> results = new ArrayList<>(num);
// We only add a given suffix once, from the highest
// order model that saw it; for subsequent lower order
// models we skip it:
final Set<BytesRef> seen = new HashSet<>();
for (int gram = grams - 1; gram >= 0; gram--) {
BytesRefBuilder token = lastTokens[gram];
// Don't make unigram predictions from empty string:
if (token == null || (token.length() == 0 && key.length() > 0)) {
//System.out.println(" gram=" + gram + ": skip: not enough input");
continue;
}
if (endPosInc > 0 && gram <= endPosInc) {
//System.out.println(" break: only holes now");
break;
}
//System.out.println("try " + (gram+1) + " gram token=" + token.utf8ToString());
// TODO: we could add fuzziness here
// match the prefix portion exactly
//Pair<Long,BytesRef> prefixOutput = null;
Long prefixOutput = null;
try {
prefixOutput = lookupPrefix(fst, bytesReader, token.get(), arc);
} catch (IOException bogus) {
throw new RuntimeException(bogus);
}
if (prefixOutput == null) {
// This model never saw this prefix, e.g. the
// trigram model never saw context "purple mushroom"
backoff *= ALPHA;
continue;
}
// TODO: we could do this division at build time, and
// bake it into the FST?
// Denominator for computing scores from current
// model's predictions:
long contextCount = totTokens;
BytesRef lastTokenFragment = null;
for (int i = token.length() - 1; i >= 0; i--) {
if (token.byteAt(i) == separator) {
BytesRef context = new BytesRef(token.bytes(), 0, i);
Long output = Util.get(fst, Util.toIntsRef(context, new IntsRefBuilder()));
assert output != null;
contextCount = decodeWeight(output);
lastTokenFragment = new BytesRef(token.bytes(), i + 1, token.length() - i - 1);
break;
}
}
final BytesRefBuilder finalLastToken = new BytesRefBuilder();
if (lastTokenFragment == null) {
finalLastToken.copyBytes(token.get());
} else {
finalLastToken.copyBytes(lastTokenFragment);
}
CharsRefBuilder spare = new CharsRefBuilder();
// complete top-N
TopResults<Long> completions = null;
try {
// Because we store multiple models in one FST
// (1gram, 2gram, 3gram), we must restrict the
// search so that it only considers the current
// model. For highest order model, this is not
// necessary since all completions in the FST
// must be from this model, but for lower order
// models we have to filter out the higher order
// ones:
// Must do num+seen.size() for queue depth because we may
// reject up to seen.size() paths in acceptResult():
Util.TopNSearcher<Long> searcher = new Util.TopNSearcher<Long>(fst, num, num + seen.size(), weightComparator) {
BytesRefBuilder scratchBytes = new BytesRefBuilder();
@Override
protected void addIfCompetitive(Util.FSTPath<Long> path) {
if (path.arc.label != separator) {
//System.out.println(" keep path: " + Util.toBytesRef(path.input, new BytesRef()).utf8ToString() + "; " + path + "; arc=" + path.arc);
super.addIfCompetitive(path);
} else {
//System.out.println(" prevent path: " + Util.toBytesRef(path.input, new BytesRef()).utf8ToString() + "; " + path + "; arc=" + path.arc);
}
}
@Override
protected boolean acceptResult(IntsRef input, Long output) {
Util.toBytesRef(input, scratchBytes);
finalLastToken.grow(finalLastToken.length() + scratchBytes.length());
int lenSav = finalLastToken.length();
finalLastToken.append(scratchBytes);
//System.out.println(" accept? input='" + scratchBytes.utf8ToString() + "'; lastToken='" + finalLastToken.utf8ToString() + "'; return " + (seen.contains(finalLastToken) == false));
boolean ret = seen.contains(finalLastToken.get()) == false;
finalLastToken.setLength(lenSav);
return ret;
}
};
// since this search is initialized with a single start node
// it is okay to start with an empty input path here
searcher.addStartPaths(arc, prefixOutput, true, new IntsRefBuilder());
completions = searcher.search();
assert completions.isComplete;
} catch (IOException bogus) {
throw new RuntimeException(bogus);
}
int prefixLength = token.length();
BytesRefBuilder suffix = new BytesRefBuilder();
nextCompletion: for (Result<Long> completion : completions) {
token.setLength(prefixLength);
// append suffix
Util.toBytesRef(completion.input, suffix);
token.append(suffix);
//System.out.println(" completion " + token.utf8ToString());
// Skip this path if a higher-order model already
// saw/predicted its last token:
BytesRef lastToken = token.get();
for (int i = token.length() - 1; i >= 0; i--) {
if (token.byteAt(i) == separator) {
assert token.length() - i - 1 > 0;
lastToken = new BytesRef(token.bytes(), i + 1, token.length() - i - 1);
break;
}
}
if (seen.contains(lastToken)) {
//System.out.println(" skip dup " + lastToken.utf8ToString());
continue nextCompletion;
}
seen.add(BytesRef.deepCopyOf(lastToken));
spare.copyUTF8Bytes(token.get());
LookupResult result = new LookupResult(spare.toString(), (long) (Long.MAX_VALUE * backoff * ((double) decodeWeight(completion.output)) / contextCount));
results.add(result);
assert results.size() == seen.size();
//System.out.println(" add result=" + result);
}
backoff *= ALPHA;
}
Collections.sort(results, new Comparator<LookupResult>() {
@Override
public int compare(LookupResult a, LookupResult b) {
if (a.value > b.value) {
return -1;
} else if (a.value < b.value) {
return 1;
} else {
// Tie break by UTF16 sort order:
return ((String) a.key).compareTo((String) b.key);
}
}
});
if (results.size() > num) {
results.subList(num, results.size()).clear();
}
return results;
}
}
use of org.apache.lucene.util.fst.FST.Arc in project lucene-solr by apache.
the class WFSTCompletionLookup method lookup.
@Override
public List<LookupResult> lookup(CharSequence key, Set<BytesRef> contexts, boolean onlyMorePopular, int num) {
if (contexts != null) {
throw new IllegalArgumentException("this suggester doesn't support contexts");
}
assert num > 0;
if (onlyMorePopular) {
throw new IllegalArgumentException("this suggester only works with onlyMorePopular=false");
}
if (fst == null) {
return Collections.emptyList();
}
BytesRefBuilder scratch = new BytesRefBuilder();
scratch.copyChars(key);
int prefixLength = scratch.length();
Arc<Long> arc = new Arc<>();
// match the prefix portion exactly
Long prefixOutput = null;
try {
prefixOutput = lookupPrefix(scratch.get(), arc);
} catch (IOException bogus) {
throw new RuntimeException(bogus);
}
if (prefixOutput == null) {
return Collections.emptyList();
}
List<LookupResult> results = new ArrayList<>(num);
CharsRefBuilder spare = new CharsRefBuilder();
if (exactFirst && arc.isFinal()) {
spare.copyUTF8Bytes(scratch.get());
results.add(new LookupResult(spare.toString(), decodeWeight(prefixOutput + arc.nextFinalOutput)));
if (--num == 0) {
// that was quick
return results;
}
}
// complete top-N
TopResults<Long> completions = null;
try {
completions = Util.shortestPaths(fst, arc, prefixOutput, weightComparator, num, !exactFirst);
assert completions.isComplete;
} catch (IOException bogus) {
throw new RuntimeException(bogus);
}
BytesRefBuilder suffix = new BytesRefBuilder();
for (Result<Long> completion : completions) {
scratch.setLength(prefixLength);
// append suffix
Util.toBytesRef(completion.input, suffix);
scratch.append(suffix);
spare.copyUTF8Bytes(scratch.get());
results.add(new LookupResult(spare.toString(), decodeWeight(completion.output)));
}
return results;
}
use of org.apache.lucene.util.fst.FST.Arc in project lucene-solr by apache.
the class TestFSTs method testExpandedCloseToRoot.
/**
* Test state expansion (array format) on close-to-root states. Creates
* synthetic input that has one expanded state on each level.
*
* @see <a href="https://issues.apache.org/jira/browse/LUCENE-2933">LUCENE-2933</a>
*/
public void testExpandedCloseToRoot() throws Exception {
class SyntheticData {
FST<Object> compile(String[] lines) throws IOException {
final NoOutputs outputs = NoOutputs.getSingleton();
final Object nothing = outputs.getNoOutput();
final Builder<Object> b = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
int line = 0;
final BytesRefBuilder term = new BytesRefBuilder();
final IntsRefBuilder scratchIntsRef = new IntsRefBuilder();
while (line < lines.length) {
String w = lines[line++];
if (w == null) {
break;
}
term.copyChars(w);
b.add(Util.toIntsRef(term.get(), scratchIntsRef), nothing);
}
return b.finish();
}
void generate(ArrayList<String> out, StringBuilder b, char from, char to, int depth) {
if (depth == 0 || from == to) {
String seq = b.toString() + "_" + out.size() + "_end";
out.add(seq);
} else {
for (char c = from; c <= to; c++) {
b.append(c);
generate(out, b, from, c == to ? to : from, depth - 1);
b.deleteCharAt(b.length() - 1);
}
}
}
public int verifyStateAndBelow(FST<Object> fst, Arc<Object> arc, int depth) throws IOException {
if (FST.targetHasArcs(arc)) {
int childCount = 0;
BytesReader fstReader = fst.getBytesReader();
for (arc = fst.readFirstTargetArc(arc, arc, fstReader); ; arc = fst.readNextArc(arc, fstReader), childCount++) {
boolean expanded = fst.isExpandedTarget(arc, fstReader);
int children = verifyStateAndBelow(fst, new FST.Arc<>().copyFrom(arc), depth + 1);
assertEquals(expanded, (depth <= FST.FIXED_ARRAY_SHALLOW_DISTANCE && children >= FST.FIXED_ARRAY_NUM_ARCS_SHALLOW) || children >= FST.FIXED_ARRAY_NUM_ARCS_DEEP);
if (arc.isLast())
break;
}
return childCount;
}
return 0;
}
}
// Sanity check.
assertTrue(FST.FIXED_ARRAY_NUM_ARCS_SHALLOW < FST.FIXED_ARRAY_NUM_ARCS_DEEP);
assertTrue(FST.FIXED_ARRAY_SHALLOW_DISTANCE >= 0);
SyntheticData s = new SyntheticData();
ArrayList<String> out = new ArrayList<>();
StringBuilder b = new StringBuilder();
s.generate(out, b, 'a', 'i', 10);
String[] input = out.toArray(new String[out.size()]);
Arrays.sort(input);
FST<Object> fst = s.compile(input);
FST.Arc<Object> arc = fst.getFirstArc(new FST.Arc<>());
s.verifyStateAndBelow(fst, arc, 1);
}
use of org.apache.lucene.util.fst.FST.Arc in project lucene-solr by apache.
the class TestFSTs method testIllegallyModifyRootArc.
public void testIllegallyModifyRootArc() throws Exception {
assumeTrue("test relies on assertions", assertsAreEnabled);
Set<BytesRef> terms = new HashSet<>();
for (int i = 0; i < 100; i++) {
String prefix = Character.toString((char) ('a' + i));
terms.add(new BytesRef(prefix));
if (prefix.equals("m") == false) {
for (int j = 0; j < 20; j++) {
// Make a big enough FST that the root cache will be created:
String suffix = TestUtil.randomRealisticUnicodeString(random(), 10, 20);
terms.add(new BytesRef(prefix + suffix));
}
}
}
List<BytesRef> termsList = new ArrayList<>(terms);
Collections.sort(termsList);
ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
Builder<BytesRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
IntsRefBuilder input = new IntsRefBuilder();
for (BytesRef term : termsList) {
Util.toIntsRef(term, input);
builder.add(input.get(), term);
}
FST<BytesRef> fst = builder.finish();
Arc<BytesRef> arc = new FST.Arc<>();
fst.getFirstArc(arc);
FST.BytesReader reader = fst.getBytesReader();
arc = fst.findTargetArc((int) 'm', arc, arc, reader);
assertNotNull(arc);
assertEquals(new BytesRef("m"), arc.output);
// NOTE: illegal:
arc.output.length = 0;
fst.getFirstArc(arc);
try {
arc = fst.findTargetArc((int) 'm', arc, arc, reader);
} catch (AssertionError ae) {
// expected
}
}
use of org.apache.lucene.util.fst.FST.Arc in project lucene-solr by apache.
the class Util method getByOutput.
/** Reverse lookup (lookup by output instead of by input),
* in the special case when your FSTs outputs are
* strictly ascending. This locates the input/output
* pair where the output is equal to the target, and will
* return null if that output does not exist.
*
* <p>NOTE: this only works with {@code FST<Long>}, only
* works when the outputs are ascending in order with
* the inputs.
* For example, simple ordinals (0, 1,
* 2, ...), or file offets (when appending to a file)
* fit this. */
public static IntsRef getByOutput(FST<Long> fst, long targetOutput) throws IOException {
final BytesReader in = fst.getBytesReader();
// TODO: would be nice not to alloc this on every lookup
FST.Arc<Long> arc = fst.getFirstArc(new FST.Arc<Long>());
FST.Arc<Long> scratchArc = new FST.Arc<>();
final IntsRefBuilder result = new IntsRefBuilder();
return getByOutput(fst, targetOutput, in, arc, scratchArc, result);
}
Aggregations