use of org.apache.lucene.util.automaton.Automaton in project lucene-solr by apache.
the class FuzzySuggester method convertAutomaton.
@Override
protected Automaton convertAutomaton(Automaton a) {
if (unicodeAware) {
Automaton utf8automaton = new UTF32ToUTF8().convert(a);
utf8automaton = Operations.determinize(utf8automaton, DEFAULT_MAX_DETERMINIZED_STATES);
return utf8automaton;
} else {
return a;
}
}
use of org.apache.lucene.util.automaton.Automaton in project lucene-solr by apache.
the class FuzzySuggester method toLevenshteinAutomata.
Automaton toLevenshteinAutomata(Automaton automaton) {
List<Automaton> subs = new ArrayList<>();
FiniteStringsIterator finiteStrings = new FiniteStringsIterator(automaton);
for (IntsRef string; (string = finiteStrings.next()) != null; ) {
if (string.length <= nonFuzzyPrefix || string.length < minFuzzyLength) {
subs.add(Automata.makeString(string.ints, string.offset, string.length));
} else {
int[] ints = new int[string.length - nonFuzzyPrefix];
System.arraycopy(string.ints, string.offset + nonFuzzyPrefix, ints, 0, ints.length);
// TODO: maybe add alphaMin to LevenshteinAutomata,
// and pass 1 instead of 0? We probably don't want
// to allow the trailing dedup bytes to be
// edited... but then 0 byte is "in general" allowed
// on input (but not in UTF8).
LevenshteinAutomata lev = new LevenshteinAutomata(ints, unicodeAware ? Character.MAX_CODE_POINT : 255, transpositions);
subs.add(lev.toAutomaton(maxEdits, UnicodeUtil.newString(string.ints, string.offset, nonFuzzyPrefix)));
}
}
if (subs.isEmpty()) {
// matches nothing
return Automata.makeEmpty();
} else if (subs.size() == 1) {
// no synonyms or anything: just a single path through the tokenstream
return subs.get(0);
} else {
// multiple paths: this is really scary! is it slow?
// maybe we should not do this and throw UOE?
Automaton a = Operations.union(subs);
// this only happens if you have multiple paths anyway (e.g. synonyms)
return Operations.determinize(a, DEFAULT_MAX_DETERMINIZED_STATES);
}
}
use of org.apache.lucene.util.automaton.Automaton in project lucene-solr by apache.
the class AnalyzingSuggester method lookup.
@Override
public List<LookupResult> lookup(final CharSequence key, Set<BytesRef> contexts, boolean onlyMorePopular, int num) {
assert num > 0;
if (onlyMorePopular) {
throw new IllegalArgumentException("this suggester only works with onlyMorePopular=false");
}
if (contexts != null) {
throw new IllegalArgumentException("this suggester doesn't support contexts");
}
if (fst == null) {
return Collections.emptyList();
}
//System.out.println("lookup key=" + key + " num=" + num);
for (int i = 0; i < key.length(); i++) {
if (key.charAt(i) == 0x1E) {
throw new IllegalArgumentException("lookup key cannot contain HOLE character U+001E; this character is reserved");
}
if (key.charAt(i) == 0x1F) {
throw new IllegalArgumentException("lookup key cannot contain unit separator character U+001F; this character is reserved");
}
}
final BytesRef utf8Key = new BytesRef(key);
try {
Automaton lookupAutomaton = toLookupAutomaton(key);
final CharsRefBuilder spare = new CharsRefBuilder();
//System.out.println(" now intersect exactFirst=" + exactFirst);
// Intersect automaton w/ suggest wFST and get all
// prefix starting nodes & their outputs:
//final PathIntersector intersector = getPathIntersector(lookupAutomaton, fst);
//System.out.println(" prefixPaths: " + prefixPaths.size());
BytesReader bytesReader = fst.getBytesReader();
FST.Arc<Pair<Long, BytesRef>> scratchArc = new FST.Arc<>();
final List<LookupResult> results = new ArrayList<>();
List<FSTUtil.Path<Pair<Long, BytesRef>>> prefixPaths = FSTUtil.intersectPrefixPaths(convertAutomaton(lookupAutomaton), fst);
if (exactFirst) {
int count = 0;
for (FSTUtil.Path<Pair<Long, BytesRef>> path : prefixPaths) {
if (fst.findTargetArc(END_BYTE, path.fstNode, scratchArc, bytesReader) != null) {
// This node has END_BYTE arc leaving, meaning it's an
// "exact" match:
count++;
}
}
// Searcher just to find the single exact only
// match, if present:
Util.TopNSearcher<Pair<Long, BytesRef>> searcher;
searcher = new Util.TopNSearcher<>(fst, count * maxSurfaceFormsPerAnalyzedForm, count * maxSurfaceFormsPerAnalyzedForm, weightComparator);
// ...:
for (FSTUtil.Path<Pair<Long, BytesRef>> path : prefixPaths) {
if (fst.findTargetArc(END_BYTE, path.fstNode, scratchArc, bytesReader) != null) {
// This node has END_BYTE arc leaving, meaning it's an
// "exact" match:
searcher.addStartPaths(scratchArc, fst.outputs.add(path.output, scratchArc.output), false, path.input);
}
}
TopResults<Pair<Long, BytesRef>> completions = searcher.search();
assert completions.isComplete;
// maxSurfaceFormsPerAnalyzedForm:
for (Result<Pair<Long, BytesRef>> completion : completions) {
BytesRef output2 = completion.output.output2;
if (sameSurfaceForm(utf8Key, output2)) {
results.add(getLookupResult(completion.output.output1, output2, spare));
break;
}
}
if (results.size() == num) {
// That was quick:
return results;
}
}
Util.TopNSearcher<Pair<Long, BytesRef>> searcher;
searcher = new Util.TopNSearcher<Pair<Long, BytesRef>>(fst, num - results.size(), num * maxAnalyzedPathsForOneInput, weightComparator) {
private final Set<BytesRef> seen = new HashSet<>();
@Override
protected boolean acceptResult(IntsRef input, Pair<Long, BytesRef> output) {
// can get duplicate surface forms:
if (seen.contains(output.output2)) {
return false;
}
seen.add(output.output2);
if (!exactFirst) {
return true;
} else {
// create duplicate results:
if (sameSurfaceForm(utf8Key, output.output2)) {
// have already found it in the first search:
assert results.size() == 1;
return false;
} else {
return true;
}
}
}
};
prefixPaths = getFullPrefixPaths(prefixPaths, lookupAutomaton, fst);
for (FSTUtil.Path<Pair<Long, BytesRef>> path : prefixPaths) {
searcher.addStartPaths(path.fstNode, path.output, true, path.input);
}
TopResults<Pair<Long, BytesRef>> completions = searcher.search();
assert completions.isComplete;
for (Result<Pair<Long, BytesRef>> completion : completions) {
LookupResult result = getLookupResult(completion.output.output1, completion.output.output2, spare);
// TODO: for fuzzy case would be nice to return
// how many edits were required
//System.out.println(" result=" + result);
results.add(result);
if (results.size() == num) {
// produce one extra path
break;
}
}
return results;
} catch (IOException bogus) {
throw new RuntimeException(bogus);
}
}
use of org.apache.lucene.util.automaton.Automaton in project lucene-solr by apache.
the class RandomPostingsTester method testTermsOneThread.
private void testTermsOneThread(Random random, Fields fieldsSource, EnumSet<Option> options, IndexOptions maxTestOptions, IndexOptions maxIndexOptions, boolean alwaysTestMax) throws IOException {
ThreadState threadState = new ThreadState();
// Test random terms/fields:
List<TermState> termStates = new ArrayList<>();
List<FieldAndTerm> termStateTerms = new ArrayList<>();
boolean supportsOrds = true;
Collections.shuffle(allTerms, random);
int upto = 0;
while (upto < allTerms.size()) {
boolean useTermState = termStates.size() != 0 && random.nextInt(5) == 1;
boolean useTermOrd = supportsOrds && useTermState == false && random.nextInt(5) == 1;
FieldAndTerm fieldAndTerm;
TermsEnum termsEnum;
TermState termState = null;
if (!useTermState) {
// Seek by random field+term:
fieldAndTerm = allTerms.get(upto++);
if (LuceneTestCase.VERBOSE) {
if (useTermOrd) {
System.out.println("\nTEST: seek to term=" + fieldAndTerm.field + ":" + fieldAndTerm.term.utf8ToString() + " using ord=" + fieldAndTerm.ord);
} else {
System.out.println("\nTEST: seek to term=" + fieldAndTerm.field + ":" + fieldAndTerm.term.utf8ToString());
}
}
} else {
// Seek by previous saved TermState
int idx = random.nextInt(termStates.size());
fieldAndTerm = termStateTerms.get(idx);
if (LuceneTestCase.VERBOSE) {
System.out.println("\nTEST: seek using TermState to term=" + fieldAndTerm.field + ":" + fieldAndTerm.term.utf8ToString());
}
termState = termStates.get(idx);
}
Terms terms = fieldsSource.terms(fieldAndTerm.field);
assertNotNull(terms);
termsEnum = terms.iterator();
if (!useTermState) {
if (useTermOrd) {
// Try seek by ord sometimes:
try {
termsEnum.seekExact(fieldAndTerm.ord);
} catch (UnsupportedOperationException uoe) {
supportsOrds = false;
assertTrue(termsEnum.seekExact(fieldAndTerm.term));
}
} else {
assertTrue(termsEnum.seekExact(fieldAndTerm.term));
}
} else {
termsEnum.seekExact(fieldAndTerm.term, termState);
}
// check we really seeked to the right place
assertEquals(fieldAndTerm.term, termsEnum.term());
long termOrd;
if (supportsOrds) {
try {
termOrd = termsEnum.ord();
} catch (UnsupportedOperationException uoe) {
supportsOrds = false;
termOrd = -1;
}
} else {
termOrd = -1;
}
if (termOrd != -1) {
// PostingsFormat supports ords
assertEquals(fieldAndTerm.ord, termsEnum.ord());
}
boolean savedTermState = false;
if (options.contains(Option.TERM_STATE) && !useTermState && random.nextInt(5) == 1) {
// Save away this TermState:
termStates.add(termsEnum.termState());
termStateTerms.add(fieldAndTerm);
savedTermState = true;
}
verifyEnum(random, threadState, fieldAndTerm.field, fieldAndTerm.term, termsEnum, maxTestOptions, maxIndexOptions, options, alwaysTestMax);
// Sometimes save term state after pulling the enum:
if (options.contains(Option.TERM_STATE) && !useTermState && !savedTermState && random.nextInt(5) == 1) {
// Save away this TermState:
termStates.add(termsEnum.termState());
termStateTerms.add(fieldAndTerm);
useTermState = true;
}
// from the same term:
if (alwaysTestMax || random.nextInt(10) == 7) {
// Try same term again
if (LuceneTestCase.VERBOSE) {
System.out.println("TEST: try enum again on same term");
}
verifyEnum(random, threadState, fieldAndTerm.field, fieldAndTerm.term, termsEnum, maxTestOptions, maxIndexOptions, options, alwaysTestMax);
}
}
// Test Terms.intersect:
for (String field : fields.keySet()) {
while (true) {
Automaton a = AutomatonTestUtil.randomAutomaton(random);
CompiledAutomaton ca = new CompiledAutomaton(a, null, true, Integer.MAX_VALUE, false);
if (ca.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
// Keep retrying until we get an A that will really "use" the PF's intersect code:
continue;
}
// System.out.println("A:\n" + a.toDot());
BytesRef startTerm = null;
if (random.nextBoolean()) {
RandomAcceptedStrings ras = new RandomAcceptedStrings(a);
for (int iter = 0; iter < 100; iter++) {
int[] codePoints = ras.getRandomAcceptedString(random);
if (codePoints.length == 0) {
continue;
}
startTerm = new BytesRef(UnicodeUtil.newString(codePoints, 0, codePoints.length));
break;
}
// Don't allow empty string startTerm:
if (startTerm == null) {
continue;
}
}
TermsEnum intersected = fieldsSource.terms(field).intersect(ca, startTerm);
Set<BytesRef> intersectedTerms = new HashSet<BytesRef>();
BytesRef term;
while ((term = intersected.next()) != null) {
if (startTerm != null) {
// NOTE: not <=
assertTrue(startTerm.compareTo(term) < 0);
}
intersectedTerms.add(BytesRef.deepCopyOf(term));
verifyEnum(random, threadState, field, term, intersected, maxTestOptions, maxIndexOptions, options, alwaysTestMax);
}
if (ca.runAutomaton == null) {
assertTrue(intersectedTerms.isEmpty());
} else {
for (BytesRef term2 : fields.get(field).keySet()) {
boolean expected;
if (startTerm != null && startTerm.compareTo(term2) >= 0) {
expected = false;
} else {
expected = ca.runAutomaton.run(term2.bytes, term2.offset, term2.length);
}
assertEquals("term=" + term2, expected, intersectedTerms.contains(term2));
}
}
break;
}
}
}
use of org.apache.lucene.util.automaton.Automaton in project lucene-solr by apache.
the class FuzzyCompletionQuery method createWeight.
@Override
public Weight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException {
CompletionTokenStream stream = (CompletionTokenStream) analyzer.tokenStream(getField(), getTerm().text());
Set<IntsRef> refs = new HashSet<>();
Automaton automaton = toLevenshteinAutomata(stream.toAutomaton(unicodeAware), refs);
if (unicodeAware) {
Automaton utf8automaton = new UTF32ToUTF8().convert(automaton);
utf8automaton = Operations.determinize(utf8automaton, maxDeterminizedStates);
automaton = utf8automaton;
}
// TODO Better iterate over automaton again inside FuzzyCompletionWeight?
return new FuzzyCompletionWeight(this, automaton, refs);
}
Aggregations