use of org.apache.lucene.util.IntsRef in project lucene-solr by apache.
the class FuzzyCompletionQuery method createWeight.
@Override
public Weight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException {
CompletionTokenStream stream = (CompletionTokenStream) analyzer.tokenStream(getField(), getTerm().text());
Set<IntsRef> refs = new HashSet<>();
Automaton automaton = toLevenshteinAutomata(stream.toAutomaton(unicodeAware), refs);
if (unicodeAware) {
Automaton utf8automaton = new UTF32ToUTF8().convert(automaton);
utf8automaton = Operations.determinize(utf8automaton, maxDeterminizedStates);
automaton = utf8automaton;
}
// TODO Better iterate over automaton again inside FuzzyCompletionWeight?
return new FuzzyCompletionWeight(this, automaton, refs);
}
use of org.apache.lucene.util.IntsRef in project lucene-solr by apache.
the class FuzzyCompletionQuery method toLevenshteinAutomata.
private Automaton toLevenshteinAutomata(Automaton automaton, Set<IntsRef> refs) {
List<Automaton> subs = new ArrayList<>();
FiniteStringsIterator finiteStrings = new FiniteStringsIterator(automaton);
for (IntsRef string; (string = finiteStrings.next()) != null; ) {
refs.add(IntsRef.deepCopyOf(string));
if (string.length <= nonFuzzyPrefix || string.length < minFuzzyLength) {
subs.add(Automata.makeString(string.ints, string.offset, string.length));
} else {
int[] ints = new int[string.length - nonFuzzyPrefix];
System.arraycopy(string.ints, string.offset + nonFuzzyPrefix, ints, 0, ints.length);
// TODO: maybe add alphaMin to LevenshteinAutomata,
// and pass 1 instead of 0? We probably don't want
// to allow the trailing dedup bytes to be
// edited... but then 0 byte is "in general" allowed
// on input (but not in UTF8).
LevenshteinAutomata lev = new LevenshteinAutomata(ints, unicodeAware ? Character.MAX_CODE_POINT : 255, transpositions);
subs.add(lev.toAutomaton(maxEdits, UnicodeUtil.newString(string.ints, string.offset, nonFuzzyPrefix)));
}
}
if (subs.isEmpty()) {
// matches nothing
return Automata.makeEmpty();
} else if (subs.size() == 1) {
// no synonyms or anything: just a single path through the tokenstream
return subs.get(0);
} else {
// multiple paths: this is really scary! is it slow?
// maybe we should not do this and throw UOE?
Automaton a = Operations.union(subs);
// this only happens if you have multiple paths anyway (e.g. synonyms)
return Operations.determinize(a, maxDeterminizedStates);
}
}
use of org.apache.lucene.util.IntsRef in project lucene-solr by apache.
the class TestFSTs method testRandomWords.
private void testRandomWords(int maxNumWords, int numIter) throws IOException {
Random random = new Random(random().nextLong());
for (int iter = 0; iter < numIter; iter++) {
if (VERBOSE) {
System.out.println("\nTEST: iter " + iter);
}
for (int inputMode = 0; inputMode < 2; inputMode++) {
final int numWords = random.nextInt(maxNumWords + 1);
Set<IntsRef> termsSet = new HashSet<>();
IntsRef[] terms = new IntsRef[numWords];
while (termsSet.size() < numWords) {
final String term = getRandomString(random);
termsSet.add(toIntsRef(term, inputMode));
}
doTest(inputMode, termsSet.toArray(new IntsRef[termsSet.size()]));
}
}
}
use of org.apache.lucene.util.IntsRef in project lucene-solr by apache.
the class TestAutomaton method testGetSingletonEmptyString.
public void testGetSingletonEmptyString() {
Automaton a = new Automaton();
int s = a.createState();
a.setAccept(s, true);
a.finishState();
assertEquals(new IntsRef(), Operations.getSingleton(a));
}
use of org.apache.lucene.util.IntsRef in project lucene-solr by apache.
the class FiniteStringsIteratorTest method testFiniteStringsEatsStack.
public void testFiniteStringsEatsStack() {
char[] chars = new char[50000];
TestUtil.randomFixedLengthUnicodeString(random(), chars, 0, chars.length);
String bigString1 = new String(chars);
TestUtil.randomFixedLengthUnicodeString(random(), chars, 0, chars.length);
String bigString2 = new String(chars);
Automaton a = Operations.union(Automata.makeString(bigString1), Automata.makeString(bigString2));
FiniteStringsIterator iterator = new FiniteStringsIterator(a);
List<IntsRef> actual = getFiniteStrings(iterator);
assertEquals(2, actual.size());
IntsRefBuilder scratch = new IntsRefBuilder();
Util.toUTF32(bigString1.toCharArray(), 0, bigString1.length(), scratch);
assertTrue(actual.contains(scratch.get()));
Util.toUTF32(bigString2.toCharArray(), 0, bigString2.length(), scratch);
assertTrue(actual.contains(scratch.get()));
}
Aggregations