Search in sources :

Example 16 with IntsRef

use of org.apache.lucene.util.IntsRef in project lucene-solr by apache.

the class FuzzyCompletionQuery method createWeight.

@Override
public Weight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException {
    CompletionTokenStream stream = (CompletionTokenStream) analyzer.tokenStream(getField(), getTerm().text());
    Set<IntsRef> refs = new HashSet<>();
    Automaton automaton = toLevenshteinAutomata(stream.toAutomaton(unicodeAware), refs);
    if (unicodeAware) {
        Automaton utf8automaton = new UTF32ToUTF8().convert(automaton);
        utf8automaton = Operations.determinize(utf8automaton, maxDeterminizedStates);
        automaton = utf8automaton;
    }
    // TODO Better iterate over automaton again inside FuzzyCompletionWeight?
    return new FuzzyCompletionWeight(this, automaton, refs);
}
Also used : Automaton(org.apache.lucene.util.automaton.Automaton) IntsRef(org.apache.lucene.util.IntsRef) HashSet(java.util.HashSet) UTF32ToUTF8(org.apache.lucene.util.automaton.UTF32ToUTF8)

Example 17 with IntsRef

use of org.apache.lucene.util.IntsRef in project lucene-solr by apache.

the class FuzzyCompletionQuery method toLevenshteinAutomata.

private Automaton toLevenshteinAutomata(Automaton automaton, Set<IntsRef> refs) {
    List<Automaton> subs = new ArrayList<>();
    FiniteStringsIterator finiteStrings = new FiniteStringsIterator(automaton);
    for (IntsRef string; (string = finiteStrings.next()) != null; ) {
        refs.add(IntsRef.deepCopyOf(string));
        if (string.length <= nonFuzzyPrefix || string.length < minFuzzyLength) {
            subs.add(Automata.makeString(string.ints, string.offset, string.length));
        } else {
            int[] ints = new int[string.length - nonFuzzyPrefix];
            System.arraycopy(string.ints, string.offset + nonFuzzyPrefix, ints, 0, ints.length);
            // TODO: maybe add alphaMin to LevenshteinAutomata,
            // and pass 1 instead of 0?  We probably don't want
            // to allow the trailing dedup bytes to be
            // edited... but then 0 byte is "in general" allowed
            // on input (but not in UTF8).
            LevenshteinAutomata lev = new LevenshteinAutomata(ints, unicodeAware ? Character.MAX_CODE_POINT : 255, transpositions);
            subs.add(lev.toAutomaton(maxEdits, UnicodeUtil.newString(string.ints, string.offset, nonFuzzyPrefix)));
        }
    }
    if (subs.isEmpty()) {
        // matches nothing
        return Automata.makeEmpty();
    } else if (subs.size() == 1) {
        // no synonyms or anything: just a single path through the tokenstream
        return subs.get(0);
    } else {
        // multiple paths: this is really scary! is it slow?
        // maybe we should not do this and throw UOE?
        Automaton a = Operations.union(subs);
        // this only happens if you have multiple paths anyway (e.g. synonyms)
        return Operations.determinize(a, maxDeterminizedStates);
    }
}
Also used : FiniteStringsIterator(org.apache.lucene.util.automaton.FiniteStringsIterator) Automaton(org.apache.lucene.util.automaton.Automaton) LevenshteinAutomata(org.apache.lucene.util.automaton.LevenshteinAutomata) ArrayList(java.util.ArrayList) IntsRef(org.apache.lucene.util.IntsRef)

Example 18 with IntsRef

use of org.apache.lucene.util.IntsRef in project lucene-solr by apache.

the class TestFSTs method testRandomWords.

private void testRandomWords(int maxNumWords, int numIter) throws IOException {
    Random random = new Random(random().nextLong());
    for (int iter = 0; iter < numIter; iter++) {
        if (VERBOSE) {
            System.out.println("\nTEST: iter " + iter);
        }
        for (int inputMode = 0; inputMode < 2; inputMode++) {
            final int numWords = random.nextInt(maxNumWords + 1);
            Set<IntsRef> termsSet = new HashSet<>();
            IntsRef[] terms = new IntsRef[numWords];
            while (termsSet.size() < numWords) {
                final String term = getRandomString(random);
                termsSet.add(toIntsRef(term, inputMode));
            }
            doTest(inputMode, termsSet.toArray(new IntsRef[termsSet.size()]));
        }
    }
}
Also used : Random(java.util.Random) IntsRef(org.apache.lucene.util.IntsRef) FSTTester.toIntsRef(org.apache.lucene.util.fst.FSTTester.toIntsRef) FSTTester.simpleRandomString(org.apache.lucene.util.fst.FSTTester.simpleRandomString) FSTTester.getRandomString(org.apache.lucene.util.fst.FSTTester.getRandomString) HashSet(java.util.HashSet)

Example 19 with IntsRef

use of org.apache.lucene.util.IntsRef in project lucene-solr by apache.

the class TestAutomaton method testGetSingletonEmptyString.

public void testGetSingletonEmptyString() {
    Automaton a = new Automaton();
    int s = a.createState();
    a.setAccept(s, true);
    a.finishState();
    assertEquals(new IntsRef(), Operations.getSingleton(a));
}
Also used : IntsRef(org.apache.lucene.util.IntsRef)

Example 20 with IntsRef

use of org.apache.lucene.util.IntsRef in project lucene-solr by apache.

the class FiniteStringsIteratorTest method testFiniteStringsEatsStack.

public void testFiniteStringsEatsStack() {
    char[] chars = new char[50000];
    TestUtil.randomFixedLengthUnicodeString(random(), chars, 0, chars.length);
    String bigString1 = new String(chars);
    TestUtil.randomFixedLengthUnicodeString(random(), chars, 0, chars.length);
    String bigString2 = new String(chars);
    Automaton a = Operations.union(Automata.makeString(bigString1), Automata.makeString(bigString2));
    FiniteStringsIterator iterator = new FiniteStringsIterator(a);
    List<IntsRef> actual = getFiniteStrings(iterator);
    assertEquals(2, actual.size());
    IntsRefBuilder scratch = new IntsRefBuilder();
    Util.toUTF32(bigString1.toCharArray(), 0, bigString1.length(), scratch);
    assertTrue(actual.contains(scratch.get()));
    Util.toUTF32(bigString2.toCharArray(), 0, bigString2.length(), scratch);
    assertTrue(actual.contains(scratch.get()));
}
Also used : IntsRef(org.apache.lucene.util.IntsRef) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder)

Aggregations

IntsRef (org.apache.lucene.util.IntsRef)63 BytesRef (org.apache.lucene.util.BytesRef)19 IntsRefBuilder (org.apache.lucene.util.IntsRefBuilder)19 HashSet (java.util.HashSet)16 ArrayList (java.util.ArrayList)13 Automaton (org.apache.lucene.util.automaton.Automaton)13 BytesRefBuilder (org.apache.lucene.util.BytesRefBuilder)12 TokenStreamToAutomaton (org.apache.lucene.analysis.TokenStreamToAutomaton)9 IOException (java.io.IOException)7 Directory (org.apache.lucene.store.Directory)7 HashMap (java.util.HashMap)5 FiniteStringsIterator (org.apache.lucene.util.automaton.FiniteStringsIterator)5 BytesReader (org.apache.lucene.util.fst.FST.BytesReader)5 Pair (org.apache.lucene.util.fst.PairOutputs.Pair)5 ByteArrayInputStream (java.io.ByteArrayInputStream)4 FilterInputStream (java.io.FilterInputStream)4 InputStream (java.io.InputStream)4 Map (java.util.Map)4 Random (java.util.Random)4 TokenStream (org.apache.lucene.analysis.TokenStream)4