Search in sources :

Example 21 with Automaton

use of org.apache.lucene.util.automaton.Automaton in project lucene-solr by apache.

the class FuzzyCompletionQuery method toLevenshteinAutomata.

private Automaton toLevenshteinAutomata(Automaton automaton, Set<IntsRef> refs) {
    List<Automaton> subs = new ArrayList<>();
    FiniteStringsIterator finiteStrings = new FiniteStringsIterator(automaton);
    for (IntsRef string; (string = finiteStrings.next()) != null; ) {
        refs.add(IntsRef.deepCopyOf(string));
        if (string.length <= nonFuzzyPrefix || string.length < minFuzzyLength) {
            subs.add(Automata.makeString(string.ints, string.offset, string.length));
        } else {
            int[] ints = new int[string.length - nonFuzzyPrefix];
            System.arraycopy(string.ints, string.offset + nonFuzzyPrefix, ints, 0, ints.length);
            // TODO: maybe add alphaMin to LevenshteinAutomata,
            // and pass 1 instead of 0?  We probably don't want
            // to allow the trailing dedup bytes to be
            // edited... but then 0 byte is "in general" allowed
            // on input (but not in UTF8).
            LevenshteinAutomata lev = new LevenshteinAutomata(ints, unicodeAware ? Character.MAX_CODE_POINT : 255, transpositions);
            subs.add(lev.toAutomaton(maxEdits, UnicodeUtil.newString(string.ints, string.offset, nonFuzzyPrefix)));
        }
    }
    if (subs.isEmpty()) {
        // matches nothing
        return Automata.makeEmpty();
    } else if (subs.size() == 1) {
        // no synonyms or anything: just a single path through the tokenstream
        return subs.get(0);
    } else {
        // multiple paths: this is really scary! is it slow?
        // maybe we should not do this and throw UOE?
        Automaton a = Operations.union(subs);
        // this only happens if you have multiple paths anyway (e.g. synonyms)
        return Operations.determinize(a, maxDeterminizedStates);
    }
}
Also used : FiniteStringsIterator(org.apache.lucene.util.automaton.FiniteStringsIterator) Automaton(org.apache.lucene.util.automaton.Automaton) LevenshteinAutomata(org.apache.lucene.util.automaton.LevenshteinAutomata) ArrayList(java.util.ArrayList) IntsRef(org.apache.lucene.util.IntsRef)

Example 22 with Automaton

use of org.apache.lucene.util.automaton.Automaton in project lucene-solr by apache.

the class CompletionTokenStream method replaceSep.

// Replaces SEP with epsilon or remaps them if
// we were asked to preserve them:
private static Automaton replaceSep(Automaton a, boolean preserveSep, int sepLabel) {
    Automaton result = new Automaton();
    // Copy all states over
    int numStates = a.getNumStates();
    for (int s = 0; s < numStates; s++) {
        result.createState();
        result.setAccept(s, a.isAccept(s));
    }
    // Go in reverse topo sort so we know we only have to
    // make one pass:
    Transition t = new Transition();
    int[] topoSortStates = Operations.topoSortStates(a);
    for (int i = 0; i < topoSortStates.length; i++) {
        int state = topoSortStates[topoSortStates.length - 1 - i];
        int count = a.initTransition(state, t);
        for (int j = 0; j < count; j++) {
            a.getNextTransition(t);
            if (t.min == TokenStreamToAutomaton.POS_SEP) {
                assert t.max == TokenStreamToAutomaton.POS_SEP;
                if (preserveSep) {
                    // Remap to SEP_LABEL:
                    result.addTransition(state, t.dest, sepLabel);
                } else {
                    result.addEpsilon(state, t.dest);
                }
            } else if (t.min == TokenStreamToAutomaton.HOLE) {
                assert t.max == TokenStreamToAutomaton.HOLE;
                // Just remove the hole: there will then be two
                // SEP tokens next to each other, which will only
                // match another hole at search time.  Note that
                // it will also match an empty-string token ... if
                // that's somehow a problem we can always map HOLE
                // to a dedicated byte (and escape it in the
                // input).
                result.addEpsilon(state, t.dest);
            } else {
                result.addTransition(state, t.dest, t.min, t.max);
            }
        }
    }
    result.finishState();
    return result;
}
Also used : Automaton(org.apache.lucene.util.automaton.Automaton) TokenStreamToAutomaton(org.apache.lucene.analysis.TokenStreamToAutomaton) Transition(org.apache.lucene.util.automaton.Transition)

Example 23 with Automaton

use of org.apache.lucene.util.automaton.Automaton in project lucene-solr by apache.

the class TestFSTs method testRealTerms.

// Build FST for all unique terms in the test line docs
// file, up until a doc limit
public void testRealTerms() throws Exception {
    final LineFileDocs docs = new LineFileDocs(random());
    final int numDocs = TEST_NIGHTLY ? atLeast(1000) : atLeast(100);
    MockAnalyzer analyzer = new MockAnalyzer(random());
    analyzer.setMaxTokenLength(TestUtil.nextInt(random(), 1, IndexWriter.MAX_TERM_LENGTH));
    final IndexWriterConfig conf = newIndexWriterConfig(analyzer).setMaxBufferedDocs(-1).setRAMBufferSizeMB(64);
    final Path tempDir = createTempDir("fstlines");
    final Directory dir = newFSDirectory(tempDir);
    final IndexWriter writer = new IndexWriter(dir, conf);
    Document doc;
    int docCount = 0;
    while ((doc = docs.nextDoc()) != null && docCount < numDocs) {
        writer.addDocument(doc);
        docCount++;
    }
    IndexReader r = DirectoryReader.open(writer);
    writer.close();
    final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
    Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
    boolean storeOrd = random().nextBoolean();
    if (VERBOSE) {
        if (storeOrd) {
            System.out.println("FST stores ord");
        } else {
            System.out.println("FST stores docFreq");
        }
    }
    Terms terms = MultiFields.getTerms(r, "body");
    if (terms != null) {
        final IntsRefBuilder scratchIntsRef = new IntsRefBuilder();
        final TermsEnum termsEnum = terms.iterator();
        if (VERBOSE) {
            System.out.println("TEST: got termsEnum=" + termsEnum);
        }
        BytesRef term;
        int ord = 0;
        Automaton automaton = new RegExp(".*", RegExp.NONE).toAutomaton();
        final TermsEnum termsEnum2 = terms.intersect(new CompiledAutomaton(automaton, false, false), null);
        while ((term = termsEnum.next()) != null) {
            BytesRef term2 = termsEnum2.next();
            assertNotNull(term2);
            assertEquals(term, term2);
            assertEquals(termsEnum.docFreq(), termsEnum2.docFreq());
            assertEquals(termsEnum.totalTermFreq(), termsEnum2.totalTermFreq());
            if (ord == 0) {
                try {
                    termsEnum.ord();
                } catch (UnsupportedOperationException uoe) {
                    if (VERBOSE) {
                        System.out.println("TEST: codec doesn't support ord; FST stores docFreq");
                    }
                    storeOrd = false;
                }
            }
            final int output;
            if (storeOrd) {
                output = ord;
            } else {
                output = termsEnum.docFreq();
            }
            builder.add(Util.toIntsRef(term, scratchIntsRef), (long) output);
            ord++;
            if (VERBOSE && ord % 100000 == 0 && LuceneTestCase.TEST_NIGHTLY) {
                System.out.println(ord + " terms...");
            }
        }
        FST<Long> fst = builder.finish();
        if (VERBOSE) {
            System.out.println("FST: " + docCount + " docs; " + ord + " terms; " + builder.getNodeCount() + " nodes; " + builder.getArcCount() + " arcs;" + " " + fst.ramBytesUsed() + " bytes");
        }
        if (ord > 0) {
            final Random random = new Random(random().nextLong());
            // Now confirm BytesRefFSTEnum and TermsEnum act the
            // same:
            final BytesRefFSTEnum<Long> fstEnum = new BytesRefFSTEnum<>(fst);
            int num = atLeast(1000);
            for (int iter = 0; iter < num; iter++) {
                final BytesRef randomTerm = new BytesRef(getRandomString(random));
                if (VERBOSE) {
                    System.out.println("TEST: seek non-exist " + randomTerm.utf8ToString() + " " + randomTerm);
                }
                final TermsEnum.SeekStatus seekResult = termsEnum.seekCeil(randomTerm);
                final InputOutput<Long> fstSeekResult = fstEnum.seekCeil(randomTerm);
                if (seekResult == TermsEnum.SeekStatus.END) {
                    assertNull("got " + (fstSeekResult == null ? "null" : fstSeekResult.input.utf8ToString()) + " but expected null", fstSeekResult);
                } else {
                    assertSame(termsEnum, fstEnum, storeOrd);
                    for (int nextIter = 0; nextIter < 10; nextIter++) {
                        if (VERBOSE) {
                            System.out.println("TEST: next");
                            if (storeOrd) {
                                System.out.println("  ord=" + termsEnum.ord());
                            }
                        }
                        if (termsEnum.next() != null) {
                            if (VERBOSE) {
                                System.out.println("  term=" + termsEnum.term().utf8ToString());
                            }
                            assertNotNull(fstEnum.next());
                            assertSame(termsEnum, fstEnum, storeOrd);
                        } else {
                            if (VERBOSE) {
                                System.out.println("  end!");
                            }
                            BytesRefFSTEnum.InputOutput<Long> nextResult = fstEnum.next();
                            if (nextResult != null) {
                                System.out.println("expected null but got: input=" + nextResult.input.utf8ToString() + " output=" + outputs.outputToString(nextResult.output));
                                fail();
                            }
                            break;
                        }
                    }
                }
            }
        }
    }
    r.close();
    dir.close();
}
Also used : BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) CompiledAutomaton(org.apache.lucene.util.automaton.CompiledAutomaton) Document(org.apache.lucene.document.Document) TermsEnum(org.apache.lucene.index.TermsEnum) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) Random(java.util.Random) BytesRef(org.apache.lucene.util.BytesRef) LineFileDocs(org.apache.lucene.util.LineFileDocs) Directory(org.apache.lucene.store.Directory) FSDirectory(org.apache.lucene.store.FSDirectory) Path(java.nio.file.Path) CompiledAutomaton(org.apache.lucene.util.automaton.CompiledAutomaton) Automaton(org.apache.lucene.util.automaton.Automaton) RegExp(org.apache.lucene.util.automaton.RegExp) Terms(org.apache.lucene.index.Terms) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) IndexWriter(org.apache.lucene.index.IndexWriter) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) IndexReader(org.apache.lucene.index.IndexReader) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Example 24 with Automaton

use of org.apache.lucene.util.automaton.Automaton in project lucene-solr by apache.

the class BaseTokenStreamTestCase method assertGraphStrings.

/**
   * Enumerates all accepted strings in the token graph created by the already initialized {@link TokenStream}.
   */
public static void assertGraphStrings(TokenStream tokenStream, String... expectedStrings) throws IOException {
    Automaton automaton = new TokenStreamToAutomaton().toAutomaton(tokenStream);
    Set<IntsRef> actualStringPaths = AutomatonTestUtil.getFiniteStringsRecursive(automaton, -1);
    Set<String> expectedStringsSet = new HashSet<>(Arrays.asList(expectedStrings));
    BytesRefBuilder scratchBytesRefBuilder = new BytesRefBuilder();
    Set<String> actualStrings = new HashSet<>();
    for (IntsRef ir : actualStringPaths) {
        actualStrings.add(Util.toBytesRef(ir, scratchBytesRefBuilder).utf8ToString().replace((char) TokenStreamToAutomaton.POS_SEP, ' '));
    }
    for (String s : actualStrings) {
        assertTrue("Analyzer created unexpected string path: " + s + "\nexpected:\n" + toString(expectedStringsSet) + "\nactual:\n" + toString(actualStrings), expectedStringsSet.contains(s));
    }
    for (String s : expectedStrings) {
        assertTrue("Analyzer created unexpected string path: " + s + "\nexpected:\n" + toString(expectedStringsSet) + "\nactual:\n" + toString(actualStrings), actualStrings.contains(s));
    }
}
Also used : BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) Automaton(org.apache.lucene.util.automaton.Automaton) IntsRef(org.apache.lucene.util.IntsRef)

Example 25 with Automaton

use of org.apache.lucene.util.automaton.Automaton in project elasticsearch by elastic.

the class XAnalyzingSuggester method replaceSep.

// Replaces SEP with epsilon or remaps them if
// we were asked to preserve them:
private Automaton replaceSep(Automaton a) {
    Automaton result = new Automaton();
    // Copy all states over
    int numStates = a.getNumStates();
    for (int s = 0; s < numStates; s++) {
        result.createState();
        result.setAccept(s, a.isAccept(s));
    }
    // Go in reverse topo sort so we know we only have to
    // make one pass:
    Transition t = new Transition();
    int[] topoSortStates = topoSortStates(a);
    for (int i = 0; i < topoSortStates.length; i++) {
        int state = topoSortStates[topoSortStates.length - 1 - i];
        int count = a.initTransition(state, t);
        for (int j = 0; j < count; j++) {
            a.getNextTransition(t);
            if (t.min == TokenStreamToAutomaton.POS_SEP) {
                assert t.max == TokenStreamToAutomaton.POS_SEP;
                if (preserveSep) {
                    // Remap to SEP_LABEL:
                    result.addTransition(state, t.dest, SEP_LABEL);
                } else {
                    result.addEpsilon(state, t.dest);
                }
            } else if (t.min == TokenStreamToAutomaton.HOLE) {
                assert t.max == TokenStreamToAutomaton.HOLE;
                // Just remove the hole: there will then be two
                // SEP tokens next to each other, which will only
                // match another hole at search time.  Note that
                // it will also match an empty-string token ... if
                // that's somehow a problem we can always map HOLE
                // to a dedicated byte (and escape it in the
                // input).
                result.addEpsilon(state, t.dest);
            } else {
                result.addTransition(state, t.dest, t.min, t.max);
            }
        }
    }
    result.finishState();
    return result;
}
Also used : Automaton(org.apache.lucene.util.automaton.Automaton) TokenStreamToAutomaton(org.apache.lucene.analysis.TokenStreamToAutomaton) Transition(org.apache.lucene.util.automaton.Transition)

Aggregations

Automaton (org.apache.lucene.util.automaton.Automaton)57 TokenStreamToAutomaton (org.apache.lucene.analysis.TokenStreamToAutomaton)17 IntsRef (org.apache.lucene.util.IntsRef)13 BytesRef (org.apache.lucene.util.BytesRef)12 ArrayList (java.util.ArrayList)11 Directory (org.apache.lucene.store.Directory)8 HashSet (java.util.HashSet)7 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)7 Document (org.apache.lucene.document.Document)6 CompiledAutomaton (org.apache.lucene.util.automaton.CompiledAutomaton)6 Transition (org.apache.lucene.util.automaton.Transition)6 TokenStream (org.apache.lucene.analysis.TokenStream)5 BytesRefBuilder (org.apache.lucene.util.BytesRefBuilder)5 CharsRefBuilder (org.apache.lucene.util.CharsRefBuilder)5 CharacterRunAutomaton (org.apache.lucene.util.automaton.CharacterRunAutomaton)5 Analyzer (org.apache.lucene.analysis.Analyzer)4 IntsRefBuilder (org.apache.lucene.util.IntsRefBuilder)4 FiniteStringsIterator (org.apache.lucene.util.automaton.FiniteStringsIterator)4 LevenshteinAutomata (org.apache.lucene.util.automaton.LevenshteinAutomata)4 RegExp (org.apache.lucene.util.automaton.RegExp)4