Search in sources :

Example 16 with Automaton

use of org.apache.lucene.util.automaton.Automaton in project lucene-solr by apache.

the class FuzzySuggester method convertAutomaton.

@Override
protected Automaton convertAutomaton(Automaton a) {
    if (unicodeAware) {
        Automaton utf8automaton = new UTF32ToUTF8().convert(a);
        utf8automaton = Operations.determinize(utf8automaton, DEFAULT_MAX_DETERMINIZED_STATES);
        return utf8automaton;
    } else {
        return a;
    }
}
Also used : Automaton(org.apache.lucene.util.automaton.Automaton) TokenStreamToAutomaton(org.apache.lucene.analysis.TokenStreamToAutomaton) UTF32ToUTF8(org.apache.lucene.util.automaton.UTF32ToUTF8)

Example 17 with Automaton

use of org.apache.lucene.util.automaton.Automaton in project lucene-solr by apache.

the class FuzzySuggester method toLevenshteinAutomata.

Automaton toLevenshteinAutomata(Automaton automaton) {
    List<Automaton> subs = new ArrayList<>();
    FiniteStringsIterator finiteStrings = new FiniteStringsIterator(automaton);
    for (IntsRef string; (string = finiteStrings.next()) != null; ) {
        if (string.length <= nonFuzzyPrefix || string.length < minFuzzyLength) {
            subs.add(Automata.makeString(string.ints, string.offset, string.length));
        } else {
            int[] ints = new int[string.length - nonFuzzyPrefix];
            System.arraycopy(string.ints, string.offset + nonFuzzyPrefix, ints, 0, ints.length);
            // TODO: maybe add alphaMin to LevenshteinAutomata,
            // and pass 1 instead of 0?  We probably don't want
            // to allow the trailing dedup bytes to be
            // edited... but then 0 byte is "in general" allowed
            // on input (but not in UTF8).
            LevenshteinAutomata lev = new LevenshteinAutomata(ints, unicodeAware ? Character.MAX_CODE_POINT : 255, transpositions);
            subs.add(lev.toAutomaton(maxEdits, UnicodeUtil.newString(string.ints, string.offset, nonFuzzyPrefix)));
        }
    }
    if (subs.isEmpty()) {
        // matches nothing
        return Automata.makeEmpty();
    } else if (subs.size() == 1) {
        // no synonyms or anything: just a single path through the tokenstream
        return subs.get(0);
    } else {
        // multiple paths: this is really scary! is it slow?
        // maybe we should not do this and throw UOE?
        Automaton a = Operations.union(subs);
        // this only happens if you have multiple paths anyway (e.g. synonyms)
        return Operations.determinize(a, DEFAULT_MAX_DETERMINIZED_STATES);
    }
}
Also used : FiniteStringsIterator(org.apache.lucene.util.automaton.FiniteStringsIterator) Automaton(org.apache.lucene.util.automaton.Automaton) TokenStreamToAutomaton(org.apache.lucene.analysis.TokenStreamToAutomaton) LevenshteinAutomata(org.apache.lucene.util.automaton.LevenshteinAutomata) ArrayList(java.util.ArrayList) IntsRef(org.apache.lucene.util.IntsRef)

Example 18 with Automaton

use of org.apache.lucene.util.automaton.Automaton in project lucene-solr by apache.

the class AnalyzingSuggester method lookup.

@Override
public List<LookupResult> lookup(final CharSequence key, Set<BytesRef> contexts, boolean onlyMorePopular, int num) {
    assert num > 0;
    if (onlyMorePopular) {
        throw new IllegalArgumentException("this suggester only works with onlyMorePopular=false");
    }
    if (contexts != null) {
        throw new IllegalArgumentException("this suggester doesn't support contexts");
    }
    if (fst == null) {
        return Collections.emptyList();
    }
    //System.out.println("lookup key=" + key + " num=" + num);
    for (int i = 0; i < key.length(); i++) {
        if (key.charAt(i) == 0x1E) {
            throw new IllegalArgumentException("lookup key cannot contain HOLE character U+001E; this character is reserved");
        }
        if (key.charAt(i) == 0x1F) {
            throw new IllegalArgumentException("lookup key cannot contain unit separator character U+001F; this character is reserved");
        }
    }
    final BytesRef utf8Key = new BytesRef(key);
    try {
        Automaton lookupAutomaton = toLookupAutomaton(key);
        final CharsRefBuilder spare = new CharsRefBuilder();
        //System.out.println("  now intersect exactFirst=" + exactFirst);
        // Intersect automaton w/ suggest wFST and get all
        // prefix starting nodes & their outputs:
        //final PathIntersector intersector = getPathIntersector(lookupAutomaton, fst);
        //System.out.println("  prefixPaths: " + prefixPaths.size());
        BytesReader bytesReader = fst.getBytesReader();
        FST.Arc<Pair<Long, BytesRef>> scratchArc = new FST.Arc<>();
        final List<LookupResult> results = new ArrayList<>();
        List<FSTUtil.Path<Pair<Long, BytesRef>>> prefixPaths = FSTUtil.intersectPrefixPaths(convertAutomaton(lookupAutomaton), fst);
        if (exactFirst) {
            int count = 0;
            for (FSTUtil.Path<Pair<Long, BytesRef>> path : prefixPaths) {
                if (fst.findTargetArc(END_BYTE, path.fstNode, scratchArc, bytesReader) != null) {
                    // This node has END_BYTE arc leaving, meaning it's an
                    // "exact" match:
                    count++;
                }
            }
            // Searcher just to find the single exact only
            // match, if present:
            Util.TopNSearcher<Pair<Long, BytesRef>> searcher;
            searcher = new Util.TopNSearcher<>(fst, count * maxSurfaceFormsPerAnalyzedForm, count * maxSurfaceFormsPerAnalyzedForm, weightComparator);
            // ...:
            for (FSTUtil.Path<Pair<Long, BytesRef>> path : prefixPaths) {
                if (fst.findTargetArc(END_BYTE, path.fstNode, scratchArc, bytesReader) != null) {
                    // This node has END_BYTE arc leaving, meaning it's an
                    // "exact" match:
                    searcher.addStartPaths(scratchArc, fst.outputs.add(path.output, scratchArc.output), false, path.input);
                }
            }
            TopResults<Pair<Long, BytesRef>> completions = searcher.search();
            assert completions.isComplete;
            // maxSurfaceFormsPerAnalyzedForm:
            for (Result<Pair<Long, BytesRef>> completion : completions) {
                BytesRef output2 = completion.output.output2;
                if (sameSurfaceForm(utf8Key, output2)) {
                    results.add(getLookupResult(completion.output.output1, output2, spare));
                    break;
                }
            }
            if (results.size() == num) {
                // That was quick:
                return results;
            }
        }
        Util.TopNSearcher<Pair<Long, BytesRef>> searcher;
        searcher = new Util.TopNSearcher<Pair<Long, BytesRef>>(fst, num - results.size(), num * maxAnalyzedPathsForOneInput, weightComparator) {

            private final Set<BytesRef> seen = new HashSet<>();

            @Override
            protected boolean acceptResult(IntsRef input, Pair<Long, BytesRef> output) {
                // can get duplicate surface forms:
                if (seen.contains(output.output2)) {
                    return false;
                }
                seen.add(output.output2);
                if (!exactFirst) {
                    return true;
                } else {
                    // create duplicate results:
                    if (sameSurfaceForm(utf8Key, output.output2)) {
                        // have already found it in the first search:
                        assert results.size() == 1;
                        return false;
                    } else {
                        return true;
                    }
                }
            }
        };
        prefixPaths = getFullPrefixPaths(prefixPaths, lookupAutomaton, fst);
        for (FSTUtil.Path<Pair<Long, BytesRef>> path : prefixPaths) {
            searcher.addStartPaths(path.fstNode, path.output, true, path.input);
        }
        TopResults<Pair<Long, BytesRef>> completions = searcher.search();
        assert completions.isComplete;
        for (Result<Pair<Long, BytesRef>> completion : completions) {
            LookupResult result = getLookupResult(completion.output.output1, completion.output.output2, spare);
            // TODO: for fuzzy case would be nice to return
            // how many edits were required
            //System.out.println("    result=" + result);
            results.add(result);
            if (results.size() == num) {
                // produce one extra path
                break;
            }
        }
        return results;
    } catch (IOException bogus) {
        throw new RuntimeException(bogus);
    }
}
Also used : ArrayList(java.util.ArrayList) Util(org.apache.lucene.util.fst.Util) CodecUtil(org.apache.lucene.codecs.CodecUtil) ArrayUtil(org.apache.lucene.util.ArrayUtil) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) IntsRef(org.apache.lucene.util.IntsRef) BytesRef(org.apache.lucene.util.BytesRef) Pair(org.apache.lucene.util.fst.PairOutputs.Pair) HashSet(java.util.HashSet) TokenStreamToAutomaton(org.apache.lucene.analysis.TokenStreamToAutomaton) Automaton(org.apache.lucene.util.automaton.Automaton) FST(org.apache.lucene.util.fst.FST) IOException(java.io.IOException) BytesReader(org.apache.lucene.util.fst.FST.BytesReader)

Example 19 with Automaton

use of org.apache.lucene.util.automaton.Automaton in project lucene-solr by apache.

the class RandomPostingsTester method testTermsOneThread.

private void testTermsOneThread(Random random, Fields fieldsSource, EnumSet<Option> options, IndexOptions maxTestOptions, IndexOptions maxIndexOptions, boolean alwaysTestMax) throws IOException {
    ThreadState threadState = new ThreadState();
    // Test random terms/fields:
    List<TermState> termStates = new ArrayList<>();
    List<FieldAndTerm> termStateTerms = new ArrayList<>();
    boolean supportsOrds = true;
    Collections.shuffle(allTerms, random);
    int upto = 0;
    while (upto < allTerms.size()) {
        boolean useTermState = termStates.size() != 0 && random.nextInt(5) == 1;
        boolean useTermOrd = supportsOrds && useTermState == false && random.nextInt(5) == 1;
        FieldAndTerm fieldAndTerm;
        TermsEnum termsEnum;
        TermState termState = null;
        if (!useTermState) {
            // Seek by random field+term:
            fieldAndTerm = allTerms.get(upto++);
            if (LuceneTestCase.VERBOSE) {
                if (useTermOrd) {
                    System.out.println("\nTEST: seek to term=" + fieldAndTerm.field + ":" + fieldAndTerm.term.utf8ToString() + " using ord=" + fieldAndTerm.ord);
                } else {
                    System.out.println("\nTEST: seek to term=" + fieldAndTerm.field + ":" + fieldAndTerm.term.utf8ToString());
                }
            }
        } else {
            // Seek by previous saved TermState
            int idx = random.nextInt(termStates.size());
            fieldAndTerm = termStateTerms.get(idx);
            if (LuceneTestCase.VERBOSE) {
                System.out.println("\nTEST: seek using TermState to term=" + fieldAndTerm.field + ":" + fieldAndTerm.term.utf8ToString());
            }
            termState = termStates.get(idx);
        }
        Terms terms = fieldsSource.terms(fieldAndTerm.field);
        assertNotNull(terms);
        termsEnum = terms.iterator();
        if (!useTermState) {
            if (useTermOrd) {
                // Try seek by ord sometimes:
                try {
                    termsEnum.seekExact(fieldAndTerm.ord);
                } catch (UnsupportedOperationException uoe) {
                    supportsOrds = false;
                    assertTrue(termsEnum.seekExact(fieldAndTerm.term));
                }
            } else {
                assertTrue(termsEnum.seekExact(fieldAndTerm.term));
            }
        } else {
            termsEnum.seekExact(fieldAndTerm.term, termState);
        }
        // check we really seeked to the right place
        assertEquals(fieldAndTerm.term, termsEnum.term());
        long termOrd;
        if (supportsOrds) {
            try {
                termOrd = termsEnum.ord();
            } catch (UnsupportedOperationException uoe) {
                supportsOrds = false;
                termOrd = -1;
            }
        } else {
            termOrd = -1;
        }
        if (termOrd != -1) {
            // PostingsFormat supports ords
            assertEquals(fieldAndTerm.ord, termsEnum.ord());
        }
        boolean savedTermState = false;
        if (options.contains(Option.TERM_STATE) && !useTermState && random.nextInt(5) == 1) {
            // Save away this TermState:
            termStates.add(termsEnum.termState());
            termStateTerms.add(fieldAndTerm);
            savedTermState = true;
        }
        verifyEnum(random, threadState, fieldAndTerm.field, fieldAndTerm.term, termsEnum, maxTestOptions, maxIndexOptions, options, alwaysTestMax);
        // Sometimes save term state after pulling the enum:
        if (options.contains(Option.TERM_STATE) && !useTermState && !savedTermState && random.nextInt(5) == 1) {
            // Save away this TermState:
            termStates.add(termsEnum.termState());
            termStateTerms.add(fieldAndTerm);
            useTermState = true;
        }
        // from the same term:
        if (alwaysTestMax || random.nextInt(10) == 7) {
            // Try same term again
            if (LuceneTestCase.VERBOSE) {
                System.out.println("TEST: try enum again on same term");
            }
            verifyEnum(random, threadState, fieldAndTerm.field, fieldAndTerm.term, termsEnum, maxTestOptions, maxIndexOptions, options, alwaysTestMax);
        }
    }
    // Test Terms.intersect:
    for (String field : fields.keySet()) {
        while (true) {
            Automaton a = AutomatonTestUtil.randomAutomaton(random);
            CompiledAutomaton ca = new CompiledAutomaton(a, null, true, Integer.MAX_VALUE, false);
            if (ca.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
                // Keep retrying until we get an A that will really "use" the PF's intersect code:
                continue;
            }
            // System.out.println("A:\n" + a.toDot());
            BytesRef startTerm = null;
            if (random.nextBoolean()) {
                RandomAcceptedStrings ras = new RandomAcceptedStrings(a);
                for (int iter = 0; iter < 100; iter++) {
                    int[] codePoints = ras.getRandomAcceptedString(random);
                    if (codePoints.length == 0) {
                        continue;
                    }
                    startTerm = new BytesRef(UnicodeUtil.newString(codePoints, 0, codePoints.length));
                    break;
                }
                // Don't allow empty string startTerm:
                if (startTerm == null) {
                    continue;
                }
            }
            TermsEnum intersected = fieldsSource.terms(field).intersect(ca, startTerm);
            Set<BytesRef> intersectedTerms = new HashSet<BytesRef>();
            BytesRef term;
            while ((term = intersected.next()) != null) {
                if (startTerm != null) {
                    // NOTE: not <=
                    assertTrue(startTerm.compareTo(term) < 0);
                }
                intersectedTerms.add(BytesRef.deepCopyOf(term));
                verifyEnum(random, threadState, field, term, intersected, maxTestOptions, maxIndexOptions, options, alwaysTestMax);
            }
            if (ca.runAutomaton == null) {
                assertTrue(intersectedTerms.isEmpty());
            } else {
                for (BytesRef term2 : fields.get(field).keySet()) {
                    boolean expected;
                    if (startTerm != null && startTerm.compareTo(term2) >= 0) {
                        expected = false;
                    } else {
                        expected = ca.runAutomaton.run(term2.bytes, term2.offset, term2.length);
                    }
                    assertEquals("term=" + term2, expected, intersectedTerms.contains(term2));
                }
            }
            break;
        }
    }
}
Also used : CompiledAutomaton(org.apache.lucene.util.automaton.CompiledAutomaton) Automaton(org.apache.lucene.util.automaton.Automaton) RandomAcceptedStrings(org.apache.lucene.util.automaton.AutomatonTestUtil.RandomAcceptedStrings) ArrayList(java.util.ArrayList) CompiledAutomaton(org.apache.lucene.util.automaton.CompiledAutomaton) BytesRef(org.apache.lucene.util.BytesRef) HashSet(java.util.HashSet)

Example 20 with Automaton

use of org.apache.lucene.util.automaton.Automaton in project lucene-solr by apache.

the class FuzzyCompletionQuery method createWeight.

@Override
public Weight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException {
    CompletionTokenStream stream = (CompletionTokenStream) analyzer.tokenStream(getField(), getTerm().text());
    Set<IntsRef> refs = new HashSet<>();
    Automaton automaton = toLevenshteinAutomata(stream.toAutomaton(unicodeAware), refs);
    if (unicodeAware) {
        Automaton utf8automaton = new UTF32ToUTF8().convert(automaton);
        utf8automaton = Operations.determinize(utf8automaton, maxDeterminizedStates);
        automaton = utf8automaton;
    }
    // TODO Better iterate over automaton again inside FuzzyCompletionWeight?
    return new FuzzyCompletionWeight(this, automaton, refs);
}
Also used : Automaton(org.apache.lucene.util.automaton.Automaton) IntsRef(org.apache.lucene.util.IntsRef) HashSet(java.util.HashSet) UTF32ToUTF8(org.apache.lucene.util.automaton.UTF32ToUTF8)

Aggregations

Automaton (org.apache.lucene.util.automaton.Automaton)57 TokenStreamToAutomaton (org.apache.lucene.analysis.TokenStreamToAutomaton)17 IntsRef (org.apache.lucene.util.IntsRef)13 BytesRef (org.apache.lucene.util.BytesRef)12 ArrayList (java.util.ArrayList)11 Directory (org.apache.lucene.store.Directory)8 HashSet (java.util.HashSet)7 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)7 Document (org.apache.lucene.document.Document)6 CompiledAutomaton (org.apache.lucene.util.automaton.CompiledAutomaton)6 Transition (org.apache.lucene.util.automaton.Transition)6 TokenStream (org.apache.lucene.analysis.TokenStream)5 BytesRefBuilder (org.apache.lucene.util.BytesRefBuilder)5 CharsRefBuilder (org.apache.lucene.util.CharsRefBuilder)5 CharacterRunAutomaton (org.apache.lucene.util.automaton.CharacterRunAutomaton)5 Analyzer (org.apache.lucene.analysis.Analyzer)4 IntsRefBuilder (org.apache.lucene.util.IntsRefBuilder)4 FiniteStringsIterator (org.apache.lucene.util.automaton.FiniteStringsIterator)4 LevenshteinAutomata (org.apache.lucene.util.automaton.LevenshteinAutomata)4 RegExp (org.apache.lucene.util.automaton.RegExp)4