Search in sources :

Example 31 with Automaton

use of org.apache.lucene.util.automaton.Automaton in project elasticsearch by elastic.

the class TransportReindexAction method buildRemoteWhitelist.

/**
     * Build the {@link CharacterRunAutomaton} that represents the reindex-from-remote whitelist and make sure that it doesn't whitelist
     * the world.
     */
static CharacterRunAutomaton buildRemoteWhitelist(List<String> whitelist) {
    if (whitelist.isEmpty()) {
        return new CharacterRunAutomaton(Automata.makeEmpty());
    }
    Automaton automaton = Regex.simpleMatchToAutomaton(whitelist.toArray(Strings.EMPTY_ARRAY));
    automaton = MinimizationOperations.minimize(automaton, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
    if (Operations.isTotal(automaton)) {
        throw new IllegalArgumentException("Refusing to start because whitelist " + whitelist + " accepts all addresses. " + "This would allow users to reindex-from-remote any URL they like effectively having Elasticsearch make HTTP GETs " + "for them.");
    }
    return new CharacterRunAutomaton(automaton);
}
Also used : Automaton(org.apache.lucene.util.automaton.Automaton) CharacterRunAutomaton(org.apache.lucene.util.automaton.CharacterRunAutomaton) CharacterRunAutomaton(org.apache.lucene.util.automaton.CharacterRunAutomaton)

Example 32 with Automaton

use of org.apache.lucene.util.automaton.Automaton in project lucene-solr by apache.

the class CompletionTokenStream method incrementToken.

@Override
public boolean incrementToken() throws IOException {
    clearAttributes();
    if (finiteStrings == null) {
        Automaton automaton = toAutomaton();
        finiteStrings = new LimitedFiniteStringsIterator(automaton, maxGraphExpansions);
    }
    IntsRef string = finiteStrings.next();
    if (string == null) {
        return false;
    }
    // now we have UTF-8
    Util.toBytesRef(string, bytesAtt.builder());
    if (charTermAttribute != null) {
        charTermAttribute.setLength(0);
        charTermAttribute.append(bytesAtt.toUTF16());
    }
    if (payload != null) {
        payloadAttr.setPayload(this.payload);
    }
    return true;
}
Also used : Automaton(org.apache.lucene.util.automaton.Automaton) TokenStreamToAutomaton(org.apache.lucene.analysis.TokenStreamToAutomaton) LimitedFiniteStringsIterator(org.apache.lucene.util.automaton.LimitedFiniteStringsIterator) IntsRef(org.apache.lucene.util.IntsRef)

Example 33 with Automaton

use of org.apache.lucene.util.automaton.Automaton in project lucene-solr by apache.

the class CompletionTokenStream method toAutomaton.

/**
   * Converts the tokenStream to an automaton
   */
public Automaton toAutomaton(boolean unicodeAware) throws IOException {
    // TODO refactor this
    // maybe we could hook up a modified automaton from TermAutomatonQuery here?
    Automaton automaton = null;
    try {
        // Create corresponding automaton: labels are bytes
        // from each analyzed token, with byte 0 used as
        // separator between tokens:
        final TokenStreamToAutomaton tsta;
        if (preserveSep) {
            tsta = new EscapingTokenStreamToAutomaton((char) SEP_LABEL);
        } else {
            // When we're not preserving sep, we don't steal 0xff
            // byte, so we don't need to do any escaping:
            tsta = new TokenStreamToAutomaton();
        }
        tsta.setPreservePositionIncrements(preservePositionIncrements);
        tsta.setUnicodeArcs(unicodeAware);
        automaton = tsta.toAutomaton(inputTokenStream);
    } finally {
        IOUtils.closeWhileHandlingException(inputTokenStream);
    }
    // TODO: we can optimize this somewhat by determinizing
    // while we convert
    automaton = replaceSep(automaton, preserveSep, SEP_LABEL);
    // This automaton should not blow up during determinize:
    return Operations.determinize(automaton, maxGraphExpansions);
}
Also used : Automaton(org.apache.lucene.util.automaton.Automaton) TokenStreamToAutomaton(org.apache.lucene.analysis.TokenStreamToAutomaton) TokenStreamToAutomaton(org.apache.lucene.analysis.TokenStreamToAutomaton)

Example 34 with Automaton

use of org.apache.lucene.util.automaton.Automaton in project lucene-solr by apache.

the class ContextQuery method toContextAutomaton.

private static Automaton toContextAutomaton(final Map<IntsRef, ContextMetaData> contexts, final boolean matchAllContexts) {
    final Automaton matchAllAutomaton = Operations.repeat(Automata.makeAnyString());
    final Automaton sep = Automata.makeChar(ContextSuggestField.CONTEXT_SEPARATOR);
    if (matchAllContexts || contexts.size() == 0) {
        return Operations.concatenate(matchAllAutomaton, sep);
    } else {
        Automaton contextsAutomaton = null;
        for (Map.Entry<IntsRef, ContextMetaData> entry : contexts.entrySet()) {
            final ContextMetaData contextMetaData = entry.getValue();
            final IntsRef ref = entry.getKey();
            Automaton contextAutomaton = Automata.makeString(ref.ints, ref.offset, ref.length);
            if (contextMetaData.exact == false) {
                contextAutomaton = Operations.concatenate(contextAutomaton, matchAllAutomaton);
            }
            contextAutomaton = Operations.concatenate(contextAutomaton, sep);
            if (contextsAutomaton == null) {
                contextsAutomaton = contextAutomaton;
            } else {
                contextsAutomaton = Operations.union(contextsAutomaton, contextAutomaton);
            }
        }
        return contextsAutomaton;
    }
}
Also used : Automaton(org.apache.lucene.util.automaton.Automaton) IntsRef(org.apache.lucene.util.IntsRef) HashMap(java.util.HashMap) Map(java.util.Map)

Example 35 with Automaton

use of org.apache.lucene.util.automaton.Automaton in project lucene-solr by apache.

the class AnalyzingSuggester method toAutomaton.

final Automaton toAutomaton(final BytesRef surfaceForm, final TokenStreamToAutomaton ts2a) throws IOException {
    // Analyze surface form:
    Automaton automaton;
    try (TokenStream ts = indexAnalyzer.tokenStream("", surfaceForm.utf8ToString())) {
        // Create corresponding automaton: labels are bytes
        // from each analyzed token, with byte 0 used as
        // separator between tokens:
        automaton = ts2a.toAutomaton(ts);
    }
    automaton = replaceSep(automaton);
    automaton = convertAutomaton(automaton);
    // graph using SynFilter or WDF):
    return automaton;
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) TokenStreamToAutomaton(org.apache.lucene.analysis.TokenStreamToAutomaton) Automaton(org.apache.lucene.util.automaton.Automaton)

Aggregations

Automaton (org.apache.lucene.util.automaton.Automaton)57 TokenStreamToAutomaton (org.apache.lucene.analysis.TokenStreamToAutomaton)17 IntsRef (org.apache.lucene.util.IntsRef)13 BytesRef (org.apache.lucene.util.BytesRef)12 ArrayList (java.util.ArrayList)11 Directory (org.apache.lucene.store.Directory)8 HashSet (java.util.HashSet)7 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)7 Document (org.apache.lucene.document.Document)6 CompiledAutomaton (org.apache.lucene.util.automaton.CompiledAutomaton)6 Transition (org.apache.lucene.util.automaton.Transition)6 TokenStream (org.apache.lucene.analysis.TokenStream)5 BytesRefBuilder (org.apache.lucene.util.BytesRefBuilder)5 CharsRefBuilder (org.apache.lucene.util.CharsRefBuilder)5 CharacterRunAutomaton (org.apache.lucene.util.automaton.CharacterRunAutomaton)5 Analyzer (org.apache.lucene.analysis.Analyzer)4 IntsRefBuilder (org.apache.lucene.util.IntsRefBuilder)4 FiniteStringsIterator (org.apache.lucene.util.automaton.FiniteStringsIterator)4 LevenshteinAutomata (org.apache.lucene.util.automaton.LevenshteinAutomata)4 RegExp (org.apache.lucene.util.automaton.RegExp)4