Search in sources :

Example 1 with Automaton

use of org.apache.lucene.util.automaton.Automaton in project elasticsearch by elastic.

the class XAnalyzingSuggester method toAutomaton.

final Automaton toAutomaton(TokenStream ts, final TokenStreamToAutomaton ts2a) throws IOException {
    // Create corresponding automaton: labels are bytes
    // from each analyzed token, with byte 0 used as
    // separator between tokens:
    Automaton automaton = ts2a.toAutomaton(ts);
    automaton = replaceSep(automaton);
    automaton = convertAutomaton(automaton);
    return automaton;
}
Also used : Automaton(org.apache.lucene.util.automaton.Automaton) TokenStreamToAutomaton(org.apache.lucene.analysis.TokenStreamToAutomaton)

Example 2 with Automaton

use of org.apache.lucene.util.automaton.Automaton in project elasticsearch by elastic.

the class XAnalyzingSuggester method toFiniteStrings.

// EDIT: Adrien, needed by lookup providers
// NOTE: these XForks are unmaintainable, we need to get rid of them...
public Set<IntsRef> toFiniteStrings(TokenStream stream) throws IOException {
    final TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton();
    Automaton automaton;
    try (TokenStream ts = stream) {
        automaton = toAutomaton(ts, ts2a);
    }
    LimitedFiniteStringsIterator finiteStrings = new LimitedFiniteStringsIterator(automaton, maxGraphExpansions);
    Set<IntsRef> set = new HashSet<>();
    for (IntsRef string = finiteStrings.next(); string != null; string = finiteStrings.next()) {
        set.add(IntsRef.deepCopyOf(string));
    }
    return Collections.unmodifiableSet(set);
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) Automaton(org.apache.lucene.util.automaton.Automaton) TokenStreamToAutomaton(org.apache.lucene.analysis.TokenStreamToAutomaton) LimitedFiniteStringsIterator(org.apache.lucene.util.automaton.LimitedFiniteStringsIterator) IntsRef(org.apache.lucene.util.IntsRef) TokenStreamToAutomaton(org.apache.lucene.analysis.TokenStreamToAutomaton) HashSet(java.util.HashSet)

Example 3 with Automaton

use of org.apache.lucene.util.automaton.Automaton in project lucene-solr by apache.

the class TestSynonymGraphFilter method testRandomSyns.

public void testRandomSyns() throws Exception {
    int synCount = atLeast(10);
    double bias = random().nextDouble();
    boolean dedup = random().nextBoolean();
    boolean flatten = random().nextBoolean();
    SynonymMap.Builder b = new SynonymMap.Builder(dedup);
    List<OneSyn> syns = new ArrayList<>();
    // Makes random syns from random a / b tokens, mapping to random x / y tokens
    if (VERBOSE) {
        System.out.println("TEST: make " + synCount + " syns");
        System.out.println("  bias for a over b=" + bias);
        System.out.println("  dedup=" + dedup);
        System.out.println("  flatten=" + flatten);
    }
    int maxSynLength = 0;
    for (int i = 0; i < synCount; i++) {
        OneSyn syn = new OneSyn();
        syn.in = randomBinaryChars(1, 5, bias, 'a');
        syn.out = randomBinaryChars(1, 5, 0.5, 'x');
        syn.keepOrig = random().nextBoolean();
        syns.add(syn);
        maxSynLength = Math.max(maxSynLength, syn.in.length);
        if (VERBOSE) {
            System.out.println("  " + syn);
        }
        add(b, toTokenString(syn.in), toTokenString(syn.out), syn.keepOrig);
    }
    // Compute max allowed lookahead for flatten filter:
    int maxFlattenLookahead = 0;
    if (flatten) {
        for (int i = 0; i < synCount; i++) {
            OneSyn syn1 = syns.get(i);
            int count = syn1.out.length;
            boolean keepOrig = syn1.keepOrig;
            for (int j = 0; j < synCount; j++) {
                OneSyn syn2 = syns.get(i);
                keepOrig |= syn2.keepOrig;
                if (syn1.in.equals(syn2.in)) {
                    count += syn2.out.length;
                }
            }
            if (keepOrig) {
                count += syn1.in.length;
            }
            maxFlattenLookahead = Math.max(maxFlattenLookahead, count);
        }
    }
    // Only used w/ VERBOSE:
    Analyzer aNoFlattened;
    if (VERBOSE) {
        aNoFlattened = getAnalyzer(b, true);
    } else {
        aNoFlattened = null;
    }
    Analyzer a;
    if (flatten) {
        a = getFlattenAnalyzer(b, true);
    } else {
        a = getAnalyzer(b, true);
    }
    int iters = atLeast(20);
    for (int iter = 0; iter < iters; iter++) {
        String doc = toTokenString(randomBinaryChars(50, 100, bias, 'a'));
        if (VERBOSE) {
            System.out.println("TEST: iter=" + iter + " doc=" + doc);
        }
        Automaton expected = slowSynFilter(doc, syns, flatten);
        if (VERBOSE) {
            System.out.println("  expected:\n" + expected.toDot());
            if (flatten) {
                Automaton unflattened = toAutomaton(aNoFlattened.tokenStream("field", new StringReader(doc)));
                System.out.println("  actual unflattened:\n" + unflattened.toDot());
            }
        }
        Automaton actual = toAutomaton(a.tokenStream("field", new StringReader(doc)));
        if (VERBOSE) {
            System.out.println("  actual:\n" + actual.toDot());
        }
        assertTrue("maxLookaheadUsed=" + synFilter.getMaxLookaheadUsed() + " maxSynLength=" + maxSynLength, synFilter.getMaxLookaheadUsed() <= maxSynLength);
        if (flatten) {
            assertTrue("flatten maxLookaheadUsed=" + flattenFilter.getMaxLookaheadUsed() + " maxFlattenLookahead=" + maxFlattenLookahead, flattenFilter.getMaxLookaheadUsed() <= maxFlattenLookahead);
        }
        checkAnalysisConsistency(random(), a, random().nextBoolean(), doc);
        // output token that also happens to be in the input:
        try {
            actual = Operations.determinize(actual, 50000);
        } catch (TooComplexToDeterminizeException tctde) {
            // Unfortunately the syns can easily create difficult-to-determinize graphs:
            assertTrue(approxEquals(actual, expected));
            continue;
        }
        try {
            expected = Operations.determinize(expected, 50000);
        } catch (TooComplexToDeterminizeException tctde) {
            // Unfortunately the syns can easily create difficult-to-determinize graphs:
            assertTrue(approxEquals(actual, expected));
            continue;
        }
        assertTrue(approxEquals(actual, expected));
        assertTrue(Operations.sameLanguage(actual, expected));
    }
    a.close();
}
Also used : TooComplexToDeterminizeException(org.apache.lucene.util.automaton.TooComplexToDeterminizeException) Automaton(org.apache.lucene.util.automaton.Automaton) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) ArrayList(java.util.ArrayList) Analyzer(org.apache.lucene.analysis.Analyzer) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) StringReader(java.io.StringReader)

Example 4 with Automaton

use of org.apache.lucene.util.automaton.Automaton in project lucene-solr by apache.

the class GraphTokenStreamFiniteStrings method build.

/**
   * Build an automaton from the provided {@link TokenStream}.
   */
private Automaton build(final TokenStream in) throws IOException {
    Automaton.Builder builder = new Automaton.Builder();
    final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class);
    final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class);
    final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class);
    in.reset();
    int pos = -1;
    int prevIncr = 1;
    int state = -1;
    while (in.incrementToken()) {
        int currentIncr = posIncAtt.getPositionIncrement();
        if (pos == -1 && currentIncr < 1) {
            throw new IllegalStateException("Malformed TokenStream, start token can't have increment less than 1");
        }
        // always use inc 1 while building, but save original increment
        int incr = Math.min(1, currentIncr);
        if (incr > 0) {
            pos += incr;
        }
        int endPos = pos + posLengthAtt.getPositionLength();
        while (state < endPos) {
            state = builder.createState();
        }
        BytesRef term = termBytesAtt.getBytesRef();
        int id = getTermID(currentIncr, prevIncr, term);
        builder.addTransition(pos, endPos, id);
        // only save last increment on non-zero increment in case we have multiple stacked tokens
        if (currentIncr > 0) {
            prevIncr = currentIncr;
        }
    }
    in.end();
    if (state != -1) {
        builder.setAccept(state, true);
    }
    return builder.finish();
}
Also used : PositionLengthAttribute(org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute) Automaton(org.apache.lucene.util.automaton.Automaton) TermToBytesRefAttribute(org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute) BytesRef(org.apache.lucene.util.BytesRef) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)

Example 5 with Automaton

use of org.apache.lucene.util.automaton.Automaton in project lucene-solr by apache.

the class TestGraphTokenizers method testOverlappedTokensLattice.

public void testOverlappedTokensLattice() throws Exception {
    final TokenStream ts = new CannedTokenStream(new Token[] { token("abc", 1, 1), token("xyz", 0, 2), token("def", 1, 1) });
    final Automaton a1 = s2a("xyz");
    final Automaton a2 = join("abc", "def");
    assertSameLanguage(Operations.union(a1, a2), ts);
}
Also used : Automaton(org.apache.lucene.util.automaton.Automaton)

Aggregations

Automaton (org.apache.lucene.util.automaton.Automaton)57 TokenStreamToAutomaton (org.apache.lucene.analysis.TokenStreamToAutomaton)17 IntsRef (org.apache.lucene.util.IntsRef)13 BytesRef (org.apache.lucene.util.BytesRef)12 ArrayList (java.util.ArrayList)11 Directory (org.apache.lucene.store.Directory)8 HashSet (java.util.HashSet)7 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)7 Document (org.apache.lucene.document.Document)6 CompiledAutomaton (org.apache.lucene.util.automaton.CompiledAutomaton)6 Transition (org.apache.lucene.util.automaton.Transition)6 TokenStream (org.apache.lucene.analysis.TokenStream)5 BytesRefBuilder (org.apache.lucene.util.BytesRefBuilder)5 CharsRefBuilder (org.apache.lucene.util.CharsRefBuilder)5 CharacterRunAutomaton (org.apache.lucene.util.automaton.CharacterRunAutomaton)5 Analyzer (org.apache.lucene.analysis.Analyzer)4 IntsRefBuilder (org.apache.lucene.util.IntsRefBuilder)4 FiniteStringsIterator (org.apache.lucene.util.automaton.FiniteStringsIterator)4 LevenshteinAutomata (org.apache.lucene.util.automaton.LevenshteinAutomata)4 RegExp (org.apache.lucene.util.automaton.RegExp)4