Search in sources :

Example 6 with TokenStreamToAutomaton

use of org.apache.lucene.analysis.TokenStreamToAutomaton in project elasticsearch by elastic.

the class XFuzzySuggester method getTokenStreamToAutomaton.

@Override
public TokenStreamToAutomaton getTokenStreamToAutomaton() {
    final TokenStreamToAutomaton tsta = super.getTokenStreamToAutomaton();
    tsta.setUnicodeArcs(unicodeAware);
    return tsta;
}
Also used : TokenStreamToAutomaton(org.apache.lucene.analysis.TokenStreamToAutomaton)

Example 7 with TokenStreamToAutomaton

use of org.apache.lucene.analysis.TokenStreamToAutomaton in project lucene-solr by apache.

the class FuzzySuggesterTest method testRandom.

public void testRandom() throws Exception {
    int numQueries = atLeast(100);
    final List<TermFreqPayload2> slowCompletor = new ArrayList<>();
    final TreeSet<String> allPrefixes = new TreeSet<>();
    final Set<String> seen = new HashSet<>();
    Input[] keys = new Input[numQueries];
    boolean preserveSep = random().nextBoolean();
    boolean unicodeAware = random().nextBoolean();
    final int numStopChars = random().nextInt(10);
    final boolean preserveHoles = random().nextBoolean();
    if (VERBOSE) {
        System.out.println("TEST: " + numQueries + " words; preserveSep=" + preserveSep + " ; unicodeAware=" + unicodeAware + " numStopChars=" + numStopChars + " preserveHoles=" + preserveHoles);
    }
    for (int i = 0; i < numQueries; i++) {
        int numTokens = TestUtil.nextInt(random(), 1, 4);
        String key;
        String analyzedKey;
        while (true) {
            key = "";
            analyzedKey = "";
            boolean lastRemoved = false;
            for (int token = 0; token < numTokens; token++) {
                String s;
                while (true) {
                    // TODO: would be nice to fix this slowCompletor/comparator to
                    // use full range, but we might lose some coverage too...
                    s = TestUtil.randomSimpleString(random());
                    if (s.length() > 0) {
                        if (token > 0) {
                            key += " ";
                        }
                        if (preserveSep && analyzedKey.length() > 0 && (unicodeAware ? analyzedKey.codePointAt(analyzedKey.codePointCount(0, analyzedKey.length()) - 1) != ' ' : analyzedKey.charAt(analyzedKey.length() - 1) != ' ')) {
                            analyzedKey += " ";
                        }
                        key += s;
                        if (s.length() == 1 && isStopChar(s.charAt(0), numStopChars)) {
                            if (preserveSep && preserveHoles) {
                                analyzedKey += '';
                            }
                            lastRemoved = true;
                        } else {
                            analyzedKey += s;
                            lastRemoved = false;
                        }
                        break;
                    }
                }
            }
            analyzedKey = analyzedKey.replaceAll("(^| )$", "");
            if (preserveSep && lastRemoved) {
                analyzedKey += " ";
            }
            // Don't add same surface form more than once:
            if (!seen.contains(key)) {
                seen.add(key);
                break;
            }
        }
        for (int j = 1; j < key.length(); j++) {
            allPrefixes.add(key.substring(0, j));
        }
        // we can probably do Integer.MAX_VALUE here, but why worry.
        int weight = random().nextInt(1 << 24);
        keys[i] = new Input(key, weight);
        slowCompletor.add(new TermFreqPayload2(key, analyzedKey, weight));
    }
    if (VERBOSE) {
        // Don't just sort original list, to avoid VERBOSE
        // altering the test:
        List<TermFreqPayload2> sorted = new ArrayList<>(slowCompletor);
        Collections.sort(sorted);
        for (TermFreqPayload2 ent : sorted) {
            System.out.println("  surface='" + ent.surfaceForm + " analyzed='" + ent.analyzedForm + "' weight=" + ent.weight);
        }
    }
    Analyzer a = new MockTokenEatingAnalyzer(numStopChars, preserveHoles);
    Directory tempDir = getDirectory();
    FuzzySuggester suggester = new FuzzySuggester(tempDir, "fuzzy", a, a, preserveSep ? AnalyzingSuggester.PRESERVE_SEP : 0, 256, -1, true, 1, false, 1, 3, unicodeAware);
    suggester.build(new InputArrayIterator(keys));
    for (String prefix : allPrefixes) {
        if (VERBOSE) {
            System.out.println("\nTEST: prefix=" + prefix);
        }
        final int topN = TestUtil.nextInt(random(), 1, 10);
        List<LookupResult> r = suggester.lookup(TestUtil.stringToCharSequence(prefix, random()), false, topN);
        // 2. go thru whole set to find suggestions:
        List<LookupResult> matches = new ArrayList<>();
        // "Analyze" the key:
        String[] tokens = prefix.split(" ");
        StringBuilder builder = new StringBuilder();
        boolean lastRemoved = false;
        for (int i = 0; i < tokens.length; i++) {
            String token = tokens[i];
            if (preserveSep && builder.length() > 0 && !builder.toString().endsWith(" ")) {
                builder.append(' ');
            }
            if (token.length() == 1 && isStopChar(token.charAt(0), numStopChars)) {
                if (preserveSep && preserveHoles) {
                    builder.append("");
                }
                lastRemoved = true;
            } else {
                builder.append(token);
                lastRemoved = false;
            }
        }
        String analyzedKey = builder.toString();
        // issue open for this):
        while (true) {
            String s = analyzedKey.replaceAll("(^| )$", "");
            s = s.replaceAll("\\s+$", "");
            if (s.equals(analyzedKey)) {
                break;
            }
            analyzedKey = s;
        }
        if (analyzedKey.length() == 0) {
            // string!  You get no results, not all results...
            continue;
        }
        if (preserveSep && (prefix.endsWith(" ") || lastRemoved)) {
            analyzedKey += " ";
        }
        if (VERBOSE) {
            System.out.println("  analyzed: " + analyzedKey);
        }
        TokenStreamToAutomaton tokenStreamToAutomaton = suggester.getTokenStreamToAutomaton();
        // NOTE: not great that we ask the suggester to give
        // us the "answer key" (ie maybe we have a bug in
        // suggester.toLevA ...) ... but testRandom2() fixes
        // this:
        Automaton automaton = suggester.convertAutomaton(suggester.toLevenshteinAutomata(suggester.toLookupAutomaton(analyzedKey)));
        assertTrue(automaton.isDeterministic());
        // TODO: could be faster... but it's slowCompletor for a reason
        BytesRefBuilder spare = new BytesRefBuilder();
        for (TermFreqPayload2 e : slowCompletor) {
            spare.copyChars(e.analyzedForm);
            FiniteStringsIterator finiteStrings = new FiniteStringsIterator(suggester.toAutomaton(spare.get(), tokenStreamToAutomaton));
            for (IntsRef string; (string = finiteStrings.next()) != null; ) {
                int p = 0;
                BytesRef ref = Util.toBytesRef(string, spare);
                boolean added = false;
                for (int i = ref.offset; i < ref.length; i++) {
                    int q = automaton.step(p, ref.bytes[i] & 0xff);
                    if (q == -1) {
                        break;
                    } else if (automaton.isAccept(q)) {
                        matches.add(new LookupResult(e.surfaceForm, e.weight));
                        added = true;
                        break;
                    }
                    p = q;
                }
                if (!added && automaton.isAccept(p)) {
                    matches.add(new LookupResult(e.surfaceForm, e.weight));
                }
            }
        }
        assertTrue(numStopChars > 0 || matches.size() > 0);
        if (matches.size() > 1) {
            Collections.sort(matches, new Comparator<LookupResult>() {

                @Override
                public int compare(LookupResult left, LookupResult right) {
                    int cmp = Float.compare(right.value, left.value);
                    if (cmp == 0) {
                        return left.compareTo(right);
                    } else {
                        return cmp;
                    }
                }
            });
        }
        if (matches.size() > topN) {
            matches = matches.subList(0, topN);
        }
        if (VERBOSE) {
            System.out.println("  expected:");
            for (LookupResult lr : matches) {
                System.out.println("    key=" + lr.key + " weight=" + lr.value);
            }
            System.out.println("  actual:");
            for (LookupResult lr : r) {
                System.out.println("    key=" + lr.key + " weight=" + lr.value);
            }
        }
        assertEquals(prefix + "  " + topN, matches.size(), r.size());
        for (int hit = 0; hit < r.size(); hit++) {
            //System.out.println("  check hit " + hit);
            assertEquals(prefix + "  " + topN, matches.get(hit).key.toString(), r.get(hit).key.toString());
            assertEquals(matches.get(hit).value, r.get(hit).value, 0f);
        }
    }
    IOUtils.close(a, tempDir);
}
Also used : FiniteStringsIterator(org.apache.lucene.util.automaton.FiniteStringsIterator) ArrayList(java.util.ArrayList) Analyzer(org.apache.lucene.analysis.Analyzer) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) Input(org.apache.lucene.search.suggest.Input) TreeSet(java.util.TreeSet) IntsRef(org.apache.lucene.util.IntsRef) BytesRef(org.apache.lucene.util.BytesRef) HashSet(java.util.HashSet) Directory(org.apache.lucene.store.Directory) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) TokenStreamToAutomaton(org.apache.lucene.analysis.TokenStreamToAutomaton) Automaton(org.apache.lucene.util.automaton.Automaton) InputArrayIterator(org.apache.lucene.search.suggest.InputArrayIterator) LookupResult(org.apache.lucene.search.suggest.Lookup.LookupResult) TokenStreamToAutomaton(org.apache.lucene.analysis.TokenStreamToAutomaton)

Example 8 with TokenStreamToAutomaton

use of org.apache.lucene.analysis.TokenStreamToAutomaton in project lucene-solr by apache.

the class CompletionTokenStream method toAutomaton.

/**
   * Converts the tokenStream to an automaton
   */
public Automaton toAutomaton(boolean unicodeAware) throws IOException {
    // TODO refactor this
    // maybe we could hook up a modified automaton from TermAutomatonQuery here?
    Automaton automaton = null;
    try {
        // Create corresponding automaton: labels are bytes
        // from each analyzed token, with byte 0 used as
        // separator between tokens:
        final TokenStreamToAutomaton tsta;
        if (preserveSep) {
            tsta = new EscapingTokenStreamToAutomaton((char) SEP_LABEL);
        } else {
            // When we're not preserving sep, we don't steal 0xff
            // byte, so we don't need to do any escaping:
            tsta = new TokenStreamToAutomaton();
        }
        tsta.setPreservePositionIncrements(preservePositionIncrements);
        tsta.setUnicodeArcs(unicodeAware);
        automaton = tsta.toAutomaton(inputTokenStream);
    } finally {
        IOUtils.closeWhileHandlingException(inputTokenStream);
    }
    // TODO: we can optimize this somewhat by determinizing
    // while we convert
    automaton = replaceSep(automaton, preserveSep, SEP_LABEL);
    // This automaton should not blow up during determinize:
    return Operations.determinize(automaton, maxGraphExpansions);
}
Also used : Automaton(org.apache.lucene.util.automaton.Automaton) TokenStreamToAutomaton(org.apache.lucene.analysis.TokenStreamToAutomaton) TokenStreamToAutomaton(org.apache.lucene.analysis.TokenStreamToAutomaton)

Example 9 with TokenStreamToAutomaton

use of org.apache.lucene.analysis.TokenStreamToAutomaton in project lucene-solr by apache.

the class AnalyzingSuggester method toAutomaton.

final Automaton toAutomaton(final BytesRef surfaceForm, final TokenStreamToAutomaton ts2a) throws IOException {
    // Analyze surface form:
    Automaton automaton;
    try (TokenStream ts = indexAnalyzer.tokenStream("", surfaceForm.utf8ToString())) {
        // Create corresponding automaton: labels are bytes
        // from each analyzed token, with byte 0 used as
        // separator between tokens:
        automaton = ts2a.toAutomaton(ts);
    }
    automaton = replaceSep(automaton);
    automaton = convertAutomaton(automaton);
    // graph using SynFilter or WDF):
    return automaton;
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) TokenStreamToAutomaton(org.apache.lucene.analysis.TokenStreamToAutomaton) Automaton(org.apache.lucene.util.automaton.Automaton)

Example 10 with TokenStreamToAutomaton

use of org.apache.lucene.analysis.TokenStreamToAutomaton in project lucene-solr by apache.

the class AnalyzingSuggester method getTokenStreamToAutomaton.

TokenStreamToAutomaton getTokenStreamToAutomaton() {
    final TokenStreamToAutomaton tsta = new TokenStreamToAutomaton();
    tsta.setPreservePositionIncrements(preservePositionIncrements);
    tsta.setFinalOffsetGapAsHole(true);
    return tsta;
}
Also used : TokenStreamToAutomaton(org.apache.lucene.analysis.TokenStreamToAutomaton)

Aggregations

TokenStreamToAutomaton (org.apache.lucene.analysis.TokenStreamToAutomaton)11 Automaton (org.apache.lucene.util.automaton.Automaton)5 HashSet (java.util.HashSet)4 IntsRef (org.apache.lucene.util.IntsRef)4 BytesRef (org.apache.lucene.util.BytesRef)3 BytesRefBuilder (org.apache.lucene.util.BytesRefBuilder)3 LimitedFiniteStringsIterator (org.apache.lucene.util.automaton.LimitedFiniteStringsIterator)3 TokenStream (org.apache.lucene.analysis.TokenStream)2 ByteArrayDataInput (org.apache.lucene.store.ByteArrayDataInput)2 ByteArrayDataOutput (org.apache.lucene.store.ByteArrayDataOutput)2 Directory (org.apache.lucene.store.Directory)2 IndexOutput (org.apache.lucene.store.IndexOutput)2 CharsRefBuilder (org.apache.lucene.util.CharsRefBuilder)2 IntsRefBuilder (org.apache.lucene.util.IntsRefBuilder)2 OfflineSorter (org.apache.lucene.util.OfflineSorter)2 Builder (org.apache.lucene.util.fst.Builder)2 PairOutputs (org.apache.lucene.util.fst.PairOutputs)2 Pair (org.apache.lucene.util.fst.PairOutputs.Pair)2 ArrayList (java.util.ArrayList)1 TreeSet (java.util.TreeSet)1