Search in sources :

Example 6 with Transition

use of org.apache.lucene.util.automaton.Transition in project lucene-solr by apache.

the class AnalyzingSuggester method replaceSep.

// Replaces SEP with epsilon or remaps them if
// we were asked to preserve them:
private Automaton replaceSep(Automaton a) {
    int numStates = a.getNumStates();
    Automaton.Builder result = new Automaton.Builder(numStates, a.getNumTransitions());
    // Copy all states over
    result.copyStates(a);
    // Go in reverse topo sort so we know we only have to
    // make one pass:
    Transition t = new Transition();
    int[] topoSortStates = Operations.topoSortStates(a);
    for (int i = 0; i < topoSortStates.length; i++) {
        int state = topoSortStates[topoSortStates.length - 1 - i];
        int count = a.initTransition(state, t);
        for (int j = 0; j < count; j++) {
            a.getNextTransition(t);
            if (t.min == TokenStreamToAutomaton.POS_SEP) {
                assert t.max == TokenStreamToAutomaton.POS_SEP;
                if (preserveSep) {
                    // Remap to SEP_LABEL:
                    result.addTransition(state, t.dest, SEP_LABEL);
                } else {
                    result.addEpsilon(state, t.dest);
                }
            } else if (t.min == TokenStreamToAutomaton.HOLE) {
                assert t.max == TokenStreamToAutomaton.HOLE;
                // Just remove the hole: there will then be two
                // SEP tokens next to each other, which will only
                // match another hole at search time.  Note that
                // it will also match an empty-string token ... if
                // that's somehow a problem we can always map HOLE
                // to a dedicated byte (and escape it in the
                // input).
                result.addEpsilon(state, t.dest);
            } else {
                result.addTransition(state, t.dest, t.min, t.max);
            }
        }
    }
    return result.finish();
}
Also used : TokenStreamToAutomaton(org.apache.lucene.analysis.TokenStreamToAutomaton) Automaton(org.apache.lucene.util.automaton.Automaton) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) Builder(org.apache.lucene.util.fst.Builder) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) Transition(org.apache.lucene.util.automaton.Transition)

Example 7 with Transition

use of org.apache.lucene.util.automaton.Transition in project lucene-solr by apache.

the class TestSynonymGraphFilter method topoSort.

/*
  private String toDot(TokenStream ts) throws IOException {
    PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
    PositionLengthAttribute posLenAtt = ts.addAttribute(PositionLengthAttribute.class);
    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class);
    ts.reset();
    int srcNode = -1;
    int destNode = -1;

    StringBuilder b = new StringBuilder();
    b.append("digraph Automaton {\n");
    b.append("  rankdir = LR\n");
    b.append("  node [width=0.2, height=0.2, fontsize=8]\n");
    b.append("  initial [shape=plaintext,label=\"\"]\n");
    b.append("  initial -> 0\n");

    while (ts.incrementToken()) {
      int posInc = posIncAtt.getPositionIncrement();
      if (posInc != 0) {
        srcNode += posInc;
        b.append("  ");
        b.append(srcNode);
        b.append(" [shape=circle,label=\"" + srcNode + "\"]\n");
      }
      destNode = srcNode + posLenAtt.getPositionLength();
      b.append("  ");
      b.append(srcNode);
      b.append(" -> ");
      b.append(destNode);
      b.append(" [label=\"");
      b.append(termAtt);
      b.append("\"");
      if (typeAtt.type().equals("word") == false) {
        b.append(" color=red");
      }
      b.append("]\n");
    }
    ts.end();
    ts.close();

    b.append('}');
    return b.toString();
  }
  */
/** Renumbers nodes according to their topo sort */
private Automaton topoSort(Automaton in) {
    int[] newToOld = Operations.topoSortStates(in);
    int[] oldToNew = new int[newToOld.length];
    Automaton.Builder a = new Automaton.Builder();
    //System.out.println("remap:");
    for (int i = 0; i < newToOld.length; i++) {
        a.createState();
        oldToNew[newToOld[i]] = i;
        //System.out.println("  " + newToOld[i] + " -> " + i);
        if (in.isAccept(newToOld[i])) {
            a.setAccept(i, true);
        //System.out.println("    **");
        }
    }
    Transition t = new Transition();
    for (int i = 0; i < newToOld.length; i++) {
        int count = in.initTransition(newToOld[i], t);
        for (int j = 0; j < count; j++) {
            in.getNextTransition(t);
            a.addTransition(i, oldToNew[t.dest], t.min, t.max);
        }
    }
    return a.finish();
}
Also used : Automaton(org.apache.lucene.util.automaton.Automaton) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) Transition(org.apache.lucene.util.automaton.Transition)

Example 8 with Transition

use of org.apache.lucene.util.automaton.Transition in project lucene-solr by apache.

the class TestSynonymGraphFilter method accepts.

/** Like {@link Operations#run} except the incoming automaton is allowed to be non-deterministic. */
private static boolean accepts(Automaton a, IntsRef path) {
    Set<Integer> states = new HashSet<>();
    states.add(0);
    Transition t = new Transition();
    for (int i = 0; i < path.length; i++) {
        int digit = path.ints[path.offset + i];
        Set<Integer> nextStates = new HashSet<>();
        for (int state : states) {
            int count = a.initTransition(state, t);
            for (int j = 0; j < count; j++) {
                a.getNextTransition(t);
                if (digit >= t.min && digit <= t.max) {
                    nextStates.add(t.dest);
                }
            }
        }
        states = nextStates;
        if (states.isEmpty()) {
            return false;
        }
    }
    for (int state : states) {
        if (a.isAccept(state)) {
            return true;
        }
    }
    return false;
}
Also used : Transition(org.apache.lucene.util.automaton.Transition) HashSet(java.util.HashSet)

Example 9 with Transition

use of org.apache.lucene.util.automaton.Transition in project lucene-solr by apache.

the class TestTermAutomatonQuery method testRandom.

public void testRandom() throws Exception {
    int numDocs = atLeast(100);
    Directory dir = newDirectory();
    // Adds occassional random synonyms:
    Analyzer analyzer = new Analyzer() {

        @Override
        public TokenStreamComponents createComponents(String fieldName) {
            MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true, 100);
            tokenizer.setEnableChecks(true);
            TokenFilter filt = new MockTokenFilter(tokenizer, MockTokenFilter.EMPTY_STOPSET);
            filt = new RandomSynonymFilter(filt);
            return new TokenStreamComponents(tokenizer, filt);
        }
    };
    IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
    RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
    for (int i = 0; i < numDocs; i++) {
        Document doc = new Document();
        int numTokens = atLeast(10);
        StringBuilder sb = new StringBuilder();
        for (int j = 0; j < numTokens; j++) {
            sb.append(' ');
            sb.append((char) (97 + random().nextInt(3)));
        }
        String contents = sb.toString();
        doc.add(newTextField("field", contents, Field.Store.NO));
        doc.add(new StoredField("id", "" + i));
        if (VERBOSE) {
            System.out.println("  doc " + i + " -> " + contents);
        }
        w.addDocument(doc);
    }
    IndexReader r = w.getReader();
    IndexSearcher s = newSearcher(r);
    // Used to match ANY using MultiPhraseQuery:
    Term[] allTerms = new Term[] { new Term("field", "a"), new Term("field", "b"), new Term("field", "c") };
    int numIters = atLeast(1000);
    for (int iter = 0; iter < numIters; iter++) {
        // Build the (finite, no any transitions) TermAutomatonQuery and
        // also the "equivalent" BooleanQuery and make sure they match the
        // same docs:
        BooleanQuery.Builder bq = new BooleanQuery.Builder();
        int count = TestUtil.nextInt(random(), 1, 5);
        Set<BytesRef> strings = new HashSet<>();
        for (int i = 0; i < count; i++) {
            StringBuilder sb = new StringBuilder();
            int numTokens = TestUtil.nextInt(random(), 1, 5);
            for (int j = 0; j < numTokens; j++) {
                if (j > 0 && j < numTokens - 1 && random().nextInt(5) == 3) {
                    sb.append('*');
                } else {
                    sb.append((char) (97 + random().nextInt(3)));
                }
            }
            String string = sb.toString();
            MultiPhraseQuery.Builder mpqb = new MultiPhraseQuery.Builder();
            for (int j = 0; j < string.length(); j++) {
                if (string.charAt(j) == '*') {
                    mpqb.add(allTerms);
                } else {
                    mpqb.add(new Term("field", "" + string.charAt(j)));
                }
            }
            bq.add(mpqb.build(), BooleanClause.Occur.SHOULD);
            strings.add(new BytesRef(string));
        }
        List<BytesRef> stringsList = new ArrayList<>(strings);
        Collections.sort(stringsList);
        Automaton a = Automata.makeStringUnion(stringsList);
        // Translate automaton to query:
        TermAutomatonQuery q = new TermAutomatonQuery("field");
        int numStates = a.getNumStates();
        for (int i = 0; i < numStates; i++) {
            q.createState();
            q.setAccept(i, a.isAccept(i));
        }
        Transition t = new Transition();
        for (int i = 0; i < numStates; i++) {
            int transCount = a.initTransition(i, t);
            for (int j = 0; j < transCount; j++) {
                a.getNextTransition(t);
                for (int label = t.min; label <= t.max; label++) {
                    if ((char) label == '*') {
                        q.addAnyTransition(t.source, t.dest);
                    } else {
                        q.addTransition(t.source, t.dest, "" + (char) label);
                    }
                }
            }
        }
        q.finish();
        if (VERBOSE) {
            System.out.println("TEST: iter=" + iter);
            for (BytesRef string : stringsList) {
                System.out.println("  string: " + string.utf8ToString());
            }
            System.out.println(q.toDot());
        }
        Query q1 = q;
        Query q2 = bq.build();
        if (random().nextInt(5) == 1) {
            if (VERBOSE) {
                System.out.println("  use random filter");
            }
            RandomQuery filter = new RandomQuery(random().nextLong(), random().nextFloat());
            q1 = new BooleanQuery.Builder().add(q1, Occur.MUST).add(filter, Occur.FILTER).build();
            q2 = new BooleanQuery.Builder().add(q2, Occur.MUST).add(filter, Occur.FILTER).build();
        }
        TopDocs hits1 = s.search(q1, numDocs);
        TopDocs hits2 = s.search(q2, numDocs);
        Set<String> hits1Docs = toDocIDs(s, hits1);
        Set<String> hits2Docs = toDocIDs(s, hits2);
        try {
            assertEquals(hits2.totalHits, hits1.totalHits);
            assertEquals(hits2Docs, hits1Docs);
        } catch (AssertionError ae) {
            System.out.println("FAILED:");
            for (String id : hits1Docs) {
                if (hits2Docs.contains(id) == false) {
                    System.out.println(String.format(Locale.ROOT, "  id=%3s matched but should not have", id));
                }
            }
            for (String id : hits2Docs) {
                if (hits1Docs.contains(id) == false) {
                    System.out.println(String.format(Locale.ROOT, "  id=%3s did not match but should have", id));
                }
            }
            throw ae;
        }
    }
    IOUtils.close(w, r, dir, analyzer);
}
Also used : MockTokenFilter(org.apache.lucene.analysis.MockTokenFilter) ArrayList(java.util.ArrayList) Analyzer(org.apache.lucene.analysis.Analyzer) Document(org.apache.lucene.document.Document) StoredField(org.apache.lucene.document.StoredField) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory) MockTokenFilter(org.apache.lucene.analysis.MockTokenFilter) TokenFilter(org.apache.lucene.analysis.TokenFilter) HashSet(java.util.HashSet) Automaton(org.apache.lucene.util.automaton.Automaton) Term(org.apache.lucene.index.Term) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) IndexReader(org.apache.lucene.index.IndexReader) Transition(org.apache.lucene.util.automaton.Transition) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Example 10 with Transition

use of org.apache.lucene.util.automaton.Transition in project lucene-solr by apache.

the class TermAutomatonQuery method rewrite.

public Query rewrite(IndexReader reader) throws IOException {
    if (Operations.isEmpty(det)) {
        return new MatchNoDocsQuery();
    }
    IntsRef single = Operations.getSingleton(det);
    if (single != null && single.length == 1) {
        return new TermQuery(new Term(field, idToTerm.get(single.ints[single.offset])));
    }
    // TODO: can PhraseQuery really handle multiple terms at the same position?  If so, why do we even have MultiPhraseQuery?
    // Try for either PhraseQuery or MultiPhraseQuery, which only works when the automaton is a sausage:
    MultiPhraseQuery.Builder mpq = new MultiPhraseQuery.Builder();
    PhraseQuery.Builder pq = new PhraseQuery.Builder();
    Transition t = new Transition();
    int state = 0;
    int pos = 0;
    query: while (true) {
        int count = det.initTransition(state, t);
        if (count == 0) {
            if (det.isAccept(state) == false) {
                mpq = null;
                pq = null;
            }
            break;
        } else if (det.isAccept(state)) {
            mpq = null;
            pq = null;
            break;
        }
        int dest = -1;
        List<Term> terms = new ArrayList<>();
        boolean matchesAny = false;
        for (int i = 0; i < count; i++) {
            det.getNextTransition(t);
            if (i == 0) {
                dest = t.dest;
            } else if (dest != t.dest) {
                mpq = null;
                pq = null;
                break query;
            }
            matchesAny |= anyTermID >= t.min && anyTermID <= t.max;
            if (matchesAny == false) {
                for (int termID = t.min; termID <= t.max; termID++) {
                    terms.add(new Term(field, idToTerm.get(termID)));
                }
            }
        }
        if (matchesAny == false) {
            mpq.add(terms.toArray(new Term[terms.size()]), pos);
            if (pq != null) {
                if (terms.size() == 1) {
                    pq.add(terms.get(0), pos);
                } else {
                    pq = null;
                }
            }
        }
        state = dest;
        pos++;
    }
    if (pq != null) {
        return pq.build();
    } else if (mpq != null) {
        return mpq.build();
    }
    // TODO: we could maybe also rewrite to union of PhraseQuery (pull all finite strings) if it's "worth it"?
    return this;
}
Also used : Term(org.apache.lucene.index.Term) Transition(org.apache.lucene.util.automaton.Transition) ArrayList(java.util.ArrayList) List(java.util.List) IntsRef(org.apache.lucene.util.IntsRef)

Aggregations

Transition (org.apache.lucene.util.automaton.Transition)13 Automaton (org.apache.lucene.util.automaton.Automaton)7 ArrayList (java.util.ArrayList)4 HashSet (java.util.HashSet)3 TokenStreamToAutomaton (org.apache.lucene.analysis.TokenStreamToAutomaton)3 Term (org.apache.lucene.index.Term)3 IntsRefBuilder (org.apache.lucene.util.IntsRefBuilder)3 List (java.util.List)2 BytesRef (org.apache.lucene.util.BytesRef)2 CharsRefBuilder (org.apache.lucene.util.CharsRefBuilder)2 IntsRef (org.apache.lucene.util.IntsRef)2 IOException (java.io.IOException)1 Arrays (java.util.Arrays)1 BitSet (java.util.BitSet)1 Collections (java.util.Collections)1 HashMap (java.util.HashMap)1 Iterator (java.util.Iterator)1 LinkedList (java.util.LinkedList)1 Map (java.util.Map)1 Analyzer (org.apache.lucene.analysis.Analyzer)1