Search in sources :

Example 41 with Automaton

use of org.apache.lucene.util.automaton.Automaton in project lucene-solr by apache.

the class TestSynonymGraphFilter method topoSort.

/*
  private String toDot(TokenStream ts) throws IOException {
    PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
    PositionLengthAttribute posLenAtt = ts.addAttribute(PositionLengthAttribute.class);
    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class);
    ts.reset();
    int srcNode = -1;
    int destNode = -1;

    StringBuilder b = new StringBuilder();
    b.append("digraph Automaton {\n");
    b.append("  rankdir = LR\n");
    b.append("  node [width=0.2, height=0.2, fontsize=8]\n");
    b.append("  initial [shape=plaintext,label=\"\"]\n");
    b.append("  initial -> 0\n");

    while (ts.incrementToken()) {
      int posInc = posIncAtt.getPositionIncrement();
      if (posInc != 0) {
        srcNode += posInc;
        b.append("  ");
        b.append(srcNode);
        b.append(" [shape=circle,label=\"" + srcNode + "\"]\n");
      }
      destNode = srcNode + posLenAtt.getPositionLength();
      b.append("  ");
      b.append(srcNode);
      b.append(" -> ");
      b.append(destNode);
      b.append(" [label=\"");
      b.append(termAtt);
      b.append("\"");
      if (typeAtt.type().equals("word") == false) {
        b.append(" color=red");
      }
      b.append("]\n");
    }
    ts.end();
    ts.close();

    b.append('}');
    return b.toString();
  }
  */
/** Renumbers nodes according to their topo sort */
private Automaton topoSort(Automaton in) {
    int[] newToOld = Operations.topoSortStates(in);
    int[] oldToNew = new int[newToOld.length];
    Automaton.Builder a = new Automaton.Builder();
    //System.out.println("remap:");
    for (int i = 0; i < newToOld.length; i++) {
        a.createState();
        oldToNew[newToOld[i]] = i;
        //System.out.println("  " + newToOld[i] + " -> " + i);
        if (in.isAccept(newToOld[i])) {
            a.setAccept(i, true);
        //System.out.println("    **");
        }
    }
    Transition t = new Transition();
    for (int i = 0; i < newToOld.length; i++) {
        int count = in.initTransition(newToOld[i], t);
        for (int j = 0; j < count; j++) {
            in.getNextTransition(t);
            a.addTransition(i, oldToNew[t.dest], t.min, t.max);
        }
    }
    return a.finish();
}
Also used : Automaton(org.apache.lucene.util.automaton.Automaton) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) Transition(org.apache.lucene.util.automaton.Transition)

Example 42 with Automaton

use of org.apache.lucene.util.automaton.Automaton in project lucene-solr by apache.

the class TestIndexWriter method testStopwordsPosIncHole2.

// LUCENE-3849
public void testStopwordsPosIncHole2() throws Exception {
    // use two stopfilters for testing here
    Directory dir = newDirectory();
    final Automaton secondSet = Automata.makeString("foobar");
    Analyzer a = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new MockTokenizer();
            TokenStream stream = new MockTokenFilter(tokenizer, MockTokenFilter.ENGLISH_STOPSET);
            stream = new MockTokenFilter(stream, new CharacterRunAutomaton(secondSet));
            return new TokenStreamComponents(tokenizer, stream);
        }
    };
    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, a);
    Document doc = new Document();
    doc.add(new TextField("body", "just a foobar", Field.Store.NO));
    doc.add(new TextField("body", "test of gaps", Field.Store.NO));
    iw.addDocument(doc);
    IndexReader ir = iw.getReader();
    iw.close();
    IndexSearcher is = newSearcher(ir);
    PhraseQuery.Builder builder = new PhraseQuery.Builder();
    builder.add(new Term("body", "just"), 0);
    builder.add(new Term("body", "test"), 3);
    PhraseQuery pq = builder.build();
    // body:"just ? ? test"
    assertEquals(1, is.search(pq, 5).totalHits);
    ir.close();
    dir.close();
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) TokenStream(org.apache.lucene.analysis.TokenStream) Automaton(org.apache.lucene.util.automaton.Automaton) CharacterRunAutomaton(org.apache.lucene.util.automaton.CharacterRunAutomaton) MockTokenFilter(org.apache.lucene.analysis.MockTokenFilter) PhraseQuery(org.apache.lucene.search.PhraseQuery) CharacterRunAutomaton(org.apache.lucene.util.automaton.CharacterRunAutomaton) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) Analyzer(org.apache.lucene.analysis.Analyzer) Document(org.apache.lucene.document.Document) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) TextField(org.apache.lucene.document.TextField) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) MMapDirectory(org.apache.lucene.store.MMapDirectory) Directory(org.apache.lucene.store.Directory) RAMDirectory(org.apache.lucene.store.RAMDirectory) FSDirectory(org.apache.lucene.store.FSDirectory) SimpleFSDirectory(org.apache.lucene.store.SimpleFSDirectory) NIOFSDirectory(org.apache.lucene.store.NIOFSDirectory)

Example 43 with Automaton

use of org.apache.lucene.util.automaton.Automaton in project lucene-solr by apache.

the class TestTermAutomatonQuery method testRandom.

public void testRandom() throws Exception {
    int numDocs = atLeast(100);
    Directory dir = newDirectory();
    // Adds occassional random synonyms:
    Analyzer analyzer = new Analyzer() {

        @Override
        public TokenStreamComponents createComponents(String fieldName) {
            MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true, 100);
            tokenizer.setEnableChecks(true);
            TokenFilter filt = new MockTokenFilter(tokenizer, MockTokenFilter.EMPTY_STOPSET);
            filt = new RandomSynonymFilter(filt);
            return new TokenStreamComponents(tokenizer, filt);
        }
    };
    IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
    RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
    for (int i = 0; i < numDocs; i++) {
        Document doc = new Document();
        int numTokens = atLeast(10);
        StringBuilder sb = new StringBuilder();
        for (int j = 0; j < numTokens; j++) {
            sb.append(' ');
            sb.append((char) (97 + random().nextInt(3)));
        }
        String contents = sb.toString();
        doc.add(newTextField("field", contents, Field.Store.NO));
        doc.add(new StoredField("id", "" + i));
        if (VERBOSE) {
            System.out.println("  doc " + i + " -> " + contents);
        }
        w.addDocument(doc);
    }
    IndexReader r = w.getReader();
    IndexSearcher s = newSearcher(r);
    // Used to match ANY using MultiPhraseQuery:
    Term[] allTerms = new Term[] { new Term("field", "a"), new Term("field", "b"), new Term("field", "c") };
    int numIters = atLeast(1000);
    for (int iter = 0; iter < numIters; iter++) {
        // Build the (finite, no any transitions) TermAutomatonQuery and
        // also the "equivalent" BooleanQuery and make sure they match the
        // same docs:
        BooleanQuery.Builder bq = new BooleanQuery.Builder();
        int count = TestUtil.nextInt(random(), 1, 5);
        Set<BytesRef> strings = new HashSet<>();
        for (int i = 0; i < count; i++) {
            StringBuilder sb = new StringBuilder();
            int numTokens = TestUtil.nextInt(random(), 1, 5);
            for (int j = 0; j < numTokens; j++) {
                if (j > 0 && j < numTokens - 1 && random().nextInt(5) == 3) {
                    sb.append('*');
                } else {
                    sb.append((char) (97 + random().nextInt(3)));
                }
            }
            String string = sb.toString();
            MultiPhraseQuery.Builder mpqb = new MultiPhraseQuery.Builder();
            for (int j = 0; j < string.length(); j++) {
                if (string.charAt(j) == '*') {
                    mpqb.add(allTerms);
                } else {
                    mpqb.add(new Term("field", "" + string.charAt(j)));
                }
            }
            bq.add(mpqb.build(), BooleanClause.Occur.SHOULD);
            strings.add(new BytesRef(string));
        }
        List<BytesRef> stringsList = new ArrayList<>(strings);
        Collections.sort(stringsList);
        Automaton a = Automata.makeStringUnion(stringsList);
        // Translate automaton to query:
        TermAutomatonQuery q = new TermAutomatonQuery("field");
        int numStates = a.getNumStates();
        for (int i = 0; i < numStates; i++) {
            q.createState();
            q.setAccept(i, a.isAccept(i));
        }
        Transition t = new Transition();
        for (int i = 0; i < numStates; i++) {
            int transCount = a.initTransition(i, t);
            for (int j = 0; j < transCount; j++) {
                a.getNextTransition(t);
                for (int label = t.min; label <= t.max; label++) {
                    if ((char) label == '*') {
                        q.addAnyTransition(t.source, t.dest);
                    } else {
                        q.addTransition(t.source, t.dest, "" + (char) label);
                    }
                }
            }
        }
        q.finish();
        if (VERBOSE) {
            System.out.println("TEST: iter=" + iter);
            for (BytesRef string : stringsList) {
                System.out.println("  string: " + string.utf8ToString());
            }
            System.out.println(q.toDot());
        }
        Query q1 = q;
        Query q2 = bq.build();
        if (random().nextInt(5) == 1) {
            if (VERBOSE) {
                System.out.println("  use random filter");
            }
            RandomQuery filter = new RandomQuery(random().nextLong(), random().nextFloat());
            q1 = new BooleanQuery.Builder().add(q1, Occur.MUST).add(filter, Occur.FILTER).build();
            q2 = new BooleanQuery.Builder().add(q2, Occur.MUST).add(filter, Occur.FILTER).build();
        }
        TopDocs hits1 = s.search(q1, numDocs);
        TopDocs hits2 = s.search(q2, numDocs);
        Set<String> hits1Docs = toDocIDs(s, hits1);
        Set<String> hits2Docs = toDocIDs(s, hits2);
        try {
            assertEquals(hits2.totalHits, hits1.totalHits);
            assertEquals(hits2Docs, hits1Docs);
        } catch (AssertionError ae) {
            System.out.println("FAILED:");
            for (String id : hits1Docs) {
                if (hits2Docs.contains(id) == false) {
                    System.out.println(String.format(Locale.ROOT, "  id=%3s matched but should not have", id));
                }
            }
            for (String id : hits2Docs) {
                if (hits1Docs.contains(id) == false) {
                    System.out.println(String.format(Locale.ROOT, "  id=%3s did not match but should have", id));
                }
            }
            throw ae;
        }
    }
    IOUtils.close(w, r, dir, analyzer);
}
Also used : MockTokenFilter(org.apache.lucene.analysis.MockTokenFilter) ArrayList(java.util.ArrayList) Analyzer(org.apache.lucene.analysis.Analyzer) Document(org.apache.lucene.document.Document) StoredField(org.apache.lucene.document.StoredField) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory) MockTokenFilter(org.apache.lucene.analysis.MockTokenFilter) TokenFilter(org.apache.lucene.analysis.TokenFilter) HashSet(java.util.HashSet) Automaton(org.apache.lucene.util.automaton.Automaton) Term(org.apache.lucene.index.Term) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) IndexReader(org.apache.lucene.index.IndexReader) Transition(org.apache.lucene.util.automaton.Transition) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Example 44 with Automaton

use of org.apache.lucene.util.automaton.Automaton in project lucene-solr by apache.

the class BaseTokenStreamTestCase method getGraphStrings.

/** Returns all paths accepted by the token stream graph produced by the already initialized {@link TokenStream}. */
public static Set<String> getGraphStrings(TokenStream tokenStream) throws IOException {
    Automaton automaton = new TokenStreamToAutomaton().toAutomaton(tokenStream);
    Set<IntsRef> actualStringPaths = AutomatonTestUtil.getFiniteStringsRecursive(automaton, -1);
    BytesRefBuilder scratchBytesRefBuilder = new BytesRefBuilder();
    Set<String> paths = new HashSet<>();
    for (IntsRef ir : actualStringPaths) {
        paths.add(Util.toBytesRef(ir, scratchBytesRefBuilder).utf8ToString().replace((char) TokenStreamToAutomaton.POS_SEP, ' '));
    }
    return paths;
}
Also used : BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) Automaton(org.apache.lucene.util.automaton.Automaton) IntsRef(org.apache.lucene.util.IntsRef)

Example 45 with Automaton

use of org.apache.lucene.util.automaton.Automaton in project lucene-solr by apache.

the class SolrQueryParserBase method getWildcardQuery.

// called from parser
protected Query getWildcardQuery(String field, String termStr) throws SyntaxError {
    checkNullField(field);
    // *:* -> MatchAllDocsQuery
    if ("*".equals(termStr)) {
        if ("*".equals(field) || getExplicitField() == null) {
            return newMatchAllDocsQuery();
        }
    }
    FieldType fieldType = schema.getFieldType(field);
    termStr = analyzeIfMultitermTermText(field, termStr, fieldType);
    // can we use reversed wildcards in this field?
    ReversedWildcardFilterFactory factory = getReversedWildcardFilterFactory(fieldType);
    if (factory != null) {
        Term term = new Term(field, termStr);
        // fsa representing the query
        Automaton automaton = WildcardQuery.toAutomaton(term);
        // TODO: we should likely use the automaton to calculate shouldReverse, too.
        if (factory.shouldReverse(termStr)) {
            automaton = Operations.concatenate(automaton, Automata.makeChar(factory.getMarkerChar()));
            automaton = Operations.reverse(automaton);
        } else {
            // reverse wildcardfilter is active: remove false positives
            // fsa representing false positives (markerChar*)
            Automaton falsePositives = Operations.concatenate(Automata.makeChar(factory.getMarkerChar()), Automata.makeAnyString());
            // subtract these away
            automaton = Operations.minus(automaton, falsePositives, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
        }
        return new AutomatonQuery(term, automaton) {

            // override toString so it's completely transparent
            @Override
            public String toString(String field) {
                StringBuilder buffer = new StringBuilder();
                if (!getField().equals(field)) {
                    buffer.append(getField());
                    buffer.append(":");
                }
                buffer.append(term.text());
                return buffer.toString();
            }
        };
    }
    // Solr has always used constant scoring for wildcard queries.  This should return constant scoring by default.
    return newWildcardQuery(new Term(field, termStr));
}
Also used : AutomatonQuery(org.apache.lucene.search.AutomatonQuery) Automaton(org.apache.lucene.util.automaton.Automaton) ReversedWildcardFilterFactory(org.apache.solr.analysis.ReversedWildcardFilterFactory) Term(org.apache.lucene.index.Term) FieldType(org.apache.solr.schema.FieldType)

Aggregations

Automaton (org.apache.lucene.util.automaton.Automaton)57 TokenStreamToAutomaton (org.apache.lucene.analysis.TokenStreamToAutomaton)17 IntsRef (org.apache.lucene.util.IntsRef)13 BytesRef (org.apache.lucene.util.BytesRef)12 ArrayList (java.util.ArrayList)11 Directory (org.apache.lucene.store.Directory)8 HashSet (java.util.HashSet)7 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)7 Document (org.apache.lucene.document.Document)6 CompiledAutomaton (org.apache.lucene.util.automaton.CompiledAutomaton)6 Transition (org.apache.lucene.util.automaton.Transition)6 TokenStream (org.apache.lucene.analysis.TokenStream)5 BytesRefBuilder (org.apache.lucene.util.BytesRefBuilder)5 CharsRefBuilder (org.apache.lucene.util.CharsRefBuilder)5 CharacterRunAutomaton (org.apache.lucene.util.automaton.CharacterRunAutomaton)5 Analyzer (org.apache.lucene.analysis.Analyzer)4 IntsRefBuilder (org.apache.lucene.util.IntsRefBuilder)4 FiniteStringsIterator (org.apache.lucene.util.automaton.FiniteStringsIterator)4 LevenshteinAutomata (org.apache.lucene.util.automaton.LevenshteinAutomata)4 RegExp (org.apache.lucene.util.automaton.RegExp)4