Search in sources :

Example 31 with CharacterRunAutomaton

use of org.apache.lucene.util.automaton.CharacterRunAutomaton in project lucene-solr by apache.

the class TestDuelingAnalyzers method setUp.

@Override
public void setUp() throws Exception {
    super.setUp();
    Automaton single = new Automaton();
    int initial = single.createState();
    int accept = single.createState();
    single.setAccept(accept, true);
    // build an automaton matching this jvm's letter definition
    for (int i = 0; i <= 0x10FFFF; i++) {
        if (Character.isLetter(i)) {
            single.addTransition(initial, accept, i);
        }
    }
    Automaton repeat = Operations.repeat(single);
    jvmLetter = new CharacterRunAutomaton(repeat);
}
Also used : Automaton(org.apache.lucene.util.automaton.Automaton) CharacterRunAutomaton(org.apache.lucene.util.automaton.CharacterRunAutomaton) CharacterRunAutomaton(org.apache.lucene.util.automaton.CharacterRunAutomaton)

Example 32 with CharacterRunAutomaton

use of org.apache.lucene.util.automaton.CharacterRunAutomaton in project lucene-solr by apache.

the class TestIndexWriter method testStopwordsPosIncHole2.

// LUCENE-3849
public void testStopwordsPosIncHole2() throws Exception {
    // use two stopfilters for testing here
    Directory dir = newDirectory();
    final Automaton secondSet = Automata.makeString("foobar");
    Analyzer a = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new MockTokenizer();
            TokenStream stream = new MockTokenFilter(tokenizer, MockTokenFilter.ENGLISH_STOPSET);
            stream = new MockTokenFilter(stream, new CharacterRunAutomaton(secondSet));
            return new TokenStreamComponents(tokenizer, stream);
        }
    };
    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, a);
    Document doc = new Document();
    doc.add(new TextField("body", "just a foobar", Field.Store.NO));
    doc.add(new TextField("body", "test of gaps", Field.Store.NO));
    iw.addDocument(doc);
    IndexReader ir = iw.getReader();
    iw.close();
    IndexSearcher is = newSearcher(ir);
    PhraseQuery.Builder builder = new PhraseQuery.Builder();
    builder.add(new Term("body", "just"), 0);
    builder.add(new Term("body", "test"), 3);
    PhraseQuery pq = builder.build();
    // body:"just ? ? test"
    assertEquals(1, is.search(pq, 5).totalHits);
    ir.close();
    dir.close();
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) TokenStream(org.apache.lucene.analysis.TokenStream) Automaton(org.apache.lucene.util.automaton.Automaton) CharacterRunAutomaton(org.apache.lucene.util.automaton.CharacterRunAutomaton) MockTokenFilter(org.apache.lucene.analysis.MockTokenFilter) PhraseQuery(org.apache.lucene.search.PhraseQuery) CharacterRunAutomaton(org.apache.lucene.util.automaton.CharacterRunAutomaton) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) Analyzer(org.apache.lucene.analysis.Analyzer) Document(org.apache.lucene.document.Document) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) TextField(org.apache.lucene.document.TextField) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) MMapDirectory(org.apache.lucene.store.MMapDirectory) Directory(org.apache.lucene.store.Directory) RAMDirectory(org.apache.lucene.store.RAMDirectory) FSDirectory(org.apache.lucene.store.FSDirectory) SimpleFSDirectory(org.apache.lucene.store.SimpleFSDirectory) NIOFSDirectory(org.apache.lucene.store.NIOFSDirectory)

Example 33 with CharacterRunAutomaton

use of org.apache.lucene.util.automaton.CharacterRunAutomaton in project lucene-solr by apache.

the class QueryParserTestBase method testBoost.

public void testBoost() throws Exception {
    CharacterRunAutomaton stopWords = new CharacterRunAutomaton(Automata.makeString("on"));
    Analyzer oneStopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopWords);
    CommonQueryParserConfiguration qp = getParserConfig(oneStopAnalyzer);
    Query q = getQuery("on^1.0", qp);
    assertNotNull(q);
    q = getQuery("\"hello\"^2.0", qp);
    assertNotNull(q);
    assertEquals(((BoostQuery) q).getBoost(), (float) 2.0, (float) 0.5);
    q = getQuery("hello^2.0", qp);
    assertNotNull(q);
    assertEquals(((BoostQuery) q).getBoost(), (float) 2.0, (float) 0.5);
    q = getQuery("\"on\"^1.0", qp);
    assertNotNull(q);
    Analyzer a2 = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET);
    CommonQueryParserConfiguration qp2 = getParserConfig(a2);
    q = getQuery("the^3", qp2);
    // "the" is a stop word so the result is an empty query:
    assertNotNull(q);
    assertMatchNoDocsQuery(q);
    assertFalse(q instanceof BoostQuery);
}
Also used : CharacterRunAutomaton(org.apache.lucene.util.automaton.CharacterRunAutomaton) CommonQueryParserConfiguration(org.apache.lucene.queryparser.flexible.standard.CommonQueryParserConfiguration)

Example 34 with CharacterRunAutomaton

use of org.apache.lucene.util.automaton.CharacterRunAutomaton in project lucene-solr by apache.

the class QueryParserTestBase method testStopwords.

public void testStopwords() throws Exception {
    CharacterRunAutomaton stopSet = new CharacterRunAutomaton(new RegExp("the|foo").toAutomaton());
    CommonQueryParserConfiguration qp = getParserConfig(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet));
    Query result = getQuery("field:the OR field:foo", qp);
    assertNotNull("result is null and it shouldn't be", result);
    assertTrue("result is not a BooleanQuery", result instanceof BooleanQuery || result instanceof MatchNoDocsQuery);
    if (result instanceof BooleanQuery) {
        assertEquals(0, ((BooleanQuery) result).clauses().size());
    }
    result = getQuery("field:woo OR field:the", qp);
    assertNotNull("result is null and it shouldn't be", result);
    assertTrue("result is not a TermQuery", result instanceof TermQuery);
    result = getQuery("(fieldX:xxxxx OR fieldy:xxxxxxxx)^2 AND (fieldx:the OR fieldy:foo)", qp);
    assertNotNull("result is null and it shouldn't be", result);
    assertTrue("result is not a BoostQuery", result instanceof BoostQuery);
    result = ((BoostQuery) result).getQuery();
    assertTrue("result is not a BooleanQuery", result instanceof BooleanQuery);
    if (VERBOSE)
        System.out.println("Result: " + result);
    assertTrue(((BooleanQuery) result).clauses().size() + " does not equal: " + 2, ((BooleanQuery) result).clauses().size() == 2);
}
Also used : RegExp(org.apache.lucene.util.automaton.RegExp) CharacterRunAutomaton(org.apache.lucene.util.automaton.CharacterRunAutomaton) CommonQueryParserConfiguration(org.apache.lucene.queryparser.flexible.standard.CommonQueryParserConfiguration)

Example 35 with CharacterRunAutomaton

use of org.apache.lucene.util.automaton.CharacterRunAutomaton in project SearchServices by Alfresco.

the class MinHashFilterTest method createMockShingleTokenizer.

public static Tokenizer createMockShingleTokenizer(int shingleSize, String shingles) {
    MockTokenizer tokenizer = new MockTokenizer(new CharacterRunAutomaton(new RegExp("[^ \t\r\n]+([ \t\r\n]+[^ \t\r\n]+){4}").toAutomaton()), true);
    tokenizer.setEnableChecks(true);
    if (shingles != null) {
        tokenizer.setReader(new StringReader(shingles));
    }
    return tokenizer;
}
Also used : MockTokenizer(org.apache.lucene.analysis.MockTokenizer) RegExp(org.apache.lucene.util.automaton.RegExp) CharacterRunAutomaton(org.apache.lucene.util.automaton.CharacterRunAutomaton) StringReader(java.io.StringReader)

Aggregations

CharacterRunAutomaton (org.apache.lucene.util.automaton.CharacterRunAutomaton)36 RegExp (org.apache.lucene.util.automaton.RegExp)15 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)10 ArrayList (java.util.ArrayList)7 Term (org.apache.lucene.index.Term)7 PhraseQuery (org.apache.lucene.search.PhraseQuery)6 Query (org.apache.lucene.search.Query)6 TermQuery (org.apache.lucene.search.TermQuery)6 List (java.util.List)5 Analyzer (org.apache.lucene.analysis.Analyzer)5 BooleanQuery (org.apache.lucene.search.BooleanQuery)5 BoostQuery (org.apache.lucene.search.BoostQuery)5 Automaton (org.apache.lucene.util.automaton.Automaton)5 HashMap (java.util.HashMap)4 Map (java.util.Map)4 Document (org.apache.lucene.document.Document)4 FuzzyQuery (org.apache.lucene.search.FuzzyQuery)4 PrefixQuery (org.apache.lucene.search.PrefixQuery)4 TermRangeQuery (org.apache.lucene.search.TermRangeQuery)4 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)3