Search in sources :

Example 26 with RegExp

use of org.apache.lucene.util.automaton.RegExp in project lucene-solr by apache.

the class TestMockAnalyzer method testTwoChars.

/** Test a configuration where two characters makes a term */
public void testTwoChars() throws Exception {
    CharacterRunAutomaton single = new CharacterRunAutomaton(new RegExp("..").toAutomaton());
    Analyzer a = new MockAnalyzer(random(), single, false);
    assertAnalyzesTo(a, "foobar", new String[] { "fo", "ob", "ar" }, new int[] { 0, 2, 4 }, new int[] { 2, 4, 6 });
    // make sure when last term is a "partial" match that end() is correct
    assertTokenStreamContents(a.tokenStream("bogus", "fooba"), new String[] { "fo", "ob" }, new int[] { 0, 2 }, new int[] { 2, 4 }, new int[] { 1, 1 }, new Integer(5));
    checkRandomData(random(), a, 100);
}
Also used : RegExp(org.apache.lucene.util.automaton.RegExp) CharacterRunAutomaton(org.apache.lucene.util.automaton.CharacterRunAutomaton)

Example 27 with RegExp

use of org.apache.lucene.util.automaton.RegExp in project lucene-solr by apache.

the class TestMockAnalyzer method testUppercase.

/** Test a configuration where word starts with one uppercase */
public void testUppercase() throws Exception {
    CharacterRunAutomaton single = new CharacterRunAutomaton(new RegExp("[A-Z][a-z]*").toAutomaton());
    Analyzer a = new MockAnalyzer(random(), single, false);
    assertAnalyzesTo(a, "FooBarBAZ", new String[] { "Foo", "Bar", "B", "A", "Z" }, new int[] { 0, 3, 6, 7, 8 }, new int[] { 3, 6, 7, 8, 9 });
    assertAnalyzesTo(a, "aFooBar", new String[] { "Foo", "Bar" }, new int[] { 1, 4 }, new int[] { 4, 7 });
    checkRandomData(random(), a, 100);
}
Also used : RegExp(org.apache.lucene.util.automaton.RegExp) CharacterRunAutomaton(org.apache.lucene.util.automaton.CharacterRunAutomaton)

Example 28 with RegExp

use of org.apache.lucene.util.automaton.RegExp in project lucene-solr by apache.

the class QueryParserTestBase method testStopwords.

public void testStopwords() throws Exception {
    CharacterRunAutomaton stopSet = new CharacterRunAutomaton(new RegExp("the|foo").toAutomaton());
    CommonQueryParserConfiguration qp = getParserConfig(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet));
    Query result = getQuery("field:the OR field:foo", qp);
    assertNotNull("result is null and it shouldn't be", result);
    assertTrue("result is not a BooleanQuery", result instanceof BooleanQuery || result instanceof MatchNoDocsQuery);
    if (result instanceof BooleanQuery) {
        assertEquals(0, ((BooleanQuery) result).clauses().size());
    }
    result = getQuery("field:woo OR field:the", qp);
    assertNotNull("result is null and it shouldn't be", result);
    assertTrue("result is not a TermQuery", result instanceof TermQuery);
    result = getQuery("(fieldX:xxxxx OR fieldy:xxxxxxxx)^2 AND (fieldx:the OR fieldy:foo)", qp);
    assertNotNull("result is null and it shouldn't be", result);
    assertTrue("result is not a BoostQuery", result instanceof BoostQuery);
    result = ((BoostQuery) result).getQuery();
    assertTrue("result is not a BooleanQuery", result instanceof BooleanQuery);
    if (VERBOSE)
        System.out.println("Result: " + result);
    assertTrue(((BooleanQuery) result).clauses().size() + " does not equal: " + 2, ((BooleanQuery) result).clauses().size() == 2);
}
Also used : RegExp(org.apache.lucene.util.automaton.RegExp) CharacterRunAutomaton(org.apache.lucene.util.automaton.CharacterRunAutomaton) CommonQueryParserConfiguration(org.apache.lucene.queryparser.flexible.standard.CommonQueryParserConfiguration)

Example 29 with RegExp

use of org.apache.lucene.util.automaton.RegExp in project SearchServices by Alfresco.

the class MinHashFilterTest method createMockShingleTokenizer.

public static Tokenizer createMockShingleTokenizer(int shingleSize, String shingles) {
    MockTokenizer tokenizer = new MockTokenizer(new CharacterRunAutomaton(new RegExp("[^ \t\r\n]+([ \t\r\n]+[^ \t\r\n]+){4}").toAutomaton()), true);
    tokenizer.setEnableChecks(true);
    if (shingles != null) {
        tokenizer.setReader(new StringReader(shingles));
    }
    return tokenizer;
}
Also used : MockTokenizer(org.apache.lucene.analysis.MockTokenizer) RegExp(org.apache.lucene.util.automaton.RegExp) CharacterRunAutomaton(org.apache.lucene.util.automaton.CharacterRunAutomaton) StringReader(java.io.StringReader)

Example 30 with RegExp

use of org.apache.lucene.util.automaton.RegExp in project crate by crate.

the class RegexpMatchOperator method evaluate.

@Override
public Boolean evaluate(TransactionContext txnCtx, NodeContext nodeCtx, Input<String>[] args) {
    assert args.length == 2 : "invalid number of arguments";
    String source = args[0].value();
    if (source == null) {
        return null;
    }
    String pattern = args[1].value();
    if (pattern == null) {
        return null;
    }
    if (isPcrePattern(pattern)) {
        return source.matches(pattern);
    } else {
        RegExp regexp = new RegExp(pattern);
        ByteRunAutomaton regexpRunAutomaton = new ByteRunAutomaton(regexp.toAutomaton());
        byte[] bytes = source.getBytes(StandardCharsets.UTF_8);
        return regexpRunAutomaton.run(bytes, 0, bytes.length);
    }
}
Also used : RegExp(org.apache.lucene.util.automaton.RegExp) ByteRunAutomaton(org.apache.lucene.util.automaton.ByteRunAutomaton)

Aggregations

RegExp (org.apache.lucene.util.automaton.RegExp)30 CharacterRunAutomaton (org.apache.lucene.util.automaton.CharacterRunAutomaton)15 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)11 Document (org.apache.lucene.document.Document)9 Directory (org.apache.lucene.store.Directory)9 BytesRef (org.apache.lucene.util.BytesRef)9 CompiledAutomaton (org.apache.lucene.util.automaton.CompiledAutomaton)9 Analyzer (org.apache.lucene.analysis.Analyzer)5 Automaton (org.apache.lucene.util.automaton.Automaton)5 Term (org.apache.lucene.index.Term)4 IndexReader (org.apache.lucene.index.IndexReader)3 PhraseQuery (org.apache.lucene.search.PhraseQuery)3 TermQuery (org.apache.lucene.search.TermQuery)3 StringReader (java.io.StringReader)2 TreeSet (java.util.TreeSet)2 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)2 IndexWriter (org.apache.lucene.index.IndexWriter)2 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)2 TermsEnum (org.apache.lucene.index.TermsEnum)2 CommonQueryParserConfiguration (org.apache.lucene.queryparser.flexible.standard.CommonQueryParserConfiguration)2