Search in sources :

Example 1 with RegExp

use of org.apache.lucene.util.automaton.RegExp in project crate by crate.

the class RegexpMatchOperator method evaluate.

@Override
public Boolean evaluate(Input<BytesRef>... args) {
    assert args.length == 2 : "invalid number of arguments";
    BytesRef source = args[0].value();
    if (source == null) {
        return null;
    }
    BytesRef pattern = args[1].value();
    if (pattern == null) {
        return null;
    }
    String sPattern = pattern.utf8ToString();
    if (isPcrePattern(sPattern)) {
        return source.utf8ToString().matches(sPattern);
    } else {
        RegExp regexp = new RegExp(sPattern);
        ByteRunAutomaton regexpRunAutomaton = new ByteRunAutomaton(regexp.toAutomaton());
        return regexpRunAutomaton.run(source.bytes, source.offset, source.length);
    }
}
Also used : RegExp(org.apache.lucene.util.automaton.RegExp) ByteRunAutomaton(org.apache.lucene.util.automaton.ByteRunAutomaton) BytesRef(org.apache.lucene.util.BytesRef)

Example 2 with RegExp

use of org.apache.lucene.util.automaton.RegExp in project elasticsearch by elastic.

the class SignificantTermsTests method createTestAggregatorBuilder.

@Override
protected SignificantTermsAggregationBuilder createTestAggregatorBuilder() {
    String name = randomAsciiOfLengthBetween(3, 20);
    SignificantTermsAggregationBuilder factory = new SignificantTermsAggregationBuilder(name, null);
    String field = randomAsciiOfLengthBetween(3, 20);
    int randomFieldBranch = randomInt(2);
    switch(randomFieldBranch) {
        case 0:
            factory.field(field);
            break;
        case 1:
            factory.field(field);
            factory.script(new Script("_value + 1"));
            break;
        case 2:
            factory.script(new Script("doc[" + field + "] + 1"));
            break;
        default:
            fail();
    }
    if (randomBoolean()) {
        factory.missing("MISSING");
    }
    if (randomBoolean()) {
        factory.bucketCountThresholds().setRequiredSize(randomIntBetween(1, Integer.MAX_VALUE));
    }
    if (randomBoolean()) {
        factory.bucketCountThresholds().setShardSize(randomIntBetween(1, Integer.MAX_VALUE));
    }
    if (randomBoolean()) {
        int minDocCount = randomInt(4);
        switch(minDocCount) {
            case 0:
                break;
            case 1:
            case 2:
            case 3:
            case 4:
                minDocCount = randomIntBetween(0, Integer.MAX_VALUE);
                break;
        }
        factory.bucketCountThresholds().setMinDocCount(minDocCount);
    }
    if (randomBoolean()) {
        int shardMinDocCount = randomInt(4);
        switch(shardMinDocCount) {
            case 0:
                break;
            case 1:
            case 2:
            case 3:
            case 4:
                shardMinDocCount = randomIntBetween(0, Integer.MAX_VALUE);
                break;
            default:
                fail();
        }
        factory.bucketCountThresholds().setShardMinDocCount(shardMinDocCount);
    }
    if (randomBoolean()) {
        factory.executionHint(randomFrom(executionHints));
    }
    if (randomBoolean()) {
        factory.format("###.##");
    }
    if (randomBoolean()) {
        IncludeExclude incExc = null;
        switch(randomInt(5)) {
            case 0:
                incExc = new IncludeExclude(new RegExp("foobar"), null);
                break;
            case 1:
                incExc = new IncludeExclude(null, new RegExp("foobaz"));
                break;
            case 2:
                incExc = new IncludeExclude(new RegExp("foobar"), new RegExp("foobaz"));
                break;
            case 3:
                SortedSet<BytesRef> includeValues = new TreeSet<>();
                int numIncs = randomIntBetween(1, 20);
                for (int i = 0; i < numIncs; i++) {
                    includeValues.add(new BytesRef(randomAsciiOfLengthBetween(1, 30)));
                }
                SortedSet<BytesRef> excludeValues = null;
                incExc = new IncludeExclude(includeValues, excludeValues);
                break;
            case 4:
                SortedSet<BytesRef> includeValues2 = null;
                SortedSet<BytesRef> excludeValues2 = new TreeSet<>();
                int numExcs2 = randomIntBetween(1, 20);
                for (int i = 0; i < numExcs2; i++) {
                    excludeValues2.add(new BytesRef(randomAsciiOfLengthBetween(1, 30)));
                }
                incExc = new IncludeExclude(includeValues2, excludeValues2);
                break;
            case 5:
                SortedSet<BytesRef> includeValues3 = new TreeSet<>();
                int numIncs3 = randomIntBetween(1, 20);
                for (int i = 0; i < numIncs3; i++) {
                    includeValues3.add(new BytesRef(randomAsciiOfLengthBetween(1, 30)));
                }
                SortedSet<BytesRef> excludeValues3 = new TreeSet<>();
                int numExcs3 = randomIntBetween(1, 20);
                for (int i = 0; i < numExcs3; i++) {
                    excludeValues3.add(new BytesRef(randomAsciiOfLengthBetween(1, 30)));
                }
                incExc = new IncludeExclude(includeValues3, excludeValues3);
                break;
            default:
                fail();
        }
        factory.includeExclude(incExc);
    }
    if (randomBoolean()) {
        SignificanceHeuristic significanceHeuristic = null;
        switch(randomInt(5)) {
            case 0:
                significanceHeuristic = new PercentageScore();
                break;
            case 1:
                significanceHeuristic = new ChiSquare(randomBoolean(), randomBoolean());
                break;
            case 2:
                significanceHeuristic = new GND(randomBoolean());
                break;
            case 3:
                significanceHeuristic = new MutualInformation(randomBoolean(), randomBoolean());
                break;
            case 4:
                significanceHeuristic = new ScriptHeuristic(new Script("foo"));
                break;
            case 5:
                significanceHeuristic = new JLHScore();
                break;
            default:
                fail();
        }
        factory.significanceHeuristic(significanceHeuristic);
    }
    if (randomBoolean()) {
        factory.backgroundFilter(QueryBuilders.termsQuery("foo", "bar"));
    }
    return factory;
}
Also used : Script(org.elasticsearch.script.Script) JLHScore(org.elasticsearch.search.aggregations.bucket.significant.heuristics.JLHScore) ChiSquare(org.elasticsearch.search.aggregations.bucket.significant.heuristics.ChiSquare) RegExp(org.apache.lucene.util.automaton.RegExp) IncludeExclude(org.elasticsearch.search.aggregations.bucket.terms.support.IncludeExclude) PercentageScore(org.elasticsearch.search.aggregations.bucket.significant.heuristics.PercentageScore) TreeSet(java.util.TreeSet) ScriptHeuristic(org.elasticsearch.search.aggregations.bucket.significant.heuristics.ScriptHeuristic) SignificanceHeuristic(org.elasticsearch.search.aggregations.bucket.significant.heuristics.SignificanceHeuristic) MutualInformation(org.elasticsearch.search.aggregations.bucket.significant.heuristics.MutualInformation) SignificantTermsAggregationBuilder(org.elasticsearch.search.aggregations.bucket.significant.SignificantTermsAggregationBuilder) GND(org.elasticsearch.search.aggregations.bucket.significant.heuristics.GND) BytesRef(org.apache.lucene.util.BytesRef)

Example 3 with RegExp

use of org.apache.lucene.util.automaton.RegExp in project lucene-solr by apache.

the class LuceneTestCase method assertTermsEquals.

/** 
   * Terms api equivalency 
   */
public void assertTermsEquals(String info, IndexReader leftReader, Terms leftTerms, Terms rightTerms, boolean deep) throws IOException {
    if (leftTerms == null || rightTerms == null) {
        assertNull(info, leftTerms);
        assertNull(info, rightTerms);
        return;
    }
    assertTermsStatisticsEquals(info, leftTerms, rightTerms);
    assertEquals("hasOffsets", leftTerms.hasOffsets(), rightTerms.hasOffsets());
    assertEquals("hasPositions", leftTerms.hasPositions(), rightTerms.hasPositions());
    assertEquals("hasPayloads", leftTerms.hasPayloads(), rightTerms.hasPayloads());
    TermsEnum leftTermsEnum = leftTerms.iterator();
    TermsEnum rightTermsEnum = rightTerms.iterator();
    assertTermsEnumEquals(info, leftReader, leftTermsEnum, rightTermsEnum, true);
    assertTermsSeekingEquals(info, leftTerms, rightTerms);
    if (deep) {
        int numIntersections = atLeast(3);
        for (int i = 0; i < numIntersections; i++) {
            String re = AutomatonTestUtil.randomRegexp(random());
            CompiledAutomaton automaton = new CompiledAutomaton(new RegExp(re, RegExp.NONE).toAutomaton());
            if (automaton.type == CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
                // TODO: test start term too
                TermsEnum leftIntersection = leftTerms.intersect(automaton, null);
                TermsEnum rightIntersection = rightTerms.intersect(automaton, null);
                assertTermsEnumEquals(info, leftReader, leftIntersection, rightIntersection, rarely());
            }
        }
    }
}
Also used : RegExp(org.apache.lucene.util.automaton.RegExp) CompiledAutomaton(org.apache.lucene.util.automaton.CompiledAutomaton)

Example 4 with RegExp

use of org.apache.lucene.util.automaton.RegExp in project lucene-solr by apache.

the class SynonymTokenizer method testMaxSizeEndHighlight.

public void testMaxSizeEndHighlight() throws Exception {
    TestHighlightRunner helper = new TestHighlightRunner() {

        @Override
        public void run() throws Exception {
            CharacterRunAutomaton stopWords = new CharacterRunAutomaton(new RegExp("i[nt]").toAutomaton());
            TermQuery query = new TermQuery(new Term("text", "searchterm"));
            String text = "this is a text with searchterm in it";
            SimpleHTMLFormatter fm = new SimpleHTMLFormatter();
            Highlighter hg = getHighlighter(query, "text", fm);
            hg.setTextFragmenter(new NullFragmenter());
            hg.setMaxDocCharsToAnalyze(36);
            String match = hg.getBestFragment(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopWords), "text", text);
            assertTrue("Matched text should contain remainder of text after highlighted query ", match.endsWith("in it"));
        }
    };
    helper.start();
}
Also used : MultiTermQuery(org.apache.lucene.search.MultiTermQuery) SpanTermQuery(org.apache.lucene.search.spans.SpanTermQuery) TermQuery(org.apache.lucene.search.TermQuery) TestHighlightRunner(org.apache.lucene.search.highlight.SynonymTokenizer.TestHighlightRunner) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) RegExp(org.apache.lucene.util.automaton.RegExp) CharacterRunAutomaton(org.apache.lucene.util.automaton.CharacterRunAutomaton) Term(org.apache.lucene.index.Term)

Example 5 with RegExp

use of org.apache.lucene.util.automaton.RegExp in project lucene-solr by apache.

the class BaseDocValuesFormatTestCase method testSortedSetTermsEnum.

public void testSortedSetTermsEnum() throws IOException {
    Directory directory = newDirectory();
    Analyzer analyzer = new MockAnalyzer(random());
    IndexWriterConfig iwconfig = newIndexWriterConfig(analyzer);
    iwconfig.setMergePolicy(newLogMergePolicy());
    RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig);
    Document doc = new Document();
    doc.add(new SortedSetDocValuesField("field", new BytesRef("hello")));
    doc.add(new SortedSetDocValuesField("field", new BytesRef("world")));
    doc.add(new SortedSetDocValuesField("field", new BytesRef("beer")));
    iwriter.addDocument(doc);
    DirectoryReader ireader = iwriter.getReader();
    iwriter.close();
    SortedSetDocValues dv = getOnlyLeafReader(ireader).getSortedSetDocValues("field");
    assertEquals(3, dv.getValueCount());
    TermsEnum termsEnum = dv.termsEnum();
    // next()
    assertEquals("beer", termsEnum.next().utf8ToString());
    assertEquals(0, termsEnum.ord());
    assertEquals("hello", termsEnum.next().utf8ToString());
    assertEquals(1, termsEnum.ord());
    assertEquals("world", termsEnum.next().utf8ToString());
    assertEquals(2, termsEnum.ord());
    // seekCeil()
    assertEquals(SeekStatus.NOT_FOUND, termsEnum.seekCeil(new BytesRef("ha!")));
    assertEquals("hello", termsEnum.term().utf8ToString());
    assertEquals(1, termsEnum.ord());
    assertEquals(SeekStatus.FOUND, termsEnum.seekCeil(new BytesRef("beer")));
    assertEquals("beer", termsEnum.term().utf8ToString());
    assertEquals(0, termsEnum.ord());
    assertEquals(SeekStatus.END, termsEnum.seekCeil(new BytesRef("zzz")));
    // seekExact()
    assertTrue(termsEnum.seekExact(new BytesRef("beer")));
    assertEquals("beer", termsEnum.term().utf8ToString());
    assertEquals(0, termsEnum.ord());
    assertTrue(termsEnum.seekExact(new BytesRef("hello")));
    assertEquals("hello", termsEnum.term().utf8ToString());
    assertEquals(1, termsEnum.ord());
    assertTrue(termsEnum.seekExact(new BytesRef("world")));
    assertEquals("world", termsEnum.term().utf8ToString());
    assertEquals(2, termsEnum.ord());
    assertFalse(termsEnum.seekExact(new BytesRef("bogus")));
    // seek(ord)
    termsEnum.seekExact(0);
    assertEquals("beer", termsEnum.term().utf8ToString());
    assertEquals(0, termsEnum.ord());
    termsEnum.seekExact(1);
    assertEquals("hello", termsEnum.term().utf8ToString());
    assertEquals(1, termsEnum.ord());
    termsEnum.seekExact(2);
    assertEquals("world", termsEnum.term().utf8ToString());
    assertEquals(2, termsEnum.ord());
    // NORMAL automaton
    termsEnum = dv.intersect(new CompiledAutomaton(new RegExp(".*l.*").toAutomaton()));
    assertEquals("hello", termsEnum.next().utf8ToString());
    assertEquals(1, termsEnum.ord());
    assertEquals("world", termsEnum.next().utf8ToString());
    assertEquals(2, termsEnum.ord());
    assertNull(termsEnum.next());
    // SINGLE automaton
    termsEnum = dv.intersect(new CompiledAutomaton(new RegExp("hello").toAutomaton()));
    assertEquals("hello", termsEnum.next().utf8ToString());
    assertEquals(1, termsEnum.ord());
    assertNull(termsEnum.next());
    ireader.close();
    directory.close();
}
Also used : RegExp(org.apache.lucene.util.automaton.RegExp) CompiledAutomaton(org.apache.lucene.util.automaton.CompiledAutomaton) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) Analyzer(org.apache.lucene.analysis.Analyzer) Document(org.apache.lucene.document.Document) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) SortedSetDocValuesField(org.apache.lucene.document.SortedSetDocValuesField) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory)

Aggregations

RegExp (org.apache.lucene.util.automaton.RegExp)30 CharacterRunAutomaton (org.apache.lucene.util.automaton.CharacterRunAutomaton)15 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)11 Document (org.apache.lucene.document.Document)9 Directory (org.apache.lucene.store.Directory)9 BytesRef (org.apache.lucene.util.BytesRef)9 CompiledAutomaton (org.apache.lucene.util.automaton.CompiledAutomaton)9 Analyzer (org.apache.lucene.analysis.Analyzer)5 Automaton (org.apache.lucene.util.automaton.Automaton)5 Term (org.apache.lucene.index.Term)4 IndexReader (org.apache.lucene.index.IndexReader)3 PhraseQuery (org.apache.lucene.search.PhraseQuery)3 TermQuery (org.apache.lucene.search.TermQuery)3 StringReader (java.io.StringReader)2 TreeSet (java.util.TreeSet)2 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)2 IndexWriter (org.apache.lucene.index.IndexWriter)2 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)2 TermsEnum (org.apache.lucene.index.TermsEnum)2 CommonQueryParserConfiguration (org.apache.lucene.queryparser.flexible.standard.CommonQueryParserConfiguration)2