Search in sources :

Example 11 with RegExp

use of org.apache.lucene.util.automaton.RegExp in project lucene-solr by apache.

the class TestTermsEnum method testIntersectBasic.

public void testIntersectBasic() throws Exception {
    Directory dir = newDirectory();
    IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random()));
    iwc.setMergePolicy(new LogDocMergePolicy());
    RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
    Document doc = new Document();
    doc.add(newTextField("field", "aaa", Field.Store.NO));
    w.addDocument(doc);
    doc = new Document();
    doc.add(newStringField("field", "bbb", Field.Store.NO));
    w.addDocument(doc);
    doc = new Document();
    doc.add(newTextField("field", "ccc", Field.Store.NO));
    w.addDocument(doc);
    w.forceMerge(1);
    DirectoryReader r = w.getReader();
    w.close();
    LeafReader sub = getOnlyLeafReader(r);
    Terms terms = sub.fields().terms("field");
    Automaton automaton = new RegExp(".*", RegExp.NONE).toAutomaton();
    CompiledAutomaton ca = new CompiledAutomaton(automaton, false, false);
    TermsEnum te = terms.intersect(ca, null);
    assertEquals("aaa", te.next().utf8ToString());
    assertEquals(0, te.postings(null, PostingsEnum.NONE).nextDoc());
    assertEquals("bbb", te.next().utf8ToString());
    assertEquals(1, te.postings(null, PostingsEnum.NONE).nextDoc());
    assertEquals("ccc", te.next().utf8ToString());
    assertEquals(2, te.postings(null, PostingsEnum.NONE).nextDoc());
    assertNull(te.next());
    te = terms.intersect(ca, new BytesRef("abc"));
    assertEquals("bbb", te.next().utf8ToString());
    assertEquals(1, te.postings(null, PostingsEnum.NONE).nextDoc());
    assertEquals("ccc", te.next().utf8ToString());
    assertEquals(2, te.postings(null, PostingsEnum.NONE).nextDoc());
    assertNull(te.next());
    te = terms.intersect(ca, new BytesRef("aaa"));
    assertEquals("bbb", te.next().utf8ToString());
    assertEquals(1, te.postings(null, PostingsEnum.NONE).nextDoc());
    assertEquals("ccc", te.next().utf8ToString());
    assertEquals(2, te.postings(null, PostingsEnum.NONE).nextDoc());
    assertNull(te.next());
    r.close();
    dir.close();
}
Also used : Automaton(org.apache.lucene.util.automaton.Automaton) CompiledAutomaton(org.apache.lucene.util.automaton.CompiledAutomaton) RegExp(org.apache.lucene.util.automaton.RegExp) CompiledAutomaton(org.apache.lucene.util.automaton.CompiledAutomaton) Document(org.apache.lucene.document.Document) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory)

Example 12 with RegExp

use of org.apache.lucene.util.automaton.RegExp in project lucene-solr by apache.

the class TestAutomatonQueryUnicode method testSortOrder.

/**
   * Test that AutomatonQuery interacts with lucene's sort order correctly.
   * 
   * This expression matches something either starting with the arabic
   * presentation forms block, or a supplementary character.
   */
public void testSortOrder() throws IOException {
    Automaton a = new RegExp("((𩬅)|ﮔ).*").toAutomaton();
    assertAutomatonHits(2, a);
}
Also used : Automaton(org.apache.lucene.util.automaton.Automaton) RegExp(org.apache.lucene.util.automaton.RegExp)

Example 13 with RegExp

use of org.apache.lucene.util.automaton.RegExp in project elasticsearch by elastic.

the class TermsTests method createTestAggregatorBuilder.

@Override
protected TermsAggregationBuilder createTestAggregatorBuilder() {
    String name = randomAsciiOfLengthBetween(3, 20);
    TermsAggregationBuilder factory = new TermsAggregationBuilder(name, null);
    String field = randomAsciiOfLengthBetween(3, 20);
    int randomFieldBranch = randomInt(2);
    switch(randomFieldBranch) {
        case 0:
            factory.field(field);
            break;
        case 1:
            factory.field(field);
            factory.script(new Script("_value + 1"));
            break;
        case 2:
            factory.script(new Script("doc[" + field + "] + 1"));
            break;
        default:
            fail();
    }
    if (randomBoolean()) {
        factory.missing("MISSING");
    }
    if (randomBoolean()) {
        factory.size(randomIntBetween(1, Integer.MAX_VALUE));
    }
    if (randomBoolean()) {
        factory.shardSize(randomIntBetween(1, Integer.MAX_VALUE));
    }
    if (randomBoolean()) {
        int minDocCount = randomInt(4);
        switch(minDocCount) {
            case 0:
                break;
            case 1:
            case 2:
            case 3:
            case 4:
                minDocCount = randomIntBetween(0, Integer.MAX_VALUE);
                break;
            default:
                fail();
        }
        factory.minDocCount(minDocCount);
    }
    if (randomBoolean()) {
        int shardMinDocCount = randomInt(4);
        switch(shardMinDocCount) {
            case 0:
                break;
            case 1:
            case 2:
            case 3:
            case 4:
                shardMinDocCount = randomIntBetween(0, Integer.MAX_VALUE);
                break;
            default:
                fail();
        }
        factory.shardMinDocCount(shardMinDocCount);
    }
    if (randomBoolean()) {
        factory.collectMode(randomFrom(SubAggCollectionMode.values()));
    }
    if (randomBoolean()) {
        factory.executionHint(randomFrom(executionHints));
    }
    if (randomBoolean()) {
        factory.format("###.##");
    }
    if (randomBoolean()) {
        IncludeExclude incExc = null;
        switch(randomInt(6)) {
            case 0:
                incExc = new IncludeExclude(new RegExp("foobar"), null);
                break;
            case 1:
                incExc = new IncludeExclude(null, new RegExp("foobaz"));
                break;
            case 2:
                incExc = new IncludeExclude(new RegExp("foobar"), new RegExp("foobaz"));
                break;
            case 3:
                SortedSet<BytesRef> includeValues = new TreeSet<>();
                int numIncs = randomIntBetween(1, 20);
                for (int i = 0; i < numIncs; i++) {
                    includeValues.add(new BytesRef(randomAsciiOfLengthBetween(1, 30)));
                }
                SortedSet<BytesRef> excludeValues = null;
                incExc = new IncludeExclude(includeValues, excludeValues);
                break;
            case 4:
                SortedSet<BytesRef> includeValues2 = null;
                SortedSet<BytesRef> excludeValues2 = new TreeSet<>();
                int numExcs2 = randomIntBetween(1, 20);
                for (int i = 0; i < numExcs2; i++) {
                    excludeValues2.add(new BytesRef(randomAsciiOfLengthBetween(1, 30)));
                }
                incExc = new IncludeExclude(includeValues2, excludeValues2);
                break;
            case 5:
                SortedSet<BytesRef> includeValues3 = new TreeSet<>();
                int numIncs3 = randomIntBetween(1, 20);
                for (int i = 0; i < numIncs3; i++) {
                    includeValues3.add(new BytesRef(randomAsciiOfLengthBetween(1, 30)));
                }
                SortedSet<BytesRef> excludeValues3 = new TreeSet<>();
                int numExcs3 = randomIntBetween(1, 20);
                for (int i = 0; i < numExcs3; i++) {
                    excludeValues3.add(new BytesRef(randomAsciiOfLengthBetween(1, 30)));
                }
                incExc = new IncludeExclude(includeValues3, excludeValues3);
                break;
            case 6:
                final int numPartitions = randomIntBetween(1, 100);
                final int partition = randomIntBetween(0, numPartitions - 1);
                incExc = new IncludeExclude(partition, numPartitions);
                break;
            default:
                fail();
        }
        factory.includeExclude(incExc);
    }
    if (randomBoolean()) {
        List<Terms.Order> order = randomOrder();
        factory.order(order);
    }
    if (randomBoolean()) {
        factory.showTermDocCountError(randomBoolean());
    }
    return factory;
}
Also used : Script(org.elasticsearch.script.Script) TermsAggregationBuilder(org.elasticsearch.search.aggregations.bucket.terms.TermsAggregationBuilder) RegExp(org.apache.lucene.util.automaton.RegExp) TreeSet(java.util.TreeSet) IncludeExclude(org.elasticsearch.search.aggregations.bucket.terms.support.IncludeExclude) BytesRef(org.apache.lucene.util.BytesRef)

Example 14 with RegExp

use of org.apache.lucene.util.automaton.RegExp in project elasticsearch by elastic.

the class StringTermsIT method testSingleValueFieldWithRegexFiltering.

public void testSingleValueFieldWithRegexFiltering() throws Exception {
    // include without exclude
    // we should be left with: val000, val001, val002, val003, val004, val005, val006, val007, val008, val009
    SearchResponse response = client().prepareSearch("idx").setTypes("high_card_type").addAggregation(terms("terms").executionHint(randomExecutionHint()).field(SINGLE_VALUED_FIELD_NAME).collectMode(randomFrom(SubAggCollectionMode.values())).includeExclude(new IncludeExclude("val00.+", null))).execute().actionGet();
    assertSearchResponse(response);
    Terms terms = response.getAggregations().get("terms");
    assertThat(terms, notNullValue());
    assertThat(terms.getName(), equalTo("terms"));
    assertThat(terms.getBuckets().size(), equalTo(10));
    for (int i = 0; i < 10; i++) {
        Terms.Bucket bucket = terms.getBucketByKey("val00" + i);
        assertThat(bucket, notNullValue());
        assertThat(key(bucket), equalTo("val00" + i));
        assertThat(bucket.getDocCount(), equalTo(1L));
    }
    // include and exclude
    // we should be left with: val002, val003, val004, val005, val006, val007, val008, val009
    response = client().prepareSearch("idx").setTypes("high_card_type").addAggregation(terms("terms").executionHint(randomExecutionHint()).field(SINGLE_VALUED_FIELD_NAME).collectMode(randomFrom(SubAggCollectionMode.values())).includeExclude(new IncludeExclude("val00.+", "(val000|val001)"))).execute().actionGet();
    assertSearchResponse(response);
    terms = response.getAggregations().get("terms");
    assertThat(terms, notNullValue());
    assertThat(terms.getName(), equalTo("terms"));
    assertThat(terms.getBuckets().size(), equalTo(8));
    for (int i = 2; i < 10; i++) {
        Terms.Bucket bucket = terms.getBucketByKey("val00" + i);
        assertThat(bucket, notNullValue());
        assertThat(key(bucket), equalTo("val00" + i));
        assertThat(bucket.getDocCount(), equalTo(1L));
    }
    // exclude without include
    // we should be left with: val000, val001, val002, val003, val004, val005, val006, val007, val008, val009
    response = client().prepareSearch("idx").setTypes("high_card_type").addAggregation(terms("terms").executionHint(randomExecutionHint()).field(SINGLE_VALUED_FIELD_NAME).collectMode(randomFrom(SubAggCollectionMode.values())).includeExclude(new IncludeExclude(null, new RegExp("val0[1-9]+.+")))).execute().actionGet();
    assertSearchResponse(response);
    terms = response.getAggregations().get("terms");
    assertThat(terms, notNullValue());
    assertThat(terms.getName(), equalTo("terms"));
    assertThat(terms.getBuckets().size(), equalTo(10));
    for (int i = 0; i < 10; i++) {
        Terms.Bucket bucket = terms.getBucketByKey("val00" + i);
        assertThat(bucket, notNullValue());
        assertThat(key(bucket), equalTo("val00" + i));
        assertThat(bucket.getDocCount(), equalTo(1L));
    }
}
Also used : Bucket(org.elasticsearch.search.aggregations.bucket.terms.Terms.Bucket) RegExp(org.apache.lucene.util.automaton.RegExp) IncludeExclude(org.elasticsearch.search.aggregations.bucket.terms.support.IncludeExclude) Terms(org.elasticsearch.search.aggregations.bucket.terms.Terms) SearchResponse(org.elasticsearch.action.search.SearchResponse) ElasticsearchAssertions.assertSearchResponse(org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertSearchResponse)

Example 15 with RegExp

use of org.apache.lucene.util.automaton.RegExp in project lucene-solr by apache.

the class TestTermsEnum method testIntersectStartTerm.

public void testIntersectStartTerm() throws Exception {
    Directory dir = newDirectory();
    IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random()));
    iwc.setMergePolicy(new LogDocMergePolicy());
    RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
    Document doc = new Document();
    doc.add(newStringField("field", "abc", Field.Store.NO));
    w.addDocument(doc);
    doc = new Document();
    doc.add(newStringField("field", "abd", Field.Store.NO));
    w.addDocument(doc);
    doc = new Document();
    doc.add(newStringField("field", "acd", Field.Store.NO));
    w.addDocument(doc);
    doc = new Document();
    doc.add(newStringField("field", "bcd", Field.Store.NO));
    w.addDocument(doc);
    w.forceMerge(1);
    DirectoryReader r = w.getReader();
    w.close();
    LeafReader sub = getOnlyLeafReader(r);
    Terms terms = sub.fields().terms("field");
    Automaton automaton = new RegExp(".*d", RegExp.NONE).toAutomaton();
    CompiledAutomaton ca = new CompiledAutomaton(automaton, false, false);
    TermsEnum te;
    // should seek to startTerm
    te = terms.intersect(ca, new BytesRef("aad"));
    assertEquals("abd", te.next().utf8ToString());
    assertEquals(1, te.postings(null, PostingsEnum.NONE).nextDoc());
    assertEquals("acd", te.next().utf8ToString());
    assertEquals(2, te.postings(null, PostingsEnum.NONE).nextDoc());
    assertEquals("bcd", te.next().utf8ToString());
    assertEquals(3, te.postings(null, PostingsEnum.NONE).nextDoc());
    assertNull(te.next());
    // should fail to find ceil label on second arc, rewind 
    te = terms.intersect(ca, new BytesRef("add"));
    assertEquals("bcd", te.next().utf8ToString());
    assertEquals(3, te.postings(null, PostingsEnum.NONE).nextDoc());
    assertNull(te.next());
    // should reach end
    te = terms.intersect(ca, new BytesRef("bcd"));
    assertNull(te.next());
    te = terms.intersect(ca, new BytesRef("ddd"));
    assertNull(te.next());
    r.close();
    dir.close();
}
Also used : Automaton(org.apache.lucene.util.automaton.Automaton) CompiledAutomaton(org.apache.lucene.util.automaton.CompiledAutomaton) RegExp(org.apache.lucene.util.automaton.RegExp) CompiledAutomaton(org.apache.lucene.util.automaton.CompiledAutomaton) Document(org.apache.lucene.document.Document) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory)

Aggregations

RegExp (org.apache.lucene.util.automaton.RegExp)30 CharacterRunAutomaton (org.apache.lucene.util.automaton.CharacterRunAutomaton)15 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)11 Document (org.apache.lucene.document.Document)9 Directory (org.apache.lucene.store.Directory)9 BytesRef (org.apache.lucene.util.BytesRef)9 CompiledAutomaton (org.apache.lucene.util.automaton.CompiledAutomaton)9 Analyzer (org.apache.lucene.analysis.Analyzer)5 Automaton (org.apache.lucene.util.automaton.Automaton)5 Term (org.apache.lucene.index.Term)4 IndexReader (org.apache.lucene.index.IndexReader)3 PhraseQuery (org.apache.lucene.search.PhraseQuery)3 TermQuery (org.apache.lucene.search.TermQuery)3 StringReader (java.io.StringReader)2 TreeSet (java.util.TreeSet)2 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)2 IndexWriter (org.apache.lucene.index.IndexWriter)2 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)2 TermsEnum (org.apache.lucene.index.TermsEnum)2 CommonQueryParserConfiguration (org.apache.lucene.queryparser.flexible.standard.CommonQueryParserConfiguration)2