use of org.apache.lucene.util.automaton.RegExp in project lucene-solr by apache.
the class TestTermsEnum method testIntersectBasic.
public void testIntersectBasic() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random()));
iwc.setMergePolicy(new LogDocMergePolicy());
RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
Document doc = new Document();
doc.add(newTextField("field", "aaa", Field.Store.NO));
w.addDocument(doc);
doc = new Document();
doc.add(newStringField("field", "bbb", Field.Store.NO));
w.addDocument(doc);
doc = new Document();
doc.add(newTextField("field", "ccc", Field.Store.NO));
w.addDocument(doc);
w.forceMerge(1);
DirectoryReader r = w.getReader();
w.close();
LeafReader sub = getOnlyLeafReader(r);
Terms terms = sub.fields().terms("field");
Automaton automaton = new RegExp(".*", RegExp.NONE).toAutomaton();
CompiledAutomaton ca = new CompiledAutomaton(automaton, false, false);
TermsEnum te = terms.intersect(ca, null);
assertEquals("aaa", te.next().utf8ToString());
assertEquals(0, te.postings(null, PostingsEnum.NONE).nextDoc());
assertEquals("bbb", te.next().utf8ToString());
assertEquals(1, te.postings(null, PostingsEnum.NONE).nextDoc());
assertEquals("ccc", te.next().utf8ToString());
assertEquals(2, te.postings(null, PostingsEnum.NONE).nextDoc());
assertNull(te.next());
te = terms.intersect(ca, new BytesRef("abc"));
assertEquals("bbb", te.next().utf8ToString());
assertEquals(1, te.postings(null, PostingsEnum.NONE).nextDoc());
assertEquals("ccc", te.next().utf8ToString());
assertEquals(2, te.postings(null, PostingsEnum.NONE).nextDoc());
assertNull(te.next());
te = terms.intersect(ca, new BytesRef("aaa"));
assertEquals("bbb", te.next().utf8ToString());
assertEquals(1, te.postings(null, PostingsEnum.NONE).nextDoc());
assertEquals("ccc", te.next().utf8ToString());
assertEquals(2, te.postings(null, PostingsEnum.NONE).nextDoc());
assertNull(te.next());
r.close();
dir.close();
}
use of org.apache.lucene.util.automaton.RegExp in project lucene-solr by apache.
the class TestAutomatonQueryUnicode method testSortOrder.
/**
* Test that AutomatonQuery interacts with lucene's sort order correctly.
*
* This expression matches something either starting with the arabic
* presentation forms block, or a supplementary character.
*/
public void testSortOrder() throws IOException {
Automaton a = new RegExp("((𩬅)|ﮔ).*").toAutomaton();
assertAutomatonHits(2, a);
}
use of org.apache.lucene.util.automaton.RegExp in project elasticsearch by elastic.
the class TermsTests method createTestAggregatorBuilder.
@Override
protected TermsAggregationBuilder createTestAggregatorBuilder() {
String name = randomAsciiOfLengthBetween(3, 20);
TermsAggregationBuilder factory = new TermsAggregationBuilder(name, null);
String field = randomAsciiOfLengthBetween(3, 20);
int randomFieldBranch = randomInt(2);
switch(randomFieldBranch) {
case 0:
factory.field(field);
break;
case 1:
factory.field(field);
factory.script(new Script("_value + 1"));
break;
case 2:
factory.script(new Script("doc[" + field + "] + 1"));
break;
default:
fail();
}
if (randomBoolean()) {
factory.missing("MISSING");
}
if (randomBoolean()) {
factory.size(randomIntBetween(1, Integer.MAX_VALUE));
}
if (randomBoolean()) {
factory.shardSize(randomIntBetween(1, Integer.MAX_VALUE));
}
if (randomBoolean()) {
int minDocCount = randomInt(4);
switch(minDocCount) {
case 0:
break;
case 1:
case 2:
case 3:
case 4:
minDocCount = randomIntBetween(0, Integer.MAX_VALUE);
break;
default:
fail();
}
factory.minDocCount(minDocCount);
}
if (randomBoolean()) {
int shardMinDocCount = randomInt(4);
switch(shardMinDocCount) {
case 0:
break;
case 1:
case 2:
case 3:
case 4:
shardMinDocCount = randomIntBetween(0, Integer.MAX_VALUE);
break;
default:
fail();
}
factory.shardMinDocCount(shardMinDocCount);
}
if (randomBoolean()) {
factory.collectMode(randomFrom(SubAggCollectionMode.values()));
}
if (randomBoolean()) {
factory.executionHint(randomFrom(executionHints));
}
if (randomBoolean()) {
factory.format("###.##");
}
if (randomBoolean()) {
IncludeExclude incExc = null;
switch(randomInt(6)) {
case 0:
incExc = new IncludeExclude(new RegExp("foobar"), null);
break;
case 1:
incExc = new IncludeExclude(null, new RegExp("foobaz"));
break;
case 2:
incExc = new IncludeExclude(new RegExp("foobar"), new RegExp("foobaz"));
break;
case 3:
SortedSet<BytesRef> includeValues = new TreeSet<>();
int numIncs = randomIntBetween(1, 20);
for (int i = 0; i < numIncs; i++) {
includeValues.add(new BytesRef(randomAsciiOfLengthBetween(1, 30)));
}
SortedSet<BytesRef> excludeValues = null;
incExc = new IncludeExclude(includeValues, excludeValues);
break;
case 4:
SortedSet<BytesRef> includeValues2 = null;
SortedSet<BytesRef> excludeValues2 = new TreeSet<>();
int numExcs2 = randomIntBetween(1, 20);
for (int i = 0; i < numExcs2; i++) {
excludeValues2.add(new BytesRef(randomAsciiOfLengthBetween(1, 30)));
}
incExc = new IncludeExclude(includeValues2, excludeValues2);
break;
case 5:
SortedSet<BytesRef> includeValues3 = new TreeSet<>();
int numIncs3 = randomIntBetween(1, 20);
for (int i = 0; i < numIncs3; i++) {
includeValues3.add(new BytesRef(randomAsciiOfLengthBetween(1, 30)));
}
SortedSet<BytesRef> excludeValues3 = new TreeSet<>();
int numExcs3 = randomIntBetween(1, 20);
for (int i = 0; i < numExcs3; i++) {
excludeValues3.add(new BytesRef(randomAsciiOfLengthBetween(1, 30)));
}
incExc = new IncludeExclude(includeValues3, excludeValues3);
break;
case 6:
final int numPartitions = randomIntBetween(1, 100);
final int partition = randomIntBetween(0, numPartitions - 1);
incExc = new IncludeExclude(partition, numPartitions);
break;
default:
fail();
}
factory.includeExclude(incExc);
}
if (randomBoolean()) {
List<Terms.Order> order = randomOrder();
factory.order(order);
}
if (randomBoolean()) {
factory.showTermDocCountError(randomBoolean());
}
return factory;
}
use of org.apache.lucene.util.automaton.RegExp in project elasticsearch by elastic.
the class StringTermsIT method testSingleValueFieldWithRegexFiltering.
public void testSingleValueFieldWithRegexFiltering() throws Exception {
// include without exclude
// we should be left with: val000, val001, val002, val003, val004, val005, val006, val007, val008, val009
SearchResponse response = client().prepareSearch("idx").setTypes("high_card_type").addAggregation(terms("terms").executionHint(randomExecutionHint()).field(SINGLE_VALUED_FIELD_NAME).collectMode(randomFrom(SubAggCollectionMode.values())).includeExclude(new IncludeExclude("val00.+", null))).execute().actionGet();
assertSearchResponse(response);
Terms terms = response.getAggregations().get("terms");
assertThat(terms, notNullValue());
assertThat(terms.getName(), equalTo("terms"));
assertThat(terms.getBuckets().size(), equalTo(10));
for (int i = 0; i < 10; i++) {
Terms.Bucket bucket = terms.getBucketByKey("val00" + i);
assertThat(bucket, notNullValue());
assertThat(key(bucket), equalTo("val00" + i));
assertThat(bucket.getDocCount(), equalTo(1L));
}
// include and exclude
// we should be left with: val002, val003, val004, val005, val006, val007, val008, val009
response = client().prepareSearch("idx").setTypes("high_card_type").addAggregation(terms("terms").executionHint(randomExecutionHint()).field(SINGLE_VALUED_FIELD_NAME).collectMode(randomFrom(SubAggCollectionMode.values())).includeExclude(new IncludeExclude("val00.+", "(val000|val001)"))).execute().actionGet();
assertSearchResponse(response);
terms = response.getAggregations().get("terms");
assertThat(terms, notNullValue());
assertThat(terms.getName(), equalTo("terms"));
assertThat(terms.getBuckets().size(), equalTo(8));
for (int i = 2; i < 10; i++) {
Terms.Bucket bucket = terms.getBucketByKey("val00" + i);
assertThat(bucket, notNullValue());
assertThat(key(bucket), equalTo("val00" + i));
assertThat(bucket.getDocCount(), equalTo(1L));
}
// exclude without include
// we should be left with: val000, val001, val002, val003, val004, val005, val006, val007, val008, val009
response = client().prepareSearch("idx").setTypes("high_card_type").addAggregation(terms("terms").executionHint(randomExecutionHint()).field(SINGLE_VALUED_FIELD_NAME).collectMode(randomFrom(SubAggCollectionMode.values())).includeExclude(new IncludeExclude(null, new RegExp("val0[1-9]+.+")))).execute().actionGet();
assertSearchResponse(response);
terms = response.getAggregations().get("terms");
assertThat(terms, notNullValue());
assertThat(terms.getName(), equalTo("terms"));
assertThat(terms.getBuckets().size(), equalTo(10));
for (int i = 0; i < 10; i++) {
Terms.Bucket bucket = terms.getBucketByKey("val00" + i);
assertThat(bucket, notNullValue());
assertThat(key(bucket), equalTo("val00" + i));
assertThat(bucket.getDocCount(), equalTo(1L));
}
}
use of org.apache.lucene.util.automaton.RegExp in project lucene-solr by apache.
the class TestTermsEnum method testIntersectStartTerm.
public void testIntersectStartTerm() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random()));
iwc.setMergePolicy(new LogDocMergePolicy());
RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
Document doc = new Document();
doc.add(newStringField("field", "abc", Field.Store.NO));
w.addDocument(doc);
doc = new Document();
doc.add(newStringField("field", "abd", Field.Store.NO));
w.addDocument(doc);
doc = new Document();
doc.add(newStringField("field", "acd", Field.Store.NO));
w.addDocument(doc);
doc = new Document();
doc.add(newStringField("field", "bcd", Field.Store.NO));
w.addDocument(doc);
w.forceMerge(1);
DirectoryReader r = w.getReader();
w.close();
LeafReader sub = getOnlyLeafReader(r);
Terms terms = sub.fields().terms("field");
Automaton automaton = new RegExp(".*d", RegExp.NONE).toAutomaton();
CompiledAutomaton ca = new CompiledAutomaton(automaton, false, false);
TermsEnum te;
// should seek to startTerm
te = terms.intersect(ca, new BytesRef("aad"));
assertEquals("abd", te.next().utf8ToString());
assertEquals(1, te.postings(null, PostingsEnum.NONE).nextDoc());
assertEquals("acd", te.next().utf8ToString());
assertEquals(2, te.postings(null, PostingsEnum.NONE).nextDoc());
assertEquals("bcd", te.next().utf8ToString());
assertEquals(3, te.postings(null, PostingsEnum.NONE).nextDoc());
assertNull(te.next());
// should fail to find ceil label on second arc, rewind
te = terms.intersect(ca, new BytesRef("add"));
assertEquals("bcd", te.next().utf8ToString());
assertEquals(3, te.postings(null, PostingsEnum.NONE).nextDoc());
assertNull(te.next());
// should reach end
te = terms.intersect(ca, new BytesRef("bcd"));
assertNull(te.next());
te = terms.intersect(ca, new BytesRef("ddd"));
assertNull(te.next());
r.close();
dir.close();
}
Aggregations