use of org.apache.lucene.util.automaton.CompiledAutomaton in project lucene-solr by apache.
the class TestTermsEnum method testIntersectBasic.
public void testIntersectBasic() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random()));
iwc.setMergePolicy(new LogDocMergePolicy());
RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
Document doc = new Document();
doc.add(newTextField("field", "aaa", Field.Store.NO));
w.addDocument(doc);
doc = new Document();
doc.add(newStringField("field", "bbb", Field.Store.NO));
w.addDocument(doc);
doc = new Document();
doc.add(newTextField("field", "ccc", Field.Store.NO));
w.addDocument(doc);
w.forceMerge(1);
DirectoryReader r = w.getReader();
w.close();
LeafReader sub = getOnlyLeafReader(r);
Terms terms = sub.fields().terms("field");
Automaton automaton = new RegExp(".*", RegExp.NONE).toAutomaton();
CompiledAutomaton ca = new CompiledAutomaton(automaton, false, false);
TermsEnum te = terms.intersect(ca, null);
assertEquals("aaa", te.next().utf8ToString());
assertEquals(0, te.postings(null, PostingsEnum.NONE).nextDoc());
assertEquals("bbb", te.next().utf8ToString());
assertEquals(1, te.postings(null, PostingsEnum.NONE).nextDoc());
assertEquals("ccc", te.next().utf8ToString());
assertEquals(2, te.postings(null, PostingsEnum.NONE).nextDoc());
assertNull(te.next());
te = terms.intersect(ca, new BytesRef("abc"));
assertEquals("bbb", te.next().utf8ToString());
assertEquals(1, te.postings(null, PostingsEnum.NONE).nextDoc());
assertEquals("ccc", te.next().utf8ToString());
assertEquals(2, te.postings(null, PostingsEnum.NONE).nextDoc());
assertNull(te.next());
te = terms.intersect(ca, new BytesRef("aaa"));
assertEquals("bbb", te.next().utf8ToString());
assertEquals(1, te.postings(null, PostingsEnum.NONE).nextDoc());
assertEquals("ccc", te.next().utf8ToString());
assertEquals(2, te.postings(null, PostingsEnum.NONE).nextDoc());
assertNull(te.next());
r.close();
dir.close();
}
use of org.apache.lucene.util.automaton.CompiledAutomaton in project lucene-solr by apache.
the class FuzzyTermsEnum method getAutomatonEnum.
/**
* return an automata-based enum for matching up to editDistance from
* lastTerm, if possible
*/
private TermsEnum getAutomatonEnum(int editDistance, BytesRef lastTerm) throws IOException {
assert editDistance < automata.length;
final CompiledAutomaton compiled = automata[editDistance];
BytesRef initialSeekTerm;
if (lastTerm == null) {
// This is the first enum we are pulling:
initialSeekTerm = null;
} else {
// We are pulling this enum (e.g., ed=1) after iterating for a while already (e.g., ed=2):
initialSeekTerm = compiled.floor(lastTerm, new BytesRefBuilder());
}
return terms.intersect(compiled, initialSeekTerm);
}
use of org.apache.lucene.util.automaton.CompiledAutomaton in project lucene-solr by apache.
the class TestTermsEnum method testIntersectStartTerm.
public void testIntersectStartTerm() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random()));
iwc.setMergePolicy(new LogDocMergePolicy());
RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
Document doc = new Document();
doc.add(newStringField("field", "abc", Field.Store.NO));
w.addDocument(doc);
doc = new Document();
doc.add(newStringField("field", "abd", Field.Store.NO));
w.addDocument(doc);
doc = new Document();
doc.add(newStringField("field", "acd", Field.Store.NO));
w.addDocument(doc);
doc = new Document();
doc.add(newStringField("field", "bcd", Field.Store.NO));
w.addDocument(doc);
w.forceMerge(1);
DirectoryReader r = w.getReader();
w.close();
LeafReader sub = getOnlyLeafReader(r);
Terms terms = sub.fields().terms("field");
Automaton automaton = new RegExp(".*d", RegExp.NONE).toAutomaton();
CompiledAutomaton ca = new CompiledAutomaton(automaton, false, false);
TermsEnum te;
// should seek to startTerm
te = terms.intersect(ca, new BytesRef("aad"));
assertEquals("abd", te.next().utf8ToString());
assertEquals(1, te.postings(null, PostingsEnum.NONE).nextDoc());
assertEquals("acd", te.next().utf8ToString());
assertEquals(2, te.postings(null, PostingsEnum.NONE).nextDoc());
assertEquals("bcd", te.next().utf8ToString());
assertEquals(3, te.postings(null, PostingsEnum.NONE).nextDoc());
assertNull(te.next());
// should fail to find ceil label on second arc, rewind
te = terms.intersect(ca, new BytesRef("add"));
assertEquals("bcd", te.next().utf8ToString());
assertEquals(3, te.postings(null, PostingsEnum.NONE).nextDoc());
assertNull(te.next());
// should reach end
te = terms.intersect(ca, new BytesRef("bcd"));
assertNull(te.next());
te = terms.intersect(ca, new BytesRef("ddd"));
assertNull(te.next());
r.close();
dir.close();
}
use of org.apache.lucene.util.automaton.CompiledAutomaton in project lucene-solr by apache.
the class TestTermsEnum method testIntersectRandom.
// Tests Terms.intersect
public void testIntersectRandom() throws IOException {
final Directory dir = newDirectory();
final RandomIndexWriter w = new RandomIndexWriter(random(), dir);
final int numTerms = atLeast(300);
//final int numTerms = 50;
final Set<String> terms = new HashSet<>();
final Collection<String> pendingTerms = new ArrayList<>();
final Map<BytesRef, Integer> termToID = new HashMap<>();
int id = 0;
while (terms.size() != numTerms) {
final String s = getRandomString();
if (!terms.contains(s)) {
terms.add(s);
pendingTerms.add(s);
if (random().nextInt(20) == 7) {
addDoc(w, pendingTerms, termToID, id++);
}
}
}
addDoc(w, pendingTerms, termToID, id++);
final BytesRef[] termsArray = new BytesRef[terms.size()];
final Set<BytesRef> termsSet = new HashSet<>();
{
int upto = 0;
for (String s : terms) {
final BytesRef b = new BytesRef(s);
termsArray[upto++] = b;
termsSet.add(b);
}
Arrays.sort(termsArray);
}
if (VERBOSE) {
System.out.println("\nTEST: indexed terms (unicode order):");
for (BytesRef t : termsArray) {
System.out.println(" " + t.utf8ToString() + " -> id:" + termToID.get(t));
}
}
final IndexReader r = w.getReader();
w.close();
int[] docIDToID = new int[r.maxDoc()];
NumericDocValues values = MultiDocValues.getNumericValues(r, "id");
for (int i = 0; i < r.maxDoc(); i++) {
assertEquals(i, values.nextDoc());
docIDToID[i] = (int) values.longValue();
}
for (int iter = 0; iter < 10 * RANDOM_MULTIPLIER; iter++) {
// TODO: can we also test infinite As here...?
// From the random terms, pick some ratio and compile an
// automaton:
final Set<String> acceptTerms = new HashSet<>();
final TreeSet<BytesRef> sortedAcceptTerms = new TreeSet<>();
final double keepPct = random().nextDouble();
Automaton a;
if (iter == 0) {
if (VERBOSE) {
System.out.println("\nTEST: empty automaton");
}
a = Automata.makeEmpty();
} else {
if (VERBOSE) {
System.out.println("\nTEST: keepPct=" + keepPct);
}
for (String s : terms) {
final String s2;
if (random().nextDouble() <= keepPct) {
s2 = s;
} else {
s2 = getRandomString();
}
acceptTerms.add(s2);
sortedAcceptTerms.add(new BytesRef(s2));
}
a = Automata.makeStringUnion(sortedAcceptTerms);
}
final CompiledAutomaton c = new CompiledAutomaton(a, true, false, 1000000, false);
final BytesRef[] acceptTermsArray = new BytesRef[acceptTerms.size()];
final Set<BytesRef> acceptTermsSet = new HashSet<>();
int upto = 0;
for (String s : acceptTerms) {
final BytesRef b = new BytesRef(s);
acceptTermsArray[upto++] = b;
acceptTermsSet.add(b);
assertTrue(accepts(c, b));
}
Arrays.sort(acceptTermsArray);
if (VERBOSE) {
System.out.println("\nTEST: accept terms (unicode order):");
for (BytesRef t : acceptTermsArray) {
System.out.println(" " + t.utf8ToString() + (termsSet.contains(t) ? " (exists)" : ""));
}
System.out.println(a.toDot());
}
for (int iter2 = 0; iter2 < 100; iter2++) {
final BytesRef startTerm = acceptTermsArray.length == 0 || random().nextBoolean() ? null : acceptTermsArray[random().nextInt(acceptTermsArray.length)];
if (VERBOSE) {
System.out.println("\nTEST: iter2=" + iter2 + " startTerm=" + (startTerm == null ? "<null>" : startTerm.utf8ToString()));
if (startTerm != null) {
int state = 0;
for (int idx = 0; idx < startTerm.length; idx++) {
final int label = startTerm.bytes[startTerm.offset + idx] & 0xff;
System.out.println(" state=" + state + " label=" + label);
state = c.runAutomaton.step(state, label);
assertTrue(state != -1);
}
System.out.println(" state=" + state);
}
}
final TermsEnum te = MultiFields.getTerms(r, "f").intersect(c, startTerm);
int loc;
if (startTerm == null) {
loc = 0;
} else {
loc = Arrays.binarySearch(termsArray, BytesRef.deepCopyOf(startTerm));
if (loc < 0) {
loc = -(loc + 1);
} else {
// startTerm exists in index
loc++;
}
}
while (loc < termsArray.length && !acceptTermsSet.contains(termsArray[loc])) {
loc++;
}
PostingsEnum postingsEnum = null;
while (loc < termsArray.length) {
final BytesRef expected = termsArray[loc];
final BytesRef actual = te.next();
if (VERBOSE) {
System.out.println("TEST: next() expected=" + expected.utf8ToString() + " actual=" + (actual == null ? "null" : actual.utf8ToString()));
}
assertEquals(expected, actual);
assertEquals(1, te.docFreq());
postingsEnum = TestUtil.docs(random(), te, postingsEnum, PostingsEnum.NONE);
final int docID = postingsEnum.nextDoc();
assertTrue(docID != DocIdSetIterator.NO_MORE_DOCS);
assertEquals(docIDToID[docID], termToID.get(expected).intValue());
do {
loc++;
} while (loc < termsArray.length && !acceptTermsSet.contains(termsArray[loc]));
}
assertNull(te.next());
}
}
r.close();
dir.close();
}
use of org.apache.lucene.util.automaton.CompiledAutomaton in project lucene-solr by apache.
the class TestTermsEnum method testIntersectRegexp.
// LUCENE-7576
public void testIntersectRegexp() throws Exception {
Directory d = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), d);
Document doc = new Document();
doc.add(newStringField("field", "foobar", Field.Store.NO));
w.addDocument(doc);
IndexReader r = w.getReader();
Fields fields = MultiFields.getFields(r);
CompiledAutomaton automaton = new CompiledAutomaton(new RegExp("do_not_match_anything").toAutomaton());
Terms terms = fields.terms("field");
String message = expectThrows(IllegalArgumentException.class, () -> {
terms.intersect(automaton, null);
}).getMessage();
assertEquals("please use CompiledAutomaton.getTermsEnum instead", message);
r.close();
w.close();
d.close();
}
Aggregations