use of org.apache.lucene.util.automaton.Automaton in project lucene-solr by apache.
the class TestSimplePatternTokenizer method testNotDeterminized.
public void testNotDeterminized() throws Exception {
Automaton a = new Automaton();
int start = a.createState();
int mid1 = a.createState();
int mid2 = a.createState();
int end = a.createState();
a.setAccept(end, true);
a.addTransition(start, mid1, 'a', 'z');
a.addTransition(start, mid2, 'a', 'z');
a.addTransition(mid1, end, 'b');
a.addTransition(mid2, end, 'b');
expectThrows(IllegalArgumentException.class, () -> {
new SimplePatternTokenizer(a);
});
}
use of org.apache.lucene.util.automaton.Automaton in project lucene-solr by apache.
the class TestTermsEnum method testIntersectEmptyString.
public void testIntersectEmptyString() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random()));
iwc.setMergePolicy(new LogDocMergePolicy());
RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
Document doc = new Document();
doc.add(newStringField("field", "", Field.Store.NO));
doc.add(newStringField("field", "abc", Field.Store.NO));
w.addDocument(doc);
doc = new Document();
// add empty string to both documents, so that singletonDocID == -1.
// For a FST-based term dict, we'll expect to see the first arc is
// flaged with HAS_FINAL_OUTPUT
doc.add(newStringField("field", "abc", Field.Store.NO));
doc.add(newStringField("field", "", Field.Store.NO));
w.addDocument(doc);
w.forceMerge(1);
DirectoryReader r = w.getReader();
w.close();
LeafReader sub = getOnlyLeafReader(r);
Terms terms = sub.fields().terms("field");
// accept ALL
Automaton automaton = new RegExp(".*", RegExp.NONE).toAutomaton();
CompiledAutomaton ca = new CompiledAutomaton(automaton, false, false);
TermsEnum te = terms.intersect(ca, null);
PostingsEnum de;
assertEquals("", te.next().utf8ToString());
de = te.postings(null, PostingsEnum.NONE);
assertEquals(0, de.nextDoc());
assertEquals(1, de.nextDoc());
assertEquals("abc", te.next().utf8ToString());
de = te.postings(null, PostingsEnum.NONE);
assertEquals(0, de.nextDoc());
assertEquals(1, de.nextDoc());
assertNull(te.next());
// pass empty string
te = terms.intersect(ca, new BytesRef(""));
assertEquals("abc", te.next().utf8ToString());
de = te.postings(null, PostingsEnum.NONE);
assertEquals(0, de.nextDoc());
assertEquals(1, de.nextDoc());
assertNull(te.next());
r.close();
dir.close();
}
use of org.apache.lucene.util.automaton.Automaton in project lucene-solr by apache.
the class TestTermsEnum method testIntersectBasic.
public void testIntersectBasic() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random()));
iwc.setMergePolicy(new LogDocMergePolicy());
RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
Document doc = new Document();
doc.add(newTextField("field", "aaa", Field.Store.NO));
w.addDocument(doc);
doc = new Document();
doc.add(newStringField("field", "bbb", Field.Store.NO));
w.addDocument(doc);
doc = new Document();
doc.add(newTextField("field", "ccc", Field.Store.NO));
w.addDocument(doc);
w.forceMerge(1);
DirectoryReader r = w.getReader();
w.close();
LeafReader sub = getOnlyLeafReader(r);
Terms terms = sub.fields().terms("field");
Automaton automaton = new RegExp(".*", RegExp.NONE).toAutomaton();
CompiledAutomaton ca = new CompiledAutomaton(automaton, false, false);
TermsEnum te = terms.intersect(ca, null);
assertEquals("aaa", te.next().utf8ToString());
assertEquals(0, te.postings(null, PostingsEnum.NONE).nextDoc());
assertEquals("bbb", te.next().utf8ToString());
assertEquals(1, te.postings(null, PostingsEnum.NONE).nextDoc());
assertEquals("ccc", te.next().utf8ToString());
assertEquals(2, te.postings(null, PostingsEnum.NONE).nextDoc());
assertNull(te.next());
te = terms.intersect(ca, new BytesRef("abc"));
assertEquals("bbb", te.next().utf8ToString());
assertEquals(1, te.postings(null, PostingsEnum.NONE).nextDoc());
assertEquals("ccc", te.next().utf8ToString());
assertEquals(2, te.postings(null, PostingsEnum.NONE).nextDoc());
assertNull(te.next());
te = terms.intersect(ca, new BytesRef("aaa"));
assertEquals("bbb", te.next().utf8ToString());
assertEquals(1, te.postings(null, PostingsEnum.NONE).nextDoc());
assertEquals("ccc", te.next().utf8ToString());
assertEquals(2, te.postings(null, PostingsEnum.NONE).nextDoc());
assertNull(te.next());
r.close();
dir.close();
}
use of org.apache.lucene.util.automaton.Automaton in project lucene-solr by apache.
the class TermAutomatonQuery method finish.
/**
* Call this once you are done adding states/transitions.
* @param maxDeterminizedStates Maximum number of states created when
* determinizing the automaton. Higher numbers allow this operation to
* consume more memory but allow more complex automatons.
*/
public void finish(int maxDeterminizedStates) {
Automaton automaton = builder.finish();
// System.out.println("before det:\n" + automaton.toDot());
Transition t = new Transition();
if (anyTermID != -1) {
// Make sure there are no leading or trailing ANY:
int count = automaton.initTransition(0, t);
for (int i = 0; i < count; i++) {
automaton.getNextTransition(t);
if (anyTermID >= t.min && anyTermID <= t.max) {
throw new IllegalStateException("automaton cannot lead with an ANY transition");
}
}
int numStates = automaton.getNumStates();
for (int i = 0; i < numStates; i++) {
count = automaton.initTransition(i, t);
for (int j = 0; j < count; j++) {
automaton.getNextTransition(t);
if (automaton.isAccept(t.dest) && anyTermID >= t.min && anyTermID <= t.max) {
throw new IllegalStateException("automaton cannot end with an ANY transition");
}
}
}
int termCount = termToID.size();
// We have to carefully translate these transitions so automaton
// realizes they also match all other terms:
Automaton newAutomaton = new Automaton();
for (int i = 0; i < numStates; i++) {
newAutomaton.createState();
newAutomaton.setAccept(i, automaton.isAccept(i));
}
for (int i = 0; i < numStates; i++) {
count = automaton.initTransition(i, t);
for (int j = 0; j < count; j++) {
automaton.getNextTransition(t);
int min, max;
if (t.min <= anyTermID && anyTermID <= t.max) {
// Match any term
min = 0;
max = termCount - 1;
} else {
min = t.min;
max = t.max;
}
newAutomaton.addTransition(t.source, t.dest, min, max);
}
}
newAutomaton.finishState();
automaton = newAutomaton;
}
det = Operations.removeDeadStates(Operations.determinize(automaton, maxDeterminizedStates));
if (det.isAccept(0)) {
throw new IllegalStateException("cannot accept the empty string");
}
}
use of org.apache.lucene.util.automaton.Automaton in project lucene-solr by apache.
the class TestReversedWildcardFilterFactory method wasReversed.
/** fragile assert: depends on our implementation, but cleanest way to check for now */
private boolean wasReversed(SolrQueryParser qp, String query) throws Exception {
Query q = qp.parse(query);
if (!(q instanceof AutomatonQuery)) {
return false;
}
Automaton automaton = ((AutomatonQuery) q).getAutomaton();
String prefix = Operations.getCommonPrefix(Operations.determinize(automaton, Operations.DEFAULT_MAX_DETERMINIZED_STATES));
return prefix.length() > 0 && prefix.charAt(0) == '';
}
Aggregations