use of org.apache.lucene.util.automaton.Transition in project lucene-solr by apache.
the class AnalyzingSuggester method replaceSep.
// Replaces SEP with epsilon or remaps them if
// we were asked to preserve them:
private Automaton replaceSep(Automaton a) {
int numStates = a.getNumStates();
Automaton.Builder result = new Automaton.Builder(numStates, a.getNumTransitions());
// Copy all states over
result.copyStates(a);
// Go in reverse topo sort so we know we only have to
// make one pass:
Transition t = new Transition();
int[] topoSortStates = Operations.topoSortStates(a);
for (int i = 0; i < topoSortStates.length; i++) {
int state = topoSortStates[topoSortStates.length - 1 - i];
int count = a.initTransition(state, t);
for (int j = 0; j < count; j++) {
a.getNextTransition(t);
if (t.min == TokenStreamToAutomaton.POS_SEP) {
assert t.max == TokenStreamToAutomaton.POS_SEP;
if (preserveSep) {
// Remap to SEP_LABEL:
result.addTransition(state, t.dest, SEP_LABEL);
} else {
result.addEpsilon(state, t.dest);
}
} else if (t.min == TokenStreamToAutomaton.HOLE) {
assert t.max == TokenStreamToAutomaton.HOLE;
// Just remove the hole: there will then be two
// SEP tokens next to each other, which will only
// match another hole at search time. Note that
// it will also match an empty-string token ... if
// that's somehow a problem we can always map HOLE
// to a dedicated byte (and escape it in the
// input).
result.addEpsilon(state, t.dest);
} else {
result.addTransition(state, t.dest, t.min, t.max);
}
}
}
return result.finish();
}
use of org.apache.lucene.util.automaton.Transition in project lucene-solr by apache.
the class TestSynonymGraphFilter method topoSort.
/*
private String toDot(TokenStream ts) throws IOException {
PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
PositionLengthAttribute posLenAtt = ts.addAttribute(PositionLengthAttribute.class);
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class);
ts.reset();
int srcNode = -1;
int destNode = -1;
StringBuilder b = new StringBuilder();
b.append("digraph Automaton {\n");
b.append(" rankdir = LR\n");
b.append(" node [width=0.2, height=0.2, fontsize=8]\n");
b.append(" initial [shape=plaintext,label=\"\"]\n");
b.append(" initial -> 0\n");
while (ts.incrementToken()) {
int posInc = posIncAtt.getPositionIncrement();
if (posInc != 0) {
srcNode += posInc;
b.append(" ");
b.append(srcNode);
b.append(" [shape=circle,label=\"" + srcNode + "\"]\n");
}
destNode = srcNode + posLenAtt.getPositionLength();
b.append(" ");
b.append(srcNode);
b.append(" -> ");
b.append(destNode);
b.append(" [label=\"");
b.append(termAtt);
b.append("\"");
if (typeAtt.type().equals("word") == false) {
b.append(" color=red");
}
b.append("]\n");
}
ts.end();
ts.close();
b.append('}');
return b.toString();
}
*/
/** Renumbers nodes according to their topo sort */
private Automaton topoSort(Automaton in) {
int[] newToOld = Operations.topoSortStates(in);
int[] oldToNew = new int[newToOld.length];
Automaton.Builder a = new Automaton.Builder();
//System.out.println("remap:");
for (int i = 0; i < newToOld.length; i++) {
a.createState();
oldToNew[newToOld[i]] = i;
//System.out.println(" " + newToOld[i] + " -> " + i);
if (in.isAccept(newToOld[i])) {
a.setAccept(i, true);
//System.out.println(" **");
}
}
Transition t = new Transition();
for (int i = 0; i < newToOld.length; i++) {
int count = in.initTransition(newToOld[i], t);
for (int j = 0; j < count; j++) {
in.getNextTransition(t);
a.addTransition(i, oldToNew[t.dest], t.min, t.max);
}
}
return a.finish();
}
use of org.apache.lucene.util.automaton.Transition in project lucene-solr by apache.
the class TestSynonymGraphFilter method accepts.
/** Like {@link Operations#run} except the incoming automaton is allowed to be non-deterministic. */
private static boolean accepts(Automaton a, IntsRef path) {
Set<Integer> states = new HashSet<>();
states.add(0);
Transition t = new Transition();
for (int i = 0; i < path.length; i++) {
int digit = path.ints[path.offset + i];
Set<Integer> nextStates = new HashSet<>();
for (int state : states) {
int count = a.initTransition(state, t);
for (int j = 0; j < count; j++) {
a.getNextTransition(t);
if (digit >= t.min && digit <= t.max) {
nextStates.add(t.dest);
}
}
}
states = nextStates;
if (states.isEmpty()) {
return false;
}
}
for (int state : states) {
if (a.isAccept(state)) {
return true;
}
}
return false;
}
use of org.apache.lucene.util.automaton.Transition in project lucene-solr by apache.
the class TestTermAutomatonQuery method testRandom.
public void testRandom() throws Exception {
int numDocs = atLeast(100);
Directory dir = newDirectory();
// Adds occassional random synonyms:
Analyzer analyzer = new Analyzer() {
@Override
public TokenStreamComponents createComponents(String fieldName) {
MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true, 100);
tokenizer.setEnableChecks(true);
TokenFilter filt = new MockTokenFilter(tokenizer, MockTokenFilter.EMPTY_STOPSET);
filt = new RandomSynonymFilter(filt);
return new TokenStreamComponents(tokenizer, filt);
}
};
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
for (int i = 0; i < numDocs; i++) {
Document doc = new Document();
int numTokens = atLeast(10);
StringBuilder sb = new StringBuilder();
for (int j = 0; j < numTokens; j++) {
sb.append(' ');
sb.append((char) (97 + random().nextInt(3)));
}
String contents = sb.toString();
doc.add(newTextField("field", contents, Field.Store.NO));
doc.add(new StoredField("id", "" + i));
if (VERBOSE) {
System.out.println(" doc " + i + " -> " + contents);
}
w.addDocument(doc);
}
IndexReader r = w.getReader();
IndexSearcher s = newSearcher(r);
// Used to match ANY using MultiPhraseQuery:
Term[] allTerms = new Term[] { new Term("field", "a"), new Term("field", "b"), new Term("field", "c") };
int numIters = atLeast(1000);
for (int iter = 0; iter < numIters; iter++) {
// Build the (finite, no any transitions) TermAutomatonQuery and
// also the "equivalent" BooleanQuery and make sure they match the
// same docs:
BooleanQuery.Builder bq = new BooleanQuery.Builder();
int count = TestUtil.nextInt(random(), 1, 5);
Set<BytesRef> strings = new HashSet<>();
for (int i = 0; i < count; i++) {
StringBuilder sb = new StringBuilder();
int numTokens = TestUtil.nextInt(random(), 1, 5);
for (int j = 0; j < numTokens; j++) {
if (j > 0 && j < numTokens - 1 && random().nextInt(5) == 3) {
sb.append('*');
} else {
sb.append((char) (97 + random().nextInt(3)));
}
}
String string = sb.toString();
MultiPhraseQuery.Builder mpqb = new MultiPhraseQuery.Builder();
for (int j = 0; j < string.length(); j++) {
if (string.charAt(j) == '*') {
mpqb.add(allTerms);
} else {
mpqb.add(new Term("field", "" + string.charAt(j)));
}
}
bq.add(mpqb.build(), BooleanClause.Occur.SHOULD);
strings.add(new BytesRef(string));
}
List<BytesRef> stringsList = new ArrayList<>(strings);
Collections.sort(stringsList);
Automaton a = Automata.makeStringUnion(stringsList);
// Translate automaton to query:
TermAutomatonQuery q = new TermAutomatonQuery("field");
int numStates = a.getNumStates();
for (int i = 0; i < numStates; i++) {
q.createState();
q.setAccept(i, a.isAccept(i));
}
Transition t = new Transition();
for (int i = 0; i < numStates; i++) {
int transCount = a.initTransition(i, t);
for (int j = 0; j < transCount; j++) {
a.getNextTransition(t);
for (int label = t.min; label <= t.max; label++) {
if ((char) label == '*') {
q.addAnyTransition(t.source, t.dest);
} else {
q.addTransition(t.source, t.dest, "" + (char) label);
}
}
}
}
q.finish();
if (VERBOSE) {
System.out.println("TEST: iter=" + iter);
for (BytesRef string : stringsList) {
System.out.println(" string: " + string.utf8ToString());
}
System.out.println(q.toDot());
}
Query q1 = q;
Query q2 = bq.build();
if (random().nextInt(5) == 1) {
if (VERBOSE) {
System.out.println(" use random filter");
}
RandomQuery filter = new RandomQuery(random().nextLong(), random().nextFloat());
q1 = new BooleanQuery.Builder().add(q1, Occur.MUST).add(filter, Occur.FILTER).build();
q2 = new BooleanQuery.Builder().add(q2, Occur.MUST).add(filter, Occur.FILTER).build();
}
TopDocs hits1 = s.search(q1, numDocs);
TopDocs hits2 = s.search(q2, numDocs);
Set<String> hits1Docs = toDocIDs(s, hits1);
Set<String> hits2Docs = toDocIDs(s, hits2);
try {
assertEquals(hits2.totalHits, hits1.totalHits);
assertEquals(hits2Docs, hits1Docs);
} catch (AssertionError ae) {
System.out.println("FAILED:");
for (String id : hits1Docs) {
if (hits2Docs.contains(id) == false) {
System.out.println(String.format(Locale.ROOT, " id=%3s matched but should not have", id));
}
}
for (String id : hits2Docs) {
if (hits1Docs.contains(id) == false) {
System.out.println(String.format(Locale.ROOT, " id=%3s did not match but should have", id));
}
}
throw ae;
}
}
IOUtils.close(w, r, dir, analyzer);
}
use of org.apache.lucene.util.automaton.Transition in project lucene-solr by apache.
the class TermAutomatonQuery method rewrite.
public Query rewrite(IndexReader reader) throws IOException {
if (Operations.isEmpty(det)) {
return new MatchNoDocsQuery();
}
IntsRef single = Operations.getSingleton(det);
if (single != null && single.length == 1) {
return new TermQuery(new Term(field, idToTerm.get(single.ints[single.offset])));
}
// TODO: can PhraseQuery really handle multiple terms at the same position? If so, why do we even have MultiPhraseQuery?
// Try for either PhraseQuery or MultiPhraseQuery, which only works when the automaton is a sausage:
MultiPhraseQuery.Builder mpq = new MultiPhraseQuery.Builder();
PhraseQuery.Builder pq = new PhraseQuery.Builder();
Transition t = new Transition();
int state = 0;
int pos = 0;
query: while (true) {
int count = det.initTransition(state, t);
if (count == 0) {
if (det.isAccept(state) == false) {
mpq = null;
pq = null;
}
break;
} else if (det.isAccept(state)) {
mpq = null;
pq = null;
break;
}
int dest = -1;
List<Term> terms = new ArrayList<>();
boolean matchesAny = false;
for (int i = 0; i < count; i++) {
det.getNextTransition(t);
if (i == 0) {
dest = t.dest;
} else if (dest != t.dest) {
mpq = null;
pq = null;
break query;
}
matchesAny |= anyTermID >= t.min && anyTermID <= t.max;
if (matchesAny == false) {
for (int termID = t.min; termID <= t.max; termID++) {
terms.add(new Term(field, idToTerm.get(termID)));
}
}
}
if (matchesAny == false) {
mpq.add(terms.toArray(new Term[terms.size()]), pos);
if (pq != null) {
if (terms.size() == 1) {
pq.add(terms.get(0), pos);
} else {
pq = null;
}
}
}
state = dest;
pos++;
}
if (pq != null) {
return pq.build();
} else if (mpq != null) {
return mpq.build();
}
// TODO: we could maybe also rewrite to union of PhraseQuery (pull all finite strings) if it's "worth it"?
return this;
}
Aggregations