use of org.apache.lucene.util.automaton.Automaton in project lucene-solr by apache.
the class TestSynonymGraphFilter method topoSort.
/*
private String toDot(TokenStream ts) throws IOException {
PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
PositionLengthAttribute posLenAtt = ts.addAttribute(PositionLengthAttribute.class);
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class);
ts.reset();
int srcNode = -1;
int destNode = -1;
StringBuilder b = new StringBuilder();
b.append("digraph Automaton {\n");
b.append(" rankdir = LR\n");
b.append(" node [width=0.2, height=0.2, fontsize=8]\n");
b.append(" initial [shape=plaintext,label=\"\"]\n");
b.append(" initial -> 0\n");
while (ts.incrementToken()) {
int posInc = posIncAtt.getPositionIncrement();
if (posInc != 0) {
srcNode += posInc;
b.append(" ");
b.append(srcNode);
b.append(" [shape=circle,label=\"" + srcNode + "\"]\n");
}
destNode = srcNode + posLenAtt.getPositionLength();
b.append(" ");
b.append(srcNode);
b.append(" -> ");
b.append(destNode);
b.append(" [label=\"");
b.append(termAtt);
b.append("\"");
if (typeAtt.type().equals("word") == false) {
b.append(" color=red");
}
b.append("]\n");
}
ts.end();
ts.close();
b.append('}');
return b.toString();
}
*/
/** Renumbers nodes according to their topo sort */
private Automaton topoSort(Automaton in) {
int[] newToOld = Operations.topoSortStates(in);
int[] oldToNew = new int[newToOld.length];
Automaton.Builder a = new Automaton.Builder();
//System.out.println("remap:");
for (int i = 0; i < newToOld.length; i++) {
a.createState();
oldToNew[newToOld[i]] = i;
//System.out.println(" " + newToOld[i] + " -> " + i);
if (in.isAccept(newToOld[i])) {
a.setAccept(i, true);
//System.out.println(" **");
}
}
Transition t = new Transition();
for (int i = 0; i < newToOld.length; i++) {
int count = in.initTransition(newToOld[i], t);
for (int j = 0; j < count; j++) {
in.getNextTransition(t);
a.addTransition(i, oldToNew[t.dest], t.min, t.max);
}
}
return a.finish();
}
use of org.apache.lucene.util.automaton.Automaton in project lucene-solr by apache.
the class TestIndexWriter method testStopwordsPosIncHole2.
// LUCENE-3849
public void testStopwordsPosIncHole2() throws Exception {
// use two stopfilters for testing here
Directory dir = newDirectory();
final Automaton secondSet = Automata.makeString("foobar");
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer();
TokenStream stream = new MockTokenFilter(tokenizer, MockTokenFilter.ENGLISH_STOPSET);
stream = new MockTokenFilter(stream, new CharacterRunAutomaton(secondSet));
return new TokenStreamComponents(tokenizer, stream);
}
};
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, a);
Document doc = new Document();
doc.add(new TextField("body", "just a foobar", Field.Store.NO));
doc.add(new TextField("body", "test of gaps", Field.Store.NO));
iw.addDocument(doc);
IndexReader ir = iw.getReader();
iw.close();
IndexSearcher is = newSearcher(ir);
PhraseQuery.Builder builder = new PhraseQuery.Builder();
builder.add(new Term("body", "just"), 0);
builder.add(new Term("body", "test"), 3);
PhraseQuery pq = builder.build();
// body:"just ? ? test"
assertEquals(1, is.search(pq, 5).totalHits);
ir.close();
dir.close();
}
use of org.apache.lucene.util.automaton.Automaton in project lucene-solr by apache.
the class TestTermAutomatonQuery method testRandom.
public void testRandom() throws Exception {
int numDocs = atLeast(100);
Directory dir = newDirectory();
// Adds occassional random synonyms:
Analyzer analyzer = new Analyzer() {
@Override
public TokenStreamComponents createComponents(String fieldName) {
MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true, 100);
tokenizer.setEnableChecks(true);
TokenFilter filt = new MockTokenFilter(tokenizer, MockTokenFilter.EMPTY_STOPSET);
filt = new RandomSynonymFilter(filt);
return new TokenStreamComponents(tokenizer, filt);
}
};
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
for (int i = 0; i < numDocs; i++) {
Document doc = new Document();
int numTokens = atLeast(10);
StringBuilder sb = new StringBuilder();
for (int j = 0; j < numTokens; j++) {
sb.append(' ');
sb.append((char) (97 + random().nextInt(3)));
}
String contents = sb.toString();
doc.add(newTextField("field", contents, Field.Store.NO));
doc.add(new StoredField("id", "" + i));
if (VERBOSE) {
System.out.println(" doc " + i + " -> " + contents);
}
w.addDocument(doc);
}
IndexReader r = w.getReader();
IndexSearcher s = newSearcher(r);
// Used to match ANY using MultiPhraseQuery:
Term[] allTerms = new Term[] { new Term("field", "a"), new Term("field", "b"), new Term("field", "c") };
int numIters = atLeast(1000);
for (int iter = 0; iter < numIters; iter++) {
// Build the (finite, no any transitions) TermAutomatonQuery and
// also the "equivalent" BooleanQuery and make sure they match the
// same docs:
BooleanQuery.Builder bq = new BooleanQuery.Builder();
int count = TestUtil.nextInt(random(), 1, 5);
Set<BytesRef> strings = new HashSet<>();
for (int i = 0; i < count; i++) {
StringBuilder sb = new StringBuilder();
int numTokens = TestUtil.nextInt(random(), 1, 5);
for (int j = 0; j < numTokens; j++) {
if (j > 0 && j < numTokens - 1 && random().nextInt(5) == 3) {
sb.append('*');
} else {
sb.append((char) (97 + random().nextInt(3)));
}
}
String string = sb.toString();
MultiPhraseQuery.Builder mpqb = new MultiPhraseQuery.Builder();
for (int j = 0; j < string.length(); j++) {
if (string.charAt(j) == '*') {
mpqb.add(allTerms);
} else {
mpqb.add(new Term("field", "" + string.charAt(j)));
}
}
bq.add(mpqb.build(), BooleanClause.Occur.SHOULD);
strings.add(new BytesRef(string));
}
List<BytesRef> stringsList = new ArrayList<>(strings);
Collections.sort(stringsList);
Automaton a = Automata.makeStringUnion(stringsList);
// Translate automaton to query:
TermAutomatonQuery q = new TermAutomatonQuery("field");
int numStates = a.getNumStates();
for (int i = 0; i < numStates; i++) {
q.createState();
q.setAccept(i, a.isAccept(i));
}
Transition t = new Transition();
for (int i = 0; i < numStates; i++) {
int transCount = a.initTransition(i, t);
for (int j = 0; j < transCount; j++) {
a.getNextTransition(t);
for (int label = t.min; label <= t.max; label++) {
if ((char) label == '*') {
q.addAnyTransition(t.source, t.dest);
} else {
q.addTransition(t.source, t.dest, "" + (char) label);
}
}
}
}
q.finish();
if (VERBOSE) {
System.out.println("TEST: iter=" + iter);
for (BytesRef string : stringsList) {
System.out.println(" string: " + string.utf8ToString());
}
System.out.println(q.toDot());
}
Query q1 = q;
Query q2 = bq.build();
if (random().nextInt(5) == 1) {
if (VERBOSE) {
System.out.println(" use random filter");
}
RandomQuery filter = new RandomQuery(random().nextLong(), random().nextFloat());
q1 = new BooleanQuery.Builder().add(q1, Occur.MUST).add(filter, Occur.FILTER).build();
q2 = new BooleanQuery.Builder().add(q2, Occur.MUST).add(filter, Occur.FILTER).build();
}
TopDocs hits1 = s.search(q1, numDocs);
TopDocs hits2 = s.search(q2, numDocs);
Set<String> hits1Docs = toDocIDs(s, hits1);
Set<String> hits2Docs = toDocIDs(s, hits2);
try {
assertEquals(hits2.totalHits, hits1.totalHits);
assertEquals(hits2Docs, hits1Docs);
} catch (AssertionError ae) {
System.out.println("FAILED:");
for (String id : hits1Docs) {
if (hits2Docs.contains(id) == false) {
System.out.println(String.format(Locale.ROOT, " id=%3s matched but should not have", id));
}
}
for (String id : hits2Docs) {
if (hits1Docs.contains(id) == false) {
System.out.println(String.format(Locale.ROOT, " id=%3s did not match but should have", id));
}
}
throw ae;
}
}
IOUtils.close(w, r, dir, analyzer);
}
use of org.apache.lucene.util.automaton.Automaton in project lucene-solr by apache.
the class BaseTokenStreamTestCase method getGraphStrings.
/** Returns all paths accepted by the token stream graph produced by the already initialized {@link TokenStream}. */
public static Set<String> getGraphStrings(TokenStream tokenStream) throws IOException {
Automaton automaton = new TokenStreamToAutomaton().toAutomaton(tokenStream);
Set<IntsRef> actualStringPaths = AutomatonTestUtil.getFiniteStringsRecursive(automaton, -1);
BytesRefBuilder scratchBytesRefBuilder = new BytesRefBuilder();
Set<String> paths = new HashSet<>();
for (IntsRef ir : actualStringPaths) {
paths.add(Util.toBytesRef(ir, scratchBytesRefBuilder).utf8ToString().replace((char) TokenStreamToAutomaton.POS_SEP, ' '));
}
return paths;
}
use of org.apache.lucene.util.automaton.Automaton in project lucene-solr by apache.
the class SolrQueryParserBase method getWildcardQuery.
// called from parser
protected Query getWildcardQuery(String field, String termStr) throws SyntaxError {
checkNullField(field);
// *:* -> MatchAllDocsQuery
if ("*".equals(termStr)) {
if ("*".equals(field) || getExplicitField() == null) {
return newMatchAllDocsQuery();
}
}
FieldType fieldType = schema.getFieldType(field);
termStr = analyzeIfMultitermTermText(field, termStr, fieldType);
// can we use reversed wildcards in this field?
ReversedWildcardFilterFactory factory = getReversedWildcardFilterFactory(fieldType);
if (factory != null) {
Term term = new Term(field, termStr);
// fsa representing the query
Automaton automaton = WildcardQuery.toAutomaton(term);
// TODO: we should likely use the automaton to calculate shouldReverse, too.
if (factory.shouldReverse(termStr)) {
automaton = Operations.concatenate(automaton, Automata.makeChar(factory.getMarkerChar()));
automaton = Operations.reverse(automaton);
} else {
// reverse wildcardfilter is active: remove false positives
// fsa representing false positives (markerChar*)
Automaton falsePositives = Operations.concatenate(Automata.makeChar(factory.getMarkerChar()), Automata.makeAnyString());
// subtract these away
automaton = Operations.minus(automaton, falsePositives, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
}
return new AutomatonQuery(term, automaton) {
// override toString so it's completely transparent
@Override
public String toString(String field) {
StringBuilder buffer = new StringBuilder();
if (!getField().equals(field)) {
buffer.append(getField());
buffer.append(":");
}
buffer.append(term.text());
return buffer.toString();
}
};
}
// Solr has always used constant scoring for wildcard queries. This should return constant scoring by default.
return newWildcardQuery(new Term(field, termStr));
}
Aggregations