use of org.apache.lucene.util.automaton.CharacterRunAutomaton in project lucene-solr by apache.
the class TestDuelingAnalyzers method setUp.
@Override
public void setUp() throws Exception {
super.setUp();
Automaton single = new Automaton();
int initial = single.createState();
int accept = single.createState();
single.setAccept(accept, true);
// build an automaton matching this jvm's letter definition
for (int i = 0; i <= 0x10FFFF; i++) {
if (Character.isLetter(i)) {
single.addTransition(initial, accept, i);
}
}
Automaton repeat = Operations.repeat(single);
jvmLetter = new CharacterRunAutomaton(repeat);
}
use of org.apache.lucene.util.automaton.CharacterRunAutomaton in project lucene-solr by apache.
the class TestIndexWriter method testStopwordsPosIncHole2.
// LUCENE-3849
public void testStopwordsPosIncHole2() throws Exception {
// use two stopfilters for testing here
Directory dir = newDirectory();
final Automaton secondSet = Automata.makeString("foobar");
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer();
TokenStream stream = new MockTokenFilter(tokenizer, MockTokenFilter.ENGLISH_STOPSET);
stream = new MockTokenFilter(stream, new CharacterRunAutomaton(secondSet));
return new TokenStreamComponents(tokenizer, stream);
}
};
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, a);
Document doc = new Document();
doc.add(new TextField("body", "just a foobar", Field.Store.NO));
doc.add(new TextField("body", "test of gaps", Field.Store.NO));
iw.addDocument(doc);
IndexReader ir = iw.getReader();
iw.close();
IndexSearcher is = newSearcher(ir);
PhraseQuery.Builder builder = new PhraseQuery.Builder();
builder.add(new Term("body", "just"), 0);
builder.add(new Term("body", "test"), 3);
PhraseQuery pq = builder.build();
// body:"just ? ? test"
assertEquals(1, is.search(pq, 5).totalHits);
ir.close();
dir.close();
}
use of org.apache.lucene.util.automaton.CharacterRunAutomaton in project lucene-solr by apache.
the class QueryParserTestBase method testBoost.
public void testBoost() throws Exception {
CharacterRunAutomaton stopWords = new CharacterRunAutomaton(Automata.makeString("on"));
Analyzer oneStopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopWords);
CommonQueryParserConfiguration qp = getParserConfig(oneStopAnalyzer);
Query q = getQuery("on^1.0", qp);
assertNotNull(q);
q = getQuery("\"hello\"^2.0", qp);
assertNotNull(q);
assertEquals(((BoostQuery) q).getBoost(), (float) 2.0, (float) 0.5);
q = getQuery("hello^2.0", qp);
assertNotNull(q);
assertEquals(((BoostQuery) q).getBoost(), (float) 2.0, (float) 0.5);
q = getQuery("\"on\"^1.0", qp);
assertNotNull(q);
Analyzer a2 = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET);
CommonQueryParserConfiguration qp2 = getParserConfig(a2);
q = getQuery("the^3", qp2);
// "the" is a stop word so the result is an empty query:
assertNotNull(q);
assertMatchNoDocsQuery(q);
assertFalse(q instanceof BoostQuery);
}
use of org.apache.lucene.util.automaton.CharacterRunAutomaton in project lucene-solr by apache.
the class QueryParserTestBase method testStopwords.
public void testStopwords() throws Exception {
CharacterRunAutomaton stopSet = new CharacterRunAutomaton(new RegExp("the|foo").toAutomaton());
CommonQueryParserConfiguration qp = getParserConfig(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet));
Query result = getQuery("field:the OR field:foo", qp);
assertNotNull("result is null and it shouldn't be", result);
assertTrue("result is not a BooleanQuery", result instanceof BooleanQuery || result instanceof MatchNoDocsQuery);
if (result instanceof BooleanQuery) {
assertEquals(0, ((BooleanQuery) result).clauses().size());
}
result = getQuery("field:woo OR field:the", qp);
assertNotNull("result is null and it shouldn't be", result);
assertTrue("result is not a TermQuery", result instanceof TermQuery);
result = getQuery("(fieldX:xxxxx OR fieldy:xxxxxxxx)^2 AND (fieldx:the OR fieldy:foo)", qp);
assertNotNull("result is null and it shouldn't be", result);
assertTrue("result is not a BoostQuery", result instanceof BoostQuery);
result = ((BoostQuery) result).getQuery();
assertTrue("result is not a BooleanQuery", result instanceof BooleanQuery);
if (VERBOSE)
System.out.println("Result: " + result);
assertTrue(((BooleanQuery) result).clauses().size() + " does not equal: " + 2, ((BooleanQuery) result).clauses().size() == 2);
}
use of org.apache.lucene.util.automaton.CharacterRunAutomaton in project SearchServices by Alfresco.
the class MinHashFilterTest method createMockShingleTokenizer.
public static Tokenizer createMockShingleTokenizer(int shingleSize, String shingles) {
MockTokenizer tokenizer = new MockTokenizer(new CharacterRunAutomaton(new RegExp("[^ \t\r\n]+([ \t\r\n]+[^ \t\r\n]+){4}").toAutomaton()), true);
tokenizer.setEnableChecks(true);
if (shingles != null) {
tokenizer.setReader(new StringReader(shingles));
}
return tokenizer;
}
Aggregations