use of org.apache.lucene.util.automaton.CharacterRunAutomaton in project lucene-solr by apache.
the class TestMockAnalyzer method testSingleChar.
// Test some regular expressions as tokenization patterns
/** Test a configuration where each character is a term */
public void testSingleChar() throws Exception {
CharacterRunAutomaton single = new CharacterRunAutomaton(new RegExp(".").toAutomaton());
Analyzer a = new MockAnalyzer(random(), single, false);
assertAnalyzesTo(a, "foobar", new String[] { "f", "o", "o", "b", "a", "r" }, new int[] { 0, 1, 2, 3, 4, 5 }, new int[] { 1, 2, 3, 4, 5, 6 });
checkRandomData(random(), a, 100);
}
use of org.apache.lucene.util.automaton.CharacterRunAutomaton in project lucene-solr by apache.
the class TestMockAnalyzer method testRandomRegexps.
/** blast some random strings through differently configured tokenizers */
public void testRandomRegexps() throws Exception {
int iters = TEST_NIGHTLY ? atLeast(30) : atLeast(1);
for (int i = 0; i < iters; i++) {
final CharacterRunAutomaton dfa = new CharacterRunAutomaton(AutomatonTestUtil.randomAutomaton(random()), Integer.MAX_VALUE);
final boolean lowercase = random().nextBoolean();
final int limit = TestUtil.nextInt(random(), 0, 500);
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer t = new MockTokenizer(dfa, lowercase, limit);
return new TokenStreamComponents(t, t);
}
};
checkRandomData(random(), a, 100);
a.close();
}
}
use of org.apache.lucene.util.automaton.CharacterRunAutomaton in project lucene-solr by apache.
the class TestMockAnalyzer method testThreeChars.
/** Test a configuration where three characters makes a term */
public void testThreeChars() throws Exception {
CharacterRunAutomaton single = new CharacterRunAutomaton(new RegExp("...").toAutomaton());
Analyzer a = new MockAnalyzer(random(), single, false);
assertAnalyzesTo(a, "foobar", new String[] { "foo", "bar" }, new int[] { 0, 3 }, new int[] { 3, 6 });
// make sure when last term is a "partial" match that end() is correct
assertTokenStreamContents(a.tokenStream("bogus", "fooba"), new String[] { "foo" }, new int[] { 0 }, new int[] { 3 }, new int[] { 1 }, new Integer(5));
checkRandomData(random(), a, 100);
}
use of org.apache.lucene.util.automaton.CharacterRunAutomaton in project lucene-solr by apache.
the class TestMockAnalyzer method testTwoChars.
/** Test a configuration where two characters makes a term */
public void testTwoChars() throws Exception {
CharacterRunAutomaton single = new CharacterRunAutomaton(new RegExp("..").toAutomaton());
Analyzer a = new MockAnalyzer(random(), single, false);
assertAnalyzesTo(a, "foobar", new String[] { "fo", "ob", "ar" }, new int[] { 0, 2, 4 }, new int[] { 2, 4, 6 });
// make sure when last term is a "partial" match that end() is correct
assertTokenStreamContents(a.tokenStream("bogus", "fooba"), new String[] { "fo", "ob" }, new int[] { 0, 2 }, new int[] { 2, 4 }, new int[] { 1, 1 }, new Integer(5));
checkRandomData(random(), a, 100);
}
use of org.apache.lucene.util.automaton.CharacterRunAutomaton in project lucene-solr by apache.
the class TestMockAnalyzer method testUppercase.
/** Test a configuration where word starts with one uppercase */
public void testUppercase() throws Exception {
CharacterRunAutomaton single = new CharacterRunAutomaton(new RegExp("[A-Z][a-z]*").toAutomaton());
Analyzer a = new MockAnalyzer(random(), single, false);
assertAnalyzesTo(a, "FooBarBAZ", new String[] { "Foo", "Bar", "B", "A", "Z" }, new int[] { 0, 3, 6, 7, 8 }, new int[] { 3, 6, 7, 8, 9 });
assertAnalyzesTo(a, "aFooBar", new String[] { "Foo", "Bar" }, new int[] { 1, 4 }, new int[] { 4, 7 });
checkRandomData(random(), a, 100);
}
Aggregations