use of org.apache.lucene.util.automaton.RegExp in project lucene-solr by apache.
the class TestQueryBuilder method testPhraseQueryPositionIncrements.
public void testPhraseQueryPositionIncrements() throws Exception {
PhraseQuery.Builder pqBuilder = new PhraseQuery.Builder();
pqBuilder.add(new Term("field", "1"), 0);
pqBuilder.add(new Term("field", "2"), 2);
PhraseQuery expected = pqBuilder.build();
CharacterRunAutomaton stopList = new CharacterRunAutomaton(new RegExp("[sS][tT][oO][pP]").toAutomaton());
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false, stopList);
QueryBuilder builder = new QueryBuilder(analyzer);
assertEquals(expected, builder.createPhraseQuery("field", "1 stop 2"));
}
use of org.apache.lucene.util.automaton.RegExp in project lucene-solr by apache.
the class BaseDocValuesFormatTestCase method testSortedTermsEnum.
public void testSortedTermsEnum() throws IOException {
Directory directory = newDirectory();
Analyzer analyzer = new MockAnalyzer(random());
IndexWriterConfig iwconfig = newIndexWriterConfig(analyzer);
iwconfig.setMergePolicy(newLogMergePolicy());
RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig);
Document doc = new Document();
doc.add(new SortedDocValuesField("field", new BytesRef("hello")));
iwriter.addDocument(doc);
doc = new Document();
doc.add(new SortedDocValuesField("field", new BytesRef("world")));
iwriter.addDocument(doc);
doc = new Document();
doc.add(new SortedDocValuesField("field", new BytesRef("beer")));
iwriter.addDocument(doc);
iwriter.forceMerge(1);
DirectoryReader ireader = iwriter.getReader();
iwriter.close();
SortedDocValues dv = getOnlyLeafReader(ireader).getSortedDocValues("field");
assertEquals(3, dv.getValueCount());
TermsEnum termsEnum = dv.termsEnum();
// next()
assertEquals("beer", termsEnum.next().utf8ToString());
assertEquals(0, termsEnum.ord());
assertEquals("hello", termsEnum.next().utf8ToString());
assertEquals(1, termsEnum.ord());
assertEquals("world", termsEnum.next().utf8ToString());
assertEquals(2, termsEnum.ord());
// seekCeil()
assertEquals(SeekStatus.NOT_FOUND, termsEnum.seekCeil(new BytesRef("ha!")));
assertEquals("hello", termsEnum.term().utf8ToString());
assertEquals(1, termsEnum.ord());
assertEquals(SeekStatus.FOUND, termsEnum.seekCeil(new BytesRef("beer")));
assertEquals("beer", termsEnum.term().utf8ToString());
assertEquals(0, termsEnum.ord());
assertEquals(SeekStatus.END, termsEnum.seekCeil(new BytesRef("zzz")));
assertEquals(SeekStatus.NOT_FOUND, termsEnum.seekCeil(new BytesRef("aba")));
assertEquals(0, termsEnum.ord());
// seekExact()
assertTrue(termsEnum.seekExact(new BytesRef("beer")));
assertEquals("beer", termsEnum.term().utf8ToString());
assertEquals(0, termsEnum.ord());
assertTrue(termsEnum.seekExact(new BytesRef("hello")));
assertEquals(Codec.getDefault().toString(), "hello", termsEnum.term().utf8ToString());
assertEquals(1, termsEnum.ord());
assertTrue(termsEnum.seekExact(new BytesRef("world")));
assertEquals("world", termsEnum.term().utf8ToString());
assertEquals(2, termsEnum.ord());
assertFalse(termsEnum.seekExact(new BytesRef("bogus")));
// seek(ord)
termsEnum.seekExact(0);
assertEquals("beer", termsEnum.term().utf8ToString());
assertEquals(0, termsEnum.ord());
termsEnum.seekExact(1);
assertEquals("hello", termsEnum.term().utf8ToString());
assertEquals(1, termsEnum.ord());
termsEnum.seekExact(2);
assertEquals("world", termsEnum.term().utf8ToString());
assertEquals(2, termsEnum.ord());
// NORMAL automaton
termsEnum = dv.intersect(new CompiledAutomaton(new RegExp(".*l.*").toAutomaton()));
assertEquals("hello", termsEnum.next().utf8ToString());
assertEquals(1, termsEnum.ord());
assertEquals("world", termsEnum.next().utf8ToString());
assertEquals(2, termsEnum.ord());
assertNull(termsEnum.next());
// SINGLE automaton
termsEnum = dv.intersect(new CompiledAutomaton(new RegExp("hello").toAutomaton()));
assertEquals("hello", termsEnum.next().utf8ToString());
assertEquals(1, termsEnum.ord());
assertNull(termsEnum.next());
ireader.close();
directory.close();
}
use of org.apache.lucene.util.automaton.RegExp in project lucene-solr by apache.
the class TestMockAnalyzer method testLength.
/** Test a configuration that behaves a lot like LengthFilter */
public void testLength() throws Exception {
CharacterRunAutomaton length5 = new CharacterRunAutomaton(new RegExp(".{5,}").toAutomaton());
Analyzer a = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, length5);
assertAnalyzesTo(a, "ok toolong fine notfine", new String[] { "ok", "fine" }, new int[] { 1, 2 });
}
use of org.apache.lucene.util.automaton.RegExp in project lucene-solr by apache.
the class TestMockAnalyzer method testSingleChar.
// Test some regular expressions as tokenization patterns
/** Test a configuration where each character is a term */
public void testSingleChar() throws Exception {
CharacterRunAutomaton single = new CharacterRunAutomaton(new RegExp(".").toAutomaton());
Analyzer a = new MockAnalyzer(random(), single, false);
assertAnalyzesTo(a, "foobar", new String[] { "f", "o", "o", "b", "a", "r" }, new int[] { 0, 1, 2, 3, 4, 5 }, new int[] { 1, 2, 3, 4, 5, 6 });
checkRandomData(random(), a, 100);
}
use of org.apache.lucene.util.automaton.RegExp in project lucene-solr by apache.
the class TestMockAnalyzer method testThreeChars.
/** Test a configuration where three characters makes a term */
public void testThreeChars() throws Exception {
CharacterRunAutomaton single = new CharacterRunAutomaton(new RegExp("...").toAutomaton());
Analyzer a = new MockAnalyzer(random(), single, false);
assertAnalyzesTo(a, "foobar", new String[] { "foo", "bar" }, new int[] { 0, 3 }, new int[] { 3, 6 });
// make sure when last term is a "partial" match that end() is correct
assertTokenStreamContents(a.tokenStream("bogus", "fooba"), new String[] { "foo" }, new int[] { 0 }, new int[] { 3 }, new int[] { 1 }, new Integer(5));
checkRandomData(random(), a, 100);
}
Aggregations