use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project lucene-solr by apache.
the class TestSimplePatternSplitTokenizer method testBigLookahead.
public void testBigLookahead() throws Exception {
StringBuilder b = new StringBuilder();
for (int i = 0; i < 100; i++) {
b.append('a');
}
b.append('b');
Tokenizer t = new SimplePatternSplitTokenizer(b.toString());
CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
b = new StringBuilder();
for (int i = 0; i < 200; i++) {
b.append('a');
}
t.setReader(new StringReader(b.toString()));
t.reset();
assertTrue(t.incrementToken());
assertEquals(b.toString(), termAtt.toString());
assertFalse(t.incrementToken());
}
use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project lucene-solr by apache.
the class TestSimplePatternSplitTokenizer method testEndOffset.
public void testEndOffset() throws Exception {
Tokenizer t = new SimplePatternSplitTokenizer("a+");
CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = t.getAttribute(OffsetAttribute.class);
t.setReader(new StringReader("aaabbb"));
t.reset();
assertTrue(t.incrementToken());
assertEquals("bbb", termAtt.toString());
assertFalse(t.incrementToken());
t.end();
assertEquals(6, offsetAtt.endOffset());
}
use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project lucene-solr by apache.
the class TestSimplePatternSplitTokenizer method tsToString.
/**
* TODO: rewrite tests not to use string comparison.
*/
private static String tsToString(TokenStream in) throws IOException {
StringBuilder out = new StringBuilder();
CharTermAttribute termAtt = in.addAttribute(CharTermAttribute.class);
// extra safety to enforce, that the state is not preserved and also
// assign bogus values
in.clearAttributes();
termAtt.setEmpty().append("bogusTerm");
in.reset();
while (in.incrementToken()) {
if (out.length() > 0) {
out.append(' ');
}
out.append(termAtt.toString());
in.clearAttributes();
termAtt.setEmpty().append("bogusTerm");
}
in.close();
return out.toString();
}
use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project lucene-solr by apache.
the class TestSimplePatternSplitTokenizer method testSplitMultiCharWhitespace.
public void testSplitMultiCharWhitespace() throws Exception {
Tokenizer t = new SimplePatternSplitTokenizer("[ \t\r\n]*");
CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
t.setReader(new StringReader("a \tb c"));
assertTokenStreamContents(t, new String[] { "a", "b", "c" }, new int[] { 0, 3, 7 }, new int[] { 1, 4, 8 });
}
use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project lucene-solr by apache.
the class TestSimplePatternSplitTokenizer method testLeadingNonToken.
public void testLeadingNonToken() throws Exception {
Tokenizer t = new SimplePatternSplitTokenizer("[ \t\r\n]*");
CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
t.setReader(new StringReader(" a c"));
assertTokenStreamContents(t, new String[] { "a", "c" }, new int[] { 4, 6 }, new int[] { 5, 7 });
}
Aggregations