use of org.apache.lucene.analysis.core.WhitespaceTokenizer in project lucene-solr by apache.
the class EdgeNGramTokenFilterTest method testReset.
public void testReset() throws Exception {
WhitespaceTokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader("abcde"));
EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(tokenizer, 1, 3);
assertTokenStreamContents(filter, new String[] { "a", "ab", "abc" }, new int[] { 0, 0, 0 }, new int[] { 5, 5, 5 });
tokenizer.setReader(new StringReader("abcde"));
assertTokenStreamContents(filter, new String[] { "a", "ab", "abc" }, new int[] { 0, 0, 0 }, new int[] { 5, 5, 5 });
}
use of org.apache.lucene.analysis.core.WhitespaceTokenizer in project lucene-solr by apache.
the class NGramTokenFilterTest method testReset.
public void testReset() throws Exception {
WhitespaceTokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader("abcde"));
NGramTokenFilter filter = new NGramTokenFilter(tokenizer, 1, 1);
assertTokenStreamContents(filter, new String[] { "a", "b", "c", "d", "e" }, new int[] { 0, 0, 0, 0, 0 }, new int[] { 5, 5, 5, 5, 5 }, new int[] { 1, 0, 0, 0, 0 });
tokenizer.setReader(new StringReader("abcde"));
assertTokenStreamContents(filter, new String[] { "a", "b", "c", "d", "e" }, new int[] { 0, 0, 0, 0, 0 }, new int[] { 5, 5, 5, 5, 5 }, new int[] { 1, 0, 0, 0, 0 });
}
use of org.apache.lucene.analysis.core.WhitespaceTokenizer in project lucene-solr by apache.
the class CommonGramsFilterTest method testReset.
public void testReset() throws Exception {
final String input = "How the s a brown s cow d like A B thing?";
WhitespaceTokenizer wt = new WhitespaceTokenizer();
wt.setReader(new StringReader(input));
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
CharTermAttribute term = cgf.addAttribute(CharTermAttribute.class);
cgf.reset();
assertTrue(cgf.incrementToken());
assertEquals("How", term.toString());
assertTrue(cgf.incrementToken());
assertEquals("How_the", term.toString());
assertTrue(cgf.incrementToken());
assertEquals("the", term.toString());
assertTrue(cgf.incrementToken());
assertEquals("the_s", term.toString());
cgf.close();
wt.setReader(new StringReader(input));
cgf.reset();
assertTrue(cgf.incrementToken());
assertEquals("How", term.toString());
}
use of org.apache.lucene.analysis.core.WhitespaceTokenizer in project Anserini by castorini.
the class TweetAnalyzer method createComponents.
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new WhitespaceTokenizer();
TokenStream filter = new TweetLowerCaseEntityPreservingFilter(source);
if (stemming) {
// Porter stemmer ignores words which are marked as keywords
filter = new PorterStemFilter(filter);
}
return new TokenStreamComponents(source, filter);
}
use of org.apache.lucene.analysis.core.WhitespaceTokenizer in project Anserini by castorini.
the class TRECAnalyzer method createComponents.
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new WhitespaceTokenizer();
TokenStream filter = new TweetLowerCaseEntityPreservingFilter(source);
return new TokenStreamComponents(source, filter);
}
Aggregations