use of org.apache.lucene.analysis.Tokenizer in project lucene-solr by apache.
the class TestPhoneticFilter method testRandomStrings.
/** blast some random strings through the analyzer */
public void testRandomStrings() throws IOException {
Encoder[] encoders = new Encoder[] { new Metaphone(), new DoubleMetaphone(), new Soundex(), new RefinedSoundex(), new Caverphone2() };
for (final Encoder e : encoders) {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, new PhoneticFilter(tokenizer, e, false));
}
};
checkRandomData(random(), a, 1000 * RANDOM_MULTIPLIER);
a.close();
Analyzer b = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, new PhoneticFilter(tokenizer, e, false));
}
};
checkRandomData(random(), b, 1000 * RANDOM_MULTIPLIER);
b.close();
}
}
use of org.apache.lucene.analysis.Tokenizer in project lucene-solr by apache.
the class TestBeiderMorseFilter method testCustomAttribute.
public void testCustomAttribute() throws IOException {
TokenStream stream = new MockTokenizer(MockTokenizer.KEYWORD, false);
((Tokenizer) stream).setReader(new StringReader("D'Angelo"));
stream = new PatternKeywordMarkerFilter(stream, Pattern.compile(".*"));
stream = new BeiderMorseFilter(stream, new PhoneticEngine(NameType.GENERIC, RuleType.EXACT, true));
KeywordAttribute keyAtt = stream.addAttribute(KeywordAttribute.class);
stream.reset();
int i = 0;
while (stream.incrementToken()) {
assertTrue(keyAtt.isKeyword());
i++;
}
assertEquals(12, i);
stream.end();
stream.close();
}
use of org.apache.lucene.analysis.Tokenizer in project lucene-solr by apache.
the class TestDaitchMokotoffSoundexFilter method testEmptyTerm.
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new KeywordTokenizer();
return new TokenStreamComponents(tokenizer, new DaitchMokotoffSoundexFilter(tokenizer, random().nextBoolean()));
}
};
checkOneTerm(a, "", "");
a.close();
}
use of org.apache.lucene.analysis.Tokenizer in project lucene-solr by apache.
the class SmartChineseAnalyzer method createComponents.
@Override
public TokenStreamComponents createComponents(String fieldName) {
final Tokenizer tokenizer = new HMMChineseTokenizer();
TokenStream result = tokenizer;
// result = new LowerCaseFilter(result);
// LowerCaseFilter is not needed, as SegTokenFilter lowercases Basic Latin text.
// The porter stemming is too strict, this is not a bug, this is a feature:)
result = new PorterStemFilter(result);
if (!stopWords.isEmpty()) {
result = new StopFilter(result, stopWords);
}
return new TokenStreamComponents(tokenizer, result);
}
use of org.apache.lucene.analysis.Tokenizer in project lucene-solr by apache.
the class TestPhoneticFilterFactory method assertAlgorithm.
static void assertAlgorithm(String algName, String inject, String input, String[] expected) throws Exception {
Tokenizer tokenizer = whitespaceMockTokenizer(input);
Map<String, String> args = new HashMap<>();
args.put("encoder", algName);
args.put("inject", inject);
PhoneticFilterFactory factory = new PhoneticFilterFactory(args);
factory.inform(new ClasspathResourceLoader(factory.getClass()));
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, expected);
}
Aggregations