use of org.apache.lucene.analysis.Tokenizer in project elasticsearch by elastic.
the class SimpleIcuCollationTokenFilterTests method assertCollation.
private void assertCollation(TokenFilterFactory factory, String string1, String string2, int comparison) throws IOException {
Tokenizer tokenizer = new KeywordTokenizer();
tokenizer.setReader(new StringReader(string1));
TokenStream stream1 = factory.create(tokenizer);
tokenizer = new KeywordTokenizer();
tokenizer.setReader(new StringReader(string2));
TokenStream stream2 = factory.create(tokenizer);
assertCollation(stream1, stream2, comparison);
}
use of org.apache.lucene.analysis.Tokenizer in project elasticsearch by elastic.
the class KuromojiAnalysisTests method testBaseFormFilterFactory.
public void testBaseFormFilterFactory() throws IOException {
TestAnalysis analysis = createTestAnalysis();
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("kuromoji_pos");
assertThat(tokenFilter, instanceOf(KuromojiPartOfSpeechFilterFactory.class));
String source = "私は制限スピードを超える。";
String[] expected = new String[] { "私", "は", "制限", "スピード", "を" };
Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
tokenizer.setReader(new StringReader(source));
assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
}
use of org.apache.lucene.analysis.Tokenizer in project elasticsearch by elastic.
the class KuromojiAnalysisTests method testKatakanaStemFilter.
public void testKatakanaStemFilter() throws IOException {
TestAnalysis analysis = createTestAnalysis();
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("kuromoji_stemmer");
assertThat(tokenFilter, instanceOf(KuromojiKatakanaStemmerFactory.class));
String source = "明後日パーティーに行く予定がある。図書館で資料をコピーしました。";
Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
tokenizer.setReader(new StringReader(source));
// パーティー should be stemmed by default
// (min len) コピー should not be stemmed
String[] expected_tokens_katakana = new String[] { "明後日", "パーティ", "に", "行く", "予定", "が", "ある", "図書館", "で", "資料", "を", "コピー", "し", "まし", "た" };
assertSimpleTSOutput(tokenFilter.create(tokenizer), expected_tokens_katakana);
tokenFilter = analysis.tokenFilter.get("kuromoji_ks");
assertThat(tokenFilter, instanceOf(KuromojiKatakanaStemmerFactory.class));
tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
tokenizer.setReader(new StringReader(source));
// パーティー should not be stemmed since min len == 6
// コピー should not be stemmed
expected_tokens_katakana = new String[] { "明後日", "パーティー", "に", "行く", "予定", "が", "ある", "図書館", "で", "資料", "を", "コピー", "し", "まし", "た" };
assertSimpleTSOutput(tokenFilter.create(tokenizer), expected_tokens_katakana);
}
use of org.apache.lucene.analysis.Tokenizer in project elasticsearch by elastic.
the class KuromojiAnalysisTests method testKuromojiUserDict.
public void testKuromojiUserDict() throws IOException {
TestAnalysis analysis = createTestAnalysis();
TokenizerFactory tokenizerFactory = analysis.tokenizer.get("kuromoji_user_dict");
String source = "私は制限スピードを超える。";
String[] expected = new String[] { "私", "は", "制限スピード", "を", "超える" };
Tokenizer tokenizer = tokenizerFactory.create();
tokenizer.setReader(new StringReader(source));
assertSimpleTSOutput(tokenizer, expected);
}
use of org.apache.lucene.analysis.Tokenizer in project elasticsearch by elastic.
the class KuromojiAnalysisTests method testJapaneseStopFilterFactory.
public void testJapaneseStopFilterFactory() throws IOException {
TestAnalysis analysis = createTestAnalysis();
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("ja_stop");
assertThat(tokenFilter, instanceOf(JapaneseStopTokenFilterFactory.class));
String source = "私は制限スピードを超える。";
String[] expected = new String[] { "私", "制限", "超える" };
Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
tokenizer.setReader(new StringReader(source));
assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
}
Aggregations