use of org.apache.lucene.analysis.ja.JapaneseTokenizer in project elasticsearch by elastic.
the class KuromojiAnalysisTests method testBaseFormFilterFactory.
public void testBaseFormFilterFactory() throws IOException {
TestAnalysis analysis = createTestAnalysis();
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("kuromoji_pos");
assertThat(tokenFilter, instanceOf(KuromojiPartOfSpeechFilterFactory.class));
String source = "私は制限スピードを超える。";
String[] expected = new String[] { "私", "は", "制限", "スピード", "を" };
Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
tokenizer.setReader(new StringReader(source));
assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
}
use of org.apache.lucene.analysis.ja.JapaneseTokenizer in project elasticsearch by elastic.
the class KuromojiAnalysisTests method testKatakanaStemFilter.
public void testKatakanaStemFilter() throws IOException {
TestAnalysis analysis = createTestAnalysis();
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("kuromoji_stemmer");
assertThat(tokenFilter, instanceOf(KuromojiKatakanaStemmerFactory.class));
String source = "明後日パーティーに行く予定がある。図書館で資料をコピーしました。";
Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
tokenizer.setReader(new StringReader(source));
// パーティー should be stemmed by default
// (min len) コピー should not be stemmed
String[] expected_tokens_katakana = new String[] { "明後日", "パーティ", "に", "行く", "予定", "が", "ある", "図書館", "で", "資料", "を", "コピー", "し", "まし", "た" };
assertSimpleTSOutput(tokenFilter.create(tokenizer), expected_tokens_katakana);
tokenFilter = analysis.tokenFilter.get("kuromoji_ks");
assertThat(tokenFilter, instanceOf(KuromojiKatakanaStemmerFactory.class));
tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
tokenizer.setReader(new StringReader(source));
// パーティー should not be stemmed since min len == 6
// コピー should not be stemmed
expected_tokens_katakana = new String[] { "明後日", "パーティー", "に", "行く", "予定", "が", "ある", "図書館", "で", "資料", "を", "コピー", "し", "まし", "た" };
assertSimpleTSOutput(tokenFilter.create(tokenizer), expected_tokens_katakana);
}
use of org.apache.lucene.analysis.ja.JapaneseTokenizer in project elasticsearch by elastic.
the class KuromojiAnalysisTests method testJapaneseStopFilterFactory.
public void testJapaneseStopFilterFactory() throws IOException {
TestAnalysis analysis = createTestAnalysis();
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("ja_stop");
assertThat(tokenFilter, instanceOf(JapaneseStopTokenFilterFactory.class));
String source = "私は制限スピードを超える。";
String[] expected = new String[] { "私", "制限", "超える" };
Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
tokenizer.setReader(new StringReader(source));
assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
}
use of org.apache.lucene.analysis.ja.JapaneseTokenizer in project elasticsearch by elastic.
the class KuromojiAnalysisTests method testReadingFormFilterFactory.
public void testReadingFormFilterFactory() throws IOException {
TestAnalysis analysis = createTestAnalysis();
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("kuromoji_rf");
assertThat(tokenFilter, instanceOf(KuromojiReadingFormFilterFactory.class));
String source = "今夜はロバート先生と話した";
String[] expected_tokens_romaji = new String[] { "kon'ya", "ha", "robato", "sensei", "to", "hanashi", "ta" };
Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
tokenizer.setReader(new StringReader(source));
assertSimpleTSOutput(tokenFilter.create(tokenizer), expected_tokens_romaji);
tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
tokenizer.setReader(new StringReader(source));
String[] expected_tokens_katakana = new String[] { "コンヤ", "ハ", "ロバート", "センセイ", "ト", "ハナシ", "タ" };
tokenFilter = analysis.tokenFilter.get("kuromoji_readingform");
assertThat(tokenFilter, instanceOf(KuromojiReadingFormFilterFactory.class));
assertSimpleTSOutput(tokenFilter.create(tokenizer), expected_tokens_katakana);
}
use of org.apache.lucene.analysis.ja.JapaneseTokenizer in project omegat by omegat-org.
the class LuceneJapaneseTokenizer method getTokenStream.
@SuppressWarnings("resource")
@Override
protected TokenStream getTokenStream(String strOrig, boolean stemsAllowed, boolean stopWordsAllowed) throws IOException {
if (stemsAllowed) {
// Blank out tags when stemming only
strOrig = blankOutTags(strOrig);
CharArraySet stopWords = stopWordsAllowed ? JapaneseAnalyzer.getDefaultStopSet() : CharArraySet.EMPTY_SET;
Set<String> stopTags = stopWordsAllowed ? JapaneseAnalyzer.getDefaultStopTags() : Collections.emptySet();
return new JapaneseAnalyzer(null, Mode.SEARCH, stopWords, stopTags).tokenStream("", new StringReader(strOrig));
} else {
JapaneseTokenizer tokenizer = new JapaneseTokenizer(null, false, Mode.NORMAL);
tokenizer.setReader(new StringReader(strOrig));
return new TagJoiningFilter(tokenizer);
}
}
Aggregations