use of org.apache.lucene.analysis.Tokenizer in project elasticsearch by elastic.
the class KuromojiAnalysisTests method testNbestExample.
public void testNbestExample() throws IOException {
TestAnalysis analysis = createTestAnalysis();
TokenizerFactory tokenizerFactory = analysis.tokenizer.get("kuromoji_nbest_examples");
String source = "鳩山積み";
String[] expected = new String[] { "鳩", "鳩山", "山積み", "積み" };
Tokenizer tokenizer = tokenizerFactory.create();
tokenizer.setReader(new StringReader(source));
assertSimpleTSOutput(tokenizer, expected);
}
use of org.apache.lucene.analysis.Tokenizer in project elasticsearch by elastic.
the class KuromojiAnalysisTests method testNbestCost.
public void testNbestCost() throws IOException {
TestAnalysis analysis = createTestAnalysis();
TokenizerFactory tokenizerFactory = analysis.tokenizer.get("kuromoji_nbest_cost");
String source = "鳩山積み";
String[] expected = new String[] { "鳩", "鳩山", "山積み", "積み" };
Tokenizer tokenizer = tokenizerFactory.create();
tokenizer.setReader(new StringReader(source));
assertSimpleTSOutput(tokenizer, expected);
}
use of org.apache.lucene.analysis.Tokenizer in project elasticsearch by elastic.
the class KuromojiAnalysisTests method testReadingFormFilterFactory.
public void testReadingFormFilterFactory() throws IOException {
TestAnalysis analysis = createTestAnalysis();
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("kuromoji_rf");
assertThat(tokenFilter, instanceOf(KuromojiReadingFormFilterFactory.class));
String source = "今夜はロバート先生と話した";
String[] expected_tokens_romaji = new String[] { "kon'ya", "ha", "robato", "sensei", "to", "hanashi", "ta" };
Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
tokenizer.setReader(new StringReader(source));
assertSimpleTSOutput(tokenFilter.create(tokenizer), expected_tokens_romaji);
tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
tokenizer.setReader(new StringReader(source));
String[] expected_tokens_katakana = new String[] { "コンヤ", "ハ", "ロバート", "センセイ", "ト", "ハナシ", "タ" };
tokenFilter = analysis.tokenFilter.get("kuromoji_readingform");
assertThat(tokenFilter, instanceOf(KuromojiReadingFormFilterFactory.class));
assertSimpleTSOutput(tokenFilter.create(tokenizer), expected_tokens_katakana);
}
use of org.apache.lucene.analysis.Tokenizer in project elasticsearch by elastic.
the class WordDelimiterTokenFilterFactoryTests method testPartsAndCatenate.
/** Correct offset order when doing both parts and concatenation: PowerShot is a synonym of Power */
public void testPartsAndCatenate() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).put("index.analysis.filter.my_word_delimiter.type", type).put("index.analysis.filter.my_word_delimiter.catenate_words", "true").put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true").build());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
String source = "PowerShot";
String[] expected = new String[] { "Power", "PowerShot", "Shot" };
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
use of org.apache.lucene.analysis.Tokenizer in project elasticsearch by elastic.
the class CommonGramsTokenFilterFactoryTests method testWithoutCommonWordsMatch.
public void testWithoutCommonWordsMatch() throws IOException {
{
Settings settings = Settings.builder().put("index.analysis.filter.common_grams_default.type", "common_grams").putArray("index.analysis.filter.common_grams_default.common_words", "chromosome", "protein").put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build();
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings);
{
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("common_grams_default");
String source = "the quick brown is a fox Or noT";
String[] expected = new String[] { "the", "quick", "brown", "is", "a", "fox", "Or", "noT" };
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
}
{
Settings settings = Settings.builder().put("index.analysis.filter.common_grams_default.type", "common_grams").put("index.analysis.filter.common_grams_default.query_mode", false).put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).putArray("index.analysis.filter.common_grams_default.common_words", "chromosome", "protein").build();
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings);
{
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("common_grams_default");
String source = "the quick brown is a fox Or noT";
String[] expected = new String[] { "the", "quick", "brown", "is", "a", "fox", "Or", "noT" };
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
}
}
Aggregations