use of org.apache.lucene.analysis.Tokenizer in project elasticsearch by elastic.
the class CJKFilterFactoryTests method testHanUnigramOnly.
public void testHanUnigramOnly() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE);
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("cjk_han_unigram_only");
String source = "多くの学生が試験に落ちた。";
String[] expected = new String[] { "多", "く", "の", "学", "学生", "生", "が", "試", "試験", "験", "に", "落", "ち", "た" };
Tokenizer tokenizer = new StandardTokenizer();
tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
use of org.apache.lucene.analysis.Tokenizer in project elasticsearch by elastic.
the class CJKFilterFactoryTests method testDefault.
public void testDefault() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE);
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("cjk_bigram");
String source = "多くの学生が試験に落ちた。";
String[] expected = new String[] { "多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた" };
Tokenizer tokenizer = new StandardTokenizer();
tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
use of org.apache.lucene.analysis.Tokenizer in project elasticsearch by elastic.
the class ShingleTokenFilterFactoryTests method testInverseMappingNoShingles.
public void testInverseMappingNoShingles() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE);
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("shingle_inverse");
assertThat(tokenFilter, instanceOf(ShingleTokenFilterFactory.class));
String source = "the quick";
String[] expected = new String[] { "the", "quick" };
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
use of org.apache.lucene.analysis.Tokenizer in project elasticsearch by elastic.
the class StemmerTokenFilterFactoryTests method testEnglishFilterFactory.
public void testEnglishFilterFactory() throws IOException {
int iters = scaledRandomIntBetween(20, 100);
for (int i = 0; i < iters; i++) {
Version v = VersionUtils.randomVersion(random());
Settings settings = Settings.builder().put("index.analysis.filter.my_english.type", "stemmer").put("index.analysis.filter.my_english.language", "english").put("index.analysis.analyzer.my_english.tokenizer", "whitespace").put("index.analysis.analyzer.my_english.filter", "my_english").put(SETTING_VERSION_CREATED, v).put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build();
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings);
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_english");
assertThat(tokenFilter, instanceOf(StemmerTokenFilterFactory.class));
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader("foo bar"));
TokenStream create = tokenFilter.create(tokenizer);
IndexAnalyzers indexAnalyzers = analysis.indexAnalyzers;
NamedAnalyzer analyzer = indexAnalyzers.get("my_english");
assertThat(create, instanceOf(PorterStemFilter.class));
assertAnalyzesTo(analyzer, "consolingly", new String[] { "consolingli" });
}
}
use of org.apache.lucene.analysis.Tokenizer in project elasticsearch by elastic.
the class WordDelimiterGraphTokenFilterFactoryTests method testPartsAndCatenate.
/** Correct offset order when doing both parts and concatenation: PowerShot is a synonym of Power */
public void testPartsAndCatenate() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).put("index.analysis.filter.my_word_delimiter.type", type).put("index.analysis.filter.my_word_delimiter.catenate_words", "true").put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true").build());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
String source = "PowerShot";
int[] expectedIncr = new int[] { 1, 0, 1 };
int[] expectedPosLen = new int[] { 2, 1, 1 };
String[] expected = new String[] { "PowerShot", "Power", "Shot" };
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected, null, null, null, expectedIncr, expectedPosLen, null);
}
Aggregations