use of org.apache.lucene.analysis.Tokenizer in project elasticsearch by elastic.
the class NGramTokenizerFactoryTests method testPreTokenization.
public void testPreTokenization() throws IOException {
// Make sure that pretokenization works well and that it can be used even with token chars which are supplementary characters
final Index index = new Index("test", "_na_");
final String name = "ngr";
final Settings indexSettings = newAnalysisSettingsBuilder().build();
Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3).put("token_chars", "letter,digit").build();
Tokenizer tokenizer = new NGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings).create();
tokenizer.setReader(new StringReader("Åbc déf g𐐀f "));
assertTokenStreamContents(tokenizer, new String[] { "Åb", "Åbc", "bc", "dé", "déf", "éf", "g𐐀", "g𐐀f", "𐐀f" });
settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3).put("token_chars", "letter,digit,punctuation,whitespace,symbol").build();
tokenizer = new NGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings).create();
tokenizer.setReader(new StringReader(" a!$ 9"));
assertTokenStreamContents(tokenizer, new String[] { " a", " a!", "a!", "a!$", "!$", "!$ ", "$ ", "$ 9", " 9" });
}
use of org.apache.lucene.analysis.Tokenizer in project elasticsearch by elastic.
the class BaseWordDelimiterTokenFilterFactoryTestCase method testSplitOnCaseChange.
public void testSplitOnCaseChange() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).put("index.analysis.filter.my_word_delimiter.type", type).put("index.analysis.filter.my_word_delimiter.split_on_case_change", "false").build());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
String source = "PowerShot";
String[] expected = new String[] { "PowerShot" };
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
use of org.apache.lucene.analysis.Tokenizer in project elasticsearch by elastic.
the class BaseWordDelimiterTokenFilterFactoryTestCase method testCatenateAll.
public void testCatenateAll() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).put("index.analysis.filter.my_word_delimiter.type", type).put("index.analysis.filter.my_word_delimiter.generate_word_parts", "false").put("index.analysis.filter.my_word_delimiter.generate_number_parts", "false").put("index.analysis.filter.my_word_delimiter.catenate_all", "true").build());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
String[] expected = new String[] { "PowerShot", "50042", "wifi", "wifi4000", "j2se", "ONeil" };
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
use of org.apache.lucene.analysis.Tokenizer in project elasticsearch by elastic.
the class BaseWordDelimiterTokenFilterFactoryTestCase method testStemEnglishPossessive.
public void testStemEnglishPossessive() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).put("index.analysis.filter.my_word_delimiter.type", type).put("index.analysis.filter.my_word_delimiter.stem_english_possessive", "false").build());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
String[] expected = new String[] { "Power", "Shot", "500", "42", "wi", "fi", "wi", "fi", "4000", "j", "2", "se", "O", "Neil", "s" };
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
use of org.apache.lucene.analysis.Tokenizer in project elasticsearch by elastic.
the class CJKFilterFactoryTests method testHanOnly.
public void testHanOnly() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE);
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("cjk_han_only");
String source = "多くの学生が試験に落ちた。";
String[] expected = new String[] { "多", "く", "の", "学生", "が", "試験", "に", "落", "ち", "た" };
Tokenizer tokenizer = new StandardTokenizer();
tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
Aggregations