use of org.apache.lucene.analysis.Tokenizer in project elasticsearch by elastic.
the class BaseWordDelimiterTokenFilterFactoryTestCase method testPreserveOriginal.
public void testPreserveOriginal() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).put("index.analysis.filter.my_word_delimiter.type", type).put("index.analysis.filter.my_word_delimiter.preserve_original", "true").build());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
String[] expected = new String[] { "PowerShot", "Power", "Shot", "500-42", "500", "42", "wi-fi", "wi", "fi", "wi-fi-4000", "wi", "fi", "4000", "j2se", "j", "2", "se", "O'Neil's", "O", "Neil" };
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
use of org.apache.lucene.analysis.Tokenizer in project elasticsearch by elastic.
the class MockRepeatAnalyzer method createComponents.
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new StandardTokenizer();
TokenStream repeatFilter = new MockRepeatFilter(tokenizer);
return new TokenStreamComponents(tokenizer, repeatFilter);
}
use of org.apache.lucene.analysis.Tokenizer in project languagetool by languagetool-org.
the class LanguageToolFilterTest method testFilter.
public void testFilter() throws Exception {
String input = "How to?";
Tokenizer stream = new AnyCharTokenizer();
stream.setReader(new StringReader(input));
LanguageToolFilter filter = new LanguageToolFilter(stream, new JLanguageTool(new English()), false);
//displayTokensWithFullDetails(filter);
String start = "_POS_SENT_START";
assertTokenStreamContents(filter, new String[] { start, "How", "_LEMMA_how", "_POS_WRB", "to", "_LEMMA_to", "_POS_TO", "_LEMMA_to", "_POS_IN", "?", "_POS_SENT_END" }, new int[] { 0, 0, 0, 0, 4, 4, 4, 4, 4, 6, 6 }, new int[] { 0, 3, 3, 3, 6, 6, 6, 6, 6, 7, 7 }, new String[] { "pos", "word", "pos", "pos", "word", "pos", "pos", "pos", "pos", "word", "pos" }, new int[] { 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0 }, 7);
}
use of org.apache.lucene.analysis.Tokenizer in project languagetool by languagetool-org.
the class LanguageToolAnalyzer method createComponents.
@Override
protected TokenStreamComponents createComponents(String s) {
Tokenizer tokenizer = new AnyCharTokenizer();
TokenStream result = new LanguageToolFilter(tokenizer, languageTool, toLowerCase);
return new TokenStreamComponents(tokenizer, result);
}
use of org.apache.lucene.analysis.Tokenizer in project che by eclipse.
the class LuceneSearcher method makeAnalyzer.
protected Analyzer makeAnalyzer() {
return new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new WhitespaceTokenizer();
TokenStream filter = new LowerCaseFilter(tokenizer);
return new TokenStreamComponents(tokenizer, filter);
}
};
}
Aggregations