use of org.apache.lucene.analysis.core.WhitespaceTokenizer in project elasticsearch by elastic.
the class ASCIIFoldingTokenFilterFactoryTests method testPreserveOriginal.
public void testPreserveOriginal() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).put("index.analysis.filter.my_ascii_folding.type", "asciifolding").put("index.analysis.filter.my_ascii_folding.preserve_original", true).build());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_ascii_folding");
String source = "Ansprüche";
String[] expected = new String[] { "Anspruche", "Ansprüche" };
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
// but the multi-term aware component still emits a single token
tokenFilter = (TokenFilterFactory) ((MultiTermAwareComponent) tokenFilter).getMultiTermComponent();
tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
expected = new String[] { "Anspruche" };
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
use of org.apache.lucene.analysis.core.WhitespaceTokenizer in project elasticsearch by elastic.
the class ASCIIFoldingTokenFilterFactoryTests method testDefault.
public void testDefault() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).put("index.analysis.filter.my_ascii_folding.type", "asciifolding").build());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_ascii_folding");
String source = "Ansprüche";
String[] expected = new String[] { "Anspruche" };
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
use of org.apache.lucene.analysis.core.WhitespaceTokenizer in project elasticsearch by elastic.
the class ShingleTokenFilterFactoryTests method testDefault.
public void testDefault() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE);
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("shingle");
String source = "the quick brown fox";
String[] expected = new String[] { "the", "the quick", "quick", "quick brown", "brown", "brown fox", "fox" };
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
use of org.apache.lucene.analysis.core.WhitespaceTokenizer in project elasticsearch by elastic.
the class ShingleTokenFilterFactoryTests method testFillerToken.
public void testFillerToken() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE);
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("shingle_filler");
String source = "simon the sorcerer";
String[] expected = new String[] { "simon FILLER", "simon FILLER sorcerer", "FILLER sorcerer" };
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
TokenStream stream = new StopFilter(tokenizer, StopFilter.makeStopSet("the"));
assertTokenStreamContents(tokenFilter.create(stream), expected);
}
use of org.apache.lucene.analysis.core.WhitespaceTokenizer in project elasticsearch by elastic.
the class ShingleTokenFilterFactoryTests method testInverseMapping.
public void testInverseMapping() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE);
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("shingle_inverse");
assertThat(tokenFilter, instanceOf(ShingleTokenFilterFactory.class));
String source = "the quick brown fox";
String[] expected = new String[] { "the_quick_brown", "quick_brown_fox" };
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
Aggregations