use of org.apache.lucene.analysis.core.WhitespaceTokenizer in project che by eclipse.
the class LuceneSearcher method makeAnalyzer.
protected Analyzer makeAnalyzer() {
return new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new WhitespaceTokenizer();
TokenStream filter = new LowerCaseFilter(tokenizer);
return new TokenStreamComponents(tokenizer, filter);
}
};
}
use of org.apache.lucene.analysis.core.WhitespaceTokenizer in project elasticsearch by elastic.
the class WordDelimiterTokenFilterFactoryTests method testPartsAndCatenate.
/** Correct offset order when doing both parts and concatenation: PowerShot is a synonym of Power */
public void testPartsAndCatenate() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).put("index.analysis.filter.my_word_delimiter.type", type).put("index.analysis.filter.my_word_delimiter.catenate_words", "true").put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true").build());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
String source = "PowerShot";
String[] expected = new String[] { "Power", "PowerShot", "Shot" };
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
use of org.apache.lucene.analysis.core.WhitespaceTokenizer in project elasticsearch by elastic.
the class CommonGramsTokenFilterFactoryTests method testWithoutCommonWordsMatch.
public void testWithoutCommonWordsMatch() throws IOException {
{
Settings settings = Settings.builder().put("index.analysis.filter.common_grams_default.type", "common_grams").putArray("index.analysis.filter.common_grams_default.common_words", "chromosome", "protein").put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build();
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings);
{
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("common_grams_default");
String source = "the quick brown is a fox Or noT";
String[] expected = new String[] { "the", "quick", "brown", "is", "a", "fox", "Or", "noT" };
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
}
{
Settings settings = Settings.builder().put("index.analysis.filter.common_grams_default.type", "common_grams").put("index.analysis.filter.common_grams_default.query_mode", false).put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).putArray("index.analysis.filter.common_grams_default.common_words", "chromosome", "protein").build();
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings);
{
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("common_grams_default");
String source = "the quick brown is a fox Or noT";
String[] expected = new String[] { "the", "quick", "brown", "is", "a", "fox", "Or", "noT" };
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
}
}
use of org.apache.lucene.analysis.core.WhitespaceTokenizer in project elasticsearch by elastic.
the class MinHashFilterFactoryTests method testDefault.
public void testDefault() throws IOException {
int default_hash_count = 1;
int default_bucket_size = 512;
int default_hash_set_size = 1;
Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build();
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings);
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("min_hash");
String source = "the quick brown fox";
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
// with_rotation is true by default, and hash_set_size is 1, so even though the source doesn't
// have enough tokens to fill all the buckets, we still expect 512 tokens.
assertStreamHasNumberOfTokens(tokenFilter.create(tokenizer), default_hash_count * default_bucket_size * default_hash_set_size);
}
use of org.apache.lucene.analysis.core.WhitespaceTokenizer in project elasticsearch by elastic.
the class MinHashFilterFactoryTests method testSettings.
public void testSettings() throws IOException {
Settings settings = Settings.builder().put("index.analysis.filter.test_min_hash.type", "min_hash").put("index.analysis.filter.test_min_hash.hash_count", "1").put("index.analysis.filter.test_min_hash.bucket_count", "2").put("index.analysis.filter.test_min_hash.hash_set_size", "1").put("index.analysis.filter.test_min_hash.with_rotation", false).put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build();
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings);
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("test_min_hash");
String source = "sushi";
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
// despite the fact that bucket_count is 2 and hash_set_size is 1,
// because with_rotation is false, we only expect 1 token here.
assertStreamHasNumberOfTokens(tokenFilter.create(tokenizer), 1);
}
Aggregations