use of org.apache.lucene.analysis.StopFilter in project elasticsearch by elastic.
the class PatternAnalyzer method createComponents.
@Override
protected TokenStreamComponents createComponents(String s) {
final Tokenizer tokenizer = new PatternTokenizer(pattern, -1);
TokenStream stream = tokenizer;
if (lowercase) {
stream = new LowerCaseFilter(stream);
}
if (stopWords != null) {
stream = new StopFilter(stream, stopWords);
}
return new TokenStreamComponents(tokenizer, stream);
}
use of org.apache.lucene.analysis.StopFilter in project lucene-solr by apache.
the class TestFreeTextSuggester method testEndingHole.
// With one ending hole, ShingleFilter produces "of _" and
// we should properly predict from that:
public void testEndingHole() throws Exception {
// Just deletes "of"
Analyzer a = new Analyzer() {
@Override
public TokenStreamComponents createComponents(String field) {
Tokenizer tokenizer = new MockTokenizer();
CharArraySet stopSet = StopFilter.makeStopSet("of");
return new TokenStreamComponents(tokenizer, new StopFilter(tokenizer, stopSet));
}
};
Iterable<Input> keys = AnalyzingSuggesterTest.shuffle(new Input("wizard of oz", 50));
FreeTextSuggester sug = new FreeTextSuggester(a, a, 3, (byte) 0x20);
sug.build(new InputArrayIterator(keys));
assertEquals("wizard _ oz/1.00", toString(sug.lookup("wizard of", 10)));
// Falls back to unigram model, with backoff 0.4 times
// prop 0.5:
assertEquals("oz/0.20", toString(sug.lookup("wizard o", 10)));
a.close();
}
use of org.apache.lucene.analysis.StopFilter in project lucene-solr by apache.
the class TestFreeTextSuggester method testTwoEndingHoles.
// If the number of ending holes exceeds the ngrams window
// then there are no predictions, because ShingleFilter
// does not produce e.g. a hole only "_ _" token:
public void testTwoEndingHoles() throws Exception {
// Just deletes "of"
Analyzer a = new Analyzer() {
@Override
public TokenStreamComponents createComponents(String field) {
Tokenizer tokenizer = new MockTokenizer();
CharArraySet stopSet = StopFilter.makeStopSet("of");
return new TokenStreamComponents(tokenizer, new StopFilter(tokenizer, stopSet));
}
};
Iterable<Input> keys = AnalyzingSuggesterTest.shuffle(new Input("wizard of of oz", 50));
FreeTextSuggester sug = new FreeTextSuggester(a, a, 3, (byte) 0x20);
sug.build(new InputArrayIterator(keys));
assertEquals("", toString(sug.lookup("wizard of of", 10)));
a.close();
}
use of org.apache.lucene.analysis.StopFilter in project lucene-solr by apache.
the class AnalyzingInfixSuggesterTest method testSuggestStopFilter.
public void testSuggestStopFilter() throws Exception {
final CharArraySet stopWords = StopFilter.makeStopSet("a");
Analyzer indexAnalyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
MockTokenizer tokens = new MockTokenizer();
return new TokenStreamComponents(tokens, new StopFilter(tokens, stopWords));
}
};
Analyzer queryAnalyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
MockTokenizer tokens = new MockTokenizer();
return new TokenStreamComponents(tokens, new SuggestStopFilter(tokens, stopWords));
}
};
AnalyzingInfixSuggester suggester = new AnalyzingInfixSuggester(newDirectory(), indexAnalyzer, queryAnalyzer, 3, false);
Input[] keys = new Input[] { new Input("a bob for apples", 10, new BytesRef("foobaz")) };
suggester.build(new InputArrayIterator(keys));
List<LookupResult> results = suggester.lookup(TestUtil.stringToCharSequence("a", random()), 10, true, true);
assertEquals(1, results.size());
assertEquals("a bob for apples", results.get(0).key);
assertEquals("a bob for <b>a</b>pples", results.get(0).highlightKey);
suggester.close();
IOUtils.close(suggester, indexAnalyzer, queryAnalyzer);
}
use of org.apache.lucene.analysis.StopFilter in project lucene-solr by apache.
the class SmartChineseAnalyzer method createComponents.
@Override
public TokenStreamComponents createComponents(String fieldName) {
final Tokenizer tokenizer = new HMMChineseTokenizer();
TokenStream result = tokenizer;
// result = new LowerCaseFilter(result);
// LowerCaseFilter is not needed, as SegTokenFilter lowercases Basic Latin text.
// The porter stemming is too strict, this is not a bug, this is a feature:)
result = new PorterStemFilter(result);
if (!stopWords.isEmpty()) {
result = new StopFilter(result, stopWords);
}
return new TokenStreamComponents(tokenizer, result);
}
Aggregations