use of org.apache.lucene.analysis.StopFilter in project lucene-solr by apache.
the class BrazilianAnalyzer method createComponents.
/**
* Creates
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* used to tokenize all the text in the provided {@link Reader}.
*
* @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* built from a {@link StandardTokenizer} filtered with
* {@link LowerCaseFilter}, {@link StandardFilter}, {@link StopFilter}
* , and {@link BrazilianStemFilter}.
*/
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new StandardTokenizer();
TokenStream result = new LowerCaseFilter(source);
result = new StandardFilter(result);
result = new StopFilter(result, stopwords);
if (excltable != null && !excltable.isEmpty())
result = new SetKeywordMarkerFilter(result, excltable);
return new TokenStreamComponents(source, new BrazilianStemFilter(result));
}
use of org.apache.lucene.analysis.StopFilter in project lucene-solr by apache.
the class LatvianAnalyzer method createComponents.
/**
* Creates a
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* which tokenizes all the text in the provided {@link Reader}.
*
* @return A
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* built from an {@link StandardTokenizer} filtered with
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
* , {@link SetKeywordMarkerFilter} if a stem exclusion set is
* provided and {@link LatvianStemFilter}.
*/
@Override
protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer source = new StandardTokenizer();
TokenStream result = new StandardFilter(source);
result = new LowerCaseFilter(result);
result = new StopFilter(result, stopwords);
if (!stemExclusionSet.isEmpty())
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
result = new LatvianStemFilter(result);
return new TokenStreamComponents(source, result);
}
use of org.apache.lucene.analysis.StopFilter in project lucene-solr by apache.
the class ShingleAnalyzerWrapperTest method testAltFillerToken.
public void testAltFillerToken() throws Exception {
Analyzer delegate = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
CharArraySet stopSet = StopFilter.makeStopSet("into");
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
TokenFilter filter = new StopFilter(tokenizer, stopSet);
return new TokenStreamComponents(tokenizer, filter);
}
};
ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(delegate, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, ShingleFilter.DEFAULT_TOKEN_SEPARATOR, true, false, "--");
assertAnalyzesTo(analyzer, "please divide into shingles", new String[] { "please", "please divide", "divide", "divide --", "-- shingles", "shingles" }, new int[] { 0, 0, 7, 7, 19, 19 }, new int[] { 6, 13, 13, 19, 27, 27 }, new int[] { 1, 0, 1, 0, 1, 1 });
analyzer.close();
delegate = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
CharArraySet stopSet = StopFilter.makeStopSet("into");
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
TokenFilter filter = new StopFilter(tokenizer, stopSet);
return new TokenStreamComponents(tokenizer, filter);
}
};
analyzer = new ShingleAnalyzerWrapper(delegate, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, ShingleFilter.DEFAULT_TOKEN_SEPARATOR, false, false, null);
assertAnalyzesTo(analyzer, "please divide into shingles", new String[] { "please divide", "divide ", " shingles" }, new int[] { 0, 7, 19 }, new int[] { 13, 19, 27 }, new int[] { 1, 1, 1 });
analyzer.close();
delegate = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
CharArraySet stopSet = StopFilter.makeStopSet("into");
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
TokenFilter filter = new StopFilter(tokenizer, stopSet);
return new TokenStreamComponents(tokenizer, filter);
}
};
analyzer = new ShingleAnalyzerWrapper(delegate, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, ShingleFilter.DEFAULT_TOKEN_SEPARATOR, false, false, "");
assertAnalyzesTo(analyzer, "please divide into shingles", new String[] { "please divide", "divide ", " shingles" }, new int[] { 0, 7, 19 }, new int[] { 13, 19, 27 }, new int[] { 1, 1, 1 });
analyzer.close();
}
use of org.apache.lucene.analysis.StopFilter in project lucene-solr by apache.
the class TestWithCJKBigramFilter method setUp.
@Override
public void setUp() throws Exception {
super.setUp();
/*
* ICUTokenizer+CJKBigramFilter
*/
analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false, true));
TokenStream result = new CJKBigramFilter(source);
return new TokenStreamComponents(source, new StopFilter(result, CharArraySet.EMPTY_SET));
}
};
/*
* ICUTokenizer+ICUNormalizer2Filter+CJKBigramFilter.
*
* ICUNormalizer2Filter uses nfkc_casefold by default, so this is a language-independent
* superset of CJKWidthFilter's foldings.
*/
analyzer2 = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false, true));
// we put this before the CJKBigramFilter, because the normalization might combine
// some halfwidth katakana forms, which will affect the bigramming.
TokenStream result = new ICUNormalizer2Filter(source);
result = new CJKBigramFilter(result);
return new TokenStreamComponents(source, new StopFilter(result, CharArraySet.EMPTY_SET));
}
};
}
use of org.apache.lucene.analysis.StopFilter in project lucene-solr by apache.
the class JapaneseAnalyzer method createComponents.
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new JapaneseTokenizer(userDict, true, mode);
TokenStream stream = new JapaneseBaseFormFilter(tokenizer);
stream = new JapanesePartOfSpeechStopFilter(stream, stoptags);
stream = new CJKWidthFilter(stream);
stream = new StopFilter(stream, stopwords);
stream = new JapaneseKatakanaStemFilter(stream);
stream = new LowerCaseFilter(stream);
return new TokenStreamComponents(tokenizer, stream);
}
Aggregations