use of org.apache.lucene.analysis.core.StopFilter in project Vidyavana by borsosl.
the class HtmlAnalyzer method createComponents.
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new TransliterationTokenizer();
TokenStream filter = new StopFilter(tokenizer, new CharArraySet(Arrays.asList("a", "az", "és"), false));
filter = new TransliterationSynonymFilter(filter);
return new TokenStreamComponents(tokenizer, filter);
}
use of org.apache.lucene.analysis.core.StopFilter in project vertigo by KleeGroup.
the class DefaultAnalyzer method createComponents.
/**
* Creates a TokenStream which tokenizes all the text in the provided Reader.
*
* @return A TokenStream build from a StandardTokenizer filtered with
* StandardFilter, StopFilter, FrenchStemFilter and LowerCaseFilter
*/
@Override
protected TokenStreamComponents createComponents(final String fieldName) {
/* initialisation du token */
final Tokenizer source = new StandardTokenizer();
// -----
/* on retire les élisions*/
final CharArraySet elisionSet = new CharArraySet(Arrays.asList(LuceneConstants.ELISION_ARTICLES), true);
TokenStream filter = new ElisionFilter(source, elisionSet);
/* on retire article adjectif */
filter = new StopFilter(filter, stopWords);
/* on retire les accents */
filter = new ASCIIFoldingFilter(filter);
/* on met en minuscule */
filter = new LowerCaseFilter(filter);
return new TokenStreamComponents(source, filter);
}
use of org.apache.lucene.analysis.core.StopFilter in project neo4j by neo4j.
the class StandardFoldingAnalyzer method createComponents.
@Override
protected TokenStreamComponents createComponents(String fieldName) {
StandardTokenizer src = new StandardTokenizer();
src.setMaxTokenLength(DEFAULT_MAX_TOKEN_LENGTH);
TokenStream tok = new ASCIIFoldingFilter(src);
tok = new LowerCaseFilter(tok);
tok = new StopFilter(tok, stopwords);
return new TokenStreamComponents(src, tok);
}
use of org.apache.lucene.analysis.core.StopFilter in project lucene-skos by behas.
the class SKOSAnalyzer method createComponents.
@Override
protected TokenStreamComponents createComponents(String fileName) {
if (expansionType.equals(ExpansionType.URI)) {
final KeywordTokenizer src = new KeywordTokenizer();
TokenStream tok = new SKOSURIFilter(src, skosEngine, new StandardAnalyzer(), types);
tok = new LowerCaseFilter(tok);
return new TokenStreamComponents(src, tok);
} else {
final StandardTokenizer src = new StandardTokenizer();
src.setMaxTokenLength(maxTokenLength);
TokenStream tok = new StandardFilter(src);
// prior to this we get the classic behavior, standardfilter does it for us.
tok = new SKOSLabelFilter(tok, skosEngine, new StandardAnalyzer(), bufferSize, types);
tok = new LowerCaseFilter(tok);
tok = new StopFilter(tok, stopwords);
tok = new RemoveDuplicatesTokenFilter(tok);
return new TokenStreamComponents(src, tok) {
@Override
protected void setReader(final Reader reader) throws IOException {
src.setMaxTokenLength(maxTokenLength);
super.setReader(reader);
}
};
}
}
use of org.apache.lucene.analysis.core.StopFilter in project cogcomp-nlp by CogComp.
the class ASCIIEnglishAnalyzer method createComponents.
@Override
protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer source = new StandardTokenizer();
TokenStream result = new StandardFilter(source);
result = new ASCIIFoldingFilter(result);
result = new EnglishPossessiveFilter(result);
result = new WordDelimiterFilter(result, WordDelimiterFilter.ALPHA, null);
result = new LowerCaseFilter(result);
result = new StopFilter(result, EnglishAnalyzer.getDefaultStopSet());
result = new PorterStemFilter(result);
return new TokenStreamComponents(source, result);
}
Aggregations