use of org.apache.lucene.analysis.standard.StandardTokenizer in project lucene-solr by apache.
the class GreekAnalyzer method createComponents.
/**
* Creates
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* used to tokenize all the text in the provided {@link Reader}.
*
* @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* built from a {@link StandardTokenizer} filtered with
* {@link GreekLowerCaseFilter}, {@link StandardFilter},
* {@link StopFilter}, and {@link GreekStemFilter}
*/
@Override
protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer source = new StandardTokenizer();
TokenStream result = new GreekLowerCaseFilter(source);
result = new StandardFilter(result);
result = new StopFilter(result, stopwords);
result = new GreekStemFilter(result);
return new TokenStreamComponents(source, result);
}
use of org.apache.lucene.analysis.standard.StandardTokenizer in project jackrabbit-oak by apache.
the class OakAnalyzer method createComponents.
@Override
protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
StandardTokenizer src = new StandardTokenizer(matchVersion, reader);
TokenStream tok = new LowerCaseFilter(matchVersion, src);
tok = new WordDelimiterFilter(tok, WordDelimiterFilter.GENERATE_WORD_PARTS | WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE | this.INDEX_ORIGINAL_TERM | WordDelimiterFilter.GENERATE_NUMBER_PARTS, null);
return new TokenStreamComponents(src, tok);
}
use of org.apache.lucene.analysis.standard.StandardTokenizer in project Krill by KorAP.
the class TextAnalyzer method createComponents.
@Override
protected TokenStreamComponents createComponents(final String fieldName) {
final Tokenizer source = new StandardTokenizer();
TokenStream sink = new LowerCaseFilter(source);
return new TokenStreamComponents(source, sink);
}
use of org.apache.lucene.analysis.standard.StandardTokenizer in project vertigo by KleeGroup.
the class DefaultAnalyzer method createComponents.
/**
* Creates a TokenStream which tokenizes all the text in the provided Reader.
*
* @return A TokenStream build from a StandardTokenizer filtered with
* StandardFilter, StopFilter, FrenchStemFilter and LowerCaseFilter
*/
@Override
protected TokenStreamComponents createComponents(final String fieldName) {
/* initialisation du token */
final Tokenizer source = new StandardTokenizer();
// -----
/* on retire les élisions*/
final CharArraySet elisionSet = new CharArraySet(Arrays.asList(LuceneConstants.ELISION_ARTICLES), true);
TokenStream filter = new ElisionFilter(source, elisionSet);
/* on retire article adjectif */
filter = new StopFilter(filter, stopWords);
/* on retire les accents */
filter = new ASCIIFoldingFilter(filter);
/* on met en minuscule */
filter = new LowerCaseFilter(filter);
return new TokenStreamComponents(source, filter);
}
use of org.apache.lucene.analysis.standard.StandardTokenizer in project neo4j by neo4j.
the class StandardFoldingAnalyzer method createComponents.
@Override
protected TokenStreamComponents createComponents(String fieldName) {
StandardTokenizer src = new StandardTokenizer();
src.setMaxTokenLength(DEFAULT_MAX_TOKEN_LENGTH);
TokenStream tok = new ASCIIFoldingFilter(src);
tok = new LowerCaseFilter(tok);
tok = new StopFilter(tok, stopwords);
return new TokenStreamComponents(src, tok);
}
Aggregations