Search in sources :

Example 26 with StandardTokenizer

use of org.apache.lucene.analysis.standard.StandardTokenizer in project lucene-solr by apache.

the class GreekAnalyzer method createComponents.

/**
   * Creates
   * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
   * used to tokenize all the text in the provided {@link Reader}.
   * 
   * @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
   *         built from a {@link StandardTokenizer} filtered with
   *         {@link GreekLowerCaseFilter}, {@link StandardFilter},
   *         {@link StopFilter}, and {@link GreekStemFilter}
   */
@Override
protected TokenStreamComponents createComponents(String fieldName) {
    final Tokenizer source = new StandardTokenizer();
    TokenStream result = new GreekLowerCaseFilter(source);
    result = new StandardFilter(result);
    result = new StopFilter(result, stopwords);
    result = new GreekStemFilter(result);
    return new TokenStreamComponents(source, result);
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) StopFilter(org.apache.lucene.analysis.StopFilter) StandardFilter(org.apache.lucene.analysis.standard.StandardFilter) Tokenizer(org.apache.lucene.analysis.Tokenizer) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer)

Example 27 with StandardTokenizer

use of org.apache.lucene.analysis.standard.StandardTokenizer in project jackrabbit-oak by apache.

the class OakAnalyzer method createComponents.

@Override
protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
    StandardTokenizer src = new StandardTokenizer(matchVersion, reader);
    TokenStream tok = new LowerCaseFilter(matchVersion, src);
    tok = new WordDelimiterFilter(tok, WordDelimiterFilter.GENERATE_WORD_PARTS | WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE | this.INDEX_ORIGINAL_TERM | WordDelimiterFilter.GENERATE_NUMBER_PARTS, null);
    return new TokenStreamComponents(src, tok);
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) WordDelimiterFilter(org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) LowerCaseFilter(org.apache.lucene.analysis.core.LowerCaseFilter)

Example 28 with StandardTokenizer

use of org.apache.lucene.analysis.standard.StandardTokenizer in project Krill by KorAP.

the class TextAnalyzer method createComponents.

@Override
protected TokenStreamComponents createComponents(final String fieldName) {
    final Tokenizer source = new StandardTokenizer();
    TokenStream sink = new LowerCaseFilter(source);
    return new TokenStreamComponents(source, sink);
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) LowerCaseFilter(org.apache.lucene.analysis.core.LowerCaseFilter) TokenStreamComponents(org.apache.lucene.analysis.Analyzer.TokenStreamComponents)

Example 29 with StandardTokenizer

use of org.apache.lucene.analysis.standard.StandardTokenizer in project vertigo by KleeGroup.

the class DefaultAnalyzer method createComponents.

/**
 * Creates a TokenStream which tokenizes all the text in the provided Reader.
 *
 * @return A TokenStream build from a StandardTokenizer filtered with
 *         StandardFilter, StopFilter, FrenchStemFilter and LowerCaseFilter
 */
@Override
protected TokenStreamComponents createComponents(final String fieldName) {
    /* initialisation du token */
    final Tokenizer source = new StandardTokenizer();
    // -----
    /* on retire les élisions*/
    final CharArraySet elisionSet = new CharArraySet(Arrays.asList(LuceneConstants.ELISION_ARTICLES), true);
    TokenStream filter = new ElisionFilter(source, elisionSet);
    /* on retire article adjectif */
    filter = new StopFilter(filter, stopWords);
    /* on retire les accents */
    filter = new ASCIIFoldingFilter(filter);
    /* on met en minuscule */
    filter = new LowerCaseFilter(filter);
    return new TokenStreamComponents(source, filter);
}
Also used : CharArraySet(org.apache.lucene.analysis.util.CharArraySet) TokenStream(org.apache.lucene.analysis.TokenStream) ElisionFilter(org.apache.lucene.analysis.util.ElisionFilter) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) StopFilter(org.apache.lucene.analysis.core.StopFilter) ASCIIFoldingFilter(org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter) Tokenizer(org.apache.lucene.analysis.Tokenizer) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) LowerCaseFilter(org.apache.lucene.analysis.core.LowerCaseFilter)

Example 30 with StandardTokenizer

use of org.apache.lucene.analysis.standard.StandardTokenizer in project neo4j by neo4j.

the class StandardFoldingAnalyzer method createComponents.

@Override
protected TokenStreamComponents createComponents(String fieldName) {
    StandardTokenizer src = new StandardTokenizer();
    src.setMaxTokenLength(DEFAULT_MAX_TOKEN_LENGTH);
    TokenStream tok = new ASCIIFoldingFilter(src);
    tok = new LowerCaseFilter(tok);
    tok = new StopFilter(tok, stopwords);
    return new TokenStreamComponents(src, tok);
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) StopFilter(org.apache.lucene.analysis.core.StopFilter) ASCIIFoldingFilter(org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter) LowerCaseFilter(org.apache.lucene.analysis.core.LowerCaseFilter)

Aggregations

StandardTokenizer (org.apache.lucene.analysis.standard.StandardTokenizer)82 Tokenizer (org.apache.lucene.analysis.Tokenizer)68 TokenStream (org.apache.lucene.analysis.TokenStream)57 LowerCaseFilter (org.apache.lucene.analysis.LowerCaseFilter)43 StopFilter (org.apache.lucene.analysis.StopFilter)43 StandardFilter (org.apache.lucene.analysis.standard.StandardFilter)36 SetKeywordMarkerFilter (org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter)35 StringReader (java.io.StringReader)18 SnowballFilter (org.apache.lucene.analysis.snowball.SnowballFilter)16 Analyzer (org.apache.lucene.analysis.Analyzer)10 LowerCaseFilter (org.apache.lucene.analysis.core.LowerCaseFilter)10 ASCIIFoldingFilter (org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter)7 EnglishPossessiveFilter (org.apache.lucene.analysis.en.EnglishPossessiveFilter)6 ElisionFilter (org.apache.lucene.analysis.util.ElisionFilter)6 DecimalDigitFilter (org.apache.lucene.analysis.core.DecimalDigitFilter)5 StopFilter (org.apache.lucene.analysis.core.StopFilter)5 ESTestCase (org.elasticsearch.test.ESTestCase)5 HashMap (java.util.HashMap)4 TokenFilter (org.apache.lucene.analysis.TokenFilter)4 PorterStemFilter (org.apache.lucene.analysis.en.PorterStemFilter)4