Search in sources :

Example 6 with SnowballFilter

use of org.apache.lucene.analysis.snowball.SnowballFilter in project elasticsearch by elastic.

the class SnowballAnalyzer method createComponents.

/** Constructs a {@link StandardTokenizer} filtered by a {@link
      StandardFilter}, a {@link LowerCaseFilter}, a {@link StopFilter},
      and a {@link SnowballFilter} */
@Override
public TokenStreamComponents createComponents(String fieldName) {
    final Tokenizer tokenizer = new StandardTokenizer();
    TokenStream result = tokenizer;
    // remove the possessive 's for english stemmers
    if (name.equals("English") || name.equals("Porter") || name.equals("Lovins"))
        result = new EnglishPossessiveFilter(result);
    // Use a special lowercase filter for turkish, the stemmer expects it.
    if (name.equals("Turkish"))
        result = new TurkishLowerCaseFilter(result);
    else
        result = new LowerCaseFilter(result);
    if (stopSet != null)
        result = new StopFilter(result, stopSet);
    result = new SnowballFilter(result, name);
    return new TokenStreamComponents(tokenizer, result);
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) EnglishPossessiveFilter(org.apache.lucene.analysis.en.EnglishPossessiveFilter) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) StopFilter(org.apache.lucene.analysis.StopFilter) TurkishLowerCaseFilter(org.apache.lucene.analysis.tr.TurkishLowerCaseFilter) SnowballFilter(org.apache.lucene.analysis.snowball.SnowballFilter) Tokenizer(org.apache.lucene.analysis.Tokenizer) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) LowerCaseFilter(org.apache.lucene.analysis.LowerCaseFilter) TurkishLowerCaseFilter(org.apache.lucene.analysis.tr.TurkishLowerCaseFilter)

Example 7 with SnowballFilter

use of org.apache.lucene.analysis.snowball.SnowballFilter in project lucene-solr by apache.

the class SwedishAnalyzer method createComponents.

/**
   * Creates a
   * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
   * which tokenizes all the text in the provided {@link Reader}.
   * 
   * @return A
   *         {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
   *         built from an {@link StandardTokenizer} filtered with
   *         {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
   *         , {@link SetKeywordMarkerFilter} if a stem exclusion set is
   *         provided and {@link SnowballFilter}.
   */
@Override
protected TokenStreamComponents createComponents(String fieldName) {
    final Tokenizer source = new StandardTokenizer();
    TokenStream result = new StandardFilter(source);
    result = new LowerCaseFilter(result);
    result = new StopFilter(result, stopwords);
    if (!stemExclusionSet.isEmpty())
        result = new SetKeywordMarkerFilter(result, stemExclusionSet);
    result = new SnowballFilter(result, new SwedishStemmer());
    return new TokenStreamComponents(source, result);
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) StopFilter(org.apache.lucene.analysis.StopFilter) SetKeywordMarkerFilter(org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter) StandardFilter(org.apache.lucene.analysis.standard.StandardFilter) SnowballFilter(org.apache.lucene.analysis.snowball.SnowballFilter) SwedishStemmer(org.tartarus.snowball.ext.SwedishStemmer) Tokenizer(org.apache.lucene.analysis.Tokenizer) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) LowerCaseFilter(org.apache.lucene.analysis.LowerCaseFilter)

Example 8 with SnowballFilter

use of org.apache.lucene.analysis.snowball.SnowballFilter in project lucene-solr by apache.

the class TestKeywordRepeatFilter method testBasic.

public void testBasic() throws IOException {
    TokenStream ts = new RemoveDuplicatesTokenFilter(new SnowballFilter(new KeywordRepeatFilter(whitespaceMockTokenizer("the birds are flying")), "English"));
    assertTokenStreamContents(ts, new String[] { "the", "birds", "bird", "are", "flying", "fli" }, new int[] { 1, 1, 0, 1, 1, 0 });
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) SnowballFilter(org.apache.lucene.analysis.snowball.SnowballFilter)

Example 9 with SnowballFilter

use of org.apache.lucene.analysis.snowball.SnowballFilter in project lucene-solr by apache.

the class TestKeywordRepeatFilter method testComposition.

public void testComposition() throws IOException {
    TokenStream ts = new RemoveDuplicatesTokenFilter(new SnowballFilter(new KeywordRepeatFilter(new KeywordRepeatFilter(whitespaceMockTokenizer("the birds are flying"))), "English"));
    assertTokenStreamContents(ts, new String[] { "the", "birds", "bird", "are", "flying", "fli" }, new int[] { 1, 1, 0, 1, 1, 0 });
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) SnowballFilter(org.apache.lucene.analysis.snowball.SnowballFilter)

Example 10 with SnowballFilter

use of org.apache.lucene.analysis.snowball.SnowballFilter in project lucene-solr by apache.

the class IrishAnalyzer method createComponents.

/**
   * Creates a
   * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
   * which tokenizes all the text in the provided {@link Reader}.
   * 
   * @return A
   *         {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
   *         built from an {@link StandardTokenizer} filtered with
   *         {@link StandardFilter}, {@link IrishLowerCaseFilter}, {@link StopFilter}
   *         , {@link SetKeywordMarkerFilter} if a stem exclusion set is
   *         provided and {@link SnowballFilter}.
   */
@Override
protected TokenStreamComponents createComponents(String fieldName) {
    final Tokenizer source = new StandardTokenizer();
    TokenStream result = new StandardFilter(source);
    result = new StopFilter(result, HYPHENATIONS);
    result = new ElisionFilter(result, DEFAULT_ARTICLES);
    result = new IrishLowerCaseFilter(result);
    result = new StopFilter(result, stopwords);
    if (!stemExclusionSet.isEmpty())
        result = new SetKeywordMarkerFilter(result, stemExclusionSet);
    result = new SnowballFilter(result, new IrishStemmer());
    return new TokenStreamComponents(source, result);
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) ElisionFilter(org.apache.lucene.analysis.util.ElisionFilter) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) StopFilter(org.apache.lucene.analysis.StopFilter) SetKeywordMarkerFilter(org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter) StandardFilter(org.apache.lucene.analysis.standard.StandardFilter) SnowballFilter(org.apache.lucene.analysis.snowball.SnowballFilter) Tokenizer(org.apache.lucene.analysis.Tokenizer) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) IrishStemmer(org.tartarus.snowball.ext.IrishStemmer)

Aggregations

SnowballFilter (org.apache.lucene.analysis.snowball.SnowballFilter)18 TokenStream (org.apache.lucene.analysis.TokenStream)17 Tokenizer (org.apache.lucene.analysis.Tokenizer)16 StopFilter (org.apache.lucene.analysis.StopFilter)15 StandardTokenizer (org.apache.lucene.analysis.standard.StandardTokenizer)15 SetKeywordMarkerFilter (org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter)14 StandardFilter (org.apache.lucene.analysis.standard.StandardFilter)14 LowerCaseFilter (org.apache.lucene.analysis.LowerCaseFilter)13 ElisionFilter (org.apache.lucene.analysis.util.ElisionFilter)2 LithuanianStemmer (org.tartarus.snowball.ext.LithuanianStemmer)2 Analyzer (org.apache.lucene.analysis.Analyzer)1 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)1 EnglishPossessiveFilter (org.apache.lucene.analysis.en.EnglishPossessiveFilter)1 StemmerOverrideFilter (org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter)1 TurkishLowerCaseFilter (org.apache.lucene.analysis.tr.TurkishLowerCaseFilter)1 ArmenianStemmer (org.tartarus.snowball.ext.ArmenianStemmer)1 BasqueStemmer (org.tartarus.snowball.ext.BasqueStemmer)1 CatalanStemmer (org.tartarus.snowball.ext.CatalanStemmer)1 DanishStemmer (org.tartarus.snowball.ext.DanishStemmer)1 FinnishStemmer (org.tartarus.snowball.ext.FinnishStemmer)1