Search in sources :

Example 41 with StopFilter

use of org.apache.lucene.analysis.StopFilter in project lucene-solr by apache.

the class BrazilianAnalyzer method createComponents.

/**
   * Creates
   * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
   * used to tokenize all the text in the provided {@link Reader}.
   * 
   * @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
   *         built from a {@link StandardTokenizer} filtered with
   *         {@link LowerCaseFilter}, {@link StandardFilter}, {@link StopFilter}
   *         , and {@link BrazilianStemFilter}.
   */
@Override
protected TokenStreamComponents createComponents(String fieldName) {
    Tokenizer source = new StandardTokenizer();
    TokenStream result = new LowerCaseFilter(source);
    result = new StandardFilter(result);
    result = new StopFilter(result, stopwords);
    if (excltable != null && !excltable.isEmpty())
        result = new SetKeywordMarkerFilter(result, excltable);
    return new TokenStreamComponents(source, new BrazilianStemFilter(result));
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) StopFilter(org.apache.lucene.analysis.StopFilter) SetKeywordMarkerFilter(org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter) StandardFilter(org.apache.lucene.analysis.standard.StandardFilter) Tokenizer(org.apache.lucene.analysis.Tokenizer) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) LowerCaseFilter(org.apache.lucene.analysis.LowerCaseFilter)

Example 42 with StopFilter

use of org.apache.lucene.analysis.StopFilter in project lucene-solr by apache.

the class LatvianAnalyzer method createComponents.

/**
   * Creates a
   * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
   * which tokenizes all the text in the provided {@link Reader}.
   * 
   * @return A
   *         {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
   *         built from an {@link StandardTokenizer} filtered with
   *         {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
   *         , {@link SetKeywordMarkerFilter} if a stem exclusion set is
   *         provided and {@link LatvianStemFilter}.
   */
@Override
protected TokenStreamComponents createComponents(String fieldName) {
    final Tokenizer source = new StandardTokenizer();
    TokenStream result = new StandardFilter(source);
    result = new LowerCaseFilter(result);
    result = new StopFilter(result, stopwords);
    if (!stemExclusionSet.isEmpty())
        result = new SetKeywordMarkerFilter(result, stemExclusionSet);
    result = new LatvianStemFilter(result);
    return new TokenStreamComponents(source, result);
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) StopFilter(org.apache.lucene.analysis.StopFilter) SetKeywordMarkerFilter(org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter) StandardFilter(org.apache.lucene.analysis.standard.StandardFilter) Tokenizer(org.apache.lucene.analysis.Tokenizer) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) LowerCaseFilter(org.apache.lucene.analysis.LowerCaseFilter)

Example 43 with StopFilter

use of org.apache.lucene.analysis.StopFilter in project lucene-solr by apache.

the class ShingleAnalyzerWrapperTest method testAltFillerToken.

public void testAltFillerToken() throws Exception {
    Analyzer delegate = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            CharArraySet stopSet = StopFilter.makeStopSet("into");
            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
            TokenFilter filter = new StopFilter(tokenizer, stopSet);
            return new TokenStreamComponents(tokenizer, filter);
        }
    };
    ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(delegate, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, ShingleFilter.DEFAULT_TOKEN_SEPARATOR, true, false, "--");
    assertAnalyzesTo(analyzer, "please divide into shingles", new String[] { "please", "please divide", "divide", "divide --", "-- shingles", "shingles" }, new int[] { 0, 0, 7, 7, 19, 19 }, new int[] { 6, 13, 13, 19, 27, 27 }, new int[] { 1, 0, 1, 0, 1, 1 });
    analyzer.close();
    delegate = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            CharArraySet stopSet = StopFilter.makeStopSet("into");
            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
            TokenFilter filter = new StopFilter(tokenizer, stopSet);
            return new TokenStreamComponents(tokenizer, filter);
        }
    };
    analyzer = new ShingleAnalyzerWrapper(delegate, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, ShingleFilter.DEFAULT_TOKEN_SEPARATOR, false, false, null);
    assertAnalyzesTo(analyzer, "please divide into shingles", new String[] { "please divide", "divide ", " shingles" }, new int[] { 0, 7, 19 }, new int[] { 13, 19, 27 }, new int[] { 1, 1, 1 });
    analyzer.close();
    delegate = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            CharArraySet stopSet = StopFilter.makeStopSet("into");
            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
            TokenFilter filter = new StopFilter(tokenizer, stopSet);
            return new TokenStreamComponents(tokenizer, filter);
        }
    };
    analyzer = new ShingleAnalyzerWrapper(delegate, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, ShingleFilter.DEFAULT_TOKEN_SEPARATOR, false, false, "");
    assertAnalyzesTo(analyzer, "please divide into shingles", new String[] { "please divide", "divide ", " shingles" }, new int[] { 0, 7, 19 }, new int[] { 13, 19, 27 }, new int[] { 1, 1, 1 });
    analyzer.close();
}
Also used : MockTokenizer(org.apache.lucene.analysis.MockTokenizer) CharArraySet(org.apache.lucene.analysis.CharArraySet) StopFilter(org.apache.lucene.analysis.StopFilter) Analyzer(org.apache.lucene.analysis.Analyzer) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) TokenFilter(org.apache.lucene.analysis.TokenFilter)

Example 44 with StopFilter

use of org.apache.lucene.analysis.StopFilter in project lucene-solr by apache.

the class TestWithCJKBigramFilter method setUp.

@Override
public void setUp() throws Exception {
    super.setUp();
    /*
     * ICUTokenizer+CJKBigramFilter
     */
    analyzer = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer source = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false, true));
            TokenStream result = new CJKBigramFilter(source);
            return new TokenStreamComponents(source, new StopFilter(result, CharArraySet.EMPTY_SET));
        }
    };
    /*
     * ICUTokenizer+ICUNormalizer2Filter+CJKBigramFilter.
     * 
     * ICUNormalizer2Filter uses nfkc_casefold by default, so this is a language-independent
     * superset of CJKWidthFilter's foldings.
     */
    analyzer2 = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer source = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false, true));
            // we put this before the CJKBigramFilter, because the normalization might combine
            // some halfwidth katakana forms, which will affect the bigramming.
            TokenStream result = new ICUNormalizer2Filter(source);
            result = new CJKBigramFilter(result);
            return new TokenStreamComponents(source, new StopFilter(result, CharArraySet.EMPTY_SET));
        }
    };
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) StopFilter(org.apache.lucene.analysis.StopFilter) CJKBigramFilter(org.apache.lucene.analysis.cjk.CJKBigramFilter) ICUNormalizer2Filter(org.apache.lucene.analysis.icu.ICUNormalizer2Filter) Analyzer(org.apache.lucene.analysis.Analyzer) Tokenizer(org.apache.lucene.analysis.Tokenizer)

Example 45 with StopFilter

use of org.apache.lucene.analysis.StopFilter in project lucene-solr by apache.

the class JapaneseAnalyzer method createComponents.

@Override
protected TokenStreamComponents createComponents(String fieldName) {
    Tokenizer tokenizer = new JapaneseTokenizer(userDict, true, mode);
    TokenStream stream = new JapaneseBaseFormFilter(tokenizer);
    stream = new JapanesePartOfSpeechStopFilter(stream, stoptags);
    stream = new CJKWidthFilter(stream);
    stream = new StopFilter(stream, stopwords);
    stream = new JapaneseKatakanaStemFilter(stream);
    stream = new LowerCaseFilter(stream);
    return new TokenStreamComponents(tokenizer, stream);
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CJKWidthFilter(org.apache.lucene.analysis.cjk.CJKWidthFilter) StopFilter(org.apache.lucene.analysis.StopFilter) Tokenizer(org.apache.lucene.analysis.Tokenizer) LowerCaseFilter(org.apache.lucene.analysis.LowerCaseFilter)

Aggregations

StopFilter (org.apache.lucene.analysis.StopFilter)59 TokenStream (org.apache.lucene.analysis.TokenStream)49 Tokenizer (org.apache.lucene.analysis.Tokenizer)47 LowerCaseFilter (org.apache.lucene.analysis.LowerCaseFilter)42 StandardTokenizer (org.apache.lucene.analysis.standard.StandardTokenizer)38 StandardFilter (org.apache.lucene.analysis.standard.StandardFilter)35 SetKeywordMarkerFilter (org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter)31 SnowballFilter (org.apache.lucene.analysis.snowball.SnowballFilter)15 CharArraySet (org.apache.lucene.analysis.CharArraySet)7 Analyzer (org.apache.lucene.analysis.Analyzer)6 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)5 DecimalDigitFilter (org.apache.lucene.analysis.core.DecimalDigitFilter)5 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)4 ElisionFilter (org.apache.lucene.analysis.util.ElisionFilter)4 Reader (java.io.Reader)3 KeywordTokenizer (org.apache.lucene.analysis.core.KeywordTokenizer)3 Input (org.apache.lucene.search.suggest.Input)3 InputArrayIterator (org.apache.lucene.search.suggest.InputArrayIterator)3 TokenFilter (org.apache.lucene.analysis.TokenFilter)2 StandardAnalyzer (org.apache.lucene.analysis.standard.StandardAnalyzer)2