Search in sources :

Example 46 with StopFilter

use of org.apache.lucene.analysis.StopFilter in project lucene-solr by apache.

the class TestWordDelimiterFilter method testPositionIncrements.

@Test
public void testPositionIncrements() throws Exception {
    final int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
    final CharArraySet protWords = new CharArraySet(new HashSet<>(Arrays.asList("NUTCH")), false);
    /* analyzer that uses whitespace + wdf */
    Analyzer a = new Analyzer() {

        @Override
        public TokenStreamComponents createComponents(String field) {
            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
            return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, protWords));
        }
    };
    /* in this case, works as expected. */
    assertAnalyzesTo(a, "LUCENE / SOLR", new String[] { "LUCENE", "SOLR" }, new int[] { 0, 9 }, new int[] { 6, 13 }, null, new int[] { 1, 1 }, null, false);
    /* only in this case, posInc of 2 ?! */
    assertAnalyzesTo(a, "LUCENE / solR", new String[] { "LUCENE", "sol", "solR", "R" }, new int[] { 0, 9, 9, 12 }, new int[] { 6, 12, 13, 13 }, null, new int[] { 1, 1, 0, 1 }, null, false);
    assertAnalyzesTo(a, "LUCENE / NUTCH SOLR", new String[] { "LUCENE", "NUTCH", "SOLR" }, new int[] { 0, 9, 15 }, new int[] { 6, 14, 19 }, null, new int[] { 1, 1, 1 }, null, false);
    /* analyzer that will consume tokens with large position increments */
    Analyzer a2 = new Analyzer() {

        @Override
        public TokenStreamComponents createComponents(String field) {
            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
            return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(new LargePosIncTokenFilter(tokenizer), flags, protWords));
        }
    };
    /* increment of "largegap" is preserved */
    assertAnalyzesTo(a2, "LUCENE largegap SOLR", new String[] { "LUCENE", "largegap", "SOLR" }, new int[] { 0, 7, 16 }, new int[] { 6, 15, 20 }, null, new int[] { 1, 10, 1 }, null, false);
    /* the "/" had a position increment of 10, where did it go?!?!! */
    assertAnalyzesTo(a2, "LUCENE / SOLR", new String[] { "LUCENE", "SOLR" }, new int[] { 0, 9 }, new int[] { 6, 13 }, null, new int[] { 1, 11 }, null, false);
    /* in this case, the increment of 10 from the "/" is carried over */
    assertAnalyzesTo(a2, "LUCENE / solR", new String[] { "LUCENE", "sol", "solR", "R" }, new int[] { 0, 9, 9, 12 }, new int[] { 6, 12, 13, 13 }, null, new int[] { 1, 11, 0, 1 }, null, false);
    assertAnalyzesTo(a2, "LUCENE / NUTCH SOLR", new String[] { "LUCENE", "NUTCH", "SOLR" }, new int[] { 0, 9, 15 }, new int[] { 6, 14, 19 }, null, new int[] { 1, 11, 1 }, null, false);
    Analyzer a3 = new Analyzer() {

        @Override
        public TokenStreamComponents createComponents(String field) {
            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
            StopFilter filter = new StopFilter(tokenizer, StandardAnalyzer.STOP_WORDS_SET);
            return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(filter, flags, protWords));
        }
    };
    assertAnalyzesTo(a3, "lucene.solr", new String[] { "lucene", "lucenesolr", "solr" }, new int[] { 0, 0, 7 }, new int[] { 6, 11, 11 }, null, new int[] { 1, 0, 1 }, null, false);
    /* the stopword should add a gap here */
    assertAnalyzesTo(a3, "the lucene.solr", new String[] { "lucene", "lucenesolr", "solr" }, new int[] { 4, 4, 11 }, new int[] { 10, 15, 15 }, null, new int[] { 2, 0, 1 }, null, false);
    IOUtils.close(a, a2, a3);
}
Also used : CharArraySet(org.apache.lucene.analysis.CharArraySet) WordDelimiterFilter(org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter) StopFilter(org.apache.lucene.analysis.StopFilter) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) Test(org.junit.Test)

Example 47 with StopFilter

use of org.apache.lucene.analysis.StopFilter in project lucene-solr by apache.

the class TestWordDelimiterGraphFilter method testPositionIncrements.

public void testPositionIncrements() throws Exception {
    final int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
    final CharArraySet protWords = new CharArraySet(new HashSet<>(Arrays.asList("NUTCH")), false);
    /* analyzer that uses whitespace + wdf */
    Analyzer a = new Analyzer() {

        @Override
        public TokenStreamComponents createComponents(String field) {
            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
            return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(tokenizer, flags, protWords));
        }
    };
    /* in this case, works as expected. */
    assertAnalyzesTo(a, "LUCENE / SOLR", new String[] { "LUCENE", "SOLR" }, new int[] { 0, 9 }, new int[] { 6, 13 }, null, new int[] { 1, 2 }, null, false);
    /* only in this case, posInc of 2 ?! */
    assertAnalyzesTo(a, "LUCENE / solR", new String[] { "LUCENE", "solR", "sol", "R" }, new int[] { 0, 9, 9, 12 }, new int[] { 6, 13, 12, 13 }, null, new int[] { 1, 2, 0, 1 }, null, false);
    assertAnalyzesTo(a, "LUCENE / NUTCH SOLR", new String[] { "LUCENE", "NUTCH", "SOLR" }, new int[] { 0, 9, 15 }, new int[] { 6, 14, 19 }, null, new int[] { 1, 2, 1 }, null, false);
    /* analyzer that will consume tokens with large position increments */
    Analyzer a2 = new Analyzer() {

        @Override
        public TokenStreamComponents createComponents(String field) {
            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
            return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(new LargePosIncTokenFilter(tokenizer), flags, protWords));
        }
    };
    /* increment of "largegap" is preserved */
    assertAnalyzesTo(a2, "LUCENE largegap SOLR", new String[] { "LUCENE", "largegap", "SOLR" }, new int[] { 0, 7, 16 }, new int[] { 6, 15, 20 }, null, new int[] { 1, 10, 1 }, null, false);
    /* the "/" had a position increment of 10, where did it go?!?!! */
    assertAnalyzesTo(a2, "LUCENE / SOLR", new String[] { "LUCENE", "SOLR" }, new int[] { 0, 9 }, new int[] { 6, 13 }, null, new int[] { 1, 11 }, null, false);
    /* in this case, the increment of 10 from the "/" is carried over */
    assertAnalyzesTo(a2, "LUCENE / solR", new String[] { "LUCENE", "solR", "sol", "R" }, new int[] { 0, 9, 9, 12 }, new int[] { 6, 13, 12, 13 }, null, new int[] { 1, 11, 0, 1 }, null, false);
    assertAnalyzesTo(a2, "LUCENE / NUTCH SOLR", new String[] { "LUCENE", "NUTCH", "SOLR" }, new int[] { 0, 9, 15 }, new int[] { 6, 14, 19 }, null, new int[] { 1, 11, 1 }, null, false);
    Analyzer a3 = new Analyzer() {

        @Override
        public TokenStreamComponents createComponents(String field) {
            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
            StopFilter filter = new StopFilter(tokenizer, StandardAnalyzer.STOP_WORDS_SET);
            return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(filter, flags, protWords));
        }
    };
    assertAnalyzesTo(a3, "lucene.solr", new String[] { "lucenesolr", "lucene", "solr" }, new int[] { 0, 0, 7 }, new int[] { 11, 6, 11 }, null, new int[] { 1, 0, 1 }, null, false);
    /* the stopword should add a gap here */
    assertAnalyzesTo(a3, "the lucene.solr", new String[] { "lucenesolr", "lucene", "solr" }, new int[] { 4, 4, 11 }, new int[] { 15, 10, 15 }, null, new int[] { 2, 0, 1 }, null, false);
    IOUtils.close(a, a2, a3);
}
Also used : CharArraySet(org.apache.lucene.analysis.CharArraySet) StopFilter(org.apache.lucene.analysis.StopFilter) WordDelimiterGraphFilter(org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer)

Example 48 with StopFilter

use of org.apache.lucene.analysis.StopFilter in project lucene-solr by apache.

the class PolishAnalyzer method createComponents.

/**
   * Creates a
   * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
   * which tokenizes all the text in the provided {@link Reader}.
   * 
   * @return A
   *         {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
   *         built from an {@link StandardTokenizer} filtered with
   *         {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
   *         , {@link SetKeywordMarkerFilter} if a stem exclusion set is
   *         provided and {@link StempelFilter}.
   */
@Override
protected TokenStreamComponents createComponents(String fieldName) {
    final Tokenizer source = new StandardTokenizer();
    TokenStream result = new StandardFilter(source);
    result = new LowerCaseFilter(result);
    result = new StopFilter(result, stopwords);
    if (!stemExclusionSet.isEmpty())
        result = new SetKeywordMarkerFilter(result, stemExclusionSet);
    result = new StempelFilter(result, new StempelStemmer(stemTable));
    return new TokenStreamComponents(source, result);
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) StempelStemmer(org.apache.lucene.analysis.stempel.StempelStemmer) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) StopFilter(org.apache.lucene.analysis.StopFilter) SetKeywordMarkerFilter(org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter) StandardFilter(org.apache.lucene.analysis.standard.StandardFilter) StempelFilter(org.apache.lucene.analysis.stempel.StempelFilter) Tokenizer(org.apache.lucene.analysis.Tokenizer) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) LowerCaseFilter(org.apache.lucene.analysis.LowerCaseFilter)

Example 49 with StopFilter

use of org.apache.lucene.analysis.StopFilter in project lucene-solr by apache.

the class SpanishAnalyzer method createComponents.

/**
   * Creates a
   * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
   * which tokenizes all the text in the provided {@link Reader}.
   * 
   * @return A
   *         {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
   *         built from an {@link StandardTokenizer} filtered with
   *         {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
   *         , {@link SetKeywordMarkerFilter} if a stem exclusion set is
   *         provided and {@link SpanishLightStemFilter}.
   */
@Override
protected TokenStreamComponents createComponents(String fieldName) {
    final Tokenizer source = new StandardTokenizer();
    TokenStream result = new StandardFilter(source);
    result = new LowerCaseFilter(result);
    result = new StopFilter(result, stopwords);
    if (!stemExclusionSet.isEmpty())
        result = new SetKeywordMarkerFilter(result, stemExclusionSet);
    result = new SpanishLightStemFilter(result);
    return new TokenStreamComponents(source, result);
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) StopFilter(org.apache.lucene.analysis.StopFilter) SetKeywordMarkerFilter(org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter) StandardFilter(org.apache.lucene.analysis.standard.StandardFilter) Tokenizer(org.apache.lucene.analysis.Tokenizer) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) LowerCaseFilter(org.apache.lucene.analysis.LowerCaseFilter)

Example 50 with StopFilter

use of org.apache.lucene.analysis.StopFilter in project lucene-solr by apache.

the class PersianAnalyzer method createComponents.

/**
   * Creates
   * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
   * used to tokenize all the text in the provided {@link Reader}.
   * 
   * @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
   *         built from a {@link StandardTokenizer} filtered with
   *         {@link LowerCaseFilter}, {@link DecimalDigitFilter}, {@link ArabicNormalizationFilter},
   *         {@link PersianNormalizationFilter} and Persian Stop words
   */
@Override
protected TokenStreamComponents createComponents(String fieldName) {
    final Tokenizer source = new StandardTokenizer();
    TokenStream result = new LowerCaseFilter(source);
    result = new DecimalDigitFilter(result);
    result = new ArabicNormalizationFilter(result);
    /* additional persian-specific normalization */
    result = new PersianNormalizationFilter(result);
    /*
     * the order here is important: the stopword list is normalized with the
     * above!
     */
    return new TokenStreamComponents(source, new StopFilter(result, stopwords));
}
Also used : DecimalDigitFilter(org.apache.lucene.analysis.core.DecimalDigitFilter) ArabicNormalizationFilter(org.apache.lucene.analysis.ar.ArabicNormalizationFilter) TokenStream(org.apache.lucene.analysis.TokenStream) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) StopFilter(org.apache.lucene.analysis.StopFilter) Tokenizer(org.apache.lucene.analysis.Tokenizer) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) LowerCaseFilter(org.apache.lucene.analysis.LowerCaseFilter)

Aggregations

StopFilter (org.apache.lucene.analysis.StopFilter)59 TokenStream (org.apache.lucene.analysis.TokenStream)49 Tokenizer (org.apache.lucene.analysis.Tokenizer)47 LowerCaseFilter (org.apache.lucene.analysis.LowerCaseFilter)42 StandardTokenizer (org.apache.lucene.analysis.standard.StandardTokenizer)38 StandardFilter (org.apache.lucene.analysis.standard.StandardFilter)35 SetKeywordMarkerFilter (org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter)31 SnowballFilter (org.apache.lucene.analysis.snowball.SnowballFilter)15 CharArraySet (org.apache.lucene.analysis.CharArraySet)7 Analyzer (org.apache.lucene.analysis.Analyzer)6 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)5 DecimalDigitFilter (org.apache.lucene.analysis.core.DecimalDigitFilter)5 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)4 ElisionFilter (org.apache.lucene.analysis.util.ElisionFilter)4 Reader (java.io.Reader)3 KeywordTokenizer (org.apache.lucene.analysis.core.KeywordTokenizer)3 Input (org.apache.lucene.search.suggest.Input)3 InputArrayIterator (org.apache.lucene.search.suggest.InputArrayIterator)3 TokenFilter (org.apache.lucene.analysis.TokenFilter)2 StandardAnalyzer (org.apache.lucene.analysis.standard.StandardAnalyzer)2