Search in sources :

Example 51 with LowerCaseFilter

use of org.apache.lucene.analysis.LowerCaseFilter in project elasticsearch by elastic.

the class KeywordFieldTypeTests method testTermQueryWithNormalizer.

public void testTermQueryWithNormalizer() {
    MappedFieldType ft = createDefaultFieldType();
    ft.setName("field");
    ft.setIndexOptions(IndexOptions.DOCS);
    Analyzer normalizer = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer in = new WhitespaceTokenizer();
            TokenFilter out = new LowerCaseFilter(in);
            return new TokenStreamComponents(in, out);
        }

        @Override
        protected TokenStream normalize(String fieldName, TokenStream in) {
            return new LowerCaseFilter(in);
        }
    };
    ft.setSearchAnalyzer(new NamedAnalyzer("my_normalizer", AnalyzerScope.INDEX, normalizer));
    assertEquals(new TermQuery(new Term("field", "foo bar")), ft.termQuery("fOo BaR", null));
    ft.setIndexOptions(IndexOptions.NONE);
    IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> ft.termQuery("bar", null));
    assertEquals("Cannot search on field [field] since it is not indexed.", e.getMessage());
}
Also used : WhitespaceTokenizer(org.apache.lucene.analysis.core.WhitespaceTokenizer) TermQuery(org.apache.lucene.search.TermQuery) TokenStream(org.apache.lucene.analysis.TokenStream) NamedAnalyzer(org.elasticsearch.index.analysis.NamedAnalyzer) Term(org.apache.lucene.index.Term) NamedAnalyzer(org.elasticsearch.index.analysis.NamedAnalyzer) Analyzer(org.apache.lucene.analysis.Analyzer) Tokenizer(org.apache.lucene.analysis.Tokenizer) WhitespaceTokenizer(org.apache.lucene.analysis.core.WhitespaceTokenizer) LowerCaseFilter(org.apache.lucene.analysis.LowerCaseFilter) TokenFilter(org.apache.lucene.analysis.TokenFilter)

Example 52 with LowerCaseFilter

use of org.apache.lucene.analysis.LowerCaseFilter in project elasticsearch by elastic.

the class SnowballAnalyzer method createComponents.

/** Constructs a {@link StandardTokenizer} filtered by a {@link
      StandardFilter}, a {@link LowerCaseFilter}, a {@link StopFilter},
      and a {@link SnowballFilter} */
@Override
public TokenStreamComponents createComponents(String fieldName) {
    final Tokenizer tokenizer = new StandardTokenizer();
    TokenStream result = tokenizer;
    // remove the possessive 's for english stemmers
    if (name.equals("English") || name.equals("Porter") || name.equals("Lovins"))
        result = new EnglishPossessiveFilter(result);
    // Use a special lowercase filter for turkish, the stemmer expects it.
    if (name.equals("Turkish"))
        result = new TurkishLowerCaseFilter(result);
    else
        result = new LowerCaseFilter(result);
    if (stopSet != null)
        result = new StopFilter(result, stopSet);
    result = new SnowballFilter(result, name);
    return new TokenStreamComponents(tokenizer, result);
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) EnglishPossessiveFilter(org.apache.lucene.analysis.en.EnglishPossessiveFilter) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) StopFilter(org.apache.lucene.analysis.StopFilter) TurkishLowerCaseFilter(org.apache.lucene.analysis.tr.TurkishLowerCaseFilter) SnowballFilter(org.apache.lucene.analysis.snowball.SnowballFilter) Tokenizer(org.apache.lucene.analysis.Tokenizer) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) LowerCaseFilter(org.apache.lucene.analysis.LowerCaseFilter) TurkishLowerCaseFilter(org.apache.lucene.analysis.tr.TurkishLowerCaseFilter)

Example 53 with LowerCaseFilter

use of org.apache.lucene.analysis.LowerCaseFilter in project elasticsearch by elastic.

the class StandardHtmlStripAnalyzer method createComponents.

@Override
protected TokenStreamComponents createComponents(final String fieldName) {
    final Tokenizer src = new StandardTokenizer();
    TokenStream tok = new StandardFilter(src);
    tok = new LowerCaseFilter(tok);
    if (!stopwords.isEmpty()) {
        tok = new StopFilter(tok, stopwords);
    }
    return new TokenStreamComponents(src, tok);
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) StopFilter(org.apache.lucene.analysis.StopFilter) StandardFilter(org.apache.lucene.analysis.standard.StandardFilter) Tokenizer(org.apache.lucene.analysis.Tokenizer) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) LowerCaseFilter(org.apache.lucene.analysis.LowerCaseFilter)

Example 54 with LowerCaseFilter

use of org.apache.lucene.analysis.LowerCaseFilter in project elasticsearch by elastic.

the class FingerprintAnalyzer method createComponents.

@Override
protected TokenStreamComponents createComponents(String s) {
    final Tokenizer tokenizer = new StandardTokenizer();
    TokenStream stream = tokenizer;
    stream = new LowerCaseFilter(stream);
    stream = new ASCIIFoldingFilter(stream, false);
    stream = new StopFilter(stream, stopWords);
    stream = new FingerprintFilter(stream, maxOutputSize, separator);
    return new TokenStreamComponents(tokenizer, stream);
}
Also used : FingerprintFilter(org.apache.lucene.analysis.miscellaneous.FingerprintFilter) TokenStream(org.apache.lucene.analysis.TokenStream) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) StopFilter(org.apache.lucene.analysis.StopFilter) ASCIIFoldingFilter(org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter) Tokenizer(org.apache.lucene.analysis.Tokenizer) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) LowerCaseFilter(org.apache.lucene.analysis.LowerCaseFilter)

Example 55 with LowerCaseFilter

use of org.apache.lucene.analysis.LowerCaseFilter in project lucene-solr by apache.

the class SoraniAnalyzer method createComponents.

/**
   * Creates a
   * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
   * which tokenizes all the text in the provided {@link Reader}.
   * 
   * @return A
   *         {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
   *         built from an {@link StandardTokenizer} filtered with
   *         {@link StandardFilter}, {@link SoraniNormalizationFilter}, 
   *         {@link LowerCaseFilter}, {@link DecimalDigitFilter}, {@link StopFilter}
   *         , {@link SetKeywordMarkerFilter} if a stem exclusion set is
   *         provided and {@link SoraniStemFilter}.
   */
@Override
protected TokenStreamComponents createComponents(String fieldName) {
    final Tokenizer source = new StandardTokenizer();
    TokenStream result = new StandardFilter(source);
    result = new SoraniNormalizationFilter(result);
    result = new LowerCaseFilter(result);
    result = new DecimalDigitFilter(result);
    result = new StopFilter(result, stopwords);
    if (!stemExclusionSet.isEmpty())
        result = new SetKeywordMarkerFilter(result, stemExclusionSet);
    result = new SoraniStemFilter(result);
    return new TokenStreamComponents(source, result);
}
Also used : DecimalDigitFilter(org.apache.lucene.analysis.core.DecimalDigitFilter) TokenStream(org.apache.lucene.analysis.TokenStream) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) StopFilter(org.apache.lucene.analysis.StopFilter) SetKeywordMarkerFilter(org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter) StandardFilter(org.apache.lucene.analysis.standard.StandardFilter) Tokenizer(org.apache.lucene.analysis.Tokenizer) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) LowerCaseFilter(org.apache.lucene.analysis.LowerCaseFilter)

Aggregations

LowerCaseFilter (org.apache.lucene.analysis.LowerCaseFilter)95 TokenStream (org.apache.lucene.analysis.TokenStream)88 StandardFilter (org.apache.lucene.analysis.standard.StandardFilter)60 Tokenizer (org.apache.lucene.analysis.Tokenizer)51 StopFilter (org.apache.lucene.analysis.StopFilter)48 StandardTokenizer (org.apache.lucene.analysis.standard.StandardTokenizer)43 SetKeywordMarkerFilter (org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter)31 SnowballFilter (org.apache.lucene.analysis.snowball.SnowballFilter)14 DecimalDigitFilter (org.apache.lucene.analysis.core.DecimalDigitFilter)10 Analyzer (org.apache.lucene.analysis.Analyzer)9 IOException (java.io.IOException)5 ElisionFilter (org.apache.lucene.analysis.util.ElisionFilter)5 StringReader (java.io.StringReader)4 HashMap (java.util.HashMap)4 TokenFilter (org.apache.lucene.analysis.TokenFilter)4 WhitespaceAnalyzer (org.apache.lucene.analysis.core.WhitespaceAnalyzer)4 EnglishPossessiveFilter (org.apache.lucene.analysis.en.EnglishPossessiveFilter)4 PerFieldAnalyzerWrapper (org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper)4 Document (org.apache.lucene.document.Document)4 Field (org.apache.lucene.document.Field)4