Search in sources :

Example 11 with ASCIIFoldingFilter

use of org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter in project crate by crate.

the class CommonAnalysisPlugin method getPreConfiguredTokenFilters.

@Override
public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
    List<PreConfiguredTokenFilter> filters = new ArrayList<>();
    filters.add(PreConfiguredTokenFilter.singleton("apostrophe", false, ApostropheFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("arabic_normalization", true, ArabicNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("arabic_stem", false, ArabicStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("asciifolding", true, ASCIIFoldingFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("bengali_normalization", true, BengaliNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("brazilian_stem", false, BrazilianStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("cjk_bigram", false, CJKBigramFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("cjk_width", true, CJKWidthFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("classic", false, ClassicFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("common_grams", false, input -> new CommonGramsFilter(input, CharArraySet.EMPTY_SET)));
    filters.add(PreConfiguredTokenFilter.singleton("czech_stem", false, CzechStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("decimal_digit", true, DecimalDigitFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("delimited_payload_filter", false, input -> new DelimitedPayloadTokenFilter(input, DelimitedPayloadTokenFilterFactory.DEFAULT_DELIMITER, DelimitedPayloadTokenFilterFactory.DEFAULT_ENCODER)));
    filters.add(PreConfiguredTokenFilter.singleton("delimited_payload", false, input -> new DelimitedPayloadTokenFilter(input, DelimitedPayloadTokenFilterFactory.DEFAULT_DELIMITER, DelimitedPayloadTokenFilterFactory.DEFAULT_ENCODER)));
    filters.add(PreConfiguredTokenFilter.singleton("dutch_stem", false, input -> new SnowballFilter(input, new DutchStemmer())));
    filters.add(PreConfiguredTokenFilter.singleton("edge_ngram", false, input -> new EdgeNGramTokenFilter(input, EdgeNGramTokenFilterFactory.SIDE_FRONT, EdgeNGramTokenFilterFactory.SIDE_BACK, EdgeNGramTokenFilter.DEFAULT_PRESERVE_ORIGINAL)));
    filters.add(PreConfiguredTokenFilter.singleton("elision", true, input -> new ElisionFilter(input, FrenchAnalyzer.DEFAULT_ARTICLES)));
    filters.add(PreConfiguredTokenFilter.singleton("french_stem", false, input -> new SnowballFilter(input, new FrenchStemmer())));
    filters.add(PreConfiguredTokenFilter.singleton("german_normalization", true, GermanNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("german_stem", false, GermanStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("hindi_normalization", true, HindiNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("indic_normalization", true, IndicNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("keyword_repeat", false, KeywordRepeatFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("kstem", false, KStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("length", false, input -> new LengthFilter(input, 0, Integer.MAX_VALUE)));
    filters.add(PreConfiguredTokenFilter.singleton("limit", false, input -> new LimitTokenCountFilter(input, LimitTokenCountFilterFactory.DEFAULT_MAX_TOKEN_COUNT, LimitTokenCountFilterFactory.DEFAULT_CONSUME_ALL_TOKENS)));
    filters.add(PreConfiguredTokenFilter.singleton("ngram", false, reader -> new NGramTokenFilter(reader, 1, 2, false)));
    filters.add(PreConfiguredTokenFilter.singleton("persian_normalization", true, PersianNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("porter_stem", false, PorterStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("reverse", false, ReverseStringFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("russian_stem", false, input -> new SnowballFilter(input, "Russian")));
    filters.add(PreConfiguredTokenFilter.singleton("scandinavian_folding", true, ScandinavianFoldingFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("scandinavian_normalization", true, ScandinavianNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("shingle", false, input -> {
        TokenStream ts = new ShingleFilter(input);
        /**
         * We disable the graph analysis on this token stream
         * because it produces shingles of different size.
         * Graph analysis on such token stream is useless and dangerous as it may create too many paths
         * since shingles of different size are not aligned in terms of positions.
         */
        ts.addAttribute(DisableGraphAttribute.class);
        return ts;
    }));
    filters.add(PreConfiguredTokenFilter.singleton("snowball", false, input -> new SnowballFilter(input, "English")));
    filters.add(PreConfiguredTokenFilter.singleton("sorani_normalization", true, SoraniNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("stemmer", false, PorterStemFilter::new));
    // The stop filter is in lucene-core but the English stop words set is in lucene-analyzers-common
    filters.add(PreConfiguredTokenFilter.singleton("stop", false, input -> new StopFilter(input, EnglishAnalyzer.ENGLISH_STOP_WORDS_SET)));
    filters.add(PreConfiguredTokenFilter.singleton("trim", true, TrimFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("truncate", false, input -> new TruncateTokenFilter(input, 10)));
    filters.add(PreConfiguredTokenFilter.singleton("type_as_payload", false, TypeAsPayloadTokenFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("unique", false, UniqueTokenFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("uppercase", true, UpperCaseFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("word_delimiter", false, input -> new WordDelimiterFilter(input, WordDelimiterFilter.GENERATE_WORD_PARTS | WordDelimiterFilter.GENERATE_NUMBER_PARTS | WordDelimiterFilter.SPLIT_ON_CASE_CHANGE | WordDelimiterFilter.SPLIT_ON_NUMERICS | WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE, null)));
    filters.add(PreConfiguredTokenFilter.singleton("word_delimiter_graph", false, input -> new WordDelimiterGraphFilter(input, WordDelimiterGraphFilter.GENERATE_WORD_PARTS | WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS | WordDelimiterGraphFilter.SPLIT_ON_CASE_CHANGE | WordDelimiterGraphFilter.SPLIT_ON_NUMERICS | WordDelimiterGraphFilter.STEM_ENGLISH_POSSESSIVE, null)));
    return filters;
}
Also used : LimitTokenCountFilter(org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilter) EdgeNGramTokenFilter(org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter) TypeAsPayloadTokenFilter(org.apache.lucene.analysis.payloads.TypeAsPayloadTokenFilter) TokenizerFactory(org.elasticsearch.index.analysis.TokenizerFactory) LithuanianAnalyzer(org.apache.lucene.analysis.lt.LithuanianAnalyzer) DecimalDigitFilter(org.apache.lucene.analysis.core.DecimalDigitFilter) PathHierarchyTokenizer(org.apache.lucene.analysis.path.PathHierarchyTokenizer) CatalanAnalyzer(org.apache.lucene.analysis.ca.CatalanAnalyzer) CzechStemFilter(org.apache.lucene.analysis.cz.CzechStemFilter) CzechAnalyzer(org.apache.lucene.analysis.cz.CzechAnalyzer) ScandinavianFoldingFilter(org.apache.lucene.analysis.miscellaneous.ScandinavianFoldingFilter) Map(java.util.Map) ArmenianAnalyzer(org.apache.lucene.analysis.hy.ArmenianAnalyzer) AnalysisProvider(org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider) SwedishAnalyzer(org.apache.lucene.analysis.sv.SwedishAnalyzer) WhitespaceTokenizer(org.apache.lucene.analysis.core.WhitespaceTokenizer) DutchStemmer(org.tartarus.snowball.ext.DutchStemmer) BrazilianStemFilter(org.apache.lucene.analysis.br.BrazilianStemFilter) CharFilterFactory(org.elasticsearch.index.analysis.CharFilterFactory) WordDelimiterFilter(org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter) CachingStrategy(org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) BasqueAnalyzer(org.apache.lucene.analysis.eu.BasqueAnalyzer) HTMLStripCharFilter(org.apache.lucene.analysis.charfilter.HTMLStripCharFilter) HungarianAnalyzer(org.apache.lucene.analysis.hu.HungarianAnalyzer) ClusterService(org.elasticsearch.cluster.service.ClusterService) LengthFilter(org.apache.lucene.analysis.miscellaneous.LengthFilter) TurkishAnalyzer(org.apache.lucene.analysis.tr.TurkishAnalyzer) PreBuiltAnalyzerProviderFactory(org.elasticsearch.index.analysis.PreBuiltAnalyzerProviderFactory) SnowballFilter(org.apache.lucene.analysis.snowball.SnowballFilter) ArrayList(java.util.ArrayList) NamedWriteableRegistry(org.elasticsearch.common.io.stream.NamedWriteableRegistry) SoraniAnalyzer(org.apache.lucene.analysis.ckb.SoraniAnalyzer) PersianNormalizationFilter(org.apache.lucene.analysis.fa.PersianNormalizationFilter) Regex(org.elasticsearch.common.regex.Regex) ClassicTokenizer(org.apache.lucene.analysis.standard.ClassicTokenizer) ArabicStemFilter(org.apache.lucene.analysis.ar.ArabicStemFilter) CJKWidthFilter(org.apache.lucene.analysis.cjk.CJKWidthFilter) LowerCaseFilter(org.apache.lucene.analysis.LowerCaseFilter) TokenStream(org.apache.lucene.analysis.TokenStream) WordDelimiterGraphFilter(org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter) Analyzer(org.apache.lucene.analysis.Analyzer) PorterStemFilter(org.apache.lucene.analysis.en.PorterStemFilter) Client(org.elasticsearch.client.Client) UAX29URLEmailTokenizer(org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer) GalicianAnalyzer(org.apache.lucene.analysis.gl.GalicianAnalyzer) FrenchStemmer(org.tartarus.snowball.ext.FrenchStemmer) UpperCaseFilter(org.apache.lucene.analysis.core.UpperCaseFilter) ItalianAnalyzer(org.apache.lucene.analysis.it.ItalianAnalyzer) BengaliAnalyzer(org.apache.lucene.analysis.bn.BengaliAnalyzer) TreeMap(java.util.TreeMap) ClassicFilter(org.apache.lucene.analysis.standard.ClassicFilter) NGramTokenizer(org.apache.lucene.analysis.ngram.NGramTokenizer) FinnishAnalyzer(org.apache.lucene.analysis.fi.FinnishAnalyzer) EdgeNGramTokenizer(org.apache.lucene.analysis.ngram.EdgeNGramTokenizer) IndonesianAnalyzer(org.apache.lucene.analysis.id.IndonesianAnalyzer) AnalysisPlugin(org.elasticsearch.plugins.AnalysisPlugin) Environment(org.elasticsearch.env.Environment) DisableGraphAttribute(org.apache.lucene.analysis.miscellaneous.DisableGraphAttribute) GermanAnalyzer(org.apache.lucene.analysis.de.GermanAnalyzer) GreekAnalyzer(org.apache.lucene.analysis.el.GreekAnalyzer) DelimitedPayloadTokenFilter(org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilter) ThreadPool(org.elasticsearch.threadpool.ThreadPool) TruncateTokenFilter(org.apache.lucene.analysis.miscellaneous.TruncateTokenFilter) BulgarianAnalyzer(org.apache.lucene.analysis.bg.BulgarianAnalyzer) NamedXContentRegistry(org.elasticsearch.common.xcontent.NamedXContentRegistry) EnglishAnalyzer(org.apache.lucene.analysis.en.EnglishAnalyzer) NGramTokenFilter(org.apache.lucene.analysis.ngram.NGramTokenFilter) Collection(java.util.Collection) HindiAnalyzer(org.apache.lucene.analysis.hi.HindiAnalyzer) RussianAnalyzer(org.apache.lucene.analysis.ru.RussianAnalyzer) BrazilianAnalyzer(org.apache.lucene.analysis.br.BrazilianAnalyzer) PortugueseAnalyzer(org.apache.lucene.analysis.pt.PortugueseAnalyzer) NorwegianAnalyzer(org.apache.lucene.analysis.no.NorwegianAnalyzer) ElisionFilter(org.apache.lucene.analysis.util.ElisionFilter) PreConfiguredTokenizer(org.elasticsearch.index.analysis.PreConfiguredTokenizer) List(java.util.List) ArabicAnalyzer(org.apache.lucene.analysis.ar.ArabicAnalyzer) KStemFilter(org.apache.lucene.analysis.en.KStemFilter) DanishAnalyzer(org.apache.lucene.analysis.da.DanishAnalyzer) RomanianAnalyzer(org.apache.lucene.analysis.ro.RomanianAnalyzer) LetterTokenizer(org.apache.lucene.analysis.core.LetterTokenizer) PreConfiguredTokenFilter(org.elasticsearch.index.analysis.PreConfiguredTokenFilter) ThaiTokenizer(org.apache.lucene.analysis.th.ThaiTokenizer) LatvianAnalyzer(org.apache.lucene.analysis.lv.LatvianAnalyzer) PatternTokenizer(org.apache.lucene.analysis.pattern.PatternTokenizer) CharArraySet(org.apache.lucene.analysis.CharArraySet) CJKAnalyzer(org.apache.lucene.analysis.cjk.CJKAnalyzer) AnalysisPlugin.requiresAnalysisSettings(org.elasticsearch.plugins.AnalysisPlugin.requiresAnalysisSettings) IndicNormalizationFilter(org.apache.lucene.analysis.in.IndicNormalizationFilter) GermanNormalizationFilter(org.apache.lucene.analysis.de.GermanNormalizationFilter) GermanStemFilter(org.apache.lucene.analysis.de.GermanStemFilter) CJKBigramFilter(org.apache.lucene.analysis.cjk.CJKBigramFilter) CommonGramsFilter(org.apache.lucene.analysis.commongrams.CommonGramsFilter) HindiNormalizationFilter(org.apache.lucene.analysis.hi.HindiNormalizationFilter) AnalyzerProvider(org.elasticsearch.index.analysis.AnalyzerProvider) ArabicNormalizationFilter(org.apache.lucene.analysis.ar.ArabicNormalizationFilter) IrishAnalyzer(org.apache.lucene.analysis.ga.IrishAnalyzer) ShingleFilter(org.apache.lucene.analysis.shingle.ShingleFilter) KeywordRepeatFilter(org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilter) DutchAnalyzer(org.apache.lucene.analysis.nl.DutchAnalyzer) SpanishAnalyzer(org.apache.lucene.analysis.es.SpanishAnalyzer) TrimFilter(org.apache.lucene.analysis.miscellaneous.TrimFilter) PersianAnalyzer(org.apache.lucene.analysis.fa.PersianAnalyzer) StopFilter(org.apache.lucene.analysis.StopFilter) SoraniNormalizationFilter(org.apache.lucene.analysis.ckb.SoraniNormalizationFilter) ScandinavianNormalizationFilter(org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizationFilter) Plugin(org.elasticsearch.plugins.Plugin) BengaliNormalizationFilter(org.apache.lucene.analysis.bn.BengaliNormalizationFilter) FrenchAnalyzer(org.apache.lucene.analysis.fr.FrenchAnalyzer) ThaiAnalyzer(org.apache.lucene.analysis.th.ThaiAnalyzer) PreConfiguredCharFilter(org.elasticsearch.index.analysis.PreConfiguredCharFilter) ASCIIFoldingFilter(org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter) NodeEnvironment(org.elasticsearch.env.NodeEnvironment) TokenFilterFactory(org.elasticsearch.index.analysis.TokenFilterFactory) ReverseStringFilter(org.apache.lucene.analysis.reverse.ReverseStringFilter) ApostropheFilter(org.apache.lucene.analysis.tr.ApostropheFilter) Collections(java.util.Collections) EdgeNGramTokenFilter(org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter) TokenStream(org.apache.lucene.analysis.TokenStream) StopFilter(org.apache.lucene.analysis.StopFilter) ArrayList(java.util.ArrayList) EdgeNGramTokenFilter(org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter) NGramTokenFilter(org.apache.lucene.analysis.ngram.NGramTokenFilter) DisableGraphAttribute(org.apache.lucene.analysis.miscellaneous.DisableGraphAttribute) DutchStemmer(org.tartarus.snowball.ext.DutchStemmer) DelimitedPayloadTokenFilter(org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilter) LimitTokenCountFilter(org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilter) ShingleFilter(org.apache.lucene.analysis.shingle.ShingleFilter) ElisionFilter(org.apache.lucene.analysis.util.ElisionFilter) LengthFilter(org.apache.lucene.analysis.miscellaneous.LengthFilter) FrenchStemmer(org.tartarus.snowball.ext.FrenchStemmer) WordDelimiterFilter(org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter) CommonGramsFilter(org.apache.lucene.analysis.commongrams.CommonGramsFilter) SnowballFilter(org.apache.lucene.analysis.snowball.SnowballFilter) WordDelimiterGraphFilter(org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter) PreConfiguredTokenFilter(org.elasticsearch.index.analysis.PreConfiguredTokenFilter) TruncateTokenFilter(org.apache.lucene.analysis.miscellaneous.TruncateTokenFilter)

Aggregations

ASCIIFoldingFilter (org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter)11 TokenStream (org.apache.lucene.analysis.TokenStream)10 Tokenizer (org.apache.lucene.analysis.Tokenizer)9 LowerCaseFilter (org.apache.lucene.analysis.core.LowerCaseFilter)7 StandardTokenizer (org.apache.lucene.analysis.standard.StandardTokenizer)7 StopFilter (org.apache.lucene.analysis.StopFilter)4 StopFilter (org.apache.lucene.analysis.core.StopFilter)4 StandardFilter (org.apache.lucene.analysis.standard.StandardFilter)4 LowerCaseFilter (org.apache.lucene.analysis.LowerCaseFilter)3 KeywordTokenizer (org.apache.lucene.analysis.core.KeywordTokenizer)3 ElisionFilter (org.apache.lucene.analysis.util.ElisionFilter)3 Analyzer (org.apache.lucene.analysis.Analyzer)2 CharArraySet (org.apache.lucene.analysis.CharArraySet)2 WhitespaceTokenizer (org.apache.lucene.analysis.core.WhitespaceTokenizer)2 EnglishPossessiveFilter (org.apache.lucene.analysis.en.EnglishPossessiveFilter)2 PorterStemFilter (org.apache.lucene.analysis.en.PorterStemFilter)2 WordDelimiterFilter (org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter)2 ShingleFilter (org.apache.lucene.analysis.shingle.ShingleFilter)2 ArrayList (java.util.ArrayList)1 Collection (java.util.Collection)1