Search in sources :

Example 1 with TrimFilter

use of org.apache.lucene.analysis.miscellaneous.TrimFilter in project lucene-solr by apache.

the class TestSuggestSpellingConverter method testComplicated.

public void testComplicated() throws Exception {
    // lowercases, removes field names, other syntax, collapses runs of whitespace, etc.
    converter.setAnalyzer(new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new KeywordTokenizer();
            TokenStream filter = new PatternReplaceFilter(tokenizer, Pattern.compile("([^\\p{L}\\p{M}\\p{N}\\p{Cs}]*[\\p{L}\\p{M}\\p{N}\\p{Cs}\\_]+:)|([^\\p{L}\\p{M}\\p{N}\\p{Cs}])+"), " ", true);
            filter = new LowerCaseFilter(filter);
            filter = new TrimFilter(filter);
            return new TokenStreamComponents(tokenizer, filter);
        }
    });
    assertConvertsTo("test1 +test2", new String[] { "test1 test2" });
    assertConvertsTo("test~", new String[] { "test" });
    assertConvertsTo("field:test", new String[] { "test" });
    assertConvertsTo("This is a test", new String[] { "this is a test" });
    assertConvertsTo(" This is  a test", new String[] { "this is a test" });
    assertConvertsTo("Foo (field:bar) text_hi:हिन्दी    ", new String[] { "foo bar हिन्दी" });
}
Also used : CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) TokenStream(org.apache.lucene.analysis.TokenStream) TrimFilter(org.apache.lucene.analysis.miscellaneous.TrimFilter) Analyzer(org.apache.lucene.analysis.Analyzer) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) PatternReplaceFilter(org.apache.lucene.analysis.pattern.PatternReplaceFilter) LowerCaseFilter(org.apache.lucene.analysis.LowerCaseFilter)

Example 2 with TrimFilter

use of org.apache.lucene.analysis.miscellaneous.TrimFilter in project crate by crate.

the class CommonAnalysisPlugin method getPreConfiguredTokenFilters.

@Override
public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
    List<PreConfiguredTokenFilter> filters = new ArrayList<>();
    filters.add(PreConfiguredTokenFilter.singleton("apostrophe", false, ApostropheFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("arabic_normalization", true, ArabicNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("arabic_stem", false, ArabicStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("asciifolding", true, ASCIIFoldingFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("bengali_normalization", true, BengaliNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("brazilian_stem", false, BrazilianStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("cjk_bigram", false, CJKBigramFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("cjk_width", true, CJKWidthFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("classic", false, ClassicFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("common_grams", false, input -> new CommonGramsFilter(input, CharArraySet.EMPTY_SET)));
    filters.add(PreConfiguredTokenFilter.singleton("czech_stem", false, CzechStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("decimal_digit", true, DecimalDigitFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("delimited_payload_filter", false, input -> new DelimitedPayloadTokenFilter(input, DelimitedPayloadTokenFilterFactory.DEFAULT_DELIMITER, DelimitedPayloadTokenFilterFactory.DEFAULT_ENCODER)));
    filters.add(PreConfiguredTokenFilter.singleton("delimited_payload", false, input -> new DelimitedPayloadTokenFilter(input, DelimitedPayloadTokenFilterFactory.DEFAULT_DELIMITER, DelimitedPayloadTokenFilterFactory.DEFAULT_ENCODER)));
    filters.add(PreConfiguredTokenFilter.singleton("dutch_stem", false, input -> new SnowballFilter(input, new DutchStemmer())));
    filters.add(PreConfiguredTokenFilter.singleton("edge_ngram", false, input -> new EdgeNGramTokenFilter(input, EdgeNGramTokenFilterFactory.SIDE_FRONT, EdgeNGramTokenFilterFactory.SIDE_BACK, EdgeNGramTokenFilter.DEFAULT_PRESERVE_ORIGINAL)));
    filters.add(PreConfiguredTokenFilter.singleton("elision", true, input -> new ElisionFilter(input, FrenchAnalyzer.DEFAULT_ARTICLES)));
    filters.add(PreConfiguredTokenFilter.singleton("french_stem", false, input -> new SnowballFilter(input, new FrenchStemmer())));
    filters.add(PreConfiguredTokenFilter.singleton("german_normalization", true, GermanNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("german_stem", false, GermanStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("hindi_normalization", true, HindiNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("indic_normalization", true, IndicNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("keyword_repeat", false, KeywordRepeatFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("kstem", false, KStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("length", false, input -> new LengthFilter(input, 0, Integer.MAX_VALUE)));
    filters.add(PreConfiguredTokenFilter.singleton("limit", false, input -> new LimitTokenCountFilter(input, LimitTokenCountFilterFactory.DEFAULT_MAX_TOKEN_COUNT, LimitTokenCountFilterFactory.DEFAULT_CONSUME_ALL_TOKENS)));
    filters.add(PreConfiguredTokenFilter.singleton("ngram", false, reader -> new NGramTokenFilter(reader, 1, 2, false)));
    filters.add(PreConfiguredTokenFilter.singleton("persian_normalization", true, PersianNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("porter_stem", false, PorterStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("reverse", false, ReverseStringFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("russian_stem", false, input -> new SnowballFilter(input, "Russian")));
    filters.add(PreConfiguredTokenFilter.singleton("scandinavian_folding", true, ScandinavianFoldingFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("scandinavian_normalization", true, ScandinavianNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("shingle", false, input -> {
        TokenStream ts = new ShingleFilter(input);
        /**
         * We disable the graph analysis on this token stream
         * because it produces shingles of different size.
         * Graph analysis on such token stream is useless and dangerous as it may create too many paths
         * since shingles of different size are not aligned in terms of positions.
         */
        ts.addAttribute(DisableGraphAttribute.class);
        return ts;
    }));
    filters.add(PreConfiguredTokenFilter.singleton("snowball", false, input -> new SnowballFilter(input, "English")));
    filters.add(PreConfiguredTokenFilter.singleton("sorani_normalization", true, SoraniNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("stemmer", false, PorterStemFilter::new));
    // The stop filter is in lucene-core but the English stop words set is in lucene-analyzers-common
    filters.add(PreConfiguredTokenFilter.singleton("stop", false, input -> new StopFilter(input, EnglishAnalyzer.ENGLISH_STOP_WORDS_SET)));
    filters.add(PreConfiguredTokenFilter.singleton("trim", true, TrimFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("truncate", false, input -> new TruncateTokenFilter(input, 10)));
    filters.add(PreConfiguredTokenFilter.singleton("type_as_payload", false, TypeAsPayloadTokenFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("unique", false, UniqueTokenFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("uppercase", true, UpperCaseFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("word_delimiter", false, input -> new WordDelimiterFilter(input, WordDelimiterFilter.GENERATE_WORD_PARTS | WordDelimiterFilter.GENERATE_NUMBER_PARTS | WordDelimiterFilter.SPLIT_ON_CASE_CHANGE | WordDelimiterFilter.SPLIT_ON_NUMERICS | WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE, null)));
    filters.add(PreConfiguredTokenFilter.singleton("word_delimiter_graph", false, input -> new WordDelimiterGraphFilter(input, WordDelimiterGraphFilter.GENERATE_WORD_PARTS | WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS | WordDelimiterGraphFilter.SPLIT_ON_CASE_CHANGE | WordDelimiterGraphFilter.SPLIT_ON_NUMERICS | WordDelimiterGraphFilter.STEM_ENGLISH_POSSESSIVE, null)));
    return filters;
}
Also used : LimitTokenCountFilter(org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilter) EdgeNGramTokenFilter(org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter) TypeAsPayloadTokenFilter(org.apache.lucene.analysis.payloads.TypeAsPayloadTokenFilter) TokenizerFactory(org.elasticsearch.index.analysis.TokenizerFactory) LithuanianAnalyzer(org.apache.lucene.analysis.lt.LithuanianAnalyzer) DecimalDigitFilter(org.apache.lucene.analysis.core.DecimalDigitFilter) PathHierarchyTokenizer(org.apache.lucene.analysis.path.PathHierarchyTokenizer) CatalanAnalyzer(org.apache.lucene.analysis.ca.CatalanAnalyzer) CzechStemFilter(org.apache.lucene.analysis.cz.CzechStemFilter) CzechAnalyzer(org.apache.lucene.analysis.cz.CzechAnalyzer) ScandinavianFoldingFilter(org.apache.lucene.analysis.miscellaneous.ScandinavianFoldingFilter) Map(java.util.Map) ArmenianAnalyzer(org.apache.lucene.analysis.hy.ArmenianAnalyzer) AnalysisProvider(org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider) SwedishAnalyzer(org.apache.lucene.analysis.sv.SwedishAnalyzer) WhitespaceTokenizer(org.apache.lucene.analysis.core.WhitespaceTokenizer) DutchStemmer(org.tartarus.snowball.ext.DutchStemmer) BrazilianStemFilter(org.apache.lucene.analysis.br.BrazilianStemFilter) CharFilterFactory(org.elasticsearch.index.analysis.CharFilterFactory) WordDelimiterFilter(org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter) CachingStrategy(org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) BasqueAnalyzer(org.apache.lucene.analysis.eu.BasqueAnalyzer) HTMLStripCharFilter(org.apache.lucene.analysis.charfilter.HTMLStripCharFilter) HungarianAnalyzer(org.apache.lucene.analysis.hu.HungarianAnalyzer) ClusterService(org.elasticsearch.cluster.service.ClusterService) LengthFilter(org.apache.lucene.analysis.miscellaneous.LengthFilter) TurkishAnalyzer(org.apache.lucene.analysis.tr.TurkishAnalyzer) PreBuiltAnalyzerProviderFactory(org.elasticsearch.index.analysis.PreBuiltAnalyzerProviderFactory) SnowballFilter(org.apache.lucene.analysis.snowball.SnowballFilter) ArrayList(java.util.ArrayList) NamedWriteableRegistry(org.elasticsearch.common.io.stream.NamedWriteableRegistry) SoraniAnalyzer(org.apache.lucene.analysis.ckb.SoraniAnalyzer) PersianNormalizationFilter(org.apache.lucene.analysis.fa.PersianNormalizationFilter) Regex(org.elasticsearch.common.regex.Regex) ClassicTokenizer(org.apache.lucene.analysis.standard.ClassicTokenizer) ArabicStemFilter(org.apache.lucene.analysis.ar.ArabicStemFilter) CJKWidthFilter(org.apache.lucene.analysis.cjk.CJKWidthFilter) LowerCaseFilter(org.apache.lucene.analysis.LowerCaseFilter) TokenStream(org.apache.lucene.analysis.TokenStream) WordDelimiterGraphFilter(org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter) Analyzer(org.apache.lucene.analysis.Analyzer) PorterStemFilter(org.apache.lucene.analysis.en.PorterStemFilter) Client(org.elasticsearch.client.Client) UAX29URLEmailTokenizer(org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer) GalicianAnalyzer(org.apache.lucene.analysis.gl.GalicianAnalyzer) FrenchStemmer(org.tartarus.snowball.ext.FrenchStemmer) UpperCaseFilter(org.apache.lucene.analysis.core.UpperCaseFilter) ItalianAnalyzer(org.apache.lucene.analysis.it.ItalianAnalyzer) BengaliAnalyzer(org.apache.lucene.analysis.bn.BengaliAnalyzer) TreeMap(java.util.TreeMap) ClassicFilter(org.apache.lucene.analysis.standard.ClassicFilter) NGramTokenizer(org.apache.lucene.analysis.ngram.NGramTokenizer) FinnishAnalyzer(org.apache.lucene.analysis.fi.FinnishAnalyzer) EdgeNGramTokenizer(org.apache.lucene.analysis.ngram.EdgeNGramTokenizer) IndonesianAnalyzer(org.apache.lucene.analysis.id.IndonesianAnalyzer) AnalysisPlugin(org.elasticsearch.plugins.AnalysisPlugin) Environment(org.elasticsearch.env.Environment) DisableGraphAttribute(org.apache.lucene.analysis.miscellaneous.DisableGraphAttribute) GermanAnalyzer(org.apache.lucene.analysis.de.GermanAnalyzer) GreekAnalyzer(org.apache.lucene.analysis.el.GreekAnalyzer) DelimitedPayloadTokenFilter(org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilter) ThreadPool(org.elasticsearch.threadpool.ThreadPool) TruncateTokenFilter(org.apache.lucene.analysis.miscellaneous.TruncateTokenFilter) BulgarianAnalyzer(org.apache.lucene.analysis.bg.BulgarianAnalyzer) NamedXContentRegistry(org.elasticsearch.common.xcontent.NamedXContentRegistry) EnglishAnalyzer(org.apache.lucene.analysis.en.EnglishAnalyzer) NGramTokenFilter(org.apache.lucene.analysis.ngram.NGramTokenFilter) Collection(java.util.Collection) HindiAnalyzer(org.apache.lucene.analysis.hi.HindiAnalyzer) RussianAnalyzer(org.apache.lucene.analysis.ru.RussianAnalyzer) BrazilianAnalyzer(org.apache.lucene.analysis.br.BrazilianAnalyzer) PortugueseAnalyzer(org.apache.lucene.analysis.pt.PortugueseAnalyzer) NorwegianAnalyzer(org.apache.lucene.analysis.no.NorwegianAnalyzer) ElisionFilter(org.apache.lucene.analysis.util.ElisionFilter) PreConfiguredTokenizer(org.elasticsearch.index.analysis.PreConfiguredTokenizer) List(java.util.List) ArabicAnalyzer(org.apache.lucene.analysis.ar.ArabicAnalyzer) KStemFilter(org.apache.lucene.analysis.en.KStemFilter) DanishAnalyzer(org.apache.lucene.analysis.da.DanishAnalyzer) RomanianAnalyzer(org.apache.lucene.analysis.ro.RomanianAnalyzer) LetterTokenizer(org.apache.lucene.analysis.core.LetterTokenizer) PreConfiguredTokenFilter(org.elasticsearch.index.analysis.PreConfiguredTokenFilter) ThaiTokenizer(org.apache.lucene.analysis.th.ThaiTokenizer) LatvianAnalyzer(org.apache.lucene.analysis.lv.LatvianAnalyzer) PatternTokenizer(org.apache.lucene.analysis.pattern.PatternTokenizer) CharArraySet(org.apache.lucene.analysis.CharArraySet) CJKAnalyzer(org.apache.lucene.analysis.cjk.CJKAnalyzer) AnalysisPlugin.requiresAnalysisSettings(org.elasticsearch.plugins.AnalysisPlugin.requiresAnalysisSettings) IndicNormalizationFilter(org.apache.lucene.analysis.in.IndicNormalizationFilter) GermanNormalizationFilter(org.apache.lucene.analysis.de.GermanNormalizationFilter) GermanStemFilter(org.apache.lucene.analysis.de.GermanStemFilter) CJKBigramFilter(org.apache.lucene.analysis.cjk.CJKBigramFilter) CommonGramsFilter(org.apache.lucene.analysis.commongrams.CommonGramsFilter) HindiNormalizationFilter(org.apache.lucene.analysis.hi.HindiNormalizationFilter) AnalyzerProvider(org.elasticsearch.index.analysis.AnalyzerProvider) ArabicNormalizationFilter(org.apache.lucene.analysis.ar.ArabicNormalizationFilter) IrishAnalyzer(org.apache.lucene.analysis.ga.IrishAnalyzer) ShingleFilter(org.apache.lucene.analysis.shingle.ShingleFilter) KeywordRepeatFilter(org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilter) DutchAnalyzer(org.apache.lucene.analysis.nl.DutchAnalyzer) SpanishAnalyzer(org.apache.lucene.analysis.es.SpanishAnalyzer) TrimFilter(org.apache.lucene.analysis.miscellaneous.TrimFilter) PersianAnalyzer(org.apache.lucene.analysis.fa.PersianAnalyzer) StopFilter(org.apache.lucene.analysis.StopFilter) SoraniNormalizationFilter(org.apache.lucene.analysis.ckb.SoraniNormalizationFilter) ScandinavianNormalizationFilter(org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizationFilter) Plugin(org.elasticsearch.plugins.Plugin) BengaliNormalizationFilter(org.apache.lucene.analysis.bn.BengaliNormalizationFilter) FrenchAnalyzer(org.apache.lucene.analysis.fr.FrenchAnalyzer) ThaiAnalyzer(org.apache.lucene.analysis.th.ThaiAnalyzer) PreConfiguredCharFilter(org.elasticsearch.index.analysis.PreConfiguredCharFilter) ASCIIFoldingFilter(org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter) NodeEnvironment(org.elasticsearch.env.NodeEnvironment) TokenFilterFactory(org.elasticsearch.index.analysis.TokenFilterFactory) ReverseStringFilter(org.apache.lucene.analysis.reverse.ReverseStringFilter) ApostropheFilter(org.apache.lucene.analysis.tr.ApostropheFilter) Collections(java.util.Collections) EdgeNGramTokenFilter(org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter) TokenStream(org.apache.lucene.analysis.TokenStream) StopFilter(org.apache.lucene.analysis.StopFilter) ArrayList(java.util.ArrayList) EdgeNGramTokenFilter(org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter) NGramTokenFilter(org.apache.lucene.analysis.ngram.NGramTokenFilter) DisableGraphAttribute(org.apache.lucene.analysis.miscellaneous.DisableGraphAttribute) DutchStemmer(org.tartarus.snowball.ext.DutchStemmer) DelimitedPayloadTokenFilter(org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilter) LimitTokenCountFilter(org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilter) ShingleFilter(org.apache.lucene.analysis.shingle.ShingleFilter) ElisionFilter(org.apache.lucene.analysis.util.ElisionFilter) LengthFilter(org.apache.lucene.analysis.miscellaneous.LengthFilter) FrenchStemmer(org.tartarus.snowball.ext.FrenchStemmer) WordDelimiterFilter(org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter) CommonGramsFilter(org.apache.lucene.analysis.commongrams.CommonGramsFilter) SnowballFilter(org.apache.lucene.analysis.snowball.SnowballFilter) WordDelimiterGraphFilter(org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter) PreConfiguredTokenFilter(org.elasticsearch.index.analysis.PreConfiguredTokenFilter) TruncateTokenFilter(org.apache.lucene.analysis.miscellaneous.TruncateTokenFilter)

Aggregations

Analyzer (org.apache.lucene.analysis.Analyzer)2 LowerCaseFilter (org.apache.lucene.analysis.LowerCaseFilter)2 TokenStream (org.apache.lucene.analysis.TokenStream)2 KeywordTokenizer (org.apache.lucene.analysis.core.KeywordTokenizer)2 ArrayList (java.util.ArrayList)1 Collection (java.util.Collection)1 Collections (java.util.Collections)1 List (java.util.List)1 Map (java.util.Map)1 TreeMap (java.util.TreeMap)1 CannedTokenStream (org.apache.lucene.analysis.CannedTokenStream)1 CharArraySet (org.apache.lucene.analysis.CharArraySet)1 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)1 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)1 StopFilter (org.apache.lucene.analysis.StopFilter)1 Tokenizer (org.apache.lucene.analysis.Tokenizer)1 ArabicAnalyzer (org.apache.lucene.analysis.ar.ArabicAnalyzer)1 ArabicNormalizationFilter (org.apache.lucene.analysis.ar.ArabicNormalizationFilter)1 ArabicStemFilter (org.apache.lucene.analysis.ar.ArabicStemFilter)1 BulgarianAnalyzer (org.apache.lucene.analysis.bg.BulgarianAnalyzer)1