Search in sources :

Example 1 with PreConfiguredTokenFilter

use of org.opensearch.index.analysis.PreConfiguredTokenFilter in project OpenSearch by opensearch-project.

the class AnalysisModule method setupPreConfiguredTokenFilters.

static Map<String, PreConfiguredTokenFilter> setupPreConfiguredTokenFilters(List<AnalysisPlugin> plugins) {
    NamedRegistry<PreConfiguredTokenFilter> preConfiguredTokenFilters = new NamedRegistry<>("pre-configured token_filter");
    // Add filters available in lucene-core
    preConfiguredTokenFilters.register("lowercase", PreConfiguredTokenFilter.singleton("lowercase", true, LowerCaseFilter::new));
    // Add "standard" for old indices (bwc)
    preConfiguredTokenFilters.register("standard", PreConfiguredTokenFilter.openSearchVersion("standard", true, (reader, version) -> {
        // until version 7_5_2
        if (version.before(LegacyESVersion.V_7_6_0)) {
            deprecationLogger.deprecate("standard_deprecation", "The [standard] token filter is deprecated and will be removed in a future version.");
        } else {
            throw new IllegalArgumentException("The [standard] token filter has been removed.");
        }
        return reader;
    }));
    for (AnalysisPlugin plugin : plugins) {
        for (PreConfiguredTokenFilter filter : plugin.getPreConfiguredTokenFilters()) {
            preConfiguredTokenFilters.register(filter.getName(), filter);
        }
    }
    return unmodifiableMap(preConfiguredTokenFilters.getRegistry());
}
Also used : TokenizerFactory(org.opensearch.index.analysis.TokenizerFactory) IndexMetadata(org.opensearch.cluster.metadata.IndexMetadata) PreBuiltAnalyzerProviderFactory(org.opensearch.index.analysis.PreBuiltAnalyzerProviderFactory) StopAnalyzerProvider(org.opensearch.index.analysis.StopAnalyzerProvider) TokenFilterFactory(org.opensearch.index.analysis.TokenFilterFactory) Version(org.opensearch.Version) StopTokenFilterFactory(org.opensearch.index.analysis.StopTokenFilterFactory) DeprecationLogger(org.opensearch.common.logging.DeprecationLogger) SimpleAnalyzerProvider(org.opensearch.index.analysis.SimpleAnalyzerProvider) AnalysisRegistry(org.opensearch.index.analysis.AnalysisRegistry) LegacyESVersion(org.opensearch.LegacyESVersion) KeywordAnalyzerProvider(org.opensearch.index.analysis.KeywordAnalyzerProvider) AnalysisPlugin.requiresAnalysisSettings(org.opensearch.plugins.AnalysisPlugin.requiresAnalysisSettings) CharFilterFactory(org.opensearch.index.analysis.CharFilterFactory) Locale(java.util.Locale) Map(java.util.Map) StandardTokenizerFactory(org.opensearch.index.analysis.StandardTokenizerFactory) PreConfiguredTokenizer(org.opensearch.index.analysis.PreConfiguredTokenizer) Environment(org.opensearch.env.Environment) LowerCaseFilter(org.apache.lucene.analysis.LowerCaseFilter) TokenStream(org.apache.lucene.analysis.TokenStream) PreConfiguredTokenFilter(org.opensearch.index.analysis.PreConfiguredTokenFilter) AbstractTokenFilterFactory(org.opensearch.index.analysis.AbstractTokenFilterFactory) Settings(org.opensearch.common.settings.Settings) IOException(java.io.IOException) PreConfiguredCharFilter(org.opensearch.index.analysis.PreConfiguredCharFilter) ShingleTokenFilterFactory(org.opensearch.index.analysis.ShingleTokenFilterFactory) LowercaseNormalizerProvider(org.opensearch.index.analysis.LowercaseNormalizerProvider) AnalysisPlugin(org.opensearch.plugins.AnalysisPlugin) List(java.util.List) AnalyzerProvider(org.opensearch.index.analysis.AnalyzerProvider) NamedRegistry(org.opensearch.common.NamedRegistry) IndexSettings(org.opensearch.index.IndexSettings) WhitespaceAnalyzerProvider(org.opensearch.index.analysis.WhitespaceAnalyzerProvider) HunspellTokenFilterFactory(org.opensearch.index.analysis.HunspellTokenFilterFactory) Collections.unmodifiableMap(java.util.Collections.unmodifiableMap) StandardAnalyzerProvider(org.opensearch.index.analysis.StandardAnalyzerProvider) NamedRegistry(org.opensearch.common.NamedRegistry) PreConfiguredTokenFilter(org.opensearch.index.analysis.PreConfiguredTokenFilter) AnalysisPlugin(org.opensearch.plugins.AnalysisPlugin)

Example 2 with PreConfiguredTokenFilter

use of org.opensearch.index.analysis.PreConfiguredTokenFilter in project OpenSearch by opensearch-project.

the class CommonAnalysisPlugin method getPreConfiguredTokenFilters.

@Override
public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
    List<PreConfiguredTokenFilter> filters = new ArrayList<>();
    filters.add(PreConfiguredTokenFilter.singleton("apostrophe", false, ApostropheFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("arabic_normalization", true, ArabicNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("arabic_stem", false, ArabicStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("asciifolding", true, ASCIIFoldingFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("bengali_normalization", true, BengaliNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("brazilian_stem", false, BrazilianStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("cjk_bigram", false, CJKBigramFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("cjk_width", true, CJKWidthFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("classic", false, ClassicFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("common_grams", false, false, input -> new CommonGramsFilter(input, CharArraySet.EMPTY_SET)));
    filters.add(PreConfiguredTokenFilter.singleton("czech_stem", false, CzechStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("decimal_digit", true, DecimalDigitFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("delimited_payload", false, input -> new DelimitedPayloadTokenFilter(input, DelimitedPayloadTokenFilterFactory.DEFAULT_DELIMITER, DelimitedPayloadTokenFilterFactory.DEFAULT_ENCODER)));
    filters.add(PreConfiguredTokenFilter.singleton("dutch_stem", false, input -> new SnowballFilter(input, new DutchStemmer())));
    filters.add(PreConfiguredTokenFilter.singleton("edge_ngram", false, false, input -> new EdgeNGramTokenFilter(input, 1)));
    filters.add(PreConfiguredTokenFilter.openSearchVersion("edgeNGram", false, false, (reader, version) -> {
        if (version.onOrAfter(LegacyESVersion.V_7_0_0)) {
            throw new IllegalArgumentException("The [edgeNGram] token filter name was deprecated in 6.4 and cannot be used in new indices. " + "Please change the filter name to [edge_ngram] instead.");
        } else {
            deprecationLogger.deprecate("edgeNGram_deprecation", "The [edgeNGram] token filter name is deprecated and will be removed in a future version. " + "Please change the filter name to [edge_ngram] instead.");
        }
        return new EdgeNGramTokenFilter(reader, 1);
    }));
    filters.add(PreConfiguredTokenFilter.singleton("elision", true, input -> new ElisionFilter(input, FrenchAnalyzer.DEFAULT_ARTICLES)));
    filters.add(PreConfiguredTokenFilter.singleton("french_stem", false, input -> new SnowballFilter(input, new FrenchStemmer())));
    filters.add(PreConfiguredTokenFilter.singleton("german_normalization", true, GermanNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("german_stem", false, GermanStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("hindi_normalization", true, HindiNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("indic_normalization", true, IndicNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("keyword_repeat", false, false, KeywordRepeatFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("kstem", false, KStemFilter::new));
    // TODO this one seems useless
    filters.add(PreConfiguredTokenFilter.singleton("length", false, input -> new LengthFilter(input, 0, Integer.MAX_VALUE)));
    filters.add(PreConfiguredTokenFilter.singleton("limit", false, input -> new LimitTokenCountFilter(input, LimitTokenCountFilterFactory.DEFAULT_MAX_TOKEN_COUNT, LimitTokenCountFilterFactory.DEFAULT_CONSUME_ALL_TOKENS)));
    filters.add(PreConfiguredTokenFilter.singleton("ngram", false, false, reader -> new NGramTokenFilter(reader, 1, 2, false)));
    filters.add(PreConfiguredTokenFilter.openSearchVersion("nGram", false, false, (reader, version) -> {
        if (version.onOrAfter(LegacyESVersion.V_7_0_0)) {
            throw new IllegalArgumentException("The [nGram] token filter name was deprecated in 6.4 and cannot be used in new indices. " + "Please change the filter name to [ngram] instead.");
        } else {
            deprecationLogger.deprecate("nGram_deprecation", "The [nGram] token filter name is deprecated and will be removed in a future version. " + "Please change the filter name to [ngram] instead.");
        }
        return new NGramTokenFilter(reader, 1, 2, false);
    }));
    filters.add(PreConfiguredTokenFilter.singleton("persian_normalization", true, PersianNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("porter_stem", false, PorterStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("reverse", false, ReverseStringFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("russian_stem", false, input -> new SnowballFilter(input, "Russian")));
    filters.add(PreConfiguredTokenFilter.singleton("scandinavian_folding", true, ScandinavianFoldingFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("scandinavian_normalization", true, ScandinavianNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("shingle", false, false, input -> {
        TokenStream ts = new ShingleFilter(input);
        /**
         * We disable the graph analysis on this token stream
         * because it produces shingles of different size.
         * Graph analysis on such token stream is useless and dangerous as it may create too many paths
         * since shingles of different size are not aligned in terms of positions.
         */
        ts.addAttribute(DisableGraphAttribute.class);
        return ts;
    }));
    filters.add(PreConfiguredTokenFilter.singleton("snowball", false, input -> new SnowballFilter(input, "English")));
    filters.add(PreConfiguredTokenFilter.singleton("sorani_normalization", true, SoraniNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("stemmer", false, PorterStemFilter::new));
    // The stop filter is in lucene-core but the English stop words set is in lucene-analyzers-common
    filters.add(PreConfiguredTokenFilter.singleton("stop", false, input -> new StopFilter(input, EnglishAnalyzer.ENGLISH_STOP_WORDS_SET)));
    filters.add(PreConfiguredTokenFilter.singleton("trim", true, TrimFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("truncate", false, input -> new TruncateTokenFilter(input, 10)));
    filters.add(PreConfiguredTokenFilter.singleton("type_as_payload", false, TypeAsPayloadTokenFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("unique", false, UniqueTokenFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("uppercase", true, UpperCaseFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("word_delimiter", false, false, input -> new WordDelimiterFilter(input, WordDelimiterFilter.GENERATE_WORD_PARTS | WordDelimiterFilter.GENERATE_NUMBER_PARTS | WordDelimiterFilter.SPLIT_ON_CASE_CHANGE | WordDelimiterFilter.SPLIT_ON_NUMERICS | WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE, null)));
    filters.add(PreConfiguredTokenFilter.openSearchVersion("word_delimiter_graph", false, false, (input, version) -> {
        boolean adjustOffsets = version.onOrAfter(LegacyESVersion.V_7_3_0);
        return new WordDelimiterGraphFilter(input, adjustOffsets, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, WordDelimiterGraphFilter.GENERATE_WORD_PARTS | WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS | WordDelimiterGraphFilter.SPLIT_ON_CASE_CHANGE | WordDelimiterGraphFilter.SPLIT_ON_NUMERICS | WordDelimiterGraphFilter.STEM_ENGLISH_POSSESSIVE, null);
    }));
    return filters;
}
Also used : LimitTokenCountFilter(org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilter) EdgeNGramTokenFilter(org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter) TypeAsPayloadTokenFilter(org.apache.lucene.analysis.payloads.TypeAsPayloadTokenFilter) LithuanianAnalyzer(org.apache.lucene.analysis.lt.LithuanianAnalyzer) DecimalDigitFilter(org.apache.lucene.analysis.core.DecimalDigitFilter) Regex(org.opensearch.common.regex.Regex) PathHierarchyTokenizer(org.apache.lucene.analysis.path.PathHierarchyTokenizer) CatalanAnalyzer(org.apache.lucene.analysis.ca.CatalanAnalyzer) CzechStemFilter(org.apache.lucene.analysis.cz.CzechStemFilter) CzechAnalyzer(org.apache.lucene.analysis.cz.CzechAnalyzer) ScandinavianFoldingFilter(org.apache.lucene.analysis.miscellaneous.ScandinavianFoldingFilter) Map(java.util.Map) ArmenianAnalyzer(org.apache.lucene.analysis.hy.ArmenianAnalyzer) SwedishAnalyzer(org.apache.lucene.analysis.sv.SwedishAnalyzer) WhitespaceTokenizer(org.apache.lucene.analysis.core.WhitespaceTokenizer) NodeEnvironment(org.opensearch.env.NodeEnvironment) ScriptService(org.opensearch.script.ScriptService) Client(org.opensearch.client.Client) DutchStemmer(org.tartarus.snowball.ext.DutchStemmer) BrazilianStemFilter(org.apache.lucene.analysis.br.BrazilianStemFilter) PreConfiguredTokenFilter(org.opensearch.index.analysis.PreConfiguredTokenFilter) WordDelimiterFilter(org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) Settings(org.opensearch.common.settings.Settings) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) AnalyzerProvider(org.opensearch.index.analysis.AnalyzerProvider) BasqueAnalyzer(org.apache.lucene.analysis.eu.BasqueAnalyzer) HTMLStripCharFilter(org.apache.lucene.analysis.charfilter.HTMLStripCharFilter) HungarianAnalyzer(org.apache.lucene.analysis.hu.HungarianAnalyzer) IndexNameExpressionResolver(org.opensearch.cluster.metadata.IndexNameExpressionResolver) RepositoriesService(org.opensearch.repositories.RepositoriesService) ThreadPool(org.opensearch.threadpool.ThreadPool) LengthFilter(org.apache.lucene.analysis.miscellaneous.LengthFilter) TurkishAnalyzer(org.apache.lucene.analysis.tr.TurkishAnalyzer) SnowballFilter(org.apache.lucene.analysis.snowball.SnowballFilter) Supplier(java.util.function.Supplier) ArrayList(java.util.ArrayList) WordDelimiterIterator(org.apache.lucene.analysis.miscellaneous.WordDelimiterIterator) LegacyESVersion(org.opensearch.LegacyESVersion) CharFilterFactory(org.opensearch.index.analysis.CharFilterFactory) SoraniAnalyzer(org.apache.lucene.analysis.ckb.SoraniAnalyzer) PersianNormalizationFilter(org.apache.lucene.analysis.fa.PersianNormalizationFilter) AnalysisProvider(org.opensearch.indices.analysis.AnalysisModule.AnalysisProvider) ClassicTokenizer(org.apache.lucene.analysis.standard.ClassicTokenizer) ArabicStemFilter(org.apache.lucene.analysis.ar.ArabicStemFilter) CJKWidthFilter(org.apache.lucene.analysis.cjk.CJKWidthFilter) Environment(org.opensearch.env.Environment) TokenStream(org.apache.lucene.analysis.TokenStream) SetOnce(org.apache.lucene.util.SetOnce) WordDelimiterGraphFilter(org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter) Analyzer(org.apache.lucene.analysis.Analyzer) PorterStemFilter(org.apache.lucene.analysis.en.PorterStemFilter) UAX29URLEmailTokenizer(org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer) GalicianAnalyzer(org.apache.lucene.analysis.gl.GalicianAnalyzer) FrenchStemmer(org.tartarus.snowball.ext.FrenchStemmer) PreConfiguredCharFilter(org.opensearch.index.analysis.PreConfiguredCharFilter) UpperCaseFilter(org.apache.lucene.analysis.core.UpperCaseFilter) Plugin(org.opensearch.plugins.Plugin) CachingStrategy(org.opensearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy) ItalianAnalyzer(org.apache.lucene.analysis.it.ItalianAnalyzer) BengaliAnalyzer(org.apache.lucene.analysis.bn.BengaliAnalyzer) TreeMap(java.util.TreeMap) ClassicFilter(org.apache.lucene.analysis.standard.ClassicFilter) NamedXContentRegistry(org.opensearch.common.xcontent.NamedXContentRegistry) NGramTokenizer(org.apache.lucene.analysis.ngram.NGramTokenizer) ClusterService(org.opensearch.cluster.service.ClusterService) FinnishAnalyzer(org.apache.lucene.analysis.fi.FinnishAnalyzer) EdgeNGramTokenizer(org.apache.lucene.analysis.ngram.EdgeNGramTokenizer) IndonesianAnalyzer(org.apache.lucene.analysis.id.IndonesianAnalyzer) ResourceWatcherService(org.opensearch.watcher.ResourceWatcherService) DisableGraphAttribute(org.apache.lucene.analysis.miscellaneous.DisableGraphAttribute) GermanAnalyzer(org.apache.lucene.analysis.de.GermanAnalyzer) AnalysisPlugin.requiresAnalysisSettings(org.opensearch.plugins.AnalysisPlugin.requiresAnalysisSettings) GreekAnalyzer(org.apache.lucene.analysis.el.GreekAnalyzer) DelimitedPayloadTokenFilter(org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilter) PreConfiguredTokenizer(org.opensearch.index.analysis.PreConfiguredTokenizer) TruncateTokenFilter(org.apache.lucene.analysis.miscellaneous.TruncateTokenFilter) BulgarianAnalyzer(org.apache.lucene.analysis.bg.BulgarianAnalyzer) EnglishAnalyzer(org.apache.lucene.analysis.en.EnglishAnalyzer) ScriptPlugin(org.opensearch.plugins.ScriptPlugin) NGramTokenFilter(org.apache.lucene.analysis.ngram.NGramTokenFilter) Collection(java.util.Collection) HindiAnalyzer(org.apache.lucene.analysis.hi.HindiAnalyzer) RussianAnalyzer(org.apache.lucene.analysis.ru.RussianAnalyzer) BrazilianAnalyzer(org.apache.lucene.analysis.br.BrazilianAnalyzer) PortugueseAnalyzer(org.apache.lucene.analysis.pt.PortugueseAnalyzer) NorwegianAnalyzer(org.apache.lucene.analysis.no.NorwegianAnalyzer) ElisionFilter(org.apache.lucene.analysis.util.ElisionFilter) List(java.util.List) ArabicAnalyzer(org.apache.lucene.analysis.ar.ArabicAnalyzer) KStemFilter(org.apache.lucene.analysis.en.KStemFilter) IndexSettings(org.opensearch.index.IndexSettings) DanishAnalyzer(org.apache.lucene.analysis.da.DanishAnalyzer) RomanianAnalyzer(org.apache.lucene.analysis.ro.RomanianAnalyzer) ScriptContext(org.opensearch.script.ScriptContext) LetterTokenizer(org.apache.lucene.analysis.core.LetterTokenizer) ThaiTokenizer(org.apache.lucene.analysis.th.ThaiTokenizer) TokenizerFactory(org.opensearch.index.analysis.TokenizerFactory) LatvianAnalyzer(org.apache.lucene.analysis.lv.LatvianAnalyzer) PatternTokenizer(org.apache.lucene.analysis.pattern.PatternTokenizer) CharArraySet(org.apache.lucene.analysis.CharArraySet) CJKAnalyzer(org.apache.lucene.analysis.cjk.CJKAnalyzer) PreBuiltAnalyzerProviderFactory(org.opensearch.index.analysis.PreBuiltAnalyzerProviderFactory) IndicNormalizationFilter(org.apache.lucene.analysis.in.IndicNormalizationFilter) TokenFilterFactory(org.opensearch.index.analysis.TokenFilterFactory) GermanNormalizationFilter(org.apache.lucene.analysis.de.GermanNormalizationFilter) GermanStemFilter(org.apache.lucene.analysis.de.GermanStemFilter) CJKBigramFilter(org.apache.lucene.analysis.cjk.CJKBigramFilter) CommonGramsFilter(org.apache.lucene.analysis.commongrams.CommonGramsFilter) HindiNormalizationFilter(org.apache.lucene.analysis.hi.HindiNormalizationFilter) NamedWriteableRegistry(org.opensearch.common.io.stream.NamedWriteableRegistry) DeprecationLogger(org.opensearch.common.logging.DeprecationLogger) ArabicNormalizationFilter(org.apache.lucene.analysis.ar.ArabicNormalizationFilter) IrishAnalyzer(org.apache.lucene.analysis.ga.IrishAnalyzer) ShingleFilter(org.apache.lucene.analysis.shingle.ShingleFilter) KeywordRepeatFilter(org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilter) DutchAnalyzer(org.apache.lucene.analysis.nl.DutchAnalyzer) SpanishAnalyzer(org.apache.lucene.analysis.es.SpanishAnalyzer) TrimFilter(org.apache.lucene.analysis.miscellaneous.TrimFilter) PersianAnalyzer(org.apache.lucene.analysis.fa.PersianAnalyzer) StopFilter(org.apache.lucene.analysis.StopFilter) SoraniNormalizationFilter(org.apache.lucene.analysis.ckb.SoraniNormalizationFilter) ScandinavianNormalizationFilter(org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizationFilter) EstonianAnalyzer(org.apache.lucene.analysis.et.EstonianAnalyzer) BengaliNormalizationFilter(org.apache.lucene.analysis.bn.BengaliNormalizationFilter) FrenchAnalyzer(org.apache.lucene.analysis.fr.FrenchAnalyzer) ThaiAnalyzer(org.apache.lucene.analysis.th.ThaiAnalyzer) AnalysisPlugin(org.opensearch.plugins.AnalysisPlugin) ASCIIFoldingFilter(org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter) ReverseStringFilter(org.apache.lucene.analysis.reverse.ReverseStringFilter) ApostropheFilter(org.apache.lucene.analysis.tr.ApostropheFilter) Collections(java.util.Collections) EdgeNGramTokenFilter(org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter) TokenStream(org.apache.lucene.analysis.TokenStream) StopFilter(org.apache.lucene.analysis.StopFilter) ArrayList(java.util.ArrayList) EdgeNGramTokenFilter(org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter) NGramTokenFilter(org.apache.lucene.analysis.ngram.NGramTokenFilter) DisableGraphAttribute(org.apache.lucene.analysis.miscellaneous.DisableGraphAttribute) DutchStemmer(org.tartarus.snowball.ext.DutchStemmer) DelimitedPayloadTokenFilter(org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilter) LimitTokenCountFilter(org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilter) ShingleFilter(org.apache.lucene.analysis.shingle.ShingleFilter) ElisionFilter(org.apache.lucene.analysis.util.ElisionFilter) LengthFilter(org.apache.lucene.analysis.miscellaneous.LengthFilter) FrenchStemmer(org.tartarus.snowball.ext.FrenchStemmer) WordDelimiterFilter(org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter) CommonGramsFilter(org.apache.lucene.analysis.commongrams.CommonGramsFilter) SnowballFilter(org.apache.lucene.analysis.snowball.SnowballFilter) WordDelimiterGraphFilter(org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter) PreConfiguredTokenFilter(org.opensearch.index.analysis.PreConfiguredTokenFilter) TruncateTokenFilter(org.apache.lucene.analysis.miscellaneous.TruncateTokenFilter)

Example 3 with PreConfiguredTokenFilter

use of org.opensearch.index.analysis.PreConfiguredTokenFilter in project OpenSearch by opensearch-project.

the class SynonymsAnalysisTests method testPreconfiguredTokenFilters.

public void testPreconfiguredTokenFilters() throws IOException {
    Set<String> disallowedFilters = new HashSet<>(Arrays.asList("common_grams", "edge_ngram", "edgeNGram", "keyword_repeat", "ngram", "nGram", "shingle", "word_delimiter", "word_delimiter_graph"));
    Settings settings = Settings.builder().put(IndexMetadata.SETTING_VERSION_CREATED, VersionUtils.randomVersionBetween(random(), LegacyESVersion.V_7_0_0, Version.CURRENT)).put("path.home", createTempDir().toString()).build();
    IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
    CommonAnalysisPlugin plugin = new CommonAnalysisPlugin();
    for (PreConfiguredTokenFilter tf : plugin.getPreConfiguredTokenFilters()) {
        if (disallowedFilters.contains(tf.getName())) {
            IllegalArgumentException e = expectThrows(IllegalArgumentException.class, "Expected exception for factory " + tf.getName(), () -> {
                tf.get(idxSettings, null, tf.getName(), settings).getSynonymFilter();
            });
            assertEquals(tf.getName(), "Token filter [" + tf.getName() + "] cannot be used to parse synonyms", e.getMessage());
        } else {
            tf.get(idxSettings, null, tf.getName(), settings).getSynonymFilter();
        }
    }
}
Also used : IndexSettings(org.opensearch.index.IndexSettings) PreConfiguredTokenFilter(org.opensearch.index.analysis.PreConfiguredTokenFilter) Settings(org.opensearch.common.settings.Settings) IndexSettings(org.opensearch.index.IndexSettings) HashSet(java.util.HashSet)

Aggregations

Settings (org.opensearch.common.settings.Settings)3 IndexSettings (org.opensearch.index.IndexSettings)3 PreConfiguredTokenFilter (org.opensearch.index.analysis.PreConfiguredTokenFilter)3 List (java.util.List)2 Map (java.util.Map)2 TokenStream (org.apache.lucene.analysis.TokenStream)2 IOException (java.io.IOException)1 ArrayList (java.util.ArrayList)1 Collection (java.util.Collection)1 Collections (java.util.Collections)1 Collections.unmodifiableMap (java.util.Collections.unmodifiableMap)1 HashSet (java.util.HashSet)1 Locale (java.util.Locale)1 TreeMap (java.util.TreeMap)1 Supplier (java.util.function.Supplier)1 Analyzer (org.apache.lucene.analysis.Analyzer)1 CharArraySet (org.apache.lucene.analysis.CharArraySet)1 LowerCaseFilter (org.apache.lucene.analysis.LowerCaseFilter)1 StopFilter (org.apache.lucene.analysis.StopFilter)1 ArabicAnalyzer (org.apache.lucene.analysis.ar.ArabicAnalyzer)1