use of org.opensearch.index.analysis.PreConfiguredTokenFilter in project OpenSearch by opensearch-project.
the class AnalysisModule method setupPreConfiguredTokenFilters.
static Map<String, PreConfiguredTokenFilter> setupPreConfiguredTokenFilters(List<AnalysisPlugin> plugins) {
NamedRegistry<PreConfiguredTokenFilter> preConfiguredTokenFilters = new NamedRegistry<>("pre-configured token_filter");
// Add filters available in lucene-core
preConfiguredTokenFilters.register("lowercase", PreConfiguredTokenFilter.singleton("lowercase", true, LowerCaseFilter::new));
// Add "standard" for old indices (bwc)
preConfiguredTokenFilters.register("standard", PreConfiguredTokenFilter.openSearchVersion("standard", true, (reader, version) -> {
// until version 7_5_2
if (version.before(LegacyESVersion.V_7_6_0)) {
deprecationLogger.deprecate("standard_deprecation", "The [standard] token filter is deprecated and will be removed in a future version.");
} else {
throw new IllegalArgumentException("The [standard] token filter has been removed.");
}
return reader;
}));
for (AnalysisPlugin plugin : plugins) {
for (PreConfiguredTokenFilter filter : plugin.getPreConfiguredTokenFilters()) {
preConfiguredTokenFilters.register(filter.getName(), filter);
}
}
return unmodifiableMap(preConfiguredTokenFilters.getRegistry());
}
use of org.opensearch.index.analysis.PreConfiguredTokenFilter in project OpenSearch by opensearch-project.
the class CommonAnalysisPlugin method getPreConfiguredTokenFilters.
@Override
public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
List<PreConfiguredTokenFilter> filters = new ArrayList<>();
filters.add(PreConfiguredTokenFilter.singleton("apostrophe", false, ApostropheFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("arabic_normalization", true, ArabicNormalizationFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("arabic_stem", false, ArabicStemFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("asciifolding", true, ASCIIFoldingFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("bengali_normalization", true, BengaliNormalizationFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("brazilian_stem", false, BrazilianStemFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("cjk_bigram", false, CJKBigramFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("cjk_width", true, CJKWidthFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("classic", false, ClassicFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("common_grams", false, false, input -> new CommonGramsFilter(input, CharArraySet.EMPTY_SET)));
filters.add(PreConfiguredTokenFilter.singleton("czech_stem", false, CzechStemFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("decimal_digit", true, DecimalDigitFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("delimited_payload", false, input -> new DelimitedPayloadTokenFilter(input, DelimitedPayloadTokenFilterFactory.DEFAULT_DELIMITER, DelimitedPayloadTokenFilterFactory.DEFAULT_ENCODER)));
filters.add(PreConfiguredTokenFilter.singleton("dutch_stem", false, input -> new SnowballFilter(input, new DutchStemmer())));
filters.add(PreConfiguredTokenFilter.singleton("edge_ngram", false, false, input -> new EdgeNGramTokenFilter(input, 1)));
filters.add(PreConfiguredTokenFilter.openSearchVersion("edgeNGram", false, false, (reader, version) -> {
if (version.onOrAfter(LegacyESVersion.V_7_0_0)) {
throw new IllegalArgumentException("The [edgeNGram] token filter name was deprecated in 6.4 and cannot be used in new indices. " + "Please change the filter name to [edge_ngram] instead.");
} else {
deprecationLogger.deprecate("edgeNGram_deprecation", "The [edgeNGram] token filter name is deprecated and will be removed in a future version. " + "Please change the filter name to [edge_ngram] instead.");
}
return new EdgeNGramTokenFilter(reader, 1);
}));
filters.add(PreConfiguredTokenFilter.singleton("elision", true, input -> new ElisionFilter(input, FrenchAnalyzer.DEFAULT_ARTICLES)));
filters.add(PreConfiguredTokenFilter.singleton("french_stem", false, input -> new SnowballFilter(input, new FrenchStemmer())));
filters.add(PreConfiguredTokenFilter.singleton("german_normalization", true, GermanNormalizationFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("german_stem", false, GermanStemFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("hindi_normalization", true, HindiNormalizationFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("indic_normalization", true, IndicNormalizationFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("keyword_repeat", false, false, KeywordRepeatFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("kstem", false, KStemFilter::new));
// TODO this one seems useless
filters.add(PreConfiguredTokenFilter.singleton("length", false, input -> new LengthFilter(input, 0, Integer.MAX_VALUE)));
filters.add(PreConfiguredTokenFilter.singleton("limit", false, input -> new LimitTokenCountFilter(input, LimitTokenCountFilterFactory.DEFAULT_MAX_TOKEN_COUNT, LimitTokenCountFilterFactory.DEFAULT_CONSUME_ALL_TOKENS)));
filters.add(PreConfiguredTokenFilter.singleton("ngram", false, false, reader -> new NGramTokenFilter(reader, 1, 2, false)));
filters.add(PreConfiguredTokenFilter.openSearchVersion("nGram", false, false, (reader, version) -> {
if (version.onOrAfter(LegacyESVersion.V_7_0_0)) {
throw new IllegalArgumentException("The [nGram] token filter name was deprecated in 6.4 and cannot be used in new indices. " + "Please change the filter name to [ngram] instead.");
} else {
deprecationLogger.deprecate("nGram_deprecation", "The [nGram] token filter name is deprecated and will be removed in a future version. " + "Please change the filter name to [ngram] instead.");
}
return new NGramTokenFilter(reader, 1, 2, false);
}));
filters.add(PreConfiguredTokenFilter.singleton("persian_normalization", true, PersianNormalizationFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("porter_stem", false, PorterStemFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("reverse", false, ReverseStringFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("russian_stem", false, input -> new SnowballFilter(input, "Russian")));
filters.add(PreConfiguredTokenFilter.singleton("scandinavian_folding", true, ScandinavianFoldingFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("scandinavian_normalization", true, ScandinavianNormalizationFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("shingle", false, false, input -> {
TokenStream ts = new ShingleFilter(input);
/**
* We disable the graph analysis on this token stream
* because it produces shingles of different size.
* Graph analysis on such token stream is useless and dangerous as it may create too many paths
* since shingles of different size are not aligned in terms of positions.
*/
ts.addAttribute(DisableGraphAttribute.class);
return ts;
}));
filters.add(PreConfiguredTokenFilter.singleton("snowball", false, input -> new SnowballFilter(input, "English")));
filters.add(PreConfiguredTokenFilter.singleton("sorani_normalization", true, SoraniNormalizationFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("stemmer", false, PorterStemFilter::new));
// The stop filter is in lucene-core but the English stop words set is in lucene-analyzers-common
filters.add(PreConfiguredTokenFilter.singleton("stop", false, input -> new StopFilter(input, EnglishAnalyzer.ENGLISH_STOP_WORDS_SET)));
filters.add(PreConfiguredTokenFilter.singleton("trim", true, TrimFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("truncate", false, input -> new TruncateTokenFilter(input, 10)));
filters.add(PreConfiguredTokenFilter.singleton("type_as_payload", false, TypeAsPayloadTokenFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("unique", false, UniqueTokenFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("uppercase", true, UpperCaseFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("word_delimiter", false, false, input -> new WordDelimiterFilter(input, WordDelimiterFilter.GENERATE_WORD_PARTS | WordDelimiterFilter.GENERATE_NUMBER_PARTS | WordDelimiterFilter.SPLIT_ON_CASE_CHANGE | WordDelimiterFilter.SPLIT_ON_NUMERICS | WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE, null)));
filters.add(PreConfiguredTokenFilter.openSearchVersion("word_delimiter_graph", false, false, (input, version) -> {
boolean adjustOffsets = version.onOrAfter(LegacyESVersion.V_7_3_0);
return new WordDelimiterGraphFilter(input, adjustOffsets, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, WordDelimiterGraphFilter.GENERATE_WORD_PARTS | WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS | WordDelimiterGraphFilter.SPLIT_ON_CASE_CHANGE | WordDelimiterGraphFilter.SPLIT_ON_NUMERICS | WordDelimiterGraphFilter.STEM_ENGLISH_POSSESSIVE, null);
}));
return filters;
}
use of org.opensearch.index.analysis.PreConfiguredTokenFilter in project OpenSearch by opensearch-project.
the class SynonymsAnalysisTests method testPreconfiguredTokenFilters.
public void testPreconfiguredTokenFilters() throws IOException {
Set<String> disallowedFilters = new HashSet<>(Arrays.asList("common_grams", "edge_ngram", "edgeNGram", "keyword_repeat", "ngram", "nGram", "shingle", "word_delimiter", "word_delimiter_graph"));
Settings settings = Settings.builder().put(IndexMetadata.SETTING_VERSION_CREATED, VersionUtils.randomVersionBetween(random(), LegacyESVersion.V_7_0_0, Version.CURRENT)).put("path.home", createTempDir().toString()).build();
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
CommonAnalysisPlugin plugin = new CommonAnalysisPlugin();
for (PreConfiguredTokenFilter tf : plugin.getPreConfiguredTokenFilters()) {
if (disallowedFilters.contains(tf.getName())) {
IllegalArgumentException e = expectThrows(IllegalArgumentException.class, "Expected exception for factory " + tf.getName(), () -> {
tf.get(idxSettings, null, tf.getName(), settings).getSynonymFilter();
});
assertEquals(tf.getName(), "Token filter [" + tf.getName() + "] cannot be used to parse synonyms", e.getMessage());
} else {
tf.get(idxSettings, null, tf.getName(), settings).getSynonymFilter();
}
}
}
Aggregations