Search in sources :

Example 1 with AnalysisPlugin

use of org.opensearch.plugins.AnalysisPlugin in project OpenSearch by opensearch-project.

the class AnalysisModuleTests method testRegisterHunspellDictionary.

public void testRegisterHunspellDictionary() throws Exception {
    Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).put(IndexMetadata.SETTING_VERSION_CREATED, Version.CURRENT).build();
    Environment environment = TestEnvironment.newEnvironment(settings);
    InputStream aff = getClass().getResourceAsStream("/indices/analyze/conf_dir/hunspell/en_US/en_US.aff");
    InputStream dic = getClass().getResourceAsStream("/indices/analyze/conf_dir/hunspell/en_US/en_US.dic");
    Dictionary dictionary;
    try (Directory tmp = new NIOFSDirectory(environment.tmpFile())) {
        dictionary = new Dictionary(tmp, "hunspell", aff, dic);
    }
    AnalysisModule module = new AnalysisModule(environment, singletonList(new AnalysisPlugin() {

        @Override
        public Map<String, Dictionary> getHunspellDictionaries() {
            return singletonMap("foo", dictionary);
        }
    }));
    assertSame(dictionary, module.getHunspellService().getDictionary("foo"));
}
Also used : Dictionary(org.apache.lucene.analysis.hunspell.Dictionary) NIOFSDirectory(org.apache.lucene.store.NIOFSDirectory) InputStream(java.io.InputStream) TestEnvironment(org.opensearch.env.TestEnvironment) Environment(org.opensearch.env.Environment) Settings(org.opensearch.common.settings.Settings) IndexSettings(org.opensearch.index.IndexSettings) Directory(org.apache.lucene.store.Directory) NIOFSDirectory(org.apache.lucene.store.NIOFSDirectory) AnalysisPlugin(org.opensearch.plugins.AnalysisPlugin)

Example 2 with AnalysisPlugin

use of org.opensearch.plugins.AnalysisPlugin in project OpenSearch by opensearch-project.

the class AnalysisModuleTests method testPluginPreConfiguredCharFilters.

/**
 * Tests that plugins can register pre-configured char filters that vary in behavior based on OpenSearch version, Lucene version,
 * and that do not vary based on version at all.
 */
public void testPluginPreConfiguredCharFilters() throws IOException {
    boolean noVersionSupportsMultiTerm = randomBoolean();
    boolean luceneVersionSupportsMultiTerm = randomBoolean();
    boolean opensearchVersionSupportsMultiTerm = randomBoolean();
    AnalysisRegistry registry = new AnalysisModule(TestEnvironment.newEnvironment(emptyNodeSettings), singletonList(new AnalysisPlugin() {

        @Override
        public List<PreConfiguredCharFilter> getPreConfiguredCharFilters() {
            return Arrays.asList(PreConfiguredCharFilter.singleton("no_version", noVersionSupportsMultiTerm, tokenStream -> new AppendCharFilter(tokenStream, "no_version")), PreConfiguredCharFilter.luceneVersion("lucene_version", luceneVersionSupportsMultiTerm, (tokenStream, luceneVersion) -> new AppendCharFilter(tokenStream, luceneVersion.toString())), PreConfiguredCharFilter.openSearchVersion("opensearch_version", opensearchVersionSupportsMultiTerm, (tokenStream, esVersion) -> new AppendCharFilter(tokenStream, esVersion.toString())));
        }

        @Override
        public Map<String, AnalysisProvider<TokenizerFactory>> getTokenizers() {
            // Need mock keyword tokenizer here, because alpha / beta versions are broken up by the dash.
            return singletonMap("keyword", (indexSettings, environment, name, settings) -> TokenizerFactory.newFactory(name, () -> new MockTokenizer(MockTokenizer.KEYWORD, false)));
        }
    })).getAnalysisRegistry();
    Version version = VersionUtils.randomVersion(random());
    IndexAnalyzers analyzers = getIndexAnalyzers(registry, Settings.builder().put("index.analysis.analyzer.no_version.tokenizer", "keyword").put("index.analysis.analyzer.no_version.char_filter", "no_version").put("index.analysis.analyzer.lucene_version.tokenizer", "keyword").put("index.analysis.analyzer.lucene_version.char_filter", "lucene_version").put("index.analysis.analyzer.opensearch_version.tokenizer", "keyword").put("index.analysis.analyzer.opensearch_version.char_filter", "opensearch_version").put(IndexMetadata.SETTING_VERSION_CREATED, version).build());
    assertTokenStreamContents(analyzers.get("no_version").tokenStream("", "test"), new String[] { "testno_version" });
    assertTokenStreamContents(analyzers.get("lucene_version").tokenStream("", "test"), new String[] { "test" + version.luceneVersion });
    assertTokenStreamContents(analyzers.get("opensearch_version").tokenStream("", "test"), new String[] { "test" + version });
    assertEquals("test" + (noVersionSupportsMultiTerm ? "no_version" : ""), analyzers.get("no_version").normalize("", "test").utf8ToString());
    assertEquals("test" + (luceneVersionSupportsMultiTerm ? version.luceneVersion.toString() : ""), analyzers.get("lucene_version").normalize("", "test").utf8ToString());
    assertEquals("test" + (opensearchVersionSupportsMultiTerm ? version.toString() : ""), analyzers.get("opensearch_version").normalize("", "test").utf8ToString());
}
Also used : Arrays(java.util.Arrays) Matchers.either(org.hamcrest.Matchers.either) Version(org.opensearch.Version) StopTokenFilterFactory(org.opensearch.index.analysis.StopTokenFilterFactory) Collections.singletonList(java.util.Collections.singletonList) AnalysisRegistry(org.opensearch.index.analysis.AnalysisRegistry) Directory(org.apache.lucene.store.Directory) Map(java.util.Map) PreConfiguredTokenizer(org.opensearch.index.analysis.PreConfiguredTokenizer) CustomAnalyzer(org.opensearch.index.analysis.CustomAnalyzer) Path(java.nio.file.Path) PreConfiguredTokenFilter(org.opensearch.index.analysis.PreConfiguredTokenFilter) OpenSearchTestCase(org.opensearch.test.OpenSearchTestCase) Set(java.util.Set) Settings(org.opensearch.common.settings.Settings) Reader(java.io.Reader) StandardCharsets(java.nio.charset.StandardCharsets) UncheckedIOException(java.io.UncheckedIOException) Matchers.instanceOf(org.hamcrest.Matchers.instanceOf) List(java.util.List) MatcherAssert(org.hamcrest.MatcherAssert) Matchers.equalTo(org.hamcrest.Matchers.equalTo) IndexSettings(org.opensearch.index.IndexSettings) TokenFilter(org.apache.lucene.analysis.TokenFilter) BaseTokenStreamTestCase.assertTokenStreamContents(org.apache.lucene.analysis.BaseTokenStreamTestCase.assertTokenStreamContents) XContentType(org.opensearch.common.xcontent.XContentType) Dictionary(org.apache.lucene.analysis.hunspell.Dictionary) IndexAnalyzers(org.opensearch.index.analysis.IndexAnalyzers) MyFilterTokenFilterFactory(org.opensearch.index.analysis.MyFilterTokenFilterFactory) IndexSettingsModule(org.opensearch.test.IndexSettingsModule) TestEnvironment(org.opensearch.env.TestEnvironment) TokenizerFactory(org.opensearch.index.analysis.TokenizerFactory) Tokenizer(org.apache.lucene.analysis.Tokenizer) IndexMetadata(org.opensearch.cluster.metadata.IndexMetadata) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) TokenFilterFactory(org.opensearch.index.analysis.TokenFilterFactory) CharFilter(org.apache.lucene.analysis.CharFilter) LegacyESVersion(org.opensearch.LegacyESVersion) Analysis(org.opensearch.index.analysis.Analysis) VersionUtils(org.opensearch.test.VersionUtils) Streams(org.opensearch.common.io.Streams) CharFilterFactory(org.opensearch.index.analysis.CharFilterFactory) StandardTokenizerFactory(org.opensearch.index.analysis.StandardTokenizerFactory) AnalysisProvider(org.opensearch.indices.analysis.AnalysisModule.AnalysisProvider) Collections.singletonMap(java.util.Collections.singletonMap) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) Environment(org.opensearch.env.Environment) TokenStream(org.apache.lucene.analysis.TokenStream) Files(java.nio.file.Files) BufferedWriter(java.io.BufferedWriter) Analyzer(org.apache.lucene.analysis.Analyzer) IOException(java.io.IOException) PreConfiguredCharFilter(org.opensearch.index.analysis.PreConfiguredCharFilter) AnalysisPlugin(org.opensearch.plugins.AnalysisPlugin) StringReader(java.io.StringReader) NIOFSDirectory(org.apache.lucene.store.NIOFSDirectory) InputStream(java.io.InputStream) TokenizerFactory(org.opensearch.index.analysis.TokenizerFactory) StandardTokenizerFactory(org.opensearch.index.analysis.StandardTokenizerFactory) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) AnalysisRegistry(org.opensearch.index.analysis.AnalysisRegistry) Version(org.opensearch.Version) LegacyESVersion(org.opensearch.LegacyESVersion) IndexAnalyzers(org.opensearch.index.analysis.IndexAnalyzers) Collections.singletonList(java.util.Collections.singletonList) List(java.util.List) Map(java.util.Map) Collections.singletonMap(java.util.Collections.singletonMap) AnalysisPlugin(org.opensearch.plugins.AnalysisPlugin)

Example 3 with AnalysisPlugin

use of org.opensearch.plugins.AnalysisPlugin in project OpenSearch by opensearch-project.

the class AnalysisModuleTests method testPluginPreConfiguredTokenFilters.

/**
 * Tests that plugins can register pre-configured token filters that vary in behavior based on OpenSearch version, Lucene version,
 * and that do not vary based on version at all.
 */
public void testPluginPreConfiguredTokenFilters() throws IOException {
    boolean noVersionSupportsMultiTerm = randomBoolean();
    boolean luceneVersionSupportsMultiTerm = randomBoolean();
    boolean opensearchVersionSupportsMultiTerm = randomBoolean();
    AnalysisRegistry registry = new AnalysisModule(TestEnvironment.newEnvironment(emptyNodeSettings), singletonList(new AnalysisPlugin() {

        @Override
        public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
            return Arrays.asList(PreConfiguredTokenFilter.singleton("no_version", noVersionSupportsMultiTerm, tokenStream -> new AppendTokenFilter(tokenStream, "no_version")), PreConfiguredTokenFilter.luceneVersion("lucene_version", luceneVersionSupportsMultiTerm, (tokenStream, luceneVersion) -> new AppendTokenFilter(tokenStream, luceneVersion.toString())), PreConfiguredTokenFilter.openSearchVersion("opensearch_version", opensearchVersionSupportsMultiTerm, (tokenStream, esVersion) -> new AppendTokenFilter(tokenStream, esVersion.toString())));
        }
    })).getAnalysisRegistry();
    Version version = VersionUtils.randomVersion(random());
    IndexAnalyzers analyzers = getIndexAnalyzers(registry, Settings.builder().put("index.analysis.analyzer.no_version.tokenizer", "standard").put("index.analysis.analyzer.no_version.filter", "no_version").put("index.analysis.analyzer.lucene_version.tokenizer", "standard").put("index.analysis.analyzer.lucene_version.filter", "lucene_version").put("index.analysis.analyzer.opensearch_version.tokenizer", "standard").put("index.analysis.analyzer.opensearch_version.filter", "opensearch_version").put(IndexMetadata.SETTING_VERSION_CREATED, version).build());
    assertTokenStreamContents(analyzers.get("no_version").tokenStream("", "test"), new String[] { "testno_version" });
    assertTokenStreamContents(analyzers.get("lucene_version").tokenStream("", "test"), new String[] { "test" + version.luceneVersion });
    assertTokenStreamContents(analyzers.get("opensearch_version").tokenStream("", "test"), new String[] { "test" + version });
    assertEquals("test" + (noVersionSupportsMultiTerm ? "no_version" : ""), analyzers.get("no_version").normalize("", "test").utf8ToString());
    assertEquals("test" + (luceneVersionSupportsMultiTerm ? version.luceneVersion.toString() : ""), analyzers.get("lucene_version").normalize("", "test").utf8ToString());
    assertEquals("test" + (opensearchVersionSupportsMultiTerm ? version.toString() : ""), analyzers.get("opensearch_version").normalize("", "test").utf8ToString());
}
Also used : AnalysisRegistry(org.opensearch.index.analysis.AnalysisRegistry) Version(org.opensearch.Version) LegacyESVersion(org.opensearch.LegacyESVersion) IndexAnalyzers(org.opensearch.index.analysis.IndexAnalyzers) Collections.singletonList(java.util.Collections.singletonList) List(java.util.List) AnalysisPlugin(org.opensearch.plugins.AnalysisPlugin)

Example 4 with AnalysisPlugin

use of org.opensearch.plugins.AnalysisPlugin in project OpenSearch by opensearch-project.

the class AnalysisModule method setupPreConfiguredTokenFilters.

static Map<String, PreConfiguredTokenFilter> setupPreConfiguredTokenFilters(List<AnalysisPlugin> plugins) {
    NamedRegistry<PreConfiguredTokenFilter> preConfiguredTokenFilters = new NamedRegistry<>("pre-configured token_filter");
    // Add filters available in lucene-core
    preConfiguredTokenFilters.register("lowercase", PreConfiguredTokenFilter.singleton("lowercase", true, LowerCaseFilter::new));
    // Add "standard" for old indices (bwc)
    preConfiguredTokenFilters.register("standard", PreConfiguredTokenFilter.openSearchVersion("standard", true, (reader, version) -> {
        // until version 7_5_2
        if (version.before(LegacyESVersion.V_7_6_0)) {
            deprecationLogger.deprecate("standard_deprecation", "The [standard] token filter is deprecated and will be removed in a future version.");
        } else {
            throw new IllegalArgumentException("The [standard] token filter has been removed.");
        }
        return reader;
    }));
    for (AnalysisPlugin plugin : plugins) {
        for (PreConfiguredTokenFilter filter : plugin.getPreConfiguredTokenFilters()) {
            preConfiguredTokenFilters.register(filter.getName(), filter);
        }
    }
    return unmodifiableMap(preConfiguredTokenFilters.getRegistry());
}
Also used : TokenizerFactory(org.opensearch.index.analysis.TokenizerFactory) IndexMetadata(org.opensearch.cluster.metadata.IndexMetadata) PreBuiltAnalyzerProviderFactory(org.opensearch.index.analysis.PreBuiltAnalyzerProviderFactory) StopAnalyzerProvider(org.opensearch.index.analysis.StopAnalyzerProvider) TokenFilterFactory(org.opensearch.index.analysis.TokenFilterFactory) Version(org.opensearch.Version) StopTokenFilterFactory(org.opensearch.index.analysis.StopTokenFilterFactory) DeprecationLogger(org.opensearch.common.logging.DeprecationLogger) SimpleAnalyzerProvider(org.opensearch.index.analysis.SimpleAnalyzerProvider) AnalysisRegistry(org.opensearch.index.analysis.AnalysisRegistry) LegacyESVersion(org.opensearch.LegacyESVersion) KeywordAnalyzerProvider(org.opensearch.index.analysis.KeywordAnalyzerProvider) AnalysisPlugin.requiresAnalysisSettings(org.opensearch.plugins.AnalysisPlugin.requiresAnalysisSettings) CharFilterFactory(org.opensearch.index.analysis.CharFilterFactory) Locale(java.util.Locale) Map(java.util.Map) StandardTokenizerFactory(org.opensearch.index.analysis.StandardTokenizerFactory) PreConfiguredTokenizer(org.opensearch.index.analysis.PreConfiguredTokenizer) Environment(org.opensearch.env.Environment) LowerCaseFilter(org.apache.lucene.analysis.LowerCaseFilter) TokenStream(org.apache.lucene.analysis.TokenStream) PreConfiguredTokenFilter(org.opensearch.index.analysis.PreConfiguredTokenFilter) AbstractTokenFilterFactory(org.opensearch.index.analysis.AbstractTokenFilterFactory) Settings(org.opensearch.common.settings.Settings) IOException(java.io.IOException) PreConfiguredCharFilter(org.opensearch.index.analysis.PreConfiguredCharFilter) ShingleTokenFilterFactory(org.opensearch.index.analysis.ShingleTokenFilterFactory) LowercaseNormalizerProvider(org.opensearch.index.analysis.LowercaseNormalizerProvider) AnalysisPlugin(org.opensearch.plugins.AnalysisPlugin) List(java.util.List) AnalyzerProvider(org.opensearch.index.analysis.AnalyzerProvider) NamedRegistry(org.opensearch.common.NamedRegistry) IndexSettings(org.opensearch.index.IndexSettings) WhitespaceAnalyzerProvider(org.opensearch.index.analysis.WhitespaceAnalyzerProvider) HunspellTokenFilterFactory(org.opensearch.index.analysis.HunspellTokenFilterFactory) Collections.unmodifiableMap(java.util.Collections.unmodifiableMap) StandardAnalyzerProvider(org.opensearch.index.analysis.StandardAnalyzerProvider) NamedRegistry(org.opensearch.common.NamedRegistry) PreConfiguredTokenFilter(org.opensearch.index.analysis.PreConfiguredTokenFilter) AnalysisPlugin(org.opensearch.plugins.AnalysisPlugin)

Example 5 with AnalysisPlugin

use of org.opensearch.plugins.AnalysisPlugin in project OpenSearch by opensearch-project.

the class AnalysisModule method setupPreConfiguredTokenizers.

static Map<String, PreConfiguredTokenizer> setupPreConfiguredTokenizers(List<AnalysisPlugin> plugins) {
    NamedRegistry<PreConfiguredTokenizer> preConfiguredTokenizers = new NamedRegistry<>("pre-configured tokenizer");
    // Temporary shim to register old style pre-configured tokenizers
    for (PreBuiltTokenizers tokenizer : PreBuiltTokenizers.values()) {
        String name = tokenizer.name().toLowerCase(Locale.ROOT);
        PreConfiguredTokenizer preConfigured;
        switch(tokenizer.getCachingStrategy()) {
            case ONE:
                preConfigured = PreConfiguredTokenizer.singleton(name, () -> tokenizer.create(Version.CURRENT));
                break;
            default:
                throw new UnsupportedOperationException("Caching strategy unsupported by temporary shim [" + tokenizer + "]");
        }
        preConfiguredTokenizers.register(name, preConfigured);
    }
    for (AnalysisPlugin plugin : plugins) {
        for (PreConfiguredTokenizer tokenizer : plugin.getPreConfiguredTokenizers()) {
            preConfiguredTokenizers.register(tokenizer.getName(), tokenizer);
        }
    }
    return unmodifiableMap(preConfiguredTokenizers.getRegistry());
}
Also used : NamedRegistry(org.opensearch.common.NamedRegistry) PreConfiguredTokenizer(org.opensearch.index.analysis.PreConfiguredTokenizer) AnalysisPlugin(org.opensearch.plugins.AnalysisPlugin)

Aggregations

AnalysisPlugin (org.opensearch.plugins.AnalysisPlugin)10 Settings (org.opensearch.common.settings.Settings)7 Environment (org.opensearch.env.Environment)7 IndexSettings (org.opensearch.index.IndexSettings)7 List (java.util.List)6 TokenStream (org.apache.lucene.analysis.TokenStream)6 Version (org.opensearch.Version)6 AnalysisRegistry (org.opensearch.index.analysis.AnalysisRegistry)6 LegacyESVersion (org.opensearch.LegacyESVersion)5 TestEnvironment (org.opensearch.env.TestEnvironment)5 PreConfiguredTokenizer (org.opensearch.index.analysis.PreConfiguredTokenizer)5 IOException (java.io.IOException)4 Collections.singletonList (java.util.Collections.singletonList)4 Map (java.util.Map)4 IndexMetadata (org.opensearch.cluster.metadata.IndexMetadata)4 CharFilterFactory (org.opensearch.index.analysis.CharFilterFactory)4 IndexAnalyzers (org.opensearch.index.analysis.IndexAnalyzers)4 PreConfiguredCharFilter (org.opensearch.index.analysis.PreConfiguredCharFilter)4 TokenFilterFactory (org.opensearch.index.analysis.TokenFilterFactory)4 TokenizerFactory (org.opensearch.index.analysis.TokenizerFactory)4