Search in sources :

Example 1 with PreConfiguredCharFilter

use of org.opensearch.index.analysis.PreConfiguredCharFilter in project OpenSearch by opensearch-project.

the class AnalysisModuleTests method testPluginPreConfiguredCharFilters.

/**
 * Tests that plugins can register pre-configured char filters that vary in behavior based on OpenSearch version, Lucene version,
 * and that do not vary based on version at all.
 */
public void testPluginPreConfiguredCharFilters() throws IOException {
    boolean noVersionSupportsMultiTerm = randomBoolean();
    boolean luceneVersionSupportsMultiTerm = randomBoolean();
    boolean opensearchVersionSupportsMultiTerm = randomBoolean();
    AnalysisRegistry registry = new AnalysisModule(TestEnvironment.newEnvironment(emptyNodeSettings), singletonList(new AnalysisPlugin() {

        @Override
        public List<PreConfiguredCharFilter> getPreConfiguredCharFilters() {
            return Arrays.asList(PreConfiguredCharFilter.singleton("no_version", noVersionSupportsMultiTerm, tokenStream -> new AppendCharFilter(tokenStream, "no_version")), PreConfiguredCharFilter.luceneVersion("lucene_version", luceneVersionSupportsMultiTerm, (tokenStream, luceneVersion) -> new AppendCharFilter(tokenStream, luceneVersion.toString())), PreConfiguredCharFilter.openSearchVersion("opensearch_version", opensearchVersionSupportsMultiTerm, (tokenStream, esVersion) -> new AppendCharFilter(tokenStream, esVersion.toString())));
        }

        @Override
        public Map<String, AnalysisProvider<TokenizerFactory>> getTokenizers() {
            // Need mock keyword tokenizer here, because alpha / beta versions are broken up by the dash.
            return singletonMap("keyword", (indexSettings, environment, name, settings) -> TokenizerFactory.newFactory(name, () -> new MockTokenizer(MockTokenizer.KEYWORD, false)));
        }
    })).getAnalysisRegistry();
    Version version = VersionUtils.randomVersion(random());
    IndexAnalyzers analyzers = getIndexAnalyzers(registry, Settings.builder().put("index.analysis.analyzer.no_version.tokenizer", "keyword").put("index.analysis.analyzer.no_version.char_filter", "no_version").put("index.analysis.analyzer.lucene_version.tokenizer", "keyword").put("index.analysis.analyzer.lucene_version.char_filter", "lucene_version").put("index.analysis.analyzer.opensearch_version.tokenizer", "keyword").put("index.analysis.analyzer.opensearch_version.char_filter", "opensearch_version").put(IndexMetadata.SETTING_VERSION_CREATED, version).build());
    assertTokenStreamContents(analyzers.get("no_version").tokenStream("", "test"), new String[] { "testno_version" });
    assertTokenStreamContents(analyzers.get("lucene_version").tokenStream("", "test"), new String[] { "test" + version.luceneVersion });
    assertTokenStreamContents(analyzers.get("opensearch_version").tokenStream("", "test"), new String[] { "test" + version });
    assertEquals("test" + (noVersionSupportsMultiTerm ? "no_version" : ""), analyzers.get("no_version").normalize("", "test").utf8ToString());
    assertEquals("test" + (luceneVersionSupportsMultiTerm ? version.luceneVersion.toString() : ""), analyzers.get("lucene_version").normalize("", "test").utf8ToString());
    assertEquals("test" + (opensearchVersionSupportsMultiTerm ? version.toString() : ""), analyzers.get("opensearch_version").normalize("", "test").utf8ToString());
}
Also used : Arrays(java.util.Arrays) Matchers.either(org.hamcrest.Matchers.either) Version(org.opensearch.Version) StopTokenFilterFactory(org.opensearch.index.analysis.StopTokenFilterFactory) Collections.singletonList(java.util.Collections.singletonList) AnalysisRegistry(org.opensearch.index.analysis.AnalysisRegistry) Directory(org.apache.lucene.store.Directory) Map(java.util.Map) PreConfiguredTokenizer(org.opensearch.index.analysis.PreConfiguredTokenizer) CustomAnalyzer(org.opensearch.index.analysis.CustomAnalyzer) Path(java.nio.file.Path) PreConfiguredTokenFilter(org.opensearch.index.analysis.PreConfiguredTokenFilter) OpenSearchTestCase(org.opensearch.test.OpenSearchTestCase) Set(java.util.Set) Settings(org.opensearch.common.settings.Settings) Reader(java.io.Reader) StandardCharsets(java.nio.charset.StandardCharsets) UncheckedIOException(java.io.UncheckedIOException) Matchers.instanceOf(org.hamcrest.Matchers.instanceOf) List(java.util.List) MatcherAssert(org.hamcrest.MatcherAssert) Matchers.equalTo(org.hamcrest.Matchers.equalTo) IndexSettings(org.opensearch.index.IndexSettings) TokenFilter(org.apache.lucene.analysis.TokenFilter) BaseTokenStreamTestCase.assertTokenStreamContents(org.apache.lucene.analysis.BaseTokenStreamTestCase.assertTokenStreamContents) XContentType(org.opensearch.common.xcontent.XContentType) Dictionary(org.apache.lucene.analysis.hunspell.Dictionary) IndexAnalyzers(org.opensearch.index.analysis.IndexAnalyzers) MyFilterTokenFilterFactory(org.opensearch.index.analysis.MyFilterTokenFilterFactory) IndexSettingsModule(org.opensearch.test.IndexSettingsModule) TestEnvironment(org.opensearch.env.TestEnvironment) TokenizerFactory(org.opensearch.index.analysis.TokenizerFactory) Tokenizer(org.apache.lucene.analysis.Tokenizer) IndexMetadata(org.opensearch.cluster.metadata.IndexMetadata) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) TokenFilterFactory(org.opensearch.index.analysis.TokenFilterFactory) CharFilter(org.apache.lucene.analysis.CharFilter) LegacyESVersion(org.opensearch.LegacyESVersion) Analysis(org.opensearch.index.analysis.Analysis) VersionUtils(org.opensearch.test.VersionUtils) Streams(org.opensearch.common.io.Streams) CharFilterFactory(org.opensearch.index.analysis.CharFilterFactory) StandardTokenizerFactory(org.opensearch.index.analysis.StandardTokenizerFactory) AnalysisProvider(org.opensearch.indices.analysis.AnalysisModule.AnalysisProvider) Collections.singletonMap(java.util.Collections.singletonMap) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) Environment(org.opensearch.env.Environment) TokenStream(org.apache.lucene.analysis.TokenStream) Files(java.nio.file.Files) BufferedWriter(java.io.BufferedWriter) Analyzer(org.apache.lucene.analysis.Analyzer) IOException(java.io.IOException) PreConfiguredCharFilter(org.opensearch.index.analysis.PreConfiguredCharFilter) AnalysisPlugin(org.opensearch.plugins.AnalysisPlugin) StringReader(java.io.StringReader) NIOFSDirectory(org.apache.lucene.store.NIOFSDirectory) InputStream(java.io.InputStream) TokenizerFactory(org.opensearch.index.analysis.TokenizerFactory) StandardTokenizerFactory(org.opensearch.index.analysis.StandardTokenizerFactory) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) AnalysisRegistry(org.opensearch.index.analysis.AnalysisRegistry) Version(org.opensearch.Version) LegacyESVersion(org.opensearch.LegacyESVersion) IndexAnalyzers(org.opensearch.index.analysis.IndexAnalyzers) Collections.singletonList(java.util.Collections.singletonList) List(java.util.List) Map(java.util.Map) Collections.singletonMap(java.util.Collections.singletonMap) AnalysisPlugin(org.opensearch.plugins.AnalysisPlugin)

Example 2 with PreConfiguredCharFilter

use of org.opensearch.index.analysis.PreConfiguredCharFilter in project OpenSearch by opensearch-project.

the class TransportAnalyzeActionTests method setUp.

@Override
public void setUp() throws Exception {
    super.setUp();
    Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build();
    Settings indexSettings = Settings.builder().put(IndexMetadata.SETTING_VERSION_CREATED, Version.CURRENT).put(IndexMetadata.SETTING_INDEX_UUID, UUIDs.randomBase64UUID()).put("index.analysis.analyzer.custom_analyzer.tokenizer", "standard").put("index.analysis.analyzer.custom_analyzer.filter", "mock").put("index.analysis.normalizer.my_normalizer.type", "custom").put("index.analysis.char_filter.my_append.type", "append").put("index.analysis.char_filter.my_append.suffix", "baz").put("index.analyze.max_token_count", 100).putList("index.analysis.normalizer.my_normalizer.filter", "lowercase").build();
    this.indexSettings = IndexSettingsModule.newIndexSettings("index", indexSettings);
    Environment environment = TestEnvironment.newEnvironment(settings);
    AnalysisPlugin plugin = new AnalysisPlugin() {

        class MockFactory extends AbstractTokenFilterFactory {

            final CharacterRunAutomaton stopset;

            MockFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
                super(indexSettings, name, settings);
                if (settings.hasValue("stopword")) {
                    this.stopset = new CharacterRunAutomaton(Automata.makeString(settings.get("stopword")));
                } else {
                    this.stopset = MockTokenFilter.ENGLISH_STOPSET;
                }
            }

            @Override
            public TokenStream create(TokenStream tokenStream) {
                return new MockTokenFilter(tokenStream, this.stopset);
            }
        }

        class DeprecatedTokenFilterFactory extends AbstractTokenFilterFactory implements NormalizingTokenFilterFactory {

            DeprecatedTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
                super(indexSettings, name, settings);
            }

            @Override
            public TokenStream create(TokenStream tokenStream) {
                deprecationLogger.deprecate("deprecated_token_filter_create", "Using deprecated token filter [deprecated]");
                return tokenStream;
            }

            @Override
            public TokenStream normalize(TokenStream tokenStream) {
                deprecationLogger.deprecate("deprecated_token_filter_normalize", "Using deprecated token filter [deprecated]");
                return tokenStream;
            }
        }

        class AppendCharFilterFactory extends AbstractCharFilterFactory {

            final String suffix;

            AppendCharFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
                super(indexSettings, name);
                this.suffix = settings.get("suffix", "bar");
            }

            @Override
            public Reader create(Reader reader) {
                return new AppendCharFilter(reader, suffix);
            }
        }

        @Override
        public Map<String, AnalysisProvider<CharFilterFactory>> getCharFilters() {
            return singletonMap("append", AppendCharFilterFactory::new);
        }

        @Override
        public Map<String, AnalysisProvider<TokenizerFactory>> getTokenizers() {
            return singletonMap("keyword", (indexSettings, environment, name, settings) -> TokenizerFactory.newFactory(name, () -> new MockTokenizer(MockTokenizer.KEYWORD, false)));
        }

        @Override
        public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
            Map<String, AnalysisProvider<TokenFilterFactory>> filters = new HashMap<>();
            filters.put("mock", MockFactory::new);
            filters.put("deprecated", DeprecatedTokenFilterFactory::new);
            return filters;
        }

        @Override
        public List<PreConfiguredCharFilter> getPreConfiguredCharFilters() {
            return singletonList(PreConfiguredCharFilter.singleton("append", false, reader -> new AppendCharFilter(reader, "foo")));
        }
    };
    registry = new AnalysisModule(environment, singletonList(plugin)).getAnalysisRegistry();
    indexAnalyzers = registry.build(this.indexSettings);
    maxTokenCount = IndexSettings.MAX_TOKEN_COUNT_SETTING.getDefault(settings);
    idxMaxTokenCount = this.indexSettings.getMaxTokenCount();
}
Also used : TestEnvironment(org.opensearch.env.TestEnvironment) TokenizerFactory(org.opensearch.index.analysis.TokenizerFactory) IndexMetadata(org.opensearch.cluster.metadata.IndexMetadata) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) TokenFilterFactory(org.opensearch.index.analysis.TokenFilterFactory) Version(org.opensearch.Version) HashMap(java.util.HashMap) Collections.singletonList(java.util.Collections.singletonList) AnalysisRegistry(org.opensearch.index.analysis.AnalysisRegistry) AppendCharFilter(org.opensearch.indices.analysis.AnalysisModuleTests.AppendCharFilter) CharFilterFactory(org.opensearch.index.analysis.CharFilterFactory) AnalyzeAction(org.opensearch.action.admin.indices.analyze.AnalyzeAction) AbstractCharFilterFactory(org.opensearch.index.analysis.AbstractCharFilterFactory) Map(java.util.Map) AnalysisProvider(org.opensearch.indices.analysis.AnalysisModule.AnalysisProvider) Collections.singletonMap(java.util.Collections.singletonMap) UUIDs(org.opensearch.common.UUIDs) Automata(org.apache.lucene.util.automaton.Automata) CharacterRunAutomaton(org.apache.lucene.util.automaton.CharacterRunAutomaton) Environment(org.opensearch.env.Environment) NormalizingTokenFilterFactory(org.opensearch.index.analysis.NormalizingTokenFilterFactory) TokenStream(org.apache.lucene.analysis.TokenStream) AbstractTokenFilterFactory(org.opensearch.index.analysis.AbstractTokenFilterFactory) OpenSearchTestCase(org.opensearch.test.OpenSearchTestCase) TransportAnalyzeAction(org.opensearch.action.admin.indices.analyze.TransportAnalyzeAction) Settings(org.opensearch.common.settings.Settings) IOException(java.io.IOException) Reader(java.io.Reader) Mockito.when(org.mockito.Mockito.when) IndexService(org.opensearch.index.IndexService) PreConfiguredCharFilter(org.opensearch.index.analysis.PreConfiguredCharFilter) AnalysisPlugin(org.opensearch.plugins.AnalysisPlugin) List(java.util.List) AnalysisModule(org.opensearch.indices.analysis.AnalysisModule) MockTokenFilter(org.apache.lucene.analysis.MockTokenFilter) IndexSettings(org.opensearch.index.IndexSettings) IndexAnalyzers(org.opensearch.index.analysis.IndexAnalyzers) IndexSettingsModule(org.opensearch.test.IndexSettingsModule) Mockito.mock(org.mockito.Mockito.mock) TokenStream(org.apache.lucene.analysis.TokenStream) MockTokenFilter(org.apache.lucene.analysis.MockTokenFilter) HashMap(java.util.HashMap) CharacterRunAutomaton(org.apache.lucene.util.automaton.CharacterRunAutomaton) IndexSettings(org.opensearch.index.IndexSettings) PreConfiguredCharFilter(org.opensearch.index.analysis.PreConfiguredCharFilter) Reader(java.io.Reader) AppendCharFilter(org.opensearch.indices.analysis.AnalysisModuleTests.AppendCharFilter) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) TestEnvironment(org.opensearch.env.TestEnvironment) Environment(org.opensearch.env.Environment) AnalysisModule(org.opensearch.indices.analysis.AnalysisModule) AnalysisProvider(org.opensearch.indices.analysis.AnalysisModule.AnalysisProvider) Settings(org.opensearch.common.settings.Settings) IndexSettings(org.opensearch.index.IndexSettings) AnalysisPlugin(org.opensearch.plugins.AnalysisPlugin)

Aggregations

IOException (java.io.IOException)2 Reader (java.io.Reader)2 Collections.singletonList (java.util.Collections.singletonList)2 Collections.singletonMap (java.util.Collections.singletonMap)2 List (java.util.List)2 Map (java.util.Map)2 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)2 TokenStream (org.apache.lucene.analysis.TokenStream)2 Version (org.opensearch.Version)2 IndexMetadata (org.opensearch.cluster.metadata.IndexMetadata)2 Settings (org.opensearch.common.settings.Settings)2 Environment (org.opensearch.env.Environment)2 TestEnvironment (org.opensearch.env.TestEnvironment)2 IndexSettings (org.opensearch.index.IndexSettings)2 BufferedWriter (java.io.BufferedWriter)1 InputStream (java.io.InputStream)1 StringReader (java.io.StringReader)1 UncheckedIOException (java.io.UncheckedIOException)1 StandardCharsets (java.nio.charset.StandardCharsets)1 Files (java.nio.file.Files)1