use of org.opensearch.plugins.AnalysisPlugin in project OpenSearch by opensearch-project.
the class AnalysisModuleTests method testPluginPreConfiguredCharFilters.
/**
* Tests that plugins can register pre-configured char filters that vary in behavior based on OpenSearch version, Lucene version,
* and that do not vary based on version at all.
*/
public void testPluginPreConfiguredCharFilters() throws IOException {
boolean noVersionSupportsMultiTerm = randomBoolean();
boolean luceneVersionSupportsMultiTerm = randomBoolean();
boolean opensearchVersionSupportsMultiTerm = randomBoolean();
AnalysisRegistry registry = new AnalysisModule(TestEnvironment.newEnvironment(emptyNodeSettings), singletonList(new AnalysisPlugin() {
@Override
public List<PreConfiguredCharFilter> getPreConfiguredCharFilters() {
return Arrays.asList(PreConfiguredCharFilter.singleton("no_version", noVersionSupportsMultiTerm, tokenStream -> new AppendCharFilter(tokenStream, "no_version")), PreConfiguredCharFilter.luceneVersion("lucene_version", luceneVersionSupportsMultiTerm, (tokenStream, luceneVersion) -> new AppendCharFilter(tokenStream, luceneVersion.toString())), PreConfiguredCharFilter.openSearchVersion("opensearch_version", opensearchVersionSupportsMultiTerm, (tokenStream, esVersion) -> new AppendCharFilter(tokenStream, esVersion.toString())));
}
@Override
public Map<String, AnalysisProvider<TokenizerFactory>> getTokenizers() {
// Need mock keyword tokenizer here, because alpha / beta versions are broken up by the dash.
return singletonMap("keyword", (indexSettings, environment, name, settings) -> TokenizerFactory.newFactory(name, () -> new MockTokenizer(MockTokenizer.KEYWORD, false)));
}
})).getAnalysisRegistry();
Version version = VersionUtils.randomVersion(random());
IndexAnalyzers analyzers = getIndexAnalyzers(registry, Settings.builder().put("index.analysis.analyzer.no_version.tokenizer", "keyword").put("index.analysis.analyzer.no_version.char_filter", "no_version").put("index.analysis.analyzer.lucene_version.tokenizer", "keyword").put("index.analysis.analyzer.lucene_version.char_filter", "lucene_version").put("index.analysis.analyzer.opensearch_version.tokenizer", "keyword").put("index.analysis.analyzer.opensearch_version.char_filter", "opensearch_version").put(IndexMetadata.SETTING_VERSION_CREATED, version).build());
assertTokenStreamContents(analyzers.get("no_version").tokenStream("", "test"), new String[] { "testno_version" });
assertTokenStreamContents(analyzers.get("lucene_version").tokenStream("", "test"), new String[] { "test" + version.luceneVersion });
assertTokenStreamContents(analyzers.get("opensearch_version").tokenStream("", "test"), new String[] { "test" + version });
assertEquals("test" + (noVersionSupportsMultiTerm ? "no_version" : ""), analyzers.get("no_version").normalize("", "test").utf8ToString());
assertEquals("test" + (luceneVersionSupportsMultiTerm ? version.luceneVersion.toString() : ""), analyzers.get("lucene_version").normalize("", "test").utf8ToString());
assertEquals("test" + (opensearchVersionSupportsMultiTerm ? version.toString() : ""), analyzers.get("opensearch_version").normalize("", "test").utf8ToString());
}
use of org.opensearch.plugins.AnalysisPlugin in project OpenSearch by opensearch-project.
the class AnalysisRegistryTests method testDeprecationsAndExceptions.
public void testDeprecationsAndExceptions() throws IOException {
AnalysisPlugin plugin = new AnalysisPlugin() {
class MockFactory extends AbstractTokenFilterFactory {
MockFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name, settings);
}
@Override
public TokenStream create(TokenStream tokenStream) {
if (indexSettings.getIndexVersionCreated().equals(Version.CURRENT)) {
deprecationLogger.deprecate("deprecated_token_filter", "Using deprecated token filter [deprecated]");
}
return tokenStream;
}
}
class ExceptionFactory extends AbstractTokenFilterFactory {
ExceptionFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name, settings);
}
@Override
public TokenStream create(TokenStream tokenStream) {
if (indexSettings.getIndexVersionCreated().equals(Version.CURRENT)) {
throw new IllegalArgumentException("Cannot use token filter [exception]");
}
return tokenStream;
}
}
class UnusedMockFactory extends AbstractTokenFilterFactory {
UnusedMockFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name, settings);
}
@Override
public TokenStream create(TokenStream tokenStream) {
deprecationLogger.deprecate("unused_token_filter", "Using deprecated token filter [unused]");
return tokenStream;
}
}
class NormalizerFactory extends AbstractTokenFilterFactory implements NormalizingTokenFilterFactory {
NormalizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name, settings);
}
@Override
public TokenStream create(TokenStream tokenStream) {
deprecationLogger.deprecate("deprecated_normalizer", "Using deprecated token filter [deprecated_normalizer]");
return tokenStream;
}
}
@Override
public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
Map<String, AnalysisProvider<TokenFilterFactory>> filters = new HashMap<>();
filters.put("deprecated", MockFactory::new);
filters.put("unused", UnusedMockFactory::new);
filters.put("deprecated_normalizer", NormalizerFactory::new);
filters.put("exception", ExceptionFactory::new);
return filters;
}
};
Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build();
Settings indexSettings = Settings.builder().put(IndexMetadata.SETTING_VERSION_CREATED, Version.CURRENT).put("index.analysis.filter.deprecated.type", "deprecated").put("index.analysis.analyzer.custom.tokenizer", "standard").putList("index.analysis.analyzer.custom.filter", "lowercase", "deprecated").build();
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings);
new AnalysisModule(TestEnvironment.newEnvironment(settings), singletonList(plugin)).getAnalysisRegistry().build(idxSettings);
// We should only get a warning from the token filter that is referenced in settings
assertWarnings("Using deprecated token filter [deprecated]");
indexSettings = Settings.builder().put(IndexMetadata.SETTING_VERSION_CREATED, VersionUtils.getPreviousVersion()).put("index.analysis.filter.deprecated.type", "deprecated_normalizer").putList("index.analysis.normalizer.custom.filter", "lowercase", "deprecated_normalizer").put("index.analysis.filter.deprecated.type", "deprecated").put("index.analysis.filter.exception.type", "exception").put("index.analysis.analyzer.custom.tokenizer", "standard").putList("index.analysis.analyzer.custom.filter", "lowercase", "deprecated", "exception").build();
idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings);
new AnalysisModule(TestEnvironment.newEnvironment(settings), singletonList(plugin)).getAnalysisRegistry().build(idxSettings);
// We should only get a warning from the normalizer, because we're on a version where 'deprecated'
// works fine
assertWarnings("Using deprecated token filter [deprecated_normalizer]");
indexSettings = Settings.builder().put(IndexMetadata.SETTING_VERSION_CREATED, Version.CURRENT).put("index.analysis.filter.exception.type", "exception").put("index.analysis.analyzer.custom.tokenizer", "standard").putList("index.analysis.analyzer.custom.filter", "lowercase", "exception").build();
IndexSettings exceptionSettings = IndexSettingsModule.newIndexSettings("index", indexSettings);
IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> {
new AnalysisModule(TestEnvironment.newEnvironment(settings), singletonList(plugin)).getAnalysisRegistry().build(exceptionSettings);
});
assertEquals("Cannot use token filter [exception]", e.getMessage());
}
use of org.opensearch.plugins.AnalysisPlugin in project OpenSearch by opensearch-project.
the class AnalysisRegistryTests method testConfigureCamelCaseTokenFilter.
/**
* Tests that {@code camelCase} filter names and {@code snake_case} filter names don't collide.
*/
public void testConfigureCamelCaseTokenFilter() throws IOException {
Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build();
Settings indexSettings = Settings.builder().put(IndexMetadata.SETTING_VERSION_CREATED, Version.CURRENT).put("index.analysis.filter.testFilter.type", "mock").put("index.analysis.filter.test_filter.type", "mock").put("index.analysis.analyzer.custom_analyzer_with_camel_case.tokenizer", "standard").putList("index.analysis.analyzer.custom_analyzer_with_camel_case.filter", "lowercase", "testFilter").put("index.analysis.analyzer.custom_analyzer_with_snake_case.tokenizer", "standard").putList("index.analysis.analyzer.custom_analyzer_with_snake_case.filter", "lowercase", "test_filter").build();
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings);
/* The snake_case version of the name should not filter out any stopwords while the
* camelCase version will filter out English stopwords. */
AnalysisPlugin plugin = new AnalysisPlugin() {
class MockFactory extends AbstractTokenFilterFactory {
MockFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name, settings);
}
@Override
public TokenStream create(TokenStream tokenStream) {
if (name().equals("test_filter")) {
return new MockTokenFilter(tokenStream, MockTokenFilter.EMPTY_STOPSET);
}
return new MockTokenFilter(tokenStream, MockTokenFilter.ENGLISH_STOPSET);
}
}
@Override
public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
return singletonMap("mock", MockFactory::new);
}
};
IndexAnalyzers indexAnalyzers = new AnalysisModule(TestEnvironment.newEnvironment(settings), singletonList(plugin)).getAnalysisRegistry().build(idxSettings);
// This shouldn't contain English stopwords
try (NamedAnalyzer custom_analyser = indexAnalyzers.get("custom_analyzer_with_camel_case")) {
assertNotNull(custom_analyser);
TokenStream tokenStream = custom_analyser.tokenStream("foo", "has a foo");
tokenStream.reset();
CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
assertTrue(tokenStream.incrementToken());
assertEquals("has", charTermAttribute.toString());
assertTrue(tokenStream.incrementToken());
assertEquals("foo", charTermAttribute.toString());
assertFalse(tokenStream.incrementToken());
}
// This *should* contain English stopwords
try (NamedAnalyzer custom_analyser = indexAnalyzers.get("custom_analyzer_with_snake_case")) {
assertNotNull(custom_analyser);
TokenStream tokenStream = custom_analyser.tokenStream("foo", "has a foo");
tokenStream.reset();
CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
assertTrue(tokenStream.incrementToken());
assertEquals("has", charTermAttribute.toString());
assertTrue(tokenStream.incrementToken());
assertEquals("a", charTermAttribute.toString());
assertTrue(tokenStream.incrementToken());
assertEquals("foo", charTermAttribute.toString());
assertFalse(tokenStream.incrementToken());
}
}
use of org.opensearch.plugins.AnalysisPlugin in project OpenSearch by opensearch-project.
the class TransportAnalyzeActionTests method setUp.
@Override
public void setUp() throws Exception {
super.setUp();
Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build();
Settings indexSettings = Settings.builder().put(IndexMetadata.SETTING_VERSION_CREATED, Version.CURRENT).put(IndexMetadata.SETTING_INDEX_UUID, UUIDs.randomBase64UUID()).put("index.analysis.analyzer.custom_analyzer.tokenizer", "standard").put("index.analysis.analyzer.custom_analyzer.filter", "mock").put("index.analysis.normalizer.my_normalizer.type", "custom").put("index.analysis.char_filter.my_append.type", "append").put("index.analysis.char_filter.my_append.suffix", "baz").put("index.analyze.max_token_count", 100).putList("index.analysis.normalizer.my_normalizer.filter", "lowercase").build();
this.indexSettings = IndexSettingsModule.newIndexSettings("index", indexSettings);
Environment environment = TestEnvironment.newEnvironment(settings);
AnalysisPlugin plugin = new AnalysisPlugin() {
class MockFactory extends AbstractTokenFilterFactory {
final CharacterRunAutomaton stopset;
MockFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name, settings);
if (settings.hasValue("stopword")) {
this.stopset = new CharacterRunAutomaton(Automata.makeString(settings.get("stopword")));
} else {
this.stopset = MockTokenFilter.ENGLISH_STOPSET;
}
}
@Override
public TokenStream create(TokenStream tokenStream) {
return new MockTokenFilter(tokenStream, this.stopset);
}
}
class DeprecatedTokenFilterFactory extends AbstractTokenFilterFactory implements NormalizingTokenFilterFactory {
DeprecatedTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name, settings);
}
@Override
public TokenStream create(TokenStream tokenStream) {
deprecationLogger.deprecate("deprecated_token_filter_create", "Using deprecated token filter [deprecated]");
return tokenStream;
}
@Override
public TokenStream normalize(TokenStream tokenStream) {
deprecationLogger.deprecate("deprecated_token_filter_normalize", "Using deprecated token filter [deprecated]");
return tokenStream;
}
}
class AppendCharFilterFactory extends AbstractCharFilterFactory {
final String suffix;
AppendCharFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name);
this.suffix = settings.get("suffix", "bar");
}
@Override
public Reader create(Reader reader) {
return new AppendCharFilter(reader, suffix);
}
}
@Override
public Map<String, AnalysisProvider<CharFilterFactory>> getCharFilters() {
return singletonMap("append", AppendCharFilterFactory::new);
}
@Override
public Map<String, AnalysisProvider<TokenizerFactory>> getTokenizers() {
return singletonMap("keyword", (indexSettings, environment, name, settings) -> TokenizerFactory.newFactory(name, () -> new MockTokenizer(MockTokenizer.KEYWORD, false)));
}
@Override
public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
Map<String, AnalysisProvider<TokenFilterFactory>> filters = new HashMap<>();
filters.put("mock", MockFactory::new);
filters.put("deprecated", DeprecatedTokenFilterFactory::new);
return filters;
}
@Override
public List<PreConfiguredCharFilter> getPreConfiguredCharFilters() {
return singletonList(PreConfiguredCharFilter.singleton("append", false, reader -> new AppendCharFilter(reader, "foo")));
}
};
registry = new AnalysisModule(environment, singletonList(plugin)).getAnalysisRegistry();
indexAnalyzers = registry.build(this.indexSettings);
maxTokenCount = IndexSettings.MAX_TOKEN_COUNT_SETTING.getDefault(settings);
idxMaxTokenCount = this.indexSettings.getMaxTokenCount();
}
use of org.opensearch.plugins.AnalysisPlugin in project OpenSearch by opensearch-project.
the class AnalysisModuleTests method testPluginPreConfiguredTokenizers.
/**
* Tests that plugins can register pre-configured token filters that vary in behavior based on OpenSearch version, Lucene version,
* and that do not vary based on version at all.
*/
public void testPluginPreConfiguredTokenizers() throws IOException {
// Simple tokenizer that always spits out a single token with some preconfigured characters
final class FixedTokenizer extends Tokenizer {
private final CharTermAttribute term = addAttribute(CharTermAttribute.class);
private final char[] chars;
private boolean read = false;
protected FixedTokenizer(String chars) {
this.chars = chars.toCharArray();
}
@Override
public boolean incrementToken() throws IOException {
if (read) {
return false;
}
clearAttributes();
read = true;
term.resizeBuffer(chars.length);
System.arraycopy(chars, 0, term.buffer(), 0, chars.length);
term.setLength(chars.length);
return true;
}
@Override
public void reset() throws IOException {
super.reset();
read = false;
}
}
AnalysisRegistry registry = new AnalysisModule(TestEnvironment.newEnvironment(emptyNodeSettings), singletonList(new AnalysisPlugin() {
@Override
public List<PreConfiguredTokenizer> getPreConfiguredTokenizers() {
return Arrays.asList(PreConfiguredTokenizer.singleton("no_version", () -> new FixedTokenizer("no_version")), PreConfiguredTokenizer.luceneVersion("lucene_version", luceneVersion -> new FixedTokenizer(luceneVersion.toString())), PreConfiguredTokenizer.openSearchVersion("opensearch_version", esVersion -> new FixedTokenizer(esVersion.toString())));
}
})).getAnalysisRegistry();
Version version = VersionUtils.randomVersion(random());
IndexAnalyzers analyzers = getIndexAnalyzers(registry, Settings.builder().put("index.analysis.analyzer.no_version.tokenizer", "no_version").put("index.analysis.analyzer.lucene_version.tokenizer", "lucene_version").put("index.analysis.analyzer.opensearch_version.tokenizer", "opensearch_version").put(IndexMetadata.SETTING_VERSION_CREATED, version).build());
assertTokenStreamContents(analyzers.get("no_version").tokenStream("", "test"), new String[] { "no_version" });
assertTokenStreamContents(analyzers.get("lucene_version").tokenStream("", "test"), new String[] { version.luceneVersion.toString() });
assertTokenStreamContents(analyzers.get("opensearch_version").tokenStream("", "test"), new String[] { version.toString() });
// These are current broken by https://github.com/elastic/elasticsearch/issues/24752
// assertEquals("test" + (noVersionSupportsMultiTerm ? "no_version" : ""),
// analyzers.get("no_version").normalize("", "test").utf8ToString());
// assertEquals("test" + (luceneVersionSupportsMultiTerm ? version.luceneVersion.toString() : ""),
// analyzers.get("lucene_version").normalize("", "test").utf8ToString());
// assertEquals("test" + (opensearchVersionSupportsMultiTerm ? version.toString() : ""),
// analyzers.get("opensearch_version").normalize("", "test").utf8ToString());
}
Aggregations