use of org.opensearch.index.analysis.PreConfiguredTokenizer in project OpenSearch by opensearch-project.
the class AnalysisModule method setupPreConfiguredTokenizers.
static Map<String, PreConfiguredTokenizer> setupPreConfiguredTokenizers(List<AnalysisPlugin> plugins) {
NamedRegistry<PreConfiguredTokenizer> preConfiguredTokenizers = new NamedRegistry<>("pre-configured tokenizer");
// Temporary shim to register old style pre-configured tokenizers
for (PreBuiltTokenizers tokenizer : PreBuiltTokenizers.values()) {
String name = tokenizer.name().toLowerCase(Locale.ROOT);
PreConfiguredTokenizer preConfigured;
switch(tokenizer.getCachingStrategy()) {
case ONE:
preConfigured = PreConfiguredTokenizer.singleton(name, () -> tokenizer.create(Version.CURRENT));
break;
default:
throw new UnsupportedOperationException("Caching strategy unsupported by temporary shim [" + tokenizer + "]");
}
preConfiguredTokenizers.register(name, preConfigured);
}
for (AnalysisPlugin plugin : plugins) {
for (PreConfiguredTokenizer tokenizer : plugin.getPreConfiguredTokenizers()) {
preConfiguredTokenizers.register(tokenizer.getName(), tokenizer);
}
}
return unmodifiableMap(preConfiguredTokenizers.getRegistry());
}
use of org.opensearch.index.analysis.PreConfiguredTokenizer in project OpenSearch by opensearch-project.
the class CommonAnalysisPlugin method getPreConfiguredTokenizers.
@Override
public List<PreConfiguredTokenizer> getPreConfiguredTokenizers() {
List<PreConfiguredTokenizer> tokenizers = new ArrayList<>();
tokenizers.add(PreConfiguredTokenizer.singleton("keyword", KeywordTokenizer::new));
tokenizers.add(PreConfiguredTokenizer.singleton("classic", ClassicTokenizer::new));
tokenizers.add(PreConfiguredTokenizer.singleton("uax_url_email", UAX29URLEmailTokenizer::new));
tokenizers.add(PreConfiguredTokenizer.singleton("path_hierarchy", PathHierarchyTokenizer::new));
tokenizers.add(PreConfiguredTokenizer.singleton("letter", LetterTokenizer::new));
tokenizers.add(PreConfiguredTokenizer.singleton("whitespace", WhitespaceTokenizer::new));
tokenizers.add(PreConfiguredTokenizer.singleton("ngram", NGramTokenizer::new));
tokenizers.add(PreConfiguredTokenizer.openSearchVersion("edge_ngram", (version) -> {
if (version.onOrAfter(LegacyESVersion.V_7_3_0)) {
return new EdgeNGramTokenizer(NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE, NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
}
return new EdgeNGramTokenizer(EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE);
}));
tokenizers.add(PreConfiguredTokenizer.singleton("pattern", () -> new PatternTokenizer(Regex.compile("\\W+", null), -1)));
tokenizers.add(PreConfiguredTokenizer.singleton("thai", ThaiTokenizer::new));
// TODO deprecate and remove in API
// This is already broken with normalization, so backwards compat isn't necessary?
tokenizers.add(PreConfiguredTokenizer.singleton("lowercase", XLowerCaseTokenizer::new));
// Temporary shim for aliases. TODO deprecate after they are moved
tokenizers.add(PreConfiguredTokenizer.openSearchVersion("nGram", (version) -> {
if (version.onOrAfter(LegacyESVersion.V_7_6_0)) {
deprecationLogger.deprecate("nGram_tokenizer_deprecation", "The [nGram] tokenizer name is deprecated and will be removed in a future version. " + "Please change the tokenizer name to [ngram] instead.");
}
return new NGramTokenizer();
}));
tokenizers.add(PreConfiguredTokenizer.openSearchVersion("edgeNGram", (version) -> {
if (version.onOrAfter(LegacyESVersion.V_7_6_0)) {
deprecationLogger.deprecate("edgeNGram_tokenizer_deprecation", "The [edgeNGram] tokenizer name is deprecated and will be removed in a future version. " + "Please change the tokenizer name to [edge_ngram] instead.");
}
if (version.onOrAfter(LegacyESVersion.V_7_3_0)) {
return new EdgeNGramTokenizer(NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE, NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
}
return new EdgeNGramTokenizer(EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE);
}));
tokenizers.add(PreConfiguredTokenizer.singleton("PathHierarchy", PathHierarchyTokenizer::new));
return tokenizers;
}
use of org.opensearch.index.analysis.PreConfiguredTokenizer in project OpenSearch by opensearch-project.
the class AnalysisModuleTests method testPluginPreConfiguredTokenizers.
/**
* Tests that plugins can register pre-configured token filters that vary in behavior based on OpenSearch version, Lucene version,
* and that do not vary based on version at all.
*/
public void testPluginPreConfiguredTokenizers() throws IOException {
// Simple tokenizer that always spits out a single token with some preconfigured characters
final class FixedTokenizer extends Tokenizer {
private final CharTermAttribute term = addAttribute(CharTermAttribute.class);
private final char[] chars;
private boolean read = false;
protected FixedTokenizer(String chars) {
this.chars = chars.toCharArray();
}
@Override
public boolean incrementToken() throws IOException {
if (read) {
return false;
}
clearAttributes();
read = true;
term.resizeBuffer(chars.length);
System.arraycopy(chars, 0, term.buffer(), 0, chars.length);
term.setLength(chars.length);
return true;
}
@Override
public void reset() throws IOException {
super.reset();
read = false;
}
}
AnalysisRegistry registry = new AnalysisModule(TestEnvironment.newEnvironment(emptyNodeSettings), singletonList(new AnalysisPlugin() {
@Override
public List<PreConfiguredTokenizer> getPreConfiguredTokenizers() {
return Arrays.asList(PreConfiguredTokenizer.singleton("no_version", () -> new FixedTokenizer("no_version")), PreConfiguredTokenizer.luceneVersion("lucene_version", luceneVersion -> new FixedTokenizer(luceneVersion.toString())), PreConfiguredTokenizer.openSearchVersion("opensearch_version", esVersion -> new FixedTokenizer(esVersion.toString())));
}
})).getAnalysisRegistry();
Version version = VersionUtils.randomVersion(random());
IndexAnalyzers analyzers = getIndexAnalyzers(registry, Settings.builder().put("index.analysis.analyzer.no_version.tokenizer", "no_version").put("index.analysis.analyzer.lucene_version.tokenizer", "lucene_version").put("index.analysis.analyzer.opensearch_version.tokenizer", "opensearch_version").put(IndexMetadata.SETTING_VERSION_CREATED, version).build());
assertTokenStreamContents(analyzers.get("no_version").tokenStream("", "test"), new String[] { "no_version" });
assertTokenStreamContents(analyzers.get("lucene_version").tokenStream("", "test"), new String[] { version.luceneVersion.toString() });
assertTokenStreamContents(analyzers.get("opensearch_version").tokenStream("", "test"), new String[] { version.toString() });
// These are current broken by https://github.com/elastic/elasticsearch/issues/24752
// assertEquals("test" + (noVersionSupportsMultiTerm ? "no_version" : ""),
// analyzers.get("no_version").normalize("", "test").utf8ToString());
// assertEquals("test" + (luceneVersionSupportsMultiTerm ? version.luceneVersion.toString() : ""),
// analyzers.get("lucene_version").normalize("", "test").utf8ToString());
// assertEquals("test" + (opensearchVersionSupportsMultiTerm ? version.toString() : ""),
// analyzers.get("opensearch_version").normalize("", "test").utf8ToString());
}
Aggregations