use of org.opensearch.index.analysis.TokenizerFactory in project OpenSearch by opensearch-project.
the class ScriptedConditionTokenFilterFactory method getChainAwareTokenFilterFactory.
@Override
public TokenFilterFactory getChainAwareTokenFilterFactory(TokenizerFactory tokenizer, List<CharFilterFactory> charFilters, List<TokenFilterFactory> previousTokenFilters, Function<String, TokenFilterFactory> allFilters) {
List<TokenFilterFactory> filters = new ArrayList<>();
List<TokenFilterFactory> existingChain = new ArrayList<>(previousTokenFilters);
for (String filter : filterNames) {
TokenFilterFactory tff = allFilters.apply(filter);
if (tff == null) {
throw new IllegalArgumentException("ScriptedConditionTokenFilter [" + name() + "] refers to undefined token filter [" + filter + "]");
}
tff = tff.getChainAwareTokenFilterFactory(tokenizer, charFilters, existingChain, allFilters);
filters.add(tff);
existingChain.add(tff);
}
return new TokenFilterFactory() {
@Override
public String name() {
return ScriptedConditionTokenFilterFactory.this.name();
}
@Override
public TokenStream create(TokenStream tokenStream) {
Function<TokenStream, TokenStream> filter = in -> {
for (TokenFilterFactory tff : filters) {
in = tff.create(in);
}
return in;
};
return new ScriptedConditionTokenFilter(tokenStream, filter, factory.newInstance());
}
};
}
use of org.opensearch.index.analysis.TokenizerFactory in project OpenSearch by opensearch-project.
the class AnalysisModuleTests method testPluginPreConfiguredCharFilters.
/**
* Tests that plugins can register pre-configured char filters that vary in behavior based on OpenSearch version, Lucene version,
* and that do not vary based on version at all.
*/
public void testPluginPreConfiguredCharFilters() throws IOException {
boolean noVersionSupportsMultiTerm = randomBoolean();
boolean luceneVersionSupportsMultiTerm = randomBoolean();
boolean opensearchVersionSupportsMultiTerm = randomBoolean();
AnalysisRegistry registry = new AnalysisModule(TestEnvironment.newEnvironment(emptyNodeSettings), singletonList(new AnalysisPlugin() {
@Override
public List<PreConfiguredCharFilter> getPreConfiguredCharFilters() {
return Arrays.asList(PreConfiguredCharFilter.singleton("no_version", noVersionSupportsMultiTerm, tokenStream -> new AppendCharFilter(tokenStream, "no_version")), PreConfiguredCharFilter.luceneVersion("lucene_version", luceneVersionSupportsMultiTerm, (tokenStream, luceneVersion) -> new AppendCharFilter(tokenStream, luceneVersion.toString())), PreConfiguredCharFilter.openSearchVersion("opensearch_version", opensearchVersionSupportsMultiTerm, (tokenStream, esVersion) -> new AppendCharFilter(tokenStream, esVersion.toString())));
}
@Override
public Map<String, AnalysisProvider<TokenizerFactory>> getTokenizers() {
// Need mock keyword tokenizer here, because alpha / beta versions are broken up by the dash.
return singletonMap("keyword", (indexSettings, environment, name, settings) -> TokenizerFactory.newFactory(name, () -> new MockTokenizer(MockTokenizer.KEYWORD, false)));
}
})).getAnalysisRegistry();
Version version = VersionUtils.randomVersion(random());
IndexAnalyzers analyzers = getIndexAnalyzers(registry, Settings.builder().put("index.analysis.analyzer.no_version.tokenizer", "keyword").put("index.analysis.analyzer.no_version.char_filter", "no_version").put("index.analysis.analyzer.lucene_version.tokenizer", "keyword").put("index.analysis.analyzer.lucene_version.char_filter", "lucene_version").put("index.analysis.analyzer.opensearch_version.tokenizer", "keyword").put("index.analysis.analyzer.opensearch_version.char_filter", "opensearch_version").put(IndexMetadata.SETTING_VERSION_CREATED, version).build());
assertTokenStreamContents(analyzers.get("no_version").tokenStream("", "test"), new String[] { "testno_version" });
assertTokenStreamContents(analyzers.get("lucene_version").tokenStream("", "test"), new String[] { "test" + version.luceneVersion });
assertTokenStreamContents(analyzers.get("opensearch_version").tokenStream("", "test"), new String[] { "test" + version });
assertEquals("test" + (noVersionSupportsMultiTerm ? "no_version" : ""), analyzers.get("no_version").normalize("", "test").utf8ToString());
assertEquals("test" + (luceneVersionSupportsMultiTerm ? version.luceneVersion.toString() : ""), analyzers.get("lucene_version").normalize("", "test").utf8ToString());
assertEquals("test" + (opensearchVersionSupportsMultiTerm ? version.toString() : ""), analyzers.get("opensearch_version").normalize("", "test").utf8ToString());
}
use of org.opensearch.index.analysis.TokenizerFactory in project OpenSearch by opensearch-project.
the class SynonymsAnalysisTests method testTokenFiltersBypassSynonymAnalysis.
public void testTokenFiltersBypassSynonymAnalysis() throws IOException {
Settings settings = Settings.builder().put(IndexMetadata.SETTING_VERSION_CREATED, Version.CURRENT).put("path.home", createTempDir().toString()).putList("word_list", "a").put("hyphenation_patterns_path", "foo").build();
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
String[] bypassingFactories = new String[] { "dictionary_decompounder" };
CommonAnalysisPlugin plugin = new CommonAnalysisPlugin();
for (String factory : bypassingFactories) {
TokenFilterFactory tff = plugin.getTokenFilters().get(factory).get(idxSettings, null, factory, settings);
TokenizerFactory tok = new KeywordTokenizerFactory(idxSettings, null, "keyword", settings);
SynonymTokenFilterFactory stff = new SynonymTokenFilterFactory(idxSettings, null, "synonym", settings);
Analyzer analyzer = stff.buildSynonymAnalyzer(tok, Collections.emptyList(), Collections.singletonList(tff), null);
try (TokenStream ts = analyzer.tokenStream("field", "text")) {
assertThat(ts, instanceOf(KeywordTokenizer.class));
}
}
}
use of org.opensearch.index.analysis.TokenizerFactory in project OpenSearch by opensearch-project.
the class SynonymsAnalysisTests method testDisallowedTokenFilters.
public void testDisallowedTokenFilters() throws IOException {
Settings settings = Settings.builder().put(IndexMetadata.SETTING_VERSION_CREATED, VersionUtils.randomVersionBetween(random(), LegacyESVersion.V_7_0_0, Version.CURRENT)).put("path.home", createTempDir().toString()).putList("common_words", "a", "b").put("output_unigrams", "true").build();
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
CommonAnalysisPlugin plugin = new CommonAnalysisPlugin();
String[] disallowedFactories = new String[] { "multiplexer", "cjk_bigram", "common_grams", "ngram", "edge_ngram", "word_delimiter", "word_delimiter_graph", "fingerprint" };
for (String factory : disallowedFactories) {
TokenFilterFactory tff = plugin.getTokenFilters().get(factory).get(idxSettings, null, factory, settings);
TokenizerFactory tok = new KeywordTokenizerFactory(idxSettings, null, "keyword", settings);
SynonymTokenFilterFactory stff = new SynonymTokenFilterFactory(idxSettings, null, "synonym", settings);
IllegalArgumentException e = expectThrows(IllegalArgumentException.class, "Expected IllegalArgumentException for factory " + factory, () -> stff.buildSynonymAnalyzer(tok, Collections.emptyList(), Collections.singletonList(tff), null));
assertEquals(factory, "Token filter [" + factory + "] cannot be used to parse synonyms", e.getMessage());
}
}
use of org.opensearch.index.analysis.TokenizerFactory in project OpenSearch by opensearch-project.
the class TransportAnalyzeAction method detailAnalyze.
private static AnalyzeAction.DetailAnalyzeResponse detailAnalyze(AnalyzeAction.Request request, Analyzer analyzer, int maxTokenCount) {
AnalyzeAction.DetailAnalyzeResponse detailResponse;
final Set<String> includeAttributes = new HashSet<>();
if (request.attributes() != null) {
for (String attribute : request.attributes()) {
includeAttributes.add(attribute.toLowerCase(Locale.ROOT));
}
}
// maybe unwrap analyzer from NamedAnalyzer
Analyzer potentialCustomAnalyzer = analyzer;
if (analyzer instanceof NamedAnalyzer) {
potentialCustomAnalyzer = ((NamedAnalyzer) analyzer).analyzer();
}
if (potentialCustomAnalyzer instanceof AnalyzerComponentsProvider) {
AnalyzerComponentsProvider customAnalyzer = (AnalyzerComponentsProvider) potentialCustomAnalyzer;
// note: this is not field-name dependent in our cases so we can leave out the argument
int positionIncrementGap = potentialCustomAnalyzer.getPositionIncrementGap("");
int offsetGap = potentialCustomAnalyzer.getOffsetGap("");
AnalyzerComponents components = customAnalyzer.getComponents();
// divide charfilter, tokenizer tokenfilters
CharFilterFactory[] charFilterFactories = components.getCharFilters();
TokenizerFactory tokenizerFactory = components.getTokenizerFactory();
TokenFilterFactory[] tokenFilterFactories = components.getTokenFilters();
String[][] charFiltersTexts = new String[charFilterFactories != null ? charFilterFactories.length : 0][request.text().length];
TokenListCreator[] tokenFiltersTokenListCreator = new TokenListCreator[tokenFilterFactories != null ? tokenFilterFactories.length : 0];
TokenListCreator tokenizerTokenListCreator = new TokenListCreator(maxTokenCount);
for (int textIndex = 0; textIndex < request.text().length; textIndex++) {
String charFilteredSource = request.text()[textIndex];
Reader reader = new StringReader(charFilteredSource);
if (charFilterFactories != null) {
for (int charFilterIndex = 0; charFilterIndex < charFilterFactories.length; charFilterIndex++) {
reader = charFilterFactories[charFilterIndex].create(reader);
Reader readerForWriteOut = new StringReader(charFilteredSource);
readerForWriteOut = charFilterFactories[charFilterIndex].create(readerForWriteOut);
charFilteredSource = writeCharStream(readerForWriteOut);
charFiltersTexts[charFilterIndex][textIndex] = charFilteredSource;
}
}
// analyzing only tokenizer
Tokenizer tokenizer = tokenizerFactory.create();
tokenizer.setReader(reader);
tokenizerTokenListCreator.analyze(tokenizer, includeAttributes, positionIncrementGap, offsetGap);
// analyzing each tokenfilter
if (tokenFilterFactories != null) {
for (int tokenFilterIndex = 0; tokenFilterIndex < tokenFilterFactories.length; tokenFilterIndex++) {
if (tokenFiltersTokenListCreator[tokenFilterIndex] == null) {
tokenFiltersTokenListCreator[tokenFilterIndex] = new TokenListCreator(maxTokenCount);
}
TokenStream stream = createStackedTokenStream(request.text()[textIndex], charFilterFactories, tokenizerFactory, tokenFilterFactories, tokenFilterIndex + 1);
tokenFiltersTokenListCreator[tokenFilterIndex].analyze(stream, includeAttributes, positionIncrementGap, offsetGap);
}
}
}
AnalyzeAction.CharFilteredText[] charFilteredLists = new AnalyzeAction.CharFilteredText[charFiltersTexts.length];
if (charFilterFactories != null) {
for (int charFilterIndex = 0; charFilterIndex < charFiltersTexts.length; charFilterIndex++) {
charFilteredLists[charFilterIndex] = new AnalyzeAction.CharFilteredText(charFilterFactories[charFilterIndex].name(), charFiltersTexts[charFilterIndex]);
}
}
AnalyzeAction.AnalyzeTokenList[] tokenFilterLists = new AnalyzeAction.AnalyzeTokenList[tokenFiltersTokenListCreator.length];
if (tokenFilterFactories != null) {
for (int tokenFilterIndex = 0; tokenFilterIndex < tokenFiltersTokenListCreator.length; tokenFilterIndex++) {
tokenFilterLists[tokenFilterIndex] = new AnalyzeAction.AnalyzeTokenList(tokenFilterFactories[tokenFilterIndex].name(), tokenFiltersTokenListCreator[tokenFilterIndex].getArrayTokens());
}
}
detailResponse = new AnalyzeAction.DetailAnalyzeResponse(charFilteredLists, new AnalyzeAction.AnalyzeTokenList(tokenizerFactory.name(), tokenizerTokenListCreator.getArrayTokens()), tokenFilterLists);
} else {
String name;
if (analyzer instanceof NamedAnalyzer) {
name = ((NamedAnalyzer) analyzer).name();
} else {
name = analyzer.getClass().getName();
}
TokenListCreator tokenListCreator = new TokenListCreator(maxTokenCount);
for (String text : request.text()) {
tokenListCreator.analyze(analyzer.tokenStream("", text), includeAttributes, analyzer.getPositionIncrementGap(""), analyzer.getOffsetGap(""));
}
detailResponse = new AnalyzeAction.DetailAnalyzeResponse(new AnalyzeAction.AnalyzeTokenList(name, tokenListCreator.getArrayTokens()));
}
return detailResponse;
}
Aggregations