Search in sources :

Example 1 with CharFilterFactory

use of org.opensearch.index.analysis.CharFilterFactory in project OpenSearch by opensearch-project.

the class ScriptedConditionTokenFilterFactory method getChainAwareTokenFilterFactory.

@Override
public TokenFilterFactory getChainAwareTokenFilterFactory(TokenizerFactory tokenizer, List<CharFilterFactory> charFilters, List<TokenFilterFactory> previousTokenFilters, Function<String, TokenFilterFactory> allFilters) {
    List<TokenFilterFactory> filters = new ArrayList<>();
    List<TokenFilterFactory> existingChain = new ArrayList<>(previousTokenFilters);
    for (String filter : filterNames) {
        TokenFilterFactory tff = allFilters.apply(filter);
        if (tff == null) {
            throw new IllegalArgumentException("ScriptedConditionTokenFilter [" + name() + "] refers to undefined token filter [" + filter + "]");
        }
        tff = tff.getChainAwareTokenFilterFactory(tokenizer, charFilters, existingChain, allFilters);
        filters.add(tff);
        existingChain.add(tff);
    }
    return new TokenFilterFactory() {

        @Override
        public String name() {
            return ScriptedConditionTokenFilterFactory.this.name();
        }

        @Override
        public TokenStream create(TokenStream tokenStream) {
            Function<TokenStream, TokenStream> filter = in -> {
                for (TokenFilterFactory tff : filters) {
                    in = tff.create(in);
                }
                return in;
            };
            return new ScriptedConditionTokenFilter(tokenStream, filter, factory.newInstance());
        }
    };
}
Also used : ScriptService(org.opensearch.script.ScriptService) TokenizerFactory(org.opensearch.index.analysis.TokenizerFactory) TokenStream(org.apache.lucene.analysis.TokenStream) AbstractTokenFilterFactory(org.opensearch.index.analysis.AbstractTokenFilterFactory) Script(org.opensearch.script.Script) TokenFilterFactory(org.opensearch.index.analysis.TokenFilterFactory) Settings(org.opensearch.common.settings.Settings) IOException(java.io.IOException) Function(java.util.function.Function) ConditionalTokenFilter(org.apache.lucene.analysis.miscellaneous.ConditionalTokenFilter) ArrayList(java.util.ArrayList) ScriptType(org.opensearch.script.ScriptType) List(java.util.List) CharFilterFactory(org.opensearch.index.analysis.CharFilterFactory) IndexSettings(org.opensearch.index.IndexSettings) TokenStream(org.apache.lucene.analysis.TokenStream) ArrayList(java.util.ArrayList) AbstractTokenFilterFactory(org.opensearch.index.analysis.AbstractTokenFilterFactory) TokenFilterFactory(org.opensearch.index.analysis.TokenFilterFactory)

Example 2 with CharFilterFactory

use of org.opensearch.index.analysis.CharFilterFactory in project OpenSearch by opensearch-project.

the class AnnotatedTextFieldMapperTests method createIndexAnalyzers.

@Override
protected IndexAnalyzers createIndexAnalyzers(IndexSettings indexSettings) {
    NamedAnalyzer dflt = new NamedAnalyzer("default", AnalyzerScope.INDEX, new StandardAnalyzer(), TextFieldMapper.Defaults.POSITION_INCREMENT_GAP);
    NamedAnalyzer standard = new NamedAnalyzer("standard", AnalyzerScope.INDEX, new StandardAnalyzer());
    NamedAnalyzer keyword = new NamedAnalyzer("keyword", AnalyzerScope.INDEX, new KeywordAnalyzer());
    NamedAnalyzer whitespace = new NamedAnalyzer("whitespace", AnalyzerScope.INDEX, new WhitespaceAnalyzer());
    NamedAnalyzer stop = new NamedAnalyzer("my_stop_analyzer", AnalyzerScope.INDEX, new CustomAnalyzer(new StandardTokenizerFactory(indexSettings, null, "standard", indexSettings.getSettings()), new CharFilterFactory[0], new TokenFilterFactory[] { new TokenFilterFactory() {

        @Override
        public String name() {
            return "stop";
        }

        @Override
        public TokenStream create(TokenStream tokenStream) {
            return new StopFilter(tokenStream, EnglishAnalyzer.ENGLISH_STOP_WORDS_SET);
        }
    } }));
    Map<String, NamedAnalyzer> analyzers = new HashMap<>();
    analyzers.put("default", dflt);
    analyzers.put("standard", standard);
    analyzers.put("keyword", keyword);
    analyzers.put("whitespace", whitespace);
    analyzers.put("my_stop_analyzer", stop);
    return new IndexAnalyzers(analyzers, Collections.emptyMap(), Collections.emptyMap());
}
Also used : WhitespaceAnalyzer(org.apache.lucene.analysis.core.WhitespaceAnalyzer) KeywordAnalyzer(org.apache.lucene.analysis.core.KeywordAnalyzer) TokenStream(org.apache.lucene.analysis.TokenStream) NamedAnalyzer(org.opensearch.index.analysis.NamedAnalyzer) HashMap(java.util.HashMap) StopFilter(org.apache.lucene.analysis.StopFilter) CharFilterFactory(org.opensearch.index.analysis.CharFilterFactory) Matchers.containsString(org.hamcrest.Matchers.containsString) TokenFilterFactory(org.opensearch.index.analysis.TokenFilterFactory) CustomAnalyzer(org.opensearch.index.analysis.CustomAnalyzer) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) StandardTokenizerFactory(org.opensearch.index.analysis.StandardTokenizerFactory) IndexAnalyzers(org.opensearch.index.analysis.IndexAnalyzers)

Example 3 with CharFilterFactory

use of org.opensearch.index.analysis.CharFilterFactory in project OpenSearch by opensearch-project.

the class TransportAnalyzeAction method detailAnalyze.

private static AnalyzeAction.DetailAnalyzeResponse detailAnalyze(AnalyzeAction.Request request, Analyzer analyzer, int maxTokenCount) {
    AnalyzeAction.DetailAnalyzeResponse detailResponse;
    final Set<String> includeAttributes = new HashSet<>();
    if (request.attributes() != null) {
        for (String attribute : request.attributes()) {
            includeAttributes.add(attribute.toLowerCase(Locale.ROOT));
        }
    }
    // maybe unwrap analyzer from NamedAnalyzer
    Analyzer potentialCustomAnalyzer = analyzer;
    if (analyzer instanceof NamedAnalyzer) {
        potentialCustomAnalyzer = ((NamedAnalyzer) analyzer).analyzer();
    }
    if (potentialCustomAnalyzer instanceof AnalyzerComponentsProvider) {
        AnalyzerComponentsProvider customAnalyzer = (AnalyzerComponentsProvider) potentialCustomAnalyzer;
        // note: this is not field-name dependent in our cases so we can leave out the argument
        int positionIncrementGap = potentialCustomAnalyzer.getPositionIncrementGap("");
        int offsetGap = potentialCustomAnalyzer.getOffsetGap("");
        AnalyzerComponents components = customAnalyzer.getComponents();
        // divide charfilter, tokenizer tokenfilters
        CharFilterFactory[] charFilterFactories = components.getCharFilters();
        TokenizerFactory tokenizerFactory = components.getTokenizerFactory();
        TokenFilterFactory[] tokenFilterFactories = components.getTokenFilters();
        String[][] charFiltersTexts = new String[charFilterFactories != null ? charFilterFactories.length : 0][request.text().length];
        TokenListCreator[] tokenFiltersTokenListCreator = new TokenListCreator[tokenFilterFactories != null ? tokenFilterFactories.length : 0];
        TokenListCreator tokenizerTokenListCreator = new TokenListCreator(maxTokenCount);
        for (int textIndex = 0; textIndex < request.text().length; textIndex++) {
            String charFilteredSource = request.text()[textIndex];
            Reader reader = new StringReader(charFilteredSource);
            if (charFilterFactories != null) {
                for (int charFilterIndex = 0; charFilterIndex < charFilterFactories.length; charFilterIndex++) {
                    reader = charFilterFactories[charFilterIndex].create(reader);
                    Reader readerForWriteOut = new StringReader(charFilteredSource);
                    readerForWriteOut = charFilterFactories[charFilterIndex].create(readerForWriteOut);
                    charFilteredSource = writeCharStream(readerForWriteOut);
                    charFiltersTexts[charFilterIndex][textIndex] = charFilteredSource;
                }
            }
            // analyzing only tokenizer
            Tokenizer tokenizer = tokenizerFactory.create();
            tokenizer.setReader(reader);
            tokenizerTokenListCreator.analyze(tokenizer, includeAttributes, positionIncrementGap, offsetGap);
            // analyzing each tokenfilter
            if (tokenFilterFactories != null) {
                for (int tokenFilterIndex = 0; tokenFilterIndex < tokenFilterFactories.length; tokenFilterIndex++) {
                    if (tokenFiltersTokenListCreator[tokenFilterIndex] == null) {
                        tokenFiltersTokenListCreator[tokenFilterIndex] = new TokenListCreator(maxTokenCount);
                    }
                    TokenStream stream = createStackedTokenStream(request.text()[textIndex], charFilterFactories, tokenizerFactory, tokenFilterFactories, tokenFilterIndex + 1);
                    tokenFiltersTokenListCreator[tokenFilterIndex].analyze(stream, includeAttributes, positionIncrementGap, offsetGap);
                }
            }
        }
        AnalyzeAction.CharFilteredText[] charFilteredLists = new AnalyzeAction.CharFilteredText[charFiltersTexts.length];
        if (charFilterFactories != null) {
            for (int charFilterIndex = 0; charFilterIndex < charFiltersTexts.length; charFilterIndex++) {
                charFilteredLists[charFilterIndex] = new AnalyzeAction.CharFilteredText(charFilterFactories[charFilterIndex].name(), charFiltersTexts[charFilterIndex]);
            }
        }
        AnalyzeAction.AnalyzeTokenList[] tokenFilterLists = new AnalyzeAction.AnalyzeTokenList[tokenFiltersTokenListCreator.length];
        if (tokenFilterFactories != null) {
            for (int tokenFilterIndex = 0; tokenFilterIndex < tokenFiltersTokenListCreator.length; tokenFilterIndex++) {
                tokenFilterLists[tokenFilterIndex] = new AnalyzeAction.AnalyzeTokenList(tokenFilterFactories[tokenFilterIndex].name(), tokenFiltersTokenListCreator[tokenFilterIndex].getArrayTokens());
            }
        }
        detailResponse = new AnalyzeAction.DetailAnalyzeResponse(charFilteredLists, new AnalyzeAction.AnalyzeTokenList(tokenizerFactory.name(), tokenizerTokenListCreator.getArrayTokens()), tokenFilterLists);
    } else {
        String name;
        if (analyzer instanceof NamedAnalyzer) {
            name = ((NamedAnalyzer) analyzer).name();
        } else {
            name = analyzer.getClass().getName();
        }
        TokenListCreator tokenListCreator = new TokenListCreator(maxTokenCount);
        for (String text : request.text()) {
            tokenListCreator.analyze(analyzer.tokenStream("", text), includeAttributes, analyzer.getPositionIncrementGap(""), analyzer.getOffsetGap(""));
        }
        detailResponse = new AnalyzeAction.DetailAnalyzeResponse(new AnalyzeAction.AnalyzeTokenList(name, tokenListCreator.getArrayTokens()));
    }
    return detailResponse;
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) NamedAnalyzer(org.opensearch.index.analysis.NamedAnalyzer) AnalyzerComponentsProvider(org.opensearch.index.analysis.AnalyzerComponentsProvider) Reader(java.io.Reader) StringReader(java.io.StringReader) AnalyzerComponents(org.opensearch.index.analysis.AnalyzerComponents) NamedAnalyzer(org.opensearch.index.analysis.NamedAnalyzer) Analyzer(org.apache.lucene.analysis.Analyzer) StringReader(java.io.StringReader) Tokenizer(org.apache.lucene.analysis.Tokenizer) HashSet(java.util.HashSet) TokenizerFactory(org.opensearch.index.analysis.TokenizerFactory) CharFilterFactory(org.opensearch.index.analysis.CharFilterFactory) TokenFilterFactory(org.opensearch.index.analysis.TokenFilterFactory)

Example 4 with CharFilterFactory

use of org.opensearch.index.analysis.CharFilterFactory in project OpenSearch by opensearch-project.

the class TransportAnalyzeAction method createStackedTokenStream.

private static TokenStream createStackedTokenStream(String source, CharFilterFactory[] charFilterFactories, TokenizerFactory tokenizerFactory, TokenFilterFactory[] tokenFilterFactories, int current) {
    Reader reader = new StringReader(source);
    for (CharFilterFactory charFilterFactory : charFilterFactories) {
        reader = charFilterFactory.create(reader);
    }
    Tokenizer tokenizer = tokenizerFactory.create();
    tokenizer.setReader(reader);
    TokenStream tokenStream = tokenizer;
    for (int i = 0; i < current; i++) {
        tokenStream = tokenFilterFactories[i].create(tokenStream);
    }
    return tokenStream;
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharFilterFactory(org.opensearch.index.analysis.CharFilterFactory) StringReader(java.io.StringReader) Reader(java.io.Reader) StringReader(java.io.StringReader) Tokenizer(org.apache.lucene.analysis.Tokenizer)

Example 5 with CharFilterFactory

use of org.opensearch.index.analysis.CharFilterFactory in project OpenSearch by opensearch-project.

the class MapperService method reloadSearchAnalyzers.

public synchronized List<String> reloadSearchAnalyzers(AnalysisRegistry registry) throws IOException {
    logger.info("reloading search analyzers");
    // refresh indexAnalyzers and search analyzers
    final Map<String, TokenizerFactory> tokenizerFactories = registry.buildTokenizerFactories(indexSettings);
    final Map<String, CharFilterFactory> charFilterFactories = registry.buildCharFilterFactories(indexSettings);
    final Map<String, TokenFilterFactory> tokenFilterFactories = registry.buildTokenFilterFactories(indexSettings);
    final Map<String, Settings> settings = indexSettings.getSettings().getGroups("index.analysis.analyzer");
    final List<String> reloadedAnalyzers = new ArrayList<>();
    for (NamedAnalyzer namedAnalyzer : indexAnalyzers.getAnalyzers().values()) {
        if (namedAnalyzer.analyzer() instanceof ReloadableCustomAnalyzer) {
            ReloadableCustomAnalyzer analyzer = (ReloadableCustomAnalyzer) namedAnalyzer.analyzer();
            String analyzerName = namedAnalyzer.name();
            Settings analyzerSettings = settings.get(analyzerName);
            analyzer.reload(analyzerName, analyzerSettings, tokenizerFactories, charFilterFactories, tokenFilterFactories);
            reloadedAnalyzers.add(analyzerName);
        }
    }
    return reloadedAnalyzers;
}
Also used : TokenizerFactory(org.opensearch.index.analysis.TokenizerFactory) NamedAnalyzer(org.opensearch.index.analysis.NamedAnalyzer) CharFilterFactory(org.opensearch.index.analysis.CharFilterFactory) ArrayList(java.util.ArrayList) TokenFilterFactory(org.opensearch.index.analysis.TokenFilterFactory) ReloadableCustomAnalyzer(org.opensearch.index.analysis.ReloadableCustomAnalyzer) Settings(org.opensearch.common.settings.Settings) IndexSettings(org.opensearch.index.IndexSettings)

Aggregations

CharFilterFactory (org.opensearch.index.analysis.CharFilterFactory)7 TokenStream (org.apache.lucene.analysis.TokenStream)6 TokenFilterFactory (org.opensearch.index.analysis.TokenFilterFactory)6 NamedAnalyzer (org.opensearch.index.analysis.NamedAnalyzer)4 TokenizerFactory (org.opensearch.index.analysis.TokenizerFactory)4 Reader (java.io.Reader)3 Settings (org.opensearch.common.settings.Settings)3 IndexSettings (org.opensearch.index.IndexSettings)3 IndexAnalyzers (org.opensearch.index.analysis.IndexAnalyzers)3 IOException (java.io.IOException)2 StringReader (java.io.StringReader)2 ArrayList (java.util.ArrayList)2 HashMap (java.util.HashMap)2 List (java.util.List)2 StopFilter (org.apache.lucene.analysis.StopFilter)2 Tokenizer (org.apache.lucene.analysis.Tokenizer)2 KeywordAnalyzer (org.apache.lucene.analysis.core.KeywordAnalyzer)2 WhitespaceAnalyzer (org.apache.lucene.analysis.core.WhitespaceAnalyzer)2 StandardAnalyzer (org.apache.lucene.analysis.standard.StandardAnalyzer)2 CustomAnalyzer (org.opensearch.index.analysis.CustomAnalyzer)2