Search in sources :

Example 6 with CharFilterFactory

use of org.apache.lucene.analysis.util.CharFilterFactory in project lucene-solr by apache.

the class AnalyzerFactoryTask method setParams.

/**
   * Sets the params.
   * Analysis component factory names may optionally include the "Factory" suffix.
   *
   * @param params analysis pipeline specification: name, (optional) positionIncrementGap,
   *               (optional) offsetGap, 0+ CharFilterFactory's, 1 TokenizerFactory,
   *               and 0+ TokenFilterFactory's
   */
@Override
@SuppressWarnings("fallthrough")
public void setParams(String params) {
    super.setParams(params);
    ArgType expectedArgType = ArgType.ANALYZER_ARG;
    final StreamTokenizer stok = new StreamTokenizer(new StringReader(params));
    stok.commentChar('#');
    stok.quoteChar('"');
    stok.quoteChar('\'');
    stok.eolIsSignificant(false);
    stok.ordinaryChar('(');
    stok.ordinaryChar(')');
    stok.ordinaryChar(':');
    stok.ordinaryChar(',');
    try {
        while (stok.nextToken() != StreamTokenizer.TT_EOF) {
            switch(stok.ttype) {
                case ',':
                    {
                        // Do nothing
                        break;
                    }
                case StreamTokenizer.TT_WORD:
                    {
                        if (expectedArgType.equals(ArgType.ANALYZER_ARG)) {
                            final String argName = stok.sval;
                            if (!argName.equalsIgnoreCase("name") && !argName.equalsIgnoreCase("positionIncrementGap") && !argName.equalsIgnoreCase("offsetGap")) {
                                throw new RuntimeException("Line #" + lineno(stok) + ": Missing 'name' param to AnalyzerFactory: '" + params + "'");
                            }
                            stok.nextToken();
                            if (stok.ttype != ':') {
                                throw new RuntimeException("Line #" + lineno(stok) + ": Missing ':' after '" + argName + "' param to AnalyzerFactory");
                            }
                            stok.nextToken();
                            String argValue = stok.sval;
                            switch(stok.ttype) {
                                case StreamTokenizer.TT_NUMBER:
                                    {
                                        argValue = Double.toString(stok.nval);
                                        // Drop the ".0" from numbers, for integer arguments
                                        argValue = TRAILING_DOT_ZERO_PATTERN.matcher(argValue).replaceFirst("");
                                    // Intentional fallthrough
                                    }
                                case '"':
                                case '\'':
                                case StreamTokenizer.TT_WORD:
                                    {
                                        if (argName.equalsIgnoreCase("name")) {
                                            factoryName = argValue;
                                            expectedArgType = ArgType.ANALYZER_ARG_OR_CHARFILTER_OR_TOKENIZER;
                                        } else {
                                            int intArgValue = 0;
                                            try {
                                                intArgValue = Integer.parseInt(argValue);
                                            } catch (NumberFormatException e) {
                                                throw new RuntimeException("Line #" + lineno(stok) + ": Exception parsing " + argName + " value '" + argValue + "'", e);
                                            }
                                            if (argName.equalsIgnoreCase("positionIncrementGap")) {
                                                positionIncrementGap = intArgValue;
                                            } else if (argName.equalsIgnoreCase("offsetGap")) {
                                                offsetGap = intArgValue;
                                            }
                                        }
                                        break;
                                    }
                                case StreamTokenizer.TT_EOF:
                                    {
                                        throw new RuntimeException("Unexpected EOF: " + stok.toString());
                                    }
                                default:
                                    {
                                        throw new RuntimeException("Line #" + lineno(stok) + ": Unexpected token: " + stok.toString());
                                    }
                            }
                        } else if (expectedArgType.equals(ArgType.ANALYZER_ARG_OR_CHARFILTER_OR_TOKENIZER)) {
                            final String argName = stok.sval;
                            if (argName.equalsIgnoreCase("positionIncrementGap") || argName.equalsIgnoreCase("offsetGap")) {
                                stok.nextToken();
                                if (stok.ttype != ':') {
                                    throw new RuntimeException("Line #" + lineno(stok) + ": Missing ':' after '" + argName + "' param to AnalyzerFactory");
                                }
                                stok.nextToken();
                                int intArgValue = (int) stok.nval;
                                switch(stok.ttype) {
                                    case '"':
                                    case '\'':
                                    case StreamTokenizer.TT_WORD:
                                        {
                                            intArgValue = 0;
                                            try {
                                                intArgValue = Integer.parseInt(stok.sval.trim());
                                            } catch (NumberFormatException e) {
                                                throw new RuntimeException("Line #" + lineno(stok) + ": Exception parsing " + argName + " value '" + stok.sval + "'", e);
                                            }
                                        // Intentional fall-through
                                        }
                                    case StreamTokenizer.TT_NUMBER:
                                        {
                                            if (argName.equalsIgnoreCase("positionIncrementGap")) {
                                                positionIncrementGap = intArgValue;
                                            } else if (argName.equalsIgnoreCase("offsetGap")) {
                                                offsetGap = intArgValue;
                                            }
                                            break;
                                        }
                                    case StreamTokenizer.TT_EOF:
                                        {
                                            throw new RuntimeException("Unexpected EOF: " + stok.toString());
                                        }
                                    default:
                                        {
                                            throw new RuntimeException("Line #" + lineno(stok) + ": Unexpected token: " + stok.toString());
                                        }
                                }
                                break;
                            }
                            try {
                                final Class<? extends CharFilterFactory> clazz;
                                clazz = lookupAnalysisClass(argName, CharFilterFactory.class);
                                createAnalysisPipelineComponent(stok, clazz);
                            } catch (IllegalArgumentException e) {
                                try {
                                    final Class<? extends TokenizerFactory> clazz;
                                    clazz = lookupAnalysisClass(argName, TokenizerFactory.class);
                                    createAnalysisPipelineComponent(stok, clazz);
                                    expectedArgType = ArgType.TOKENFILTER;
                                } catch (IllegalArgumentException e2) {
                                    throw new RuntimeException("Line #" + lineno(stok) + ": Can't find class '" + argName + "' as CharFilterFactory or TokenizerFactory");
                                }
                            }
                        } else {
                            // expectedArgType = ArgType.TOKENFILTER
                            final String className = stok.sval;
                            final Class<? extends TokenFilterFactory> clazz;
                            try {
                                clazz = lookupAnalysisClass(className, TokenFilterFactory.class);
                            } catch (IllegalArgumentException e) {
                                throw new RuntimeException("Line #" + lineno(stok) + ": Can't find class '" + className + "' as TokenFilterFactory");
                            }
                            createAnalysisPipelineComponent(stok, clazz);
                        }
                        break;
                    }
                default:
                    {
                        throw new RuntimeException("Line #" + lineno(stok) + ": Unexpected token: " + stok.toString());
                    }
            }
        }
    } catch (RuntimeException e) {
        if (e.getMessage().startsWith("Line #")) {
            throw e;
        } else {
            throw new RuntimeException("Line #" + lineno(stok) + ": ", e);
        }
    } catch (Throwable t) {
        throw new RuntimeException("Line #" + lineno(stok) + ": ", t);
    }
    final AnalyzerFactory analyzerFactory = new AnalyzerFactory(charFilterFactories, tokenizerFactory, tokenFilterFactories);
    analyzerFactory.setPositionIncrementGap(positionIncrementGap);
    analyzerFactory.setOffsetGap(offsetGap);
    getRunData().getAnalyzerFactories().put(factoryName, analyzerFactory);
}
Also used : TokenizerFactory(org.apache.lucene.analysis.util.TokenizerFactory) CharFilterFactory(org.apache.lucene.analysis.util.CharFilterFactory) TokenFilterFactory(org.apache.lucene.analysis.util.TokenFilterFactory) StringReader(java.io.StringReader) StreamTokenizer(java.io.StreamTokenizer) AnalyzerFactory(org.apache.lucene.benchmark.byTask.utils.AnalyzerFactory)

Example 7 with CharFilterFactory

use of org.apache.lucene.analysis.util.CharFilterFactory in project lucene-solr by apache.

the class TestFactories method doTestTokenizer.

private void doTestTokenizer(String tokenizer) throws IOException {
    Class<? extends TokenizerFactory> factoryClazz = TokenizerFactory.lookupClass(tokenizer);
    TokenizerFactory factory = (TokenizerFactory) initialize(factoryClazz);
    if (factory != null) {
        // if it implements MultiTermAware, sanity check its impl
        if (factory instanceof MultiTermAwareComponent) {
            AbstractAnalysisFactory mtc = ((MultiTermAwareComponent) factory).getMultiTermComponent();
            assertNotNull(mtc);
            // it's not ok to return e.g. a charfilter here: but a tokenizer could wrap a filter around it
            assertFalse(mtc instanceof CharFilterFactory);
        }
        // beast it just a little, it shouldnt throw exceptions:
        // (it should have thrown them in initialize)
        Analyzer a = new FactoryAnalyzer(factory, null, null);
        checkRandomData(random(), a, 20, 20, false, false);
        a.close();
    }
}
Also used : MultiTermAwareComponent(org.apache.lucene.analysis.util.MultiTermAwareComponent) TokenizerFactory(org.apache.lucene.analysis.util.TokenizerFactory) CharFilterFactory(org.apache.lucene.analysis.util.CharFilterFactory) AbstractAnalysisFactory(org.apache.lucene.analysis.util.AbstractAnalysisFactory) Analyzer(org.apache.lucene.analysis.Analyzer)

Example 8 with CharFilterFactory

use of org.apache.lucene.analysis.util.CharFilterFactory in project lucene-solr by apache.

the class AnalyzerFactory method create.

public Analyzer create() {
    return new Analyzer() {

        private final Integer positionIncrementGap = AnalyzerFactory.this.positionIncrementGap;

        private final Integer offsetGap = AnalyzerFactory.this.offsetGap;

        @Override
        public Reader initReader(String fieldName, Reader reader) {
            if (charFilterFactories != null && charFilterFactories.size() > 0) {
                Reader wrappedReader = reader;
                for (CharFilterFactory charFilterFactory : charFilterFactories) {
                    wrappedReader = charFilterFactory.create(wrappedReader);
                }
                reader = wrappedReader;
            }
            return reader;
        }

        @Override
        protected Analyzer.TokenStreamComponents createComponents(String fieldName) {
            final Tokenizer tokenizer = tokenizerFactory.create();
            TokenStream tokenStream = tokenizer;
            for (TokenFilterFactory filterFactory : tokenFilterFactories) {
                tokenStream = filterFactory.create(tokenStream);
            }
            return new TokenStreamComponents(tokenizer, tokenStream);
        }

        @Override
        public int getPositionIncrementGap(String fieldName) {
            return null == positionIncrementGap ? super.getPositionIncrementGap(fieldName) : positionIncrementGap;
        }

        @Override
        public int getOffsetGap(String fieldName) {
            return null == offsetGap ? super.getOffsetGap(fieldName) : offsetGap;
        }
    };
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharFilterFactory(org.apache.lucene.analysis.util.CharFilterFactory) Reader(java.io.Reader) Analyzer(org.apache.lucene.analysis.Analyzer) Tokenizer(org.apache.lucene.analysis.Tokenizer) TokenFilterFactory(org.apache.lucene.analysis.util.TokenFilterFactory)

Example 9 with CharFilterFactory

use of org.apache.lucene.analysis.util.CharFilterFactory in project lucene-solr by apache.

the class AnalysisRequestHandlerBase method analyzeValue.

/**
   * Analyzes the given value using the given Analyzer.
   *
   * @param value   Value to analyze
   * @param context The {@link AnalysisContext analysis context}.
   *
   * @return NamedList containing the tokens produced by analyzing the given value
   */
protected NamedList<? extends Object> analyzeValue(String value, AnalysisContext context) {
    Analyzer analyzer = context.getAnalyzer();
    if (!TokenizerChain.class.isInstance(analyzer)) {
        try (TokenStream tokenStream = analyzer.tokenStream(context.getFieldName(), value)) {
            NamedList<List<NamedList>> namedList = new NamedList<>();
            namedList.add(tokenStream.getClass().getName(), convertTokensToNamedLists(analyzeTokenStream(tokenStream), context));
            return namedList;
        } catch (IOException e) {
            throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e);
        }
    }
    TokenizerChain tokenizerChain = (TokenizerChain) analyzer;
    CharFilterFactory[] cfiltfacs = tokenizerChain.getCharFilterFactories();
    TokenizerFactory tfac = tokenizerChain.getTokenizerFactory();
    TokenFilterFactory[] filtfacs = tokenizerChain.getTokenFilterFactories();
    NamedList<Object> namedList = new NamedList<>();
    if (0 < cfiltfacs.length) {
        String source = value;
        for (CharFilterFactory cfiltfac : cfiltfacs) {
            Reader reader = new StringReader(source);
            reader = cfiltfac.create(reader);
            source = writeCharStream(namedList, reader);
        }
    }
    TokenStream tokenStream = tfac.create();
    ((Tokenizer) tokenStream).setReader(tokenizerChain.initReader(null, new StringReader(value)));
    List<AttributeSource> tokens = analyzeTokenStream(tokenStream);
    namedList.add(tokenStream.getClass().getName(), convertTokensToNamedLists(tokens, context));
    ListBasedTokenStream listBasedTokenStream = new ListBasedTokenStream(tokenStream, tokens);
    for (TokenFilterFactory tokenFilterFactory : filtfacs) {
        for (final AttributeSource tok : tokens) {
            tok.getAttribute(TokenTrackingAttribute.class).freezeStage();
        }
        // overwrite the vars "tokenStream", "tokens", and "listBasedTokenStream"
        tokenStream = tokenFilterFactory.create(listBasedTokenStream);
        tokens = analyzeTokenStream(tokenStream);
        namedList.add(tokenStream.getClass().getName(), convertTokensToNamedLists(tokens, context));
        listBasedTokenStream = new ListBasedTokenStream(listBasedTokenStream, tokens);
    }
    return namedList;
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) TokenizerFactory(org.apache.lucene.analysis.util.TokenizerFactory) AttributeSource(org.apache.lucene.util.AttributeSource) NamedList(org.apache.solr.common.util.NamedList) CharFilterFactory(org.apache.lucene.analysis.util.CharFilterFactory) Reader(java.io.Reader) StringReader(java.io.StringReader) IOException(java.io.IOException) Analyzer(org.apache.lucene.analysis.Analyzer) TokenFilterFactory(org.apache.lucene.analysis.util.TokenFilterFactory) TokenizerChain(org.apache.solr.analysis.TokenizerChain) StringReader(java.io.StringReader) ArrayList(java.util.ArrayList) NamedList(org.apache.solr.common.util.NamedList) List(java.util.List) Tokenizer(org.apache.lucene.analysis.Tokenizer) SolrException(org.apache.solr.common.SolrException)

Example 10 with CharFilterFactory

use of org.apache.lucene.analysis.util.CharFilterFactory in project lucene-solr by apache.

the class LukeRequestHandler method getAnalyzerInfo.

private static SimpleOrderedMap<Object> getAnalyzerInfo(Analyzer analyzer) {
    SimpleOrderedMap<Object> aninfo = new SimpleOrderedMap<>();
    aninfo.add("className", analyzer.getClass().getName());
    if (analyzer instanceof TokenizerChain) {
        TokenizerChain tchain = (TokenizerChain) analyzer;
        CharFilterFactory[] cfiltfacs = tchain.getCharFilterFactories();
        if (0 < cfiltfacs.length) {
            SimpleOrderedMap<Map<String, Object>> cfilters = new SimpleOrderedMap<>();
            for (CharFilterFactory cfiltfac : cfiltfacs) {
                Map<String, Object> tok = new HashMap<>();
                String className = cfiltfac.getClass().getName();
                tok.put("className", className);
                tok.put("args", cfiltfac.getOriginalArgs());
                cfilters.add(className.substring(className.lastIndexOf('.') + 1), tok);
            }
            aninfo.add("charFilters", cfilters);
        }
        SimpleOrderedMap<Object> tokenizer = new SimpleOrderedMap<>();
        TokenizerFactory tfac = tchain.getTokenizerFactory();
        tokenizer.add("className", tfac.getClass().getName());
        tokenizer.add("args", tfac.getOriginalArgs());
        aninfo.add("tokenizer", tokenizer);
        TokenFilterFactory[] filtfacs = tchain.getTokenFilterFactories();
        if (0 < filtfacs.length) {
            SimpleOrderedMap<Map<String, Object>> filters = new SimpleOrderedMap<>();
            for (TokenFilterFactory filtfac : filtfacs) {
                Map<String, Object> tok = new HashMap<>();
                String className = filtfac.getClass().getName();
                tok.put("className", className);
                tok.put("args", filtfac.getOriginalArgs());
                filters.add(className.substring(className.lastIndexOf('.') + 1), tok);
            }
            aninfo.add("filters", filters);
        }
    }
    return aninfo;
}
Also used : TokenizerFactory(org.apache.lucene.analysis.util.TokenizerFactory) HashMap(java.util.HashMap) CharFilterFactory(org.apache.lucene.analysis.util.CharFilterFactory) SimpleOrderedMap(org.apache.solr.common.util.SimpleOrderedMap) TokenFilterFactory(org.apache.lucene.analysis.util.TokenFilterFactory) TokenizerChain(org.apache.solr.analysis.TokenizerChain) SimpleOrderedMap(org.apache.solr.common.util.SimpleOrderedMap) Map(java.util.Map) HashMap(java.util.HashMap) TreeMap(java.util.TreeMap)

Aggregations

CharFilterFactory (org.apache.lucene.analysis.util.CharFilterFactory)26 TokenFilterFactory (org.apache.lucene.analysis.util.TokenFilterFactory)16 TokenizerFactory (org.apache.lucene.analysis.util.TokenizerFactory)12 Analyzer (org.apache.lucene.analysis.Analyzer)7 MultiTermAwareComponent (org.apache.lucene.analysis.util.MultiTermAwareComponent)6 TokenizerChain (org.apache.solr.analysis.TokenizerChain)5 Reader (java.io.Reader)4 ArrayList (java.util.ArrayList)4 HashMap (java.util.HashMap)4 AbstractAnalysisFactory (org.apache.lucene.analysis.util.AbstractAnalysisFactory)4 StringReader (java.io.StringReader)3 Map (java.util.Map)3 TokenStream (org.apache.lucene.analysis.TokenStream)3 Tokenizer (org.apache.lucene.analysis.Tokenizer)3 ResourceLoaderAware (org.apache.lucene.analysis.util.ResourceLoaderAware)3 SolrException (org.apache.solr.common.SolrException)3 JsonElement (com.google.gson.JsonElement)2 JsonObject (com.google.gson.JsonObject)2 IOException (java.io.IOException)2 KeywordAnalyzer (org.apache.lucene.analysis.core.KeywordAnalyzer)2