Search in sources :

Example 11 with CharFilterFactory

use of org.apache.lucene.analysis.util.CharFilterFactory in project lucene-solr by apache.

the class ManagedIndexSchema method informResourceLoaderAwareObjectsInChain.

/**
   * After creating a new FieldType, it may contain components that implement
   * the ResourceLoaderAware interface, which need to be informed after they
   * are loaded (as they depend on this callback to complete initialization work)
   */
protected void informResourceLoaderAwareObjectsInChain(TokenizerChain chain) {
    CharFilterFactory[] charFilters = chain.getCharFilterFactories();
    for (CharFilterFactory next : charFilters) {
        if (next instanceof ResourceLoaderAware) {
            try {
                ((ResourceLoaderAware) next).inform(loader);
            } catch (IOException e) {
                throw new SolrException(ErrorCode.SERVER_ERROR, e);
            }
        }
    }
    TokenizerFactory tokenizerFactory = chain.getTokenizerFactory();
    if (tokenizerFactory instanceof ResourceLoaderAware) {
        try {
            ((ResourceLoaderAware) tokenizerFactory).inform(loader);
        } catch (IOException e) {
            throw new SolrException(ErrorCode.SERVER_ERROR, e);
        }
    }
    TokenFilterFactory[] filters = chain.getTokenFilterFactories();
    for (TokenFilterFactory next : filters) {
        if (next instanceof ResourceLoaderAware) {
            try {
                ((ResourceLoaderAware) next).inform(loader);
            } catch (IOException e) {
                throw new SolrException(ErrorCode.SERVER_ERROR, e);
            }
        }
    }
}
Also used : TokenizerFactory(org.apache.lucene.analysis.util.TokenizerFactory) CharFilterFactory(org.apache.lucene.analysis.util.CharFilterFactory) ResourceLoaderAware(org.apache.lucene.analysis.util.ResourceLoaderAware) IOException(java.io.IOException) SolrException(org.apache.solr.common.SolrException) TokenFilterFactory(org.apache.lucene.analysis.util.TokenFilterFactory)

Example 12 with CharFilterFactory

use of org.apache.lucene.analysis.util.CharFilterFactory in project lucene-solr by apache.

the class FieldType method getAnalyzerProperties.

/** 
   * Returns a description of the given analyzer, by either reporting the Analyzer class
   * name (and optionally luceneMatchVersion) if it's not a TokenizerChain, or if it is,
   * querying each analysis factory for its name and args.
   */
protected static SimpleOrderedMap<Object> getAnalyzerProperties(Analyzer analyzer) {
    SimpleOrderedMap<Object> analyzerProps = new SimpleOrderedMap<>();
    if (analyzer instanceof TokenizerChain) {
        Map<String, String> factoryArgs;
        TokenizerChain tokenizerChain = (TokenizerChain) analyzer;
        CharFilterFactory[] charFilterFactories = tokenizerChain.getCharFilterFactories();
        if (0 < charFilterFactories.length) {
            List<SimpleOrderedMap<Object>> charFilterProps = new ArrayList<>();
            for (CharFilterFactory charFilterFactory : charFilterFactories) {
                SimpleOrderedMap<Object> props = new SimpleOrderedMap<>();
                props.add(CLASS_NAME, charFilterFactory.getClassArg());
                factoryArgs = charFilterFactory.getOriginalArgs();
                if (null != factoryArgs) {
                    for (String key : factoryArgs.keySet()) {
                        if (!CLASS_NAME.equals(key)) {
                            if (LUCENE_MATCH_VERSION_PARAM.equals(key)) {
                                if (charFilterFactory.isExplicitLuceneMatchVersion()) {
                                    props.add(key, factoryArgs.get(key));
                                }
                            } else {
                                props.add(key, factoryArgs.get(key));
                            }
                        }
                    }
                }
                charFilterProps.add(props);
            }
            analyzerProps.add(CHAR_FILTERS, charFilterProps);
        }
        SimpleOrderedMap<Object> tokenizerProps = new SimpleOrderedMap<>();
        TokenizerFactory tokenizerFactory = tokenizerChain.getTokenizerFactory();
        tokenizerProps.add(CLASS_NAME, tokenizerFactory.getClassArg());
        factoryArgs = tokenizerFactory.getOriginalArgs();
        if (null != factoryArgs) {
            for (String key : factoryArgs.keySet()) {
                if (!CLASS_NAME.equals(key)) {
                    if (LUCENE_MATCH_VERSION_PARAM.equals(key)) {
                        if (tokenizerFactory.isExplicitLuceneMatchVersion()) {
                            tokenizerProps.add(key, factoryArgs.get(key));
                        }
                    } else {
                        tokenizerProps.add(key, factoryArgs.get(key));
                    }
                }
            }
        }
        analyzerProps.add(TOKENIZER, tokenizerProps);
        TokenFilterFactory[] filterFactories = tokenizerChain.getTokenFilterFactories();
        if (0 < filterFactories.length) {
            List<SimpleOrderedMap<Object>> filterProps = new ArrayList<>();
            for (TokenFilterFactory filterFactory : filterFactories) {
                SimpleOrderedMap<Object> props = new SimpleOrderedMap<>();
                props.add(CLASS_NAME, filterFactory.getClassArg());
                factoryArgs = filterFactory.getOriginalArgs();
                if (null != factoryArgs) {
                    for (String key : factoryArgs.keySet()) {
                        if (!CLASS_NAME.equals(key)) {
                            if (LUCENE_MATCH_VERSION_PARAM.equals(key)) {
                                if (filterFactory.isExplicitLuceneMatchVersion()) {
                                    props.add(key, factoryArgs.get(key));
                                }
                            } else {
                                props.add(key, factoryArgs.get(key));
                            }
                        }
                    }
                }
                filterProps.add(props);
            }
            analyzerProps.add(FILTERS, filterProps);
        }
    } else {
        // analyzer is not instanceof TokenizerChain
        analyzerProps.add(CLASS_NAME, analyzer.getClass().getName());
        if (analyzer.getVersion() != Version.LATEST) {
            analyzerProps.add(LUCENE_MATCH_VERSION_PARAM, analyzer.getVersion().toString());
        }
    }
    return analyzerProps;
}
Also used : TokenizerFactory(org.apache.lucene.analysis.util.TokenizerFactory) CharFilterFactory(org.apache.lucene.analysis.util.CharFilterFactory) ArrayList(java.util.ArrayList) SimpleOrderedMap(org.apache.solr.common.util.SimpleOrderedMap) TokenFilterFactory(org.apache.lucene.analysis.util.TokenFilterFactory) TokenizerChain(org.apache.solr.analysis.TokenizerChain)

Example 13 with CharFilterFactory

use of org.apache.lucene.analysis.util.CharFilterFactory in project lucene-solr by apache.

the class FieldTypePluginLoader method readAnalyzer.

//
// <analyzer><tokenizer class="...."/><tokenizer class="...." arg="....">
//
//
private Analyzer readAnalyzer(Node node) throws XPathExpressionException {
    final SolrResourceLoader loader = schema.getResourceLoader();
    if (node == null)
        return null;
    NamedNodeMap attrs = node.getAttributes();
    String analyzerName = DOMUtil.getAttr(attrs, "class");
    // check for all of these up front, so we can error if used in 
    // conjunction with an explicit analyzer class.
    NodeList charFilterNodes = (NodeList) xpath.evaluate("./charFilter", node, XPathConstants.NODESET);
    NodeList tokenizerNodes = (NodeList) xpath.evaluate("./tokenizer", node, XPathConstants.NODESET);
    NodeList tokenFilterNodes = (NodeList) xpath.evaluate("./filter", node, XPathConstants.NODESET);
    if (analyzerName != null) {
        // own custom nodes (ie: <description> or something like that)
        if (0 != charFilterNodes.getLength() || 0 != tokenizerNodes.getLength() || 0 != tokenFilterNodes.getLength()) {
            throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Configuration Error: Analyzer class='" + analyzerName + "' can not be combined with nested analysis factories");
        }
        try {
            // No need to be core-aware as Analyzers are not in the core-aware list
            final Class<? extends Analyzer> clazz = loader.findClass(analyzerName, Analyzer.class);
            Analyzer analyzer = clazz.newInstance();
            final String matchVersionStr = DOMUtil.getAttr(attrs, LUCENE_MATCH_VERSION_PARAM);
            final Version luceneMatchVersion = (matchVersionStr == null) ? schema.getDefaultLuceneMatchVersion() : Config.parseLuceneVersionString(matchVersionStr);
            if (luceneMatchVersion == null) {
                throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Configuration Error: Analyzer '" + clazz.getName() + "' needs a 'luceneMatchVersion' parameter");
            }
            analyzer.setVersion(luceneMatchVersion);
            return analyzer;
        } catch (Exception e) {
            log.error("Cannot load analyzer: " + analyzerName, e);
            throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Cannot load analyzer: " + analyzerName, e);
        }
    }
    // Load the CharFilters
    final ArrayList<CharFilterFactory> charFilters = new ArrayList<>();
    AbstractPluginLoader<CharFilterFactory> charFilterLoader = new AbstractPluginLoader<CharFilterFactory>("[schema.xml] analyzer/charFilter", CharFilterFactory.class, false, false) {

        @Override
        protected CharFilterFactory create(SolrResourceLoader loader, String name, String className, Node node) throws Exception {
            final Map<String, String> params = DOMUtil.toMap(node.getAttributes());
            String configuredVersion = params.remove(LUCENE_MATCH_VERSION_PARAM);
            params.put(LUCENE_MATCH_VERSION_PARAM, parseConfiguredVersion(configuredVersion, CharFilterFactory.class.getSimpleName()).toString());
            CharFilterFactory factory = loader.newInstance(className, CharFilterFactory.class, getDefaultPackages(), new Class[] { Map.class }, new Object[] { params });
            factory.setExplicitLuceneMatchVersion(null != configuredVersion);
            return factory;
        }

        @Override
        protected void init(CharFilterFactory plugin, Node node) throws Exception {
            if (plugin != null) {
                charFilters.add(plugin);
            }
        }

        @Override
        protected CharFilterFactory register(String name, CharFilterFactory plugin) {
            // used for map registration
            return null;
        }
    };
    charFilterLoader.load(loader, charFilterNodes);
    // Load the Tokenizer
    // Although an analyzer only allows a single Tokenizer, we load a list to make sure
    // the configuration is ok
    final ArrayList<TokenizerFactory> tokenizers = new ArrayList<>(1);
    AbstractPluginLoader<TokenizerFactory> tokenizerLoader = new AbstractPluginLoader<TokenizerFactory>("[schema.xml] analyzer/tokenizer", TokenizerFactory.class, false, false) {

        @Override
        protected TokenizerFactory create(SolrResourceLoader loader, String name, String className, Node node) throws Exception {
            final Map<String, String> params = DOMUtil.toMap(node.getAttributes());
            String configuredVersion = params.remove(LUCENE_MATCH_VERSION_PARAM);
            params.put(LUCENE_MATCH_VERSION_PARAM, parseConfiguredVersion(configuredVersion, TokenizerFactory.class.getSimpleName()).toString());
            TokenizerFactory factory = loader.newInstance(className, TokenizerFactory.class, getDefaultPackages(), new Class[] { Map.class }, new Object[] { params });
            factory.setExplicitLuceneMatchVersion(null != configuredVersion);
            return factory;
        }

        @Override
        protected void init(TokenizerFactory plugin, Node node) throws Exception {
            if (!tokenizers.isEmpty()) {
                throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "The schema defines multiple tokenizers for: " + node);
            }
            tokenizers.add(plugin);
        }

        @Override
        protected TokenizerFactory register(String name, TokenizerFactory plugin) {
            // used for map registration
            return null;
        }
    };
    tokenizerLoader.load(loader, tokenizerNodes);
    // Make sure something was loaded
    if (tokenizers.isEmpty()) {
        throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "analyzer without class or tokenizer");
    }
    // Load the Filters
    final ArrayList<TokenFilterFactory> filters = new ArrayList<>();
    AbstractPluginLoader<TokenFilterFactory> filterLoader = new AbstractPluginLoader<TokenFilterFactory>("[schema.xml] analyzer/filter", TokenFilterFactory.class, false, false) {

        @Override
        protected TokenFilterFactory create(SolrResourceLoader loader, String name, String className, Node node) throws Exception {
            final Map<String, String> params = DOMUtil.toMap(node.getAttributes());
            String configuredVersion = params.remove(LUCENE_MATCH_VERSION_PARAM);
            params.put(LUCENE_MATCH_VERSION_PARAM, parseConfiguredVersion(configuredVersion, TokenFilterFactory.class.getSimpleName()).toString());
            TokenFilterFactory factory = loader.newInstance(className, TokenFilterFactory.class, getDefaultPackages(), new Class[] { Map.class }, new Object[] { params });
            factory.setExplicitLuceneMatchVersion(null != configuredVersion);
            return factory;
        }

        @Override
        protected void init(TokenFilterFactory plugin, Node node) throws Exception {
            if (plugin != null) {
                filters.add(plugin);
            }
        }

        @Override
        protected TokenFilterFactory register(String name, TokenFilterFactory plugin) throws Exception {
            // used for map registration
            return null;
        }
    };
    filterLoader.load(loader, tokenFilterNodes);
    return new TokenizerChain(charFilters.toArray(new CharFilterFactory[charFilters.size()]), tokenizers.get(0), filters.toArray(new TokenFilterFactory[filters.size()]));
}
Also used : AbstractPluginLoader(org.apache.solr.util.plugin.AbstractPluginLoader) NamedNodeMap(org.w3c.dom.NamedNodeMap) TokenizerFactory(org.apache.lucene.analysis.util.TokenizerFactory) KeywordTokenizerFactory(org.apache.lucene.analysis.core.KeywordTokenizerFactory) NodeList(org.w3c.dom.NodeList) CharFilterFactory(org.apache.lucene.analysis.util.CharFilterFactory) Node(org.w3c.dom.Node) ArrayList(java.util.ArrayList) KeywordAnalyzer(org.apache.lucene.analysis.core.KeywordAnalyzer) Analyzer(org.apache.lucene.analysis.Analyzer) XPathExpressionException(javax.xml.xpath.XPathExpressionException) SolrException(org.apache.solr.common.SolrException) TokenFilterFactory(org.apache.lucene.analysis.util.TokenFilterFactory) SolrResourceLoader(org.apache.solr.core.SolrResourceLoader) TokenizerChain(org.apache.solr.analysis.TokenizerChain) Version(org.apache.lucene.util.Version) SolrException(org.apache.solr.common.SolrException)

Example 14 with CharFilterFactory

use of org.apache.lucene.analysis.util.CharFilterFactory in project tika by apache.

the class AnalyzerDeserializer method buildCharFilters.

private static CharFilterFactory[] buildCharFilters(JsonElement el, String analyzerName) throws IOException {
    if (el == null || el.isJsonNull()) {
        return null;
    }
    if (!el.isJsonArray()) {
        throw new IllegalArgumentException("Expecting array for charfilters, but got:" + el.toString() + " for " + analyzerName);
    }
    JsonArray jsonArray = (JsonArray) el;
    List<CharFilterFactory> ret = new LinkedList<CharFilterFactory>();
    for (JsonElement filterMap : jsonArray) {
        if (!(filterMap instanceof JsonObject)) {
            throw new IllegalArgumentException("Expecting a map with \"factory\" string and \"params\" map in char filter factory;" + " not: " + filterMap.toString() + " in " + analyzerName);
        }
        JsonElement factoryEl = ((JsonObject) filterMap).get(FACTORY);
        if (factoryEl == null || !factoryEl.isJsonPrimitive()) {
            throw new IllegalArgumentException("Expecting value for factory in char filter factory builder in:" + analyzerName);
        }
        String factoryName = factoryEl.getAsString();
        factoryName = factoryName.replaceAll("oala.", "org.apache.lucene.analysis.");
        JsonElement paramsEl = ((JsonObject) filterMap).get(PARAMS);
        Map<String, String> params = mapify(paramsEl);
        String spiName = "";
        for (String s : CharFilterFactory.availableCharFilters()) {
            Class clazz = CharFilterFactory.lookupClass(s);
            if (clazz.getName().equals(factoryName)) {
                spiName = s;
                break;
            }
        }
        if (spiName.equals("")) {
            throw new IllegalArgumentException("A SPI class of type org.apache.lucene.analysis.util.CharFilterFactory with name" + "'" + factoryName + "' does not exist.");
        }
        try {
            CharFilterFactory charFilterFactory = CharFilterFactory.forName(spiName, params);
            if (charFilterFactory instanceof ResourceLoaderAware) {
                ((ResourceLoaderAware) charFilterFactory).inform(new ClasspathResourceLoader(AnalyzerDeserializer.class));
            }
            ret.add(charFilterFactory);
        } catch (IllegalArgumentException e) {
            throw new IllegalArgumentException("While trying to load " + analyzerName + ": " + e.getMessage(), e);
        }
    }
    if (ret.size() == 0) {
        return new CharFilterFactory[0];
    }
    return ret.toArray(new CharFilterFactory[ret.size()]);
}
Also used : CharFilterFactory(org.apache.lucene.analysis.util.CharFilterFactory) JsonObject(com.google.gson.JsonObject) LinkedList(java.util.LinkedList) JsonArray(com.google.gson.JsonArray) JsonElement(com.google.gson.JsonElement) ClasspathResourceLoader(org.apache.lucene.analysis.util.ClasspathResourceLoader) ResourceLoaderAware(org.apache.lucene.analysis.util.ResourceLoaderAware)

Example 15 with CharFilterFactory

use of org.apache.lucene.analysis.util.CharFilterFactory in project stanbol by apache.

the class LuceneLabelTokenizer method activate.

@Activate
protected void activate(ComponentContext ctx) throws ConfigurationException {
    //init the Solr ResourceLoader used for initialising the components
    resourceLoader = new StanbolResourceLoader(parentResourceLoader);
    //init the Solr CharFilterFactory (optional)
    Object value = ctx.getProperties().get(PROPERTY_CHAR_FILTER_FACTORY);
    if (value != null && !value.toString().isEmpty() && !DEFAULT_CLASS_NAME_CONFIG.equals(value)) {
        Entry<String, Map<String, String>> charFilterConfig = parseConfigLine(PROPERTY_CHAR_FILTER_FACTORY, value.toString());
        charFilterFactory = initAnalyzer(PROPERTY_CHAR_FILTER_FACTORY, charFilterConfig.getKey(), CharFilterFactory.class, charFilterConfig.getValue());
    } else {
        charFilterFactory = null;
    }
    //now initialise the TokenizerFactory (required)
    value = ctx.getProperties().get(PROPERTY_TOKENIZER_FACTORY);
    if (value == null || value.toString().isEmpty() || DEFAULT_CLASS_NAME_CONFIG.equals(value)) {
        throw new ConfigurationException(PROPERTY_TOKENIZER_FACTORY, "The class name of the Lucene Tokemizer MUST BE configured");
    }
    Entry<String, Map<String, String>> tokenizerConfig = parseConfigLine(PROPERTY_CHAR_FILTER_FACTORY, value.toString());
    tokenizerFactory = initAnalyzer(PROPERTY_TOKENIZER_FACTORY, tokenizerConfig.getKey(), TokenizerFactory.class, tokenizerConfig.getValue());
    //initialise the list of Token Filters
    Collection<String> values;
    value = ctx.getProperties().get(PROPERTY_TOKEN_FILTER_FACTORY);
    if (value == null) {
        values = Collections.emptyList();
    } else if (value instanceof Collection<?>) {
        values = new ArrayList<String>(((Collection<?>) value).size());
        for (Object v : (Collection<Object>) value) {
            values.add(v.toString());
        }
    } else if (value instanceof String[]) {
        values = Arrays.asList((String[]) value);
    } else if (value instanceof String) {
        values = Collections.singleton((String) value);
    } else {
        throw new ConfigurationException(PROPERTY_TOKEN_FILTER_FACTORY, "The type '" + value.getClass() + "' of the parsed value is not supported (supported are " + "Collections, String[] and String values)!");
    }
    for (String filterConfigLine : values) {
        if (filterConfigLine == null || filterConfigLine.isEmpty() || DEFAULT_CLASS_NAME_CONFIG.equals(filterConfigLine)) {
            //ignore null, empty and the default value
            continue;
        }
        Entry<String, Map<String, String>> filterConfig = parseConfigLine(PROPERTY_CHAR_FILTER_FACTORY, filterConfigLine);
        TokenFilterFactory tff = initAnalyzer(PROPERTY_TOKEN_FILTER_FACTORY, filterConfig.getKey(), TokenFilterFactory.class, filterConfig.getValue());
        filterFactories.add(tff);
    }
    //init the language configuration
    value = ctx.getProperties().get(LabelTokenizer.SUPPORTED_LANUAGES);
    if (value == null) {
        throw new ConfigurationException(LabelTokenizer.SUPPORTED_LANUAGES, "The language " + "configuration MUST BE present!");
    }
    langConf.setConfiguration(ctx.getProperties());
}
Also used : StanbolResourceLoader(org.apache.stanbol.commons.solr.utils.StanbolResourceLoader) TokenizerFactory(org.apache.lucene.analysis.util.TokenizerFactory) ConfigurationException(org.osgi.service.cm.ConfigurationException) CharFilterFactory(org.apache.lucene.analysis.util.CharFilterFactory) ArrayList(java.util.ArrayList) HashMap(java.util.HashMap) Map(java.util.Map) TokenFilterFactory(org.apache.lucene.analysis.util.TokenFilterFactory) Activate(org.apache.felix.scr.annotations.Activate)

Aggregations

CharFilterFactory (org.apache.lucene.analysis.util.CharFilterFactory)26 TokenFilterFactory (org.apache.lucene.analysis.util.TokenFilterFactory)16 TokenizerFactory (org.apache.lucene.analysis.util.TokenizerFactory)12 Analyzer (org.apache.lucene.analysis.Analyzer)7 MultiTermAwareComponent (org.apache.lucene.analysis.util.MultiTermAwareComponent)6 TokenizerChain (org.apache.solr.analysis.TokenizerChain)5 Reader (java.io.Reader)4 ArrayList (java.util.ArrayList)4 HashMap (java.util.HashMap)4 AbstractAnalysisFactory (org.apache.lucene.analysis.util.AbstractAnalysisFactory)4 StringReader (java.io.StringReader)3 Map (java.util.Map)3 TokenStream (org.apache.lucene.analysis.TokenStream)3 Tokenizer (org.apache.lucene.analysis.Tokenizer)3 ResourceLoaderAware (org.apache.lucene.analysis.util.ResourceLoaderAware)3 SolrException (org.apache.solr.common.SolrException)3 JsonElement (com.google.gson.JsonElement)2 JsonObject (com.google.gson.JsonObject)2 IOException (java.io.IOException)2 KeywordAnalyzer (org.apache.lucene.analysis.core.KeywordAnalyzer)2