Search in sources :

Example 1 with StanbolResourceLoader

use of org.apache.stanbol.commons.solr.utils.StanbolResourceLoader in project stanbol by apache.

the class KuromojiNlpEngine method activate.

/**
     * Activate and read the properties. Configures and initialises a POSTagger for each language configured in
     * CONFIG_LANGUAGES.
     *
     * @param ce the {@link org.osgi.service.component.ComponentContext}
     */
@Activate
protected void activate(ComponentContext ce) throws ConfigurationException, IOException {
    log.info("activating smartcn tokenizing engine");
    super.activate(ce);
    //init the Solr ResourceLoader used for initialising the components
    //first a ResourceLoader for this classloader, 2nd one using the commons.solr.core classloader
    //and third the parentResourceLoader (if present).
    resourceLoader = new StanbolResourceLoader(KuromojiNlpEngine.class.getClassLoader(), new StanbolResourceLoader(parentResourceLoader));
    tokenizerFactory = new JapaneseTokenizerFactory(TOKENIZER_FACTORY_CONFIG);
    ((ResourceLoaderAware) tokenizerFactory).inform(resourceLoader);
    //base form filter
    TokenFilterFactory baseFormFilterFactory = new JapaneseBaseFormFilterFactory(BASE_FORM_FILTER_CONFIG);
    filterFactories.add(baseFormFilterFactory);
    //POS filter
    TokenFilterFactory posFilterFactory = new JapanesePartOfSpeechStopFilterFactory(POS_FILTER_CONFIG);
    ((ResourceLoaderAware) posFilterFactory).inform(resourceLoader);
    filterFactories.add(posFilterFactory);
    //Stemming
    TokenFilterFactory stemmFilterFactory = new JapaneseKatakanaStemFilterFactory(STEMM_FILTER_CONFIG);
    filterFactories.add(stemmFilterFactory);
}
Also used : StanbolResourceLoader(org.apache.stanbol.commons.solr.utils.StanbolResourceLoader) JapaneseTokenizerFactory(org.apache.lucene.analysis.ja.JapaneseTokenizerFactory) JapanesePartOfSpeechStopFilterFactory(org.apache.lucene.analysis.ja.JapanesePartOfSpeechStopFilterFactory) JapaneseKatakanaStemFilterFactory(org.apache.lucene.analysis.ja.JapaneseKatakanaStemFilterFactory) ResourceLoaderAware(org.apache.lucene.analysis.util.ResourceLoaderAware) JapaneseBaseFormFilterFactory(org.apache.lucene.analysis.ja.JapaneseBaseFormFilterFactory) TokenFilterFactory(org.apache.lucene.analysis.util.TokenFilterFactory) Activate(org.apache.felix.scr.annotations.Activate)

Example 2 with StanbolResourceLoader

use of org.apache.stanbol.commons.solr.utils.StanbolResourceLoader in project stanbol by apache.

the class LuceneLabelTokenizer method activate.

@Activate
protected void activate(ComponentContext ctx) throws ConfigurationException {
    //init the Solr ResourceLoader used for initialising the components
    resourceLoader = new StanbolResourceLoader(parentResourceLoader);
    //init the Solr CharFilterFactory (optional)
    Object value = ctx.getProperties().get(PROPERTY_CHAR_FILTER_FACTORY);
    if (value != null && !value.toString().isEmpty() && !DEFAULT_CLASS_NAME_CONFIG.equals(value)) {
        Entry<String, Map<String, String>> charFilterConfig = parseConfigLine(PROPERTY_CHAR_FILTER_FACTORY, value.toString());
        charFilterFactory = initAnalyzer(PROPERTY_CHAR_FILTER_FACTORY, charFilterConfig.getKey(), CharFilterFactory.class, charFilterConfig.getValue());
    } else {
        charFilterFactory = null;
    }
    //now initialise the TokenizerFactory (required)
    value = ctx.getProperties().get(PROPERTY_TOKENIZER_FACTORY);
    if (value == null || value.toString().isEmpty() || DEFAULT_CLASS_NAME_CONFIG.equals(value)) {
        throw new ConfigurationException(PROPERTY_TOKENIZER_FACTORY, "The class name of the Lucene Tokemizer MUST BE configured");
    }
    Entry<String, Map<String, String>> tokenizerConfig = parseConfigLine(PROPERTY_CHAR_FILTER_FACTORY, value.toString());
    tokenizerFactory = initAnalyzer(PROPERTY_TOKENIZER_FACTORY, tokenizerConfig.getKey(), TokenizerFactory.class, tokenizerConfig.getValue());
    //initialise the list of Token Filters
    Collection<String> values;
    value = ctx.getProperties().get(PROPERTY_TOKEN_FILTER_FACTORY);
    if (value == null) {
        values = Collections.emptyList();
    } else if (value instanceof Collection<?>) {
        values = new ArrayList<String>(((Collection<?>) value).size());
        for (Object v : (Collection<Object>) value) {
            values.add(v.toString());
        }
    } else if (value instanceof String[]) {
        values = Arrays.asList((String[]) value);
    } else if (value instanceof String) {
        values = Collections.singleton((String) value);
    } else {
        throw new ConfigurationException(PROPERTY_TOKEN_FILTER_FACTORY, "The type '" + value.getClass() + "' of the parsed value is not supported (supported are " + "Collections, String[] and String values)!");
    }
    for (String filterConfigLine : values) {
        if (filterConfigLine == null || filterConfigLine.isEmpty() || DEFAULT_CLASS_NAME_CONFIG.equals(filterConfigLine)) {
            //ignore null, empty and the default value
            continue;
        }
        Entry<String, Map<String, String>> filterConfig = parseConfigLine(PROPERTY_CHAR_FILTER_FACTORY, filterConfigLine);
        TokenFilterFactory tff = initAnalyzer(PROPERTY_TOKEN_FILTER_FACTORY, filterConfig.getKey(), TokenFilterFactory.class, filterConfig.getValue());
        filterFactories.add(tff);
    }
    //init the language configuration
    value = ctx.getProperties().get(LabelTokenizer.SUPPORTED_LANUAGES);
    if (value == null) {
        throw new ConfigurationException(LabelTokenizer.SUPPORTED_LANUAGES, "The language " + "configuration MUST BE present!");
    }
    langConf.setConfiguration(ctx.getProperties());
}
Also used : StanbolResourceLoader(org.apache.stanbol.commons.solr.utils.StanbolResourceLoader) TokenizerFactory(org.apache.lucene.analysis.util.TokenizerFactory) ConfigurationException(org.osgi.service.cm.ConfigurationException) CharFilterFactory(org.apache.lucene.analysis.util.CharFilterFactory) ArrayList(java.util.ArrayList) HashMap(java.util.HashMap) Map(java.util.Map) TokenFilterFactory(org.apache.lucene.analysis.util.TokenFilterFactory) Activate(org.apache.felix.scr.annotations.Activate)

Aggregations

Activate (org.apache.felix.scr.annotations.Activate)2 TokenFilterFactory (org.apache.lucene.analysis.util.TokenFilterFactory)2 StanbolResourceLoader (org.apache.stanbol.commons.solr.utils.StanbolResourceLoader)2 ArrayList (java.util.ArrayList)1 HashMap (java.util.HashMap)1 Map (java.util.Map)1 JapaneseBaseFormFilterFactory (org.apache.lucene.analysis.ja.JapaneseBaseFormFilterFactory)1 JapaneseKatakanaStemFilterFactory (org.apache.lucene.analysis.ja.JapaneseKatakanaStemFilterFactory)1 JapanesePartOfSpeechStopFilterFactory (org.apache.lucene.analysis.ja.JapanesePartOfSpeechStopFilterFactory)1 JapaneseTokenizerFactory (org.apache.lucene.analysis.ja.JapaneseTokenizerFactory)1 CharFilterFactory (org.apache.lucene.analysis.util.CharFilterFactory)1 ResourceLoaderAware (org.apache.lucene.analysis.util.ResourceLoaderAware)1 TokenizerFactory (org.apache.lucene.analysis.util.TokenizerFactory)1 ConfigurationException (org.osgi.service.cm.ConfigurationException)1