use of org.apache.lucene.analysis.ja.JapaneseTokenizerFactory in project stanbol by apache.
the class KuromojiNlpEngine method activate.
/**
* Activate and read the properties. Configures and initialises a POSTagger for each language configured in
* CONFIG_LANGUAGES.
*
* @param ce the {@link org.osgi.service.component.ComponentContext}
*/
@Activate
protected void activate(ComponentContext ce) throws ConfigurationException, IOException {
log.info("activating smartcn tokenizing engine");
super.activate(ce);
// init the Solr ResourceLoader used for initialising the components
// first a ResourceLoader for this classloader, 2nd one using the commons.solr.core classloader
// and third the parentResourceLoader (if present).
resourceLoader = new StanbolResourceLoader(KuromojiNlpEngine.class.getClassLoader(), new StanbolResourceLoader(parentResourceLoader));
tokenizerFactory = new JapaneseTokenizerFactory(TOKENIZER_FACTORY_CONFIG);
((ResourceLoaderAware) tokenizerFactory).inform(resourceLoader);
// base form filter
TokenFilterFactory baseFormFilterFactory = new JapaneseBaseFormFilterFactory(BASE_FORM_FILTER_CONFIG);
filterFactories.add(baseFormFilterFactory);
// POS filter
TokenFilterFactory posFilterFactory = new JapanesePartOfSpeechStopFilterFactory(POS_FILTER_CONFIG);
((ResourceLoaderAware) posFilterFactory).inform(resourceLoader);
filterFactories.add(posFilterFactory);
// Stemming
TokenFilterFactory stemmFilterFactory = new JapaneseKatakanaStemFilterFactory(STEMM_FILTER_CONFIG);
filterFactories.add(stemmFilterFactory);
}
Aggregations