Search in sources :

Example 6 with TextAnalyzerConfig

use of org.apache.stanbol.commons.opennlp.TextAnalyzer.TextAnalyzerConfig in project stanbol by apache.

the class KeywordLinkingEngine method activateTextAnalyzerConfig.

/**
     * Initialise the {@link TextAnalyzer} component.<p>
     * Currently this includes the following configurations: <ul>
     * <li>{@link #PROCESSED_LANGUAGES}: If no configuration is present the
     * default (process all languages) is used.
     * <li> {@value #MIN_POS_TAG_PROBABILITY}: If no configuration is
     * present the #DEFAULT_MIN_POS_TAG_PROBABILITY is used
     * languages based on the value of the
     * 
     * @param configuration the OSGI component configuration
     */
protected final void activateTextAnalyzerConfig(Dictionary<String, Object> configuration) throws ConfigurationException {
    nlpConfig = new TextAnalyzerConfig();
    Object value;
    value = configuration.get(PROCESSED_LANGUAGES);
    if (value == null) {
        this.languages = DEFAULT_LANGUAGES;
    } else if (value.toString().trim().isEmpty()) {
        this.languages = Collections.emptySet();
    } else {
        String[] languageArray = value.toString().split(",");
        languages = new HashSet<String>();
        for (String language : languageArray) {
            if (language != null) {
                language = language.trim();
                if (!language.isEmpty()) {
                    languages.add(language);
                }
            }
        }
    }
    value = configuration.get(MIN_POS_TAG_PROBABILITY);
    double minPosTagProb;
    if (value instanceof Number) {
        minPosTagProb = ((Number) value).doubleValue();
    } else if (value != null && !value.toString().isEmpty()) {
        try {
            minPosTagProb = Double.valueOf(value.toString());
        } catch (NumberFormatException e) {
            throw new ConfigurationException(MIN_POS_TAG_PROBABILITY, "Unable to parse the min POS tag probability from the parsed value " + value, e);
        }
    } else {
        minPosTagProb = DEFAULT_MIN_POS_TAG_PROBABILITY;
    }
    if (minPosTagProb > 1) {
        throw new ConfigurationException(MIN_POS_TAG_PROBABILITY, "The configured min POS tag probability MUST BE in the range [0..1] " + "or < 0 to deactivate this feature (parsed value " + value + ")!");
    }
    nlpConfig.setMinPosTagProbability(minPosTagProb);
    value = configuration.get(KEYWORD_TOKENIZER);
    //the keyword tokenizer config
    if (value instanceof Boolean) {
        nlpConfig.forceKeywordTokenizer((Boolean) value);
    } else if (value != null && !value.toString().isEmpty()) {
        nlpConfig.forceKeywordTokenizer(Boolean.valueOf(value.toString()));
    }
    //nlpConfig.enablePosTypeChunker(false);
    //nlpConfig.enableChunker(false);
    analysedContentFactory = OpenNlpAnalysedContentFactory.getInstance(openNLP, nlpConfig);
}
Also used : ConfigurationException(org.osgi.service.cm.ConfigurationException) TextAnalyzerConfig(org.apache.stanbol.commons.opennlp.TextAnalyzer.TextAnalyzerConfig) HashSet(java.util.HashSet)

Aggregations

TextAnalyzerConfig (org.apache.stanbol.commons.opennlp.TextAnalyzer.TextAnalyzerConfig)6 Test (org.junit.Test)5 TextAnalyzer (org.apache.stanbol.commons.opennlp.TextAnalyzer)3 AnalysedText (org.apache.stanbol.commons.opennlp.TextAnalyzer.AnalysedText)3 HashMap (java.util.HashMap)2 EntityLinkerConfig (org.apache.stanbol.enhancer.engines.keywordextraction.impl.EntityLinkerConfig)2 ArrayList (java.util.ArrayList)1 HashSet (java.util.HashSet)1 List (java.util.List)1 IRI (org.apache.clerezza.commons.rdf.IRI)1 RDFTerm (org.apache.clerezza.commons.rdf.RDFTerm)1 PlainLiteralImpl (org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl)1 TripleImpl (org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)1 KeywordLinkingEngine (org.apache.stanbol.enhancer.engines.keywordextraction.engine.KeywordLinkingEngine)1 EntityLinker (org.apache.stanbol.enhancer.engines.keywordextraction.impl.EntityLinker)1 LinkedEntity (org.apache.stanbol.enhancer.engines.keywordextraction.impl.LinkedEntity)1 Suggestion (org.apache.stanbol.enhancer.engines.keywordextraction.impl.Suggestion)1 OpenNlpAnalysedContentFactory (org.apache.stanbol.enhancer.engines.keywordextraction.linking.impl.OpenNlpAnalysedContentFactory)1 ContentItem (org.apache.stanbol.enhancer.servicesapi.ContentItem)1 StringSource (org.apache.stanbol.enhancer.servicesapi.impl.StringSource)1