Search in sources :

Example 1 with ChunkerME

use of opennlp.tools.chunker.ChunkerME in project stanbol by apache.

the class OpenNlpChunkingEngine method computeEnhancements.

/**
     * Compute enhancements for supplied ContentItem. The results of the process
     * are expected to be stored in the metadata of the content item.
     * <p/>
     * The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
     * persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
     *
     * @throws org.apache.stanbol.enhancer.servicesapi.EngineException
     *          if the underlying process failed to work as
     *          expected
     */
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    AnalysedText at = getAnalysedText(this, ci, true);
    String language = getLanguage(this, ci, true);
    isLangaugeConfigured(this, languageConfiguration, language, true);
    ChunkerME chunker = initChunker(language);
    if (chunker == null) {
        return;
    }
    //init the Phrase TagSet
    TagSet<PhraseTag> tagSet = tagSetRegistry.getTagSet(language);
    if (tagSet == null) {
    }
    if (tagSet == null) {
        log.warn("No Phrase TagSet registered for Language '{}'. Will build an " + "adhoc set based on encountered Tags!", language);
        //for now only created to avoid checks for tagSet == null
        //TODO: in future we might want to automatically create posModels based
        //on tagged texts. However this makes no sense as long we can not
        //persist TagSets.
        tagSet = new TagSet<PhraseTag>("dummy", language);
    }
    //holds PosTags created for POS tags that where not part of the posModel
    //(will hold all PosTags in case tagSet is NULL
    Map<String, PhraseTag> adhocTags = languageAdhocTags.get(language);
    if (adhocTags == null) {
        adhocTags = new HashMap<String, PhraseTag>();
        languageAdhocTags.put(language, adhocTags);
    }
    ci.getLock().writeLock().lock();
    try {
        Iterator<? extends Section> sentences = at.getSentences();
        if (!sentences.hasNext()) {
            //no sentences ... iterate over the whole text
            sentences = Collections.singleton(at).iterator();
        }
        List<String> tokenTextList = new ArrayList<String>(64);
        List<String> posList = new ArrayList<String>(64);
        List<Token> tokenList = new ArrayList<Token>(64);
        //process each sentence seperatly
        while (sentences.hasNext()) {
            // (1) get Tokens and POS information for the sentence
            Section sentence = sentences.next();
            Iterator<Token> tokens = sentence.getTokens();
            while (tokens.hasNext()) {
                Token token = tokens.next();
                tokenList.add(token);
                tokenTextList.add(token.getSpan());
                Value<PosTag> posValue = token.getAnnotation(POS_ANNOTATION);
                if (posValue == null) {
                    throw new EngineException("Missing POS value for Token '" + token.getSpan() + "' of ContentItem " + ci.getUri() + "(Sentence: '" + sentence.getSpan() + "'). This may " + "indicate that a POS tagging Engine is missing in " + "the EnhancementChain or that the used POS tagging " + "does not provide POS tags for each token!");
                } else {
                    posList.add(posValue.value().getTag());
                }
            }
            String[] tokenStrings = tokenTextList.toArray(new String[tokenTextList.size()]);
            String[] tokenPos = posList.toArray(new String[tokenTextList.size()]);
            if (log.isTraceEnabled()) {
                log.trace("Tokens: {}" + Arrays.toString(tokenStrings));
            }
            //free memory
            tokenTextList.clear();
            //free memory
            posList.clear();
            // (2) Chunk the sentence
            String[] chunkTags = chunker.chunk(tokenStrings, tokenPos);
            double[] chunkProb = chunker.probs();
            if (log.isTraceEnabled()) {
                log.trace("Chunks: {}" + Arrays.toString(chunkTags));
            }
            //free memory
            tokenStrings = null;
            //free memory
            tokenPos = null;
            // (3) Process the results and write the Annotations
            double chunkProps = 0;
            int chunkTokenCount = 0;
            PhraseTag tag = null;
            int i;
            /*
                 * This assumes:
                 *  - 'B-{tag}' ... for start of a new chunk
                 *  - '???' ... anything other for continuing the current chunk
                 *  - 'O' ... no chunk (ends current chunk)
                 */
            for (i = 0; i < tokenList.size(); i++) {
                boolean start = chunkTags[i].charAt(0) == 'B';
                boolean end = tag != null && (start || chunkTags[i].charAt(0) == 'O');
                if (end) {
                    //add the current phrase
                    //add at AnalysedText level, because offsets are absolute
                    //NOTE we are already at the next token when we detect the end
                    Chunk chunk = at.addChunk(tokenList.get(i - chunkTokenCount).getStart(), tokenList.get(i - 1).getEnd());
                    chunk.addAnnotation(PHRASE_ANNOTATION, new Value<PhraseTag>(tag, chunkProps / (double) chunkTokenCount));
                    //reset the state
                    tag = null;
                    chunkTokenCount = 0;
                    chunkProps = 0;
                }
                if (start) {
                    //create the new tag
                    tag = getPhraseTag(tagSet, adhocTags, chunkTags[i].substring(2), //skip 'B-'
                    language);
                }
                if (tag != null) {
                    //count this token for the current chunk
                    chunkProps = chunkProps + chunkProb[i];
                    chunkTokenCount++;
                }
            }
            if (tag != null) {
                Chunk chunk = at.addChunk(tokenList.get(i - chunkTokenCount).getStart(), tokenList.get(i - 1).getEnd());
                chunk.addAnnotation(PHRASE_ANNOTATION, new Value<PhraseTag>(tag, chunkProps / (double) chunkTokenCount));
            }
            // (4) clean up
            tokenList.clear();
        }
    } finally {
        ci.getLock().writeLock().unlock();
    }
    if (log.isTraceEnabled()) {
        logChunks(at);
    }
}
Also used : ArrayList(java.util.ArrayList) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) Token(org.apache.stanbol.enhancer.nlp.model.Token) PhraseTag(org.apache.stanbol.enhancer.nlp.phrase.PhraseTag) Chunk(org.apache.stanbol.enhancer.nlp.model.Chunk) Section(org.apache.stanbol.enhancer.nlp.model.Section) NlpEngineHelper.getAnalysedText(org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getAnalysedText) AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) PosTag(org.apache.stanbol.enhancer.nlp.pos.PosTag) ChunkerME(opennlp.tools.chunker.ChunkerME)

Example 2 with ChunkerME

use of opennlp.tools.chunker.ChunkerME in project stanbol by apache.

the class OpenNlpChunkingEngine method initChunker.

private ChunkerME initChunker(String language) {
    //check if the parsed language is ok
    isLangaugeConfigured(this, languageConfiguration, language, true);
    String modelName = languageConfiguration.getParameter(language, MODEL_PARAM_NAME);
    ChunkerModel model;
    try {
        if (modelName == null) {
            // the default model
            model = openNLP.getChunkerModel(language);
        } else {
            model = openNLP.getModel(ChunkerModel.class, modelName, null);
        }
    } catch (IOException e) {
        log.warn("Unable to load Chunker model for language '" + language + "' (model: " + (modelName == null ? "default" : modelName) + ")", e);
        return null;
    } catch (RuntimeException e) {
        log.warn("Error while creating ChunkerModel for language '" + language + "' (model: " + (modelName == null ? "default" : modelName) + ")", e);
        return null;
    }
    if (model == null) {
        log.trace("no Chunker Model for language {}", language);
        return null;
    } else {
        return new ChunkerME(model);
    }
}
Also used : ChunkerModel(opennlp.tools.chunker.ChunkerModel) ChunkerME(opennlp.tools.chunker.ChunkerME) IOException(java.io.IOException)

Example 3 with ChunkerME

use of opennlp.tools.chunker.ChunkerME in project stanbol by apache.

the class TextAnalyzer method getChunker.

protected final ChunkerME getChunker() {
    if (!config.enableChunker || config.forcePosTypeChunker) {
        return null;
    }
    if (chunker == null && !chunkerNotAvailable) {
        try {
            ChunkerModel chunkerModel = openNLP.getChunkerModel(language);
            if (chunkerModel != null) {
                chunker = new ChunkerME(chunkerModel);
            } else {
                log.debug("No Chunker Model for language {}", language);
                chunkerNotAvailable = true;
            }
        } catch (IOException e) {
            log.info("Unable to load Chunker Model for language " + language, e);
            chunkerNotAvailable = true;
        }
    }
    return chunker;
}
Also used : ChunkerModel(opennlp.tools.chunker.ChunkerModel) ChunkerME(opennlp.tools.chunker.ChunkerME) IOException(java.io.IOException)

Aggregations

ChunkerME (opennlp.tools.chunker.ChunkerME)3 IOException (java.io.IOException)2 ChunkerModel (opennlp.tools.chunker.ChunkerModel)2 ArrayList (java.util.ArrayList)1 AnalysedText (org.apache.stanbol.enhancer.nlp.model.AnalysedText)1 Chunk (org.apache.stanbol.enhancer.nlp.model.Chunk)1 Section (org.apache.stanbol.enhancer.nlp.model.Section)1 Token (org.apache.stanbol.enhancer.nlp.model.Token)1 PhraseTag (org.apache.stanbol.enhancer.nlp.phrase.PhraseTag)1 PosTag (org.apache.stanbol.enhancer.nlp.pos.PosTag)1 NlpEngineHelper.getAnalysedText (org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getAnalysedText)1 EngineException (org.apache.stanbol.enhancer.servicesapi.EngineException)1