Search in sources :

Example 6 with Section

use of org.apache.stanbol.enhancer.nlp.model.Section in project stanbol by apache.

the class OpenNlpPosTaggingEngine method detectSentences.

private List<Section> detectSentences(AnalysedText at, String language) {
    SentenceDetector sentenceDetector = getSentenceDetector(language);
    List<Section> sentences;
    if (sentenceDetector != null) {
        sentences = new ArrayList<Section>();
        for (opennlp.tools.util.Span sentSpan : sentenceDetector.sentPosDetect(at.getSpan())) {
            Sentence sentence = at.addSentence(sentSpan.getStart(), sentSpan.getEnd());
            log.trace(" > add {}", sentence);
            sentences.add(sentence);
        }
    } else {
        sentences = null;
    }
    return sentences;
}
Also used : SentenceDetector(opennlp.tools.sentdetect.SentenceDetector) Section(org.apache.stanbol.enhancer.nlp.model.Section) Sentence(org.apache.stanbol.enhancer.nlp.model.Sentence)

Example 7 with Section

use of org.apache.stanbol.enhancer.nlp.model.Section in project stanbol by apache.

the class OpenNlpTokenizerEngine method computeEnhancements.

/**
     * Compute enhancements for supplied ContentItem. The results of the process
     * are expected to be stored in the metadata of the content item.
     * <p/>
     * The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
     * persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
     * <p/>
     * This method creates a new POSContentPart using {@link org.apache.stanbol.enhancer.engines.pos.api.POSTaggerHelper#createContentPart} from a text/plain part and
     * stores it as a new part in the content item. The metadata is not changed.
     *
     * @throws org.apache.stanbol.enhancer.servicesapi.EngineException
     *          if the underlying process failed to work as
     *          expected
     */
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    AnalysedText at = initAnalysedText(this, analysedTextFactory, ci);
    String language = getLanguage(this, ci, true);
    Tokenizer tokenizer = getTokenizer(language);
    if (tokenizer == null) {
        log.warn("Tokenizer for language {} is no longer available. " + "This might happen if the model becomes unavailable during enhancement. " + "If this happens more often it might also indicate an bug in the used " + "EnhancementJobManager implementation as the availability is also checked " + "in the canEnhance(..) method of this Enhancement Engine.");
        return;
    }
    //Try to use sentences for tokenizing
    Iterator<? extends Section> sections = at.getSentences();
    if (!sections.hasNext()) {
        //if no sentences are annotated
        sections = Collections.singleton(at).iterator();
    }
    //for all sentences (or the whole Text - if no sentences available)
    while (sections.hasNext()) {
        Section section = sections.next();
        //Tokenize section
        opennlp.tools.util.Span[] tokenSpans = tokenizer.tokenizePos(section.getSpan());
        for (int i = 0; i < tokenSpans.length; i++) {
            Token token = section.addToken(tokenSpans[i].getStart(), tokenSpans[i].getEnd());
            log.trace(" > add {}", token);
        }
    }
}
Also used : AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) NlpEngineHelper.initAnalysedText(org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.initAnalysedText) Token(org.apache.stanbol.enhancer.nlp.model.Token) Tokenizer(opennlp.tools.tokenize.Tokenizer) SimpleTokenizer(opennlp.tools.tokenize.SimpleTokenizer) Section(org.apache.stanbol.enhancer.nlp.model.Section)

Example 8 with Section

use of org.apache.stanbol.enhancer.nlp.model.Section in project stanbol by apache.

the class EntityCoReferenceEngine method extractNersAndNounPhrases.

/**
     * Extracts the NERs and the noun phrases from the given text and puts them in the given lists.
     * 
     * @param ci
     * @param ners
     * @param nounPhrases
     */
private void extractNersAndNounPhrases(ContentItem ci, Map<Integer, List<Span>> ners, List<NounPhrase> nounPhrases) {
    AnalysedText at = NlpEngineHelper.getAnalysedText(this, ci, true);
    Iterator<? extends Section> sections = at.getSentences();
    if (!sections.hasNext()) {
        // process as single sentence
        sections = Collections.singleton(at).iterator();
    }
    int sentenceCnt = 0;
    while (sections.hasNext()) {
        sentenceCnt++;
        Section section = sections.next();
        List<NounPhrase> sectionNounPhrases = new ArrayList<NounPhrase>();
        List<Span> sectionNers = new ArrayList<Span>();
        Iterator<Span> chunks = section.getEnclosed(EnumSet.of(SpanTypeEnum.Chunk));
        while (chunks.hasNext()) {
            Span chunk = chunks.next();
            Value<NerTag> ner = chunk.getAnnotation(NlpAnnotations.NER_ANNOTATION);
            if (ner != null) {
                sectionNers.add(chunk);
            }
            Value<PhraseTag> phrase = chunk.getAnnotation(NlpAnnotations.PHRASE_ANNOTATION);
            if (phrase != null && phrase.value().getCategory() == LexicalCategory.Noun) {
                sectionNounPhrases.add(new NounPhrase(chunk, sentenceCnt));
            }
        }
        for (NounPhrase nounPhrase : sectionNounPhrases) {
            Iterator<Span> tokens = section.getEnclosed(EnumSet.of(SpanTypeEnum.Token));
            while (tokens.hasNext()) {
                Span token = tokens.next();
                if (nounPhrase.containsSpan(token)) {
                    nounPhrase.addToken(token);
                }
            }
            for (Span sectionNer : sectionNers) {
                if (nounPhrase.containsSpan(sectionNer)) {
                    nounPhrase.addNerChunk(sectionNer);
                }
            }
        }
        nounPhrases.addAll(sectionNounPhrases);
        if (!sectionNers.isEmpty()) {
            ners.put(sentenceCnt, sectionNers);
        }
    }
}
Also used : NerTag(org.apache.stanbol.enhancer.nlp.ner.NerTag) NounPhrase(org.apache.stanbol.enhancer.engines.entitycoreference.datamodel.NounPhrase) ArrayList(java.util.ArrayList) PhraseTag(org.apache.stanbol.enhancer.nlp.phrase.PhraseTag) Section(org.apache.stanbol.enhancer.nlp.model.Section) Span(org.apache.stanbol.enhancer.nlp.model.Span) AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText)

Aggregations

Section (org.apache.stanbol.enhancer.nlp.model.Section)8 ArrayList (java.util.ArrayList)5 AnalysedText (org.apache.stanbol.enhancer.nlp.model.AnalysedText)5 Token (org.apache.stanbol.enhancer.nlp.model.Token)5 Chunk (org.apache.stanbol.enhancer.nlp.model.Chunk)2 Sentence (org.apache.stanbol.enhancer.nlp.model.Sentence)2 NerTag (org.apache.stanbol.enhancer.nlp.ner.NerTag)2 PhraseTag (org.apache.stanbol.enhancer.nlp.phrase.PhraseTag)2 PosTag (org.apache.stanbol.enhancer.nlp.pos.PosTag)2 NlpEngineHelper.getAnalysedText (org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getAnalysedText)2 NlpEngineHelper.initAnalysedText (org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.initAnalysedText)2 EngineException (org.apache.stanbol.enhancer.servicesapi.EngineException)2 LinkedHashMap (java.util.LinkedHashMap)1 List (java.util.List)1 ChunkerME (opennlp.tools.chunker.ChunkerME)1 NameFinderME (opennlp.tools.namefind.NameFinderME)1 POSTagger (opennlp.tools.postag.POSTagger)1 SentenceDetector (opennlp.tools.sentdetect.SentenceDetector)1 SimpleTokenizer (opennlp.tools.tokenize.SimpleTokenizer)1 Tokenizer (opennlp.tools.tokenize.Tokenizer)1