Search in sources :

Example 21 with AnalysedText

use of org.apache.stanbol.enhancer.nlp.model.AnalysedText in project stanbol by apache.

the class EntityLinkingEngine method canEnhance.

@Override
public int canEnhance(ContentItem ci) throws EngineException {
    log.trace("canEnhancer {}", ci.getUri());
    if (isOfflineMode() && !entitySearcher.supportsOfflineMode()) {
        log.warn("{} '{}' is inactive because EntitySearcher does not support Offline mode!", getClass().getSimpleName(), getName());
        return CANNOT_ENHANCE;
    }
    String language = getLanguage(this, ci, false);
    if (language == null || textProcessingConfig.getConfiguration(language) == null) {
        log.debug("Engine {} ignores ContentItem {} becuase language {} is not condigured.", new Object[] { getName(), ci.getUri(), language });
        return CANNOT_ENHANCE;
    }
    //we need a detected language, the AnalyzedText contentPart with
    //Tokens.
    AnalysedText at = getAnalysedText(this, ci, false);
    return at != null && at.getTokens().hasNext() ? ENHANCE_ASYNC : CANNOT_ENHANCE;
}
Also used : NlpEngineHelper.getAnalysedText(org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getAnalysedText) AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText)

Example 22 with AnalysedText

use of org.apache.stanbol.enhancer.nlp.model.AnalysedText in project stanbol by apache.

the class EntityCoReferenceEngine method extractNersAndNounPhrases.

/**
     * Extracts the NERs and the noun phrases from the given text and puts them in the given lists.
     * 
     * @param ci
     * @param ners
     * @param nounPhrases
     */
private void extractNersAndNounPhrases(ContentItem ci, Map<Integer, List<Span>> ners, List<NounPhrase> nounPhrases) {
    AnalysedText at = NlpEngineHelper.getAnalysedText(this, ci, true);
    Iterator<? extends Section> sections = at.getSentences();
    if (!sections.hasNext()) {
        // process as single sentence
        sections = Collections.singleton(at).iterator();
    }
    int sentenceCnt = 0;
    while (sections.hasNext()) {
        sentenceCnt++;
        Section section = sections.next();
        List<NounPhrase> sectionNounPhrases = new ArrayList<NounPhrase>();
        List<Span> sectionNers = new ArrayList<Span>();
        Iterator<Span> chunks = section.getEnclosed(EnumSet.of(SpanTypeEnum.Chunk));
        while (chunks.hasNext()) {
            Span chunk = chunks.next();
            Value<NerTag> ner = chunk.getAnnotation(NlpAnnotations.NER_ANNOTATION);
            if (ner != null) {
                sectionNers.add(chunk);
            }
            Value<PhraseTag> phrase = chunk.getAnnotation(NlpAnnotations.PHRASE_ANNOTATION);
            if (phrase != null && phrase.value().getCategory() == LexicalCategory.Noun) {
                sectionNounPhrases.add(new NounPhrase(chunk, sentenceCnt));
            }
        }
        for (NounPhrase nounPhrase : sectionNounPhrases) {
            Iterator<Span> tokens = section.getEnclosed(EnumSet.of(SpanTypeEnum.Token));
            while (tokens.hasNext()) {
                Span token = tokens.next();
                if (nounPhrase.containsSpan(token)) {
                    nounPhrase.addToken(token);
                }
            }
            for (Span sectionNer : sectionNers) {
                if (nounPhrase.containsSpan(sectionNer)) {
                    nounPhrase.addNerChunk(sectionNer);
                }
            }
        }
        nounPhrases.addAll(sectionNounPhrases);
        if (!sectionNers.isEmpty()) {
            ners.put(sentenceCnt, sectionNers);
        }
    }
}
Also used : NerTag(org.apache.stanbol.enhancer.nlp.ner.NerTag) NounPhrase(org.apache.stanbol.enhancer.engines.entitycoreference.datamodel.NounPhrase) ArrayList(java.util.ArrayList) PhraseTag(org.apache.stanbol.enhancer.nlp.phrase.PhraseTag) Section(org.apache.stanbol.enhancer.nlp.model.Section) Span(org.apache.stanbol.enhancer.nlp.model.Span) AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText)

Example 23 with AnalysedText

use of org.apache.stanbol.enhancer.nlp.model.AnalysedText in project stanbol by apache.

the class NlpEngineHelper method initAnalysedText.

/**
     * Retrieves - or if not present - creates the {@link AnalysedText} content
     * part for the parsed {@link ContentItem}. If the {@link Blob} with the
     * mime type '<code>text/plain</code>' is present this method
     * throws an {@link IllegalStateException} (this method internally uses
     * {@link #getPlainText(EnhancementEngine, ContentItem, boolean)} with
     * <code>true</code> as third parameters. Users of this method should call
     * this method with <code>false</code> as third parameter in their 
     * {@link EnhancementEngine#canEnhance(ContentItem)} implementation.<p>
     * <i>NOTE:</i> This method is intended for Engines that want to create an
     * empty {@link AnalysedText} content part. Engines that assume that this
     * content part is already present (e.g. if the consume already existing
     * annotations) should use the 
     * {@link #getAnalysedText(EnhancementEngine, ContentItem, boolean)}
     * method instead.
     * @param engine the EnhancementEngine calling this method (used for logging)
     * @param analysedTextFactory the {@link AnalysedTextFactory} used to create
     * the {@link AnalysedText} instance (if not present).
     * @param ci the {@link ContentItem}
     * @return the AnalysedText
     * @throws EngineException on any exception while accessing the 
     * '<code>text/plain</code>' Blob
     * @throws IllegalStateException if no '<code>text/plain</code>' Blob is
     * present as content part of the parsed {@link ContentItem} or the parsed
     * {@link AnalysedTextFactory} is <code>null</code>. <i>NOTE</i> that 
     * {@link IllegalStateException} are only thrown if the {@link AnalysedText}
     * ContentPart is not yet present in the parsed {@link ContentItem}
     */
public static AnalysedText initAnalysedText(EnhancementEngine engine, AnalysedTextFactory analysedTextFactory, ContentItem ci) throws EngineException {
    AnalysedText at = AnalysedTextUtils.getAnalysedText(ci);
    if (at == null) {
        if (analysedTextFactory == null) {
            throw new IllegalStateException("Unable to initialise AnalysedText" + "ContentPart because the parsed AnalysedTextFactory is NULL");
        }
        Entry<IRI, Blob> textBlob = getPlainText(engine, ci, true);
        //we need to create
        ci.getLock().writeLock().lock();
        try {
            //try again to retrieve (maybe an concurrent thread has created
            //the content part in the meantime
            at = AnalysedTextUtils.getAnalysedText(ci);
            if (at == null) {
                log.debug(" ... create new AnalysedText instance for Engine {}", engine.getName());
                at = analysedTextFactory.createAnalysedText(ci, textBlob.getValue());
            }
        } catch (IOException e) {
            throw new EngineException("Unable to create AnalysetText instance for Blob " + textBlob.getKey() + " of ContentItem " + ci.getUri() + "!", e);
        } finally {
            ci.getLock().writeLock().unlock();
        }
    } else {
        log.debug(" ... use existing AnalysedText instance for Engine {}", engine.getName());
    }
    return at;
}
Also used : AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) IRI(org.apache.clerezza.commons.rdf.IRI) Blob(org.apache.stanbol.enhancer.servicesapi.Blob) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) IOException(java.io.IOException)

Example 24 with AnalysedText

use of org.apache.stanbol.enhancer.nlp.model.AnalysedText in project stanbol by apache.

the class CorefFeatureSupportTest method testSerializationAndParse.

@Test
public void testSerializationAndParse() throws IOException {
    String serialized = getSerializedString();
    Assert.assertTrue(serialized.contains(jsonCorefCheckObama));
    Assert.assertTrue(serialized.contains(jsonCorefCheckHe));
    AnalysedText parsedAt = getParsedAnalysedText(serialized);
    assertAnalysedTextEquality(parsedAt);
}
Also used : AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) Test(org.junit.Test)

Example 25 with AnalysedText

use of org.apache.stanbol.enhancer.nlp.model.AnalysedText in project stanbol by apache.

the class CeliAnalyzedTextSentimentAnalysisEngine method computeEnhancements.

@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    AnalysedText at = getAnalysedText(this, ci, true);
    String language = getLanguage(this, ci, true);
    isLangaugeConfigured(this, languageConfig, language, true);
    List<SentimentExpression> seList;
    try {
        seList = this.client.extractSentimentExpressions(at.getSpan(), language);
    } catch (IOException e) {
        throw new EngineException("Error while calling the CELI Sentiment Analysis service (configured URL: " + serviceURL + ")!", e);
    } catch (SOAPException e) {
        throw new EngineException("Error wile encoding/decoding the request/response to the CELI Sentiment Analysis service!", e);
    }
    for (SentimentExpression se : seList) {
        //Add the Sentiment Expression as Token to the Text. NOTE that if a Token with the same start/end positions already exist this
        //Method returns the existing instance
        Token token = at.addToken(se.getStartSnippet(), se.getEndSnippet());
        token.addAnnotation(NlpAnnotations.SENTIMENT_ANNOTATION, new Value<Double>(se.getSentimentPolarityAsDoubleValue()));
    }
}
Also used : NlpEngineHelper.getAnalysedText(org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getAnalysedText) AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) SOAPException(javax.xml.soap.SOAPException) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) Token(org.apache.stanbol.enhancer.nlp.model.Token) IOException(java.io.IOException)

Aggregations

AnalysedText (org.apache.stanbol.enhancer.nlp.model.AnalysedText)32 EngineException (org.apache.stanbol.enhancer.servicesapi.EngineException)15 Token (org.apache.stanbol.enhancer.nlp.model.Token)13 NlpEngineHelper.getAnalysedText (org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getAnalysedText)13 IOException (java.io.IOException)9 IRI (org.apache.clerezza.commons.rdf.IRI)9 TripleImpl (org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)8 Sentence (org.apache.stanbol.enhancer.nlp.model.Sentence)8 PosTag (org.apache.stanbol.enhancer.nlp.pos.PosTag)8 Test (org.junit.Test)7 Graph (org.apache.clerezza.commons.rdf.Graph)6 NlpEngineHelper.initAnalysedText (org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.initAnalysedText)6 Language (org.apache.clerezza.commons.rdf.Language)5 PlainLiteralImpl (org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl)5 Section (org.apache.stanbol.enhancer.nlp.model.Section)5 Span (org.apache.stanbol.enhancer.nlp.model.Span)5 NerTag (org.apache.stanbol.enhancer.nlp.ner.NerTag)5 ArrayList (java.util.ArrayList)4 Chunk (org.apache.stanbol.enhancer.nlp.model.Chunk)4 Value (org.apache.stanbol.enhancer.nlp.model.annotation.Value)4