Search in sources :

Example 16 with AnalysedText

use of org.apache.stanbol.enhancer.nlp.model.AnalysedText in project stanbol by apache.

the class EntityCoMentionEngine method computeEnhancements.

@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    AnalysedText at = getAnalysedText(this, ci, true);
    String language = getLanguage(this, ci, true);
    LanguageProcessingConfig languageConfig = textProcessingConfig.getConfiguration(language);
    if (languageConfig == null) {
        throw new IllegalStateException("The language '" + language + "' is not configured " + "to be processed by this Engine. As this is already checked within the " + "canEnhance(..) method this may indicate an bug in the used " + "EnhanceemntJobManager implementation!");
    }
    if (log.isDebugEnabled()) {
        log.debug("compute co-mentions for ContentItem {} language {}  text={}", new Object[] { ci.getUri().getUnicodeString(), language, StringUtils.abbreviate(at.getSpan(), 100) });
    }
    LabelTokenizer labelTokenizer = (LabelTokenizer) labelTokenizerTracker.getService();
    if (labelTokenizer == null) {
        throw new EngineException(this, ci, "No LabelTokenizer available!", null);
    }
    //create the in-memory database for the mentioned Entities
    ContentItemMentionBuilder entityMentionIndex = new ContentItemMentionBuilder(labelTokenizer, language, linkerConfig.getDefaultLanguage());
    Graph metadata = ci.getMetadata();
    Set<IRI> textAnnotations = new HashSet<IRI>();
    ci.getLock().readLock().lock();
    try {
        //iterate over all TextAnnotations (mentions of Entities)
        for (Iterator<Triple> it = metadata.filter(null, RDF_TYPE, ENHANCER_TEXTANNOTATION); it.hasNext(); ) {
            IRI ta = (IRI) it.next().getSubject();
            entityMentionIndex.registerTextAnnotation(ta, metadata);
            //store the registered text annotations
            textAnnotations.add(ta);
        }
    } finally {
        ci.getLock().readLock().unlock();
    }
    EntityLinker entityLinker = new EntityLinker(at, language, languageConfig, entityMentionIndex, linkerConfig, labelTokenizer, entityMentionIndex);
    //process
    try {
        entityLinker.process();
    } catch (EntitySearcherException e) {
        log.error("Unable to link Entities with " + entityLinker, e);
        throw new EngineException(this, ci, "Unable to link Entities with " + entityLinker, e);
    }
    //TODO: write results
    ci.getLock().writeLock().lock();
    try {
        writeComentions(ci, entityLinker.getLinkedEntities().values(), language, textAnnotations);
    } finally {
        ci.getLock().writeLock().unlock();
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) LanguageProcessingConfig(org.apache.stanbol.enhancer.engines.entitylinking.config.LanguageProcessingConfig) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) EntitySearcherException(org.apache.stanbol.enhancer.engines.entitylinking.EntitySearcherException) EntityLinker(org.apache.stanbol.enhancer.engines.entitylinking.impl.EntityLinker) Triple(org.apache.clerezza.commons.rdf.Triple) NlpEngineHelper.getAnalysedText(org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getAnalysedText) AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) Graph(org.apache.clerezza.commons.rdf.Graph) LabelTokenizer(org.apache.stanbol.enhancer.engines.entitylinking.LabelTokenizer) ContentItemMentionBuilder(org.apache.stanbol.enhancer.engines.entitycomention.impl.ContentItemMentionBuilder) HashSet(java.util.HashSet)

Example 17 with AnalysedText

use of org.apache.stanbol.enhancer.nlp.model.AnalysedText in project stanbol by apache.

the class EntityCoMentionEngine method canEnhance.

@Override
public int canEnhance(ContentItem ci) throws EngineException {
    String language = getLanguage(this, ci, false);
    if (language == null || textProcessingConfig.getConfiguration(language) == null) {
        log.debug("Engine {} ignores ContentItem {} becuase language {} is not condigured.", new Object[] { getName(), ci.getUri(), language });
        return CANNOT_ENHANCE;
    }
    //we need a detected language, the AnalyzedText contentPart with Tokens.
    AnalysedText at = getAnalysedText(this, ci, false);
    return at != null && at.getTokens().hasNext() ? ENHANCE_ASYNC : CANNOT_ENHANCE;
}
Also used : NlpEngineHelper.getAnalysedText(org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getAnalysedText) AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText)

Example 18 with AnalysedText

use of org.apache.stanbol.enhancer.nlp.model.AnalysedText in project stanbol by apache.

the class EntityLinkingEngine method computeEnhancements.

@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    log.trace(" enhance ci {}", ci.getUri());
    if (isOfflineMode() && !entitySearcher.supportsOfflineMode()) {
        throw new EngineException(this, ci, "Offline mode is not supported by the used EntitySearcher!", null);
    }
    AnalysedText at = getAnalysedText(this, ci, true);
    log.debug("  > AnalysedText {}", at);
    String language = getLanguage(this, ci, true);
    if (log.isDebugEnabled()) {
        log.debug("computeEnhancements for ContentItem {} language {} text={}", new Object[] { ci.getUri().getUnicodeString(), language, StringUtils.abbreviate(at.getSpan(), 100) });
    }
    log.debug("  > Language {}", language);
    LanguageProcessingConfig languageConfig = textProcessingConfig.getConfiguration(language);
    if (languageConfig == null) {
        throw new IllegalStateException("The language '" + language + "' is not configured " + "to be processed by this Engine. As this is already checked within the " + "canEnhance(..) method this may indicate an bug in the used " + "EnhanceemntJobManager implementation!");
    }
    EntityLinker entityLinker = new EntityLinker(at, language, languageConfig, entitySearcher, linkerConfig, labelTokenizer);
    //process
    try {
        entityLinker.process();
    } catch (EntitySearcherException e) {
        log.error("Unable to link Entities with " + entityLinker, e);
        throw new EngineException(this, ci, "Unable to link Entities with " + entityLinker, e);
    }
    if (log.isInfoEnabled()) {
        entityLinker.logStatistics(log);
    }
    //write results (requires a write lock)
    ci.getLock().writeLock().lock();
    try {
        writeEnhancements(ci, entityLinker.getLinkedEntities().values(), language, linkerConfig.isWriteEntityRankings());
    } finally {
        ci.getLock().writeLock().unlock();
    }
}
Also used : NlpEngineHelper.getAnalysedText(org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getAnalysedText) AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) LanguageProcessingConfig(org.apache.stanbol.enhancer.engines.entitylinking.config.LanguageProcessingConfig) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) EntitySearcherException(org.apache.stanbol.enhancer.engines.entitylinking.EntitySearcherException) EntityLinker(org.apache.stanbol.enhancer.engines.entitylinking.impl.EntityLinker)

Example 19 with AnalysedText

use of org.apache.stanbol.enhancer.nlp.model.AnalysedText in project stanbol by apache.

the class EntityCoReferenceEngineTest method testSpatialCoref.

@Test
public void testSpatialCoref() throws EngineException, IOException {
    ContentItem ci = ciFactory.createContentItem(new StringSource(SPATIAL_TEXT));
    Graph graph = ci.getMetadata();
    IRI textEnhancement = EnhancementEngineHelper.createTextEnhancement(ci, engine);
    graph.add(new TripleImpl(textEnhancement, DC_LANGUAGE, new PlainLiteralImpl("en")));
    graph.add(new TripleImpl(textEnhancement, ENHANCER_CONFIDENCE, new PlainLiteralImpl("100.0")));
    graph.add(new TripleImpl(textEnhancement, DC_TYPE, DCTERMS_LINGUISTIC_SYSTEM));
    Entry<IRI, Blob> textBlob = ContentItemHelper.getBlob(ci, Collections.singleton("text/plain"));
    AnalysedText at = atFactory.createAnalysedText(ci, textBlob.getValue());
    Sentence sentence1 = at.addSentence(0, SPATIAL_SENTENCE_1.indexOf(".") + 1);
    Chunk angelaMerkel = sentence1.addChunk(0, "Angela Merkel".length());
    angelaMerkel.addAnnotation(NlpAnnotations.NER_ANNOTATION, Value.value(new NerTag("Angela Merkel", OntologicalClasses.DBPEDIA_PERSON)));
    Sentence sentence2 = at.addSentence(SPATIAL_SENTENCE_1.indexOf(".") + 1, SPATIAL_SENTENCE_1.length() + SPATIAL_SENTENCE_2.indexOf(".") + 1);
    int theStartIdx = sentence2.getSpan().indexOf("The");
    int germanStartIdx = sentence2.getSpan().indexOf("German");
    int chancellorStartIdx = sentence2.getSpan().indexOf("politician");
    Token the = sentence2.addToken(theStartIdx, theStartIdx + "The".length());
    the.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("The", LexicalCategory.PronounOrDeterminer, Pos.Determiner)));
    Token german = sentence2.addToken(germanStartIdx, germanStartIdx + "German".length());
    german.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("German", LexicalCategory.Adjective)));
    Token politician = sentence2.addToken(chancellorStartIdx, chancellorStartIdx + "politician".length());
    politician.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("politician", LexicalCategory.Noun)));
    Chunk theGermanChancellor = sentence2.addChunk(theStartIdx, chancellorStartIdx + "politician".length());
    theGermanChancellor.addAnnotation(NlpAnnotations.PHRASE_ANNOTATION, Value.value(new PhraseTag("The German politician", LexicalCategory.Noun)));
    engine.computeEnhancements(ci);
    Value<CorefFeature> representativeCorefValue = angelaMerkel.getAnnotation(NlpAnnotations.COREF_ANNOTATION);
    Assert.assertNotNull(representativeCorefValue);
    CorefFeature representativeCoref = representativeCorefValue.value();
    Assert.assertTrue(representativeCoref.isRepresentative());
    Assert.assertTrue(representativeCoref.getMentions().contains(theGermanChancellor));
    Value<CorefFeature> subordinateCorefValue = theGermanChancellor.getAnnotation(NlpAnnotations.COREF_ANNOTATION);
    Assert.assertNotNull(subordinateCorefValue);
    CorefFeature subordinateCoref = subordinateCorefValue.value();
    Assert.assertTrue(!subordinateCoref.isRepresentative());
    Assert.assertTrue(subordinateCoref.getMentions().contains(angelaMerkel));
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) NerTag(org.apache.stanbol.enhancer.nlp.ner.NerTag) CorefFeature(org.apache.stanbol.enhancer.nlp.coref.CorefFeature) Blob(org.apache.stanbol.enhancer.servicesapi.Blob) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) Token(org.apache.stanbol.enhancer.nlp.model.Token) Chunk(org.apache.stanbol.enhancer.nlp.model.Chunk) PhraseTag(org.apache.stanbol.enhancer.nlp.phrase.PhraseTag) AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) Graph(org.apache.clerezza.commons.rdf.Graph) PosTag(org.apache.stanbol.enhancer.nlp.pos.PosTag) StringSource(org.apache.stanbol.enhancer.servicesapi.impl.StringSource) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) Sentence(org.apache.stanbol.enhancer.nlp.model.Sentence) ContentItem(org.apache.stanbol.enhancer.servicesapi.ContentItem) Test(org.junit.Test)

Example 20 with AnalysedText

use of org.apache.stanbol.enhancer.nlp.model.AnalysedText in project stanbol by apache.

the class FstLinkingEngine method computeEnhancements.

@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    AnalysedText at;
    if (linkingMode != LinkingModeEnum.PLAIN) {
        //require AnalysedText contentPart
        at = getAnalysedText(this, ci, true);
    } else {
        //AnalysedText is optional in LinkingModeEnum.BASIC
        try {
            at = AnalysedTextUtils.getAnalysedText(ci);
        } catch (ClassCastException e) {
            //unexpected contentPart found under the URI expecting the AnalysedText
            at = null;
        }
    }
    final String content;
    if (at != null) {
        //we can get the content from the Analyzed text
        content = at.getSpan();
    } else {
        //no analyzed text ... read is from the text/plain blob
        try {
            content = ContentItemHelper.getText(NlpEngineHelper.getPlainText(this, ci, true).getValue());
        } catch (IOException e) {
            throw new EngineException(this, ci, "Unable to access plain/text content!", e);
        }
    }
    log.debug("  > AnalysedText {}", at);
    String language = getLanguage(this, ci, true);
    log.debug("  > Language {}", language);
    if (log.isDebugEnabled()) {
        log.debug("computeEnhancements for ContentItem {} language {} text={}", new Object[] { ci.getUri().getUnicodeString(), language, StringUtils.abbreviate(content, 100) });
    }
    // TODO: we need to do the same for the the default matching language
    TaggingSession session;
    try {
        session = TaggingSession.createSession(indexConfig, language);
    } catch (CorpusException e) {
        throw new EngineException(this, ci, e);
    }
    if (!session.hasCorpus()) {
        //no corpus available for processing the request
        return;
    }
    long taggingStart = System.currentTimeMillis();
    final NavigableMap<int[], Tag> tags = new TreeMap<int[], Tag>(Tag.SPAN_COMPARATOR);
    try {
        //process the language of the document
        Corpus corpus = null;
        if (session.getLanguageCorpus() != null) {
            corpus = session.getLanguageCorpus();
            long t = System.currentTimeMillis();
            int d = tag(content, at, session, corpus, tags);
            log.info(" - {}: fst: {}ms (callback: {}ms)", new Object[] { corpus.getIndexedField(), System.currentTimeMillis() - t, d });
        }
        if (session.getDefaultCorpus() != null) {
            if (corpus == null) {
                corpus = session.getDefaultCorpus();
            }
            long t = System.currentTimeMillis();
            int d = tag(content, at, session, session.getDefaultCorpus(), tags);
            log.info(" - {}: fst: {}ms (callback: {}ms)", new Object[] { session.getDefaultCorpus().getIndexedField(), System.currentTimeMillis() - t, d });
        }
        long taggingEnd = System.currentTimeMillis();
        if (corpus == null) {
            throw new EngineException(this, ci, "No FST corpus found to process contentItem " + "language '" + session.getLanguage() + "'!", null);
        } else {
            if (session.getLanguageCorpus() != null && session.getDefaultCorpus() != null) {
                log.info(" - sum fst: {} ms", taggingEnd - taggingStart);
            }
        }
        int matches = match(content, tags.values(), session.entityMentionTypes);
        log.debug(" - loaded {} ({} loaded, {} cached, {} appended) Matches in {} ms", new Object[] { matches, session.getSessionDocLoaded(), session.getSessionDocCached(), session.getSessionDocAppended(), System.currentTimeMillis() - taggingEnd });
        if (log.isDebugEnabled() && session.getDocumentCache() != null) {
            log.debug("EntityCache Statistics: {}", session.getDocumentCache().printStatistics());
        }
    } catch (IOException e) {
        throw new EngineException(this, ci, e);
    } finally {
        session.close();
    }
    if (log.isTraceEnabled()) {
        log.trace("Tagged Entities:");
        for (Tag tag : tags.values()) {
            log.trace("[{},{}]: {}", new Object[] { tag.getStart(), tag.getEnd(), tag.getMatches() });
        }
    }
    ci.getLock().writeLock().lock();
    try {
        writeEnhancements(ci, content, tags.values(), language, elConfig.isWriteEntityRankings());
    } finally {
        ci.getLock().writeLock().unlock();
    }
    //help the GC
    tags.clear();
}
Also used : EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) IOException(java.io.IOException) TreeMap(java.util.TreeMap) Corpus(org.apache.stanbol.enhancer.engines.lucenefstlinking.TaggingSession.Corpus) NlpEngineHelper.getAnalysedText(org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getAnalysedText) AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) NerTag(org.apache.stanbol.enhancer.nlp.ner.NerTag)

Aggregations

AnalysedText (org.apache.stanbol.enhancer.nlp.model.AnalysedText)32 EngineException (org.apache.stanbol.enhancer.servicesapi.EngineException)15 Token (org.apache.stanbol.enhancer.nlp.model.Token)13 NlpEngineHelper.getAnalysedText (org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getAnalysedText)13 IOException (java.io.IOException)9 IRI (org.apache.clerezza.commons.rdf.IRI)9 TripleImpl (org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)8 Sentence (org.apache.stanbol.enhancer.nlp.model.Sentence)8 PosTag (org.apache.stanbol.enhancer.nlp.pos.PosTag)8 Test (org.junit.Test)7 Graph (org.apache.clerezza.commons.rdf.Graph)6 NlpEngineHelper.initAnalysedText (org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.initAnalysedText)6 Language (org.apache.clerezza.commons.rdf.Language)5 PlainLiteralImpl (org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl)5 Section (org.apache.stanbol.enhancer.nlp.model.Section)5 Span (org.apache.stanbol.enhancer.nlp.model.Span)5 NerTag (org.apache.stanbol.enhancer.nlp.ner.NerTag)5 ArrayList (java.util.ArrayList)4 Chunk (org.apache.stanbol.enhancer.nlp.model.Chunk)4 Value (org.apache.stanbol.enhancer.nlp.model.annotation.Value)4