Search in sources :

Example 1 with Corpus

use of org.apache.stanbol.enhancer.engines.lucenefstlinking.TaggingSession.Corpus in project stanbol by apache.

the class FstLinkingEngine method computeEnhancements.

@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    AnalysedText at;
    if (linkingMode != LinkingModeEnum.PLAIN) {
        //require AnalysedText contentPart
        at = getAnalysedText(this, ci, true);
    } else {
        //AnalysedText is optional in LinkingModeEnum.BASIC
        try {
            at = AnalysedTextUtils.getAnalysedText(ci);
        } catch (ClassCastException e) {
            //unexpected contentPart found under the URI expecting the AnalysedText
            at = null;
        }
    }
    final String content;
    if (at != null) {
        //we can get the content from the Analyzed text
        content = at.getSpan();
    } else {
        //no analyzed text ... read is from the text/plain blob
        try {
            content = ContentItemHelper.getText(NlpEngineHelper.getPlainText(this, ci, true).getValue());
        } catch (IOException e) {
            throw new EngineException(this, ci, "Unable to access plain/text content!", e);
        }
    }
    log.debug("  > AnalysedText {}", at);
    String language = getLanguage(this, ci, true);
    log.debug("  > Language {}", language);
    if (log.isDebugEnabled()) {
        log.debug("computeEnhancements for ContentItem {} language {} text={}", new Object[] { ci.getUri().getUnicodeString(), language, StringUtils.abbreviate(content, 100) });
    }
    // TODO: we need to do the same for the the default matching language
    TaggingSession session;
    try {
        session = TaggingSession.createSession(indexConfig, language);
    } catch (CorpusException e) {
        throw new EngineException(this, ci, e);
    }
    if (!session.hasCorpus()) {
        //no corpus available for processing the request
        return;
    }
    long taggingStart = System.currentTimeMillis();
    final NavigableMap<int[], Tag> tags = new TreeMap<int[], Tag>(Tag.SPAN_COMPARATOR);
    try {
        //process the language of the document
        Corpus corpus = null;
        if (session.getLanguageCorpus() != null) {
            corpus = session.getLanguageCorpus();
            long t = System.currentTimeMillis();
            int d = tag(content, at, session, corpus, tags);
            log.info(" - {}: fst: {}ms (callback: {}ms)", new Object[] { corpus.getIndexedField(), System.currentTimeMillis() - t, d });
        }
        if (session.getDefaultCorpus() != null) {
            if (corpus == null) {
                corpus = session.getDefaultCorpus();
            }
            long t = System.currentTimeMillis();
            int d = tag(content, at, session, session.getDefaultCorpus(), tags);
            log.info(" - {}: fst: {}ms (callback: {}ms)", new Object[] { session.getDefaultCorpus().getIndexedField(), System.currentTimeMillis() - t, d });
        }
        long taggingEnd = System.currentTimeMillis();
        if (corpus == null) {
            throw new EngineException(this, ci, "No FST corpus found to process contentItem " + "language '" + session.getLanguage() + "'!", null);
        } else {
            if (session.getLanguageCorpus() != null && session.getDefaultCorpus() != null) {
                log.info(" - sum fst: {} ms", taggingEnd - taggingStart);
            }
        }
        int matches = match(content, tags.values(), session.entityMentionTypes);
        log.debug(" - loaded {} ({} loaded, {} cached, {} appended) Matches in {} ms", new Object[] { matches, session.getSessionDocLoaded(), session.getSessionDocCached(), session.getSessionDocAppended(), System.currentTimeMillis() - taggingEnd });
        if (log.isDebugEnabled() && session.getDocumentCache() != null) {
            log.debug("EntityCache Statistics: {}", session.getDocumentCache().printStatistics());
        }
    } catch (IOException e) {
        throw new EngineException(this, ci, e);
    } finally {
        session.close();
    }
    if (log.isTraceEnabled()) {
        log.trace("Tagged Entities:");
        for (Tag tag : tags.values()) {
            log.trace("[{},{}]: {}", new Object[] { tag.getStart(), tag.getEnd(), tag.getMatches() });
        }
    }
    ci.getLock().writeLock().lock();
    try {
        writeEnhancements(ci, content, tags.values(), language, elConfig.isWriteEntityRankings());
    } finally {
        ci.getLock().writeLock().unlock();
    }
    //help the GC
    tags.clear();
}
Also used : EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) IOException(java.io.IOException) TreeMap(java.util.TreeMap) Corpus(org.apache.stanbol.enhancer.engines.lucenefstlinking.TaggingSession.Corpus) NlpEngineHelper.getAnalysedText(org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getAnalysedText) AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) NerTag(org.apache.stanbol.enhancer.nlp.ner.NerTag)

Aggregations

IOException (java.io.IOException)1 TreeMap (java.util.TreeMap)1 Corpus (org.apache.stanbol.enhancer.engines.lucenefstlinking.TaggingSession.Corpus)1 AnalysedText (org.apache.stanbol.enhancer.nlp.model.AnalysedText)1 NerTag (org.apache.stanbol.enhancer.nlp.ner.NerTag)1 NlpEngineHelper.getAnalysedText (org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getAnalysedText)1 EngineException (org.apache.stanbol.enhancer.servicesapi.EngineException)1