use of org.apache.stanbol.enhancer.nlp.model.AnalysedText in project stanbol by apache.
the class EntityCoMentionEngine method computeEnhancements.
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
AnalysedText at = getAnalysedText(this, ci, true);
String language = getLanguage(this, ci, true);
LanguageProcessingConfig languageConfig = textProcessingConfig.getConfiguration(language);
if (languageConfig == null) {
throw new IllegalStateException("The language '" + language + "' is not configured " + "to be processed by this Engine. As this is already checked within the " + "canEnhance(..) method this may indicate an bug in the used " + "EnhanceemntJobManager implementation!");
}
if (log.isDebugEnabled()) {
log.debug("compute co-mentions for ContentItem {} language {} text={}", new Object[] { ci.getUri().getUnicodeString(), language, StringUtils.abbreviate(at.getSpan(), 100) });
}
LabelTokenizer labelTokenizer = (LabelTokenizer) labelTokenizerTracker.getService();
if (labelTokenizer == null) {
throw new EngineException(this, ci, "No LabelTokenizer available!", null);
}
//create the in-memory database for the mentioned Entities
ContentItemMentionBuilder entityMentionIndex = new ContentItemMentionBuilder(labelTokenizer, language, linkerConfig.getDefaultLanguage());
Graph metadata = ci.getMetadata();
Set<IRI> textAnnotations = new HashSet<IRI>();
ci.getLock().readLock().lock();
try {
//iterate over all TextAnnotations (mentions of Entities)
for (Iterator<Triple> it = metadata.filter(null, RDF_TYPE, ENHANCER_TEXTANNOTATION); it.hasNext(); ) {
IRI ta = (IRI) it.next().getSubject();
entityMentionIndex.registerTextAnnotation(ta, metadata);
//store the registered text annotations
textAnnotations.add(ta);
}
} finally {
ci.getLock().readLock().unlock();
}
EntityLinker entityLinker = new EntityLinker(at, language, languageConfig, entityMentionIndex, linkerConfig, labelTokenizer, entityMentionIndex);
//process
try {
entityLinker.process();
} catch (EntitySearcherException e) {
log.error("Unable to link Entities with " + entityLinker, e);
throw new EngineException(this, ci, "Unable to link Entities with " + entityLinker, e);
}
//TODO: write results
ci.getLock().writeLock().lock();
try {
writeComentions(ci, entityLinker.getLinkedEntities().values(), language, textAnnotations);
} finally {
ci.getLock().writeLock().unlock();
}
}
use of org.apache.stanbol.enhancer.nlp.model.AnalysedText in project stanbol by apache.
the class EntityCoMentionEngine method canEnhance.
@Override
public int canEnhance(ContentItem ci) throws EngineException {
String language = getLanguage(this, ci, false);
if (language == null || textProcessingConfig.getConfiguration(language) == null) {
log.debug("Engine {} ignores ContentItem {} becuase language {} is not condigured.", new Object[] { getName(), ci.getUri(), language });
return CANNOT_ENHANCE;
}
//we need a detected language, the AnalyzedText contentPart with Tokens.
AnalysedText at = getAnalysedText(this, ci, false);
return at != null && at.getTokens().hasNext() ? ENHANCE_ASYNC : CANNOT_ENHANCE;
}
use of org.apache.stanbol.enhancer.nlp.model.AnalysedText in project stanbol by apache.
the class EntityLinkingEngine method computeEnhancements.
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
log.trace(" enhance ci {}", ci.getUri());
if (isOfflineMode() && !entitySearcher.supportsOfflineMode()) {
throw new EngineException(this, ci, "Offline mode is not supported by the used EntitySearcher!", null);
}
AnalysedText at = getAnalysedText(this, ci, true);
log.debug(" > AnalysedText {}", at);
String language = getLanguage(this, ci, true);
if (log.isDebugEnabled()) {
log.debug("computeEnhancements for ContentItem {} language {} text={}", new Object[] { ci.getUri().getUnicodeString(), language, StringUtils.abbreviate(at.getSpan(), 100) });
}
log.debug(" > Language {}", language);
LanguageProcessingConfig languageConfig = textProcessingConfig.getConfiguration(language);
if (languageConfig == null) {
throw new IllegalStateException("The language '" + language + "' is not configured " + "to be processed by this Engine. As this is already checked within the " + "canEnhance(..) method this may indicate an bug in the used " + "EnhanceemntJobManager implementation!");
}
EntityLinker entityLinker = new EntityLinker(at, language, languageConfig, entitySearcher, linkerConfig, labelTokenizer);
//process
try {
entityLinker.process();
} catch (EntitySearcherException e) {
log.error("Unable to link Entities with " + entityLinker, e);
throw new EngineException(this, ci, "Unable to link Entities with " + entityLinker, e);
}
if (log.isInfoEnabled()) {
entityLinker.logStatistics(log);
}
//write results (requires a write lock)
ci.getLock().writeLock().lock();
try {
writeEnhancements(ci, entityLinker.getLinkedEntities().values(), language, linkerConfig.isWriteEntityRankings());
} finally {
ci.getLock().writeLock().unlock();
}
}
use of org.apache.stanbol.enhancer.nlp.model.AnalysedText in project stanbol by apache.
the class EntityCoReferenceEngineTest method testSpatialCoref.
@Test
public void testSpatialCoref() throws EngineException, IOException {
ContentItem ci = ciFactory.createContentItem(new StringSource(SPATIAL_TEXT));
Graph graph = ci.getMetadata();
IRI textEnhancement = EnhancementEngineHelper.createTextEnhancement(ci, engine);
graph.add(new TripleImpl(textEnhancement, DC_LANGUAGE, new PlainLiteralImpl("en")));
graph.add(new TripleImpl(textEnhancement, ENHANCER_CONFIDENCE, new PlainLiteralImpl("100.0")));
graph.add(new TripleImpl(textEnhancement, DC_TYPE, DCTERMS_LINGUISTIC_SYSTEM));
Entry<IRI, Blob> textBlob = ContentItemHelper.getBlob(ci, Collections.singleton("text/plain"));
AnalysedText at = atFactory.createAnalysedText(ci, textBlob.getValue());
Sentence sentence1 = at.addSentence(0, SPATIAL_SENTENCE_1.indexOf(".") + 1);
Chunk angelaMerkel = sentence1.addChunk(0, "Angela Merkel".length());
angelaMerkel.addAnnotation(NlpAnnotations.NER_ANNOTATION, Value.value(new NerTag("Angela Merkel", OntologicalClasses.DBPEDIA_PERSON)));
Sentence sentence2 = at.addSentence(SPATIAL_SENTENCE_1.indexOf(".") + 1, SPATIAL_SENTENCE_1.length() + SPATIAL_SENTENCE_2.indexOf(".") + 1);
int theStartIdx = sentence2.getSpan().indexOf("The");
int germanStartIdx = sentence2.getSpan().indexOf("German");
int chancellorStartIdx = sentence2.getSpan().indexOf("politician");
Token the = sentence2.addToken(theStartIdx, theStartIdx + "The".length());
the.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("The", LexicalCategory.PronounOrDeterminer, Pos.Determiner)));
Token german = sentence2.addToken(germanStartIdx, germanStartIdx + "German".length());
german.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("German", LexicalCategory.Adjective)));
Token politician = sentence2.addToken(chancellorStartIdx, chancellorStartIdx + "politician".length());
politician.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("politician", LexicalCategory.Noun)));
Chunk theGermanChancellor = sentence2.addChunk(theStartIdx, chancellorStartIdx + "politician".length());
theGermanChancellor.addAnnotation(NlpAnnotations.PHRASE_ANNOTATION, Value.value(new PhraseTag("The German politician", LexicalCategory.Noun)));
engine.computeEnhancements(ci);
Value<CorefFeature> representativeCorefValue = angelaMerkel.getAnnotation(NlpAnnotations.COREF_ANNOTATION);
Assert.assertNotNull(representativeCorefValue);
CorefFeature representativeCoref = representativeCorefValue.value();
Assert.assertTrue(representativeCoref.isRepresentative());
Assert.assertTrue(representativeCoref.getMentions().contains(theGermanChancellor));
Value<CorefFeature> subordinateCorefValue = theGermanChancellor.getAnnotation(NlpAnnotations.COREF_ANNOTATION);
Assert.assertNotNull(subordinateCorefValue);
CorefFeature subordinateCoref = subordinateCorefValue.value();
Assert.assertTrue(!subordinateCoref.isRepresentative());
Assert.assertTrue(subordinateCoref.getMentions().contains(angelaMerkel));
}
use of org.apache.stanbol.enhancer.nlp.model.AnalysedText in project stanbol by apache.
the class FstLinkingEngine method computeEnhancements.
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
AnalysedText at;
if (linkingMode != LinkingModeEnum.PLAIN) {
//require AnalysedText contentPart
at = getAnalysedText(this, ci, true);
} else {
//AnalysedText is optional in LinkingModeEnum.BASIC
try {
at = AnalysedTextUtils.getAnalysedText(ci);
} catch (ClassCastException e) {
//unexpected contentPart found under the URI expecting the AnalysedText
at = null;
}
}
final String content;
if (at != null) {
//we can get the content from the Analyzed text
content = at.getSpan();
} else {
//no analyzed text ... read is from the text/plain blob
try {
content = ContentItemHelper.getText(NlpEngineHelper.getPlainText(this, ci, true).getValue());
} catch (IOException e) {
throw new EngineException(this, ci, "Unable to access plain/text content!", e);
}
}
log.debug(" > AnalysedText {}", at);
String language = getLanguage(this, ci, true);
log.debug(" > Language {}", language);
if (log.isDebugEnabled()) {
log.debug("computeEnhancements for ContentItem {} language {} text={}", new Object[] { ci.getUri().getUnicodeString(), language, StringUtils.abbreviate(content, 100) });
}
// TODO: we need to do the same for the the default matching language
TaggingSession session;
try {
session = TaggingSession.createSession(indexConfig, language);
} catch (CorpusException e) {
throw new EngineException(this, ci, e);
}
if (!session.hasCorpus()) {
//no corpus available for processing the request
return;
}
long taggingStart = System.currentTimeMillis();
final NavigableMap<int[], Tag> tags = new TreeMap<int[], Tag>(Tag.SPAN_COMPARATOR);
try {
//process the language of the document
Corpus corpus = null;
if (session.getLanguageCorpus() != null) {
corpus = session.getLanguageCorpus();
long t = System.currentTimeMillis();
int d = tag(content, at, session, corpus, tags);
log.info(" - {}: fst: {}ms (callback: {}ms)", new Object[] { corpus.getIndexedField(), System.currentTimeMillis() - t, d });
}
if (session.getDefaultCorpus() != null) {
if (corpus == null) {
corpus = session.getDefaultCorpus();
}
long t = System.currentTimeMillis();
int d = tag(content, at, session, session.getDefaultCorpus(), tags);
log.info(" - {}: fst: {}ms (callback: {}ms)", new Object[] { session.getDefaultCorpus().getIndexedField(), System.currentTimeMillis() - t, d });
}
long taggingEnd = System.currentTimeMillis();
if (corpus == null) {
throw new EngineException(this, ci, "No FST corpus found to process contentItem " + "language '" + session.getLanguage() + "'!", null);
} else {
if (session.getLanguageCorpus() != null && session.getDefaultCorpus() != null) {
log.info(" - sum fst: {} ms", taggingEnd - taggingStart);
}
}
int matches = match(content, tags.values(), session.entityMentionTypes);
log.debug(" - loaded {} ({} loaded, {} cached, {} appended) Matches in {} ms", new Object[] { matches, session.getSessionDocLoaded(), session.getSessionDocCached(), session.getSessionDocAppended(), System.currentTimeMillis() - taggingEnd });
if (log.isDebugEnabled() && session.getDocumentCache() != null) {
log.debug("EntityCache Statistics: {}", session.getDocumentCache().printStatistics());
}
} catch (IOException e) {
throw new EngineException(this, ci, e);
} finally {
session.close();
}
if (log.isTraceEnabled()) {
log.trace("Tagged Entities:");
for (Tag tag : tags.values()) {
log.trace("[{},{}]: {}", new Object[] { tag.getStart(), tag.getEnd(), tag.getMatches() });
}
}
ci.getLock().writeLock().lock();
try {
writeEnhancements(ci, content, tags.values(), language, elConfig.isWriteEntityRankings());
} finally {
ci.getLock().writeLock().unlock();
}
//help the GC
tags.clear();
}
Aggregations