use of org.apache.stanbol.enhancer.nlp.model.AnalysedText in project stanbol by apache.
the class EntityLinkingEngine method canEnhance.
@Override
public int canEnhance(ContentItem ci) throws EngineException {
log.trace("canEnhancer {}", ci.getUri());
if (isOfflineMode() && !entitySearcher.supportsOfflineMode()) {
log.warn("{} '{}' is inactive because EntitySearcher does not support Offline mode!", getClass().getSimpleName(), getName());
return CANNOT_ENHANCE;
}
String language = getLanguage(this, ci, false);
if (language == null || textProcessingConfig.getConfiguration(language) == null) {
log.debug("Engine {} ignores ContentItem {} becuase language {} is not condigured.", new Object[] { getName(), ci.getUri(), language });
return CANNOT_ENHANCE;
}
//we need a detected language, the AnalyzedText contentPart with
//Tokens.
AnalysedText at = getAnalysedText(this, ci, false);
return at != null && at.getTokens().hasNext() ? ENHANCE_ASYNC : CANNOT_ENHANCE;
}
use of org.apache.stanbol.enhancer.nlp.model.AnalysedText in project stanbol by apache.
the class EntityCoReferenceEngine method extractNersAndNounPhrases.
/**
* Extracts the NERs and the noun phrases from the given text and puts them in the given lists.
*
* @param ci
* @param ners
* @param nounPhrases
*/
private void extractNersAndNounPhrases(ContentItem ci, Map<Integer, List<Span>> ners, List<NounPhrase> nounPhrases) {
AnalysedText at = NlpEngineHelper.getAnalysedText(this, ci, true);
Iterator<? extends Section> sections = at.getSentences();
if (!sections.hasNext()) {
// process as single sentence
sections = Collections.singleton(at).iterator();
}
int sentenceCnt = 0;
while (sections.hasNext()) {
sentenceCnt++;
Section section = sections.next();
List<NounPhrase> sectionNounPhrases = new ArrayList<NounPhrase>();
List<Span> sectionNers = new ArrayList<Span>();
Iterator<Span> chunks = section.getEnclosed(EnumSet.of(SpanTypeEnum.Chunk));
while (chunks.hasNext()) {
Span chunk = chunks.next();
Value<NerTag> ner = chunk.getAnnotation(NlpAnnotations.NER_ANNOTATION);
if (ner != null) {
sectionNers.add(chunk);
}
Value<PhraseTag> phrase = chunk.getAnnotation(NlpAnnotations.PHRASE_ANNOTATION);
if (phrase != null && phrase.value().getCategory() == LexicalCategory.Noun) {
sectionNounPhrases.add(new NounPhrase(chunk, sentenceCnt));
}
}
for (NounPhrase nounPhrase : sectionNounPhrases) {
Iterator<Span> tokens = section.getEnclosed(EnumSet.of(SpanTypeEnum.Token));
while (tokens.hasNext()) {
Span token = tokens.next();
if (nounPhrase.containsSpan(token)) {
nounPhrase.addToken(token);
}
}
for (Span sectionNer : sectionNers) {
if (nounPhrase.containsSpan(sectionNer)) {
nounPhrase.addNerChunk(sectionNer);
}
}
}
nounPhrases.addAll(sectionNounPhrases);
if (!sectionNers.isEmpty()) {
ners.put(sentenceCnt, sectionNers);
}
}
}
use of org.apache.stanbol.enhancer.nlp.model.AnalysedText in project stanbol by apache.
the class NlpEngineHelper method initAnalysedText.
/**
* Retrieves - or if not present - creates the {@link AnalysedText} content
* part for the parsed {@link ContentItem}. If the {@link Blob} with the
* mime type '<code>text/plain</code>' is present this method
* throws an {@link IllegalStateException} (this method internally uses
* {@link #getPlainText(EnhancementEngine, ContentItem, boolean)} with
* <code>true</code> as third parameters. Users of this method should call
* this method with <code>false</code> as third parameter in their
* {@link EnhancementEngine#canEnhance(ContentItem)} implementation.<p>
* <i>NOTE:</i> This method is intended for Engines that want to create an
* empty {@link AnalysedText} content part. Engines that assume that this
* content part is already present (e.g. if the consume already existing
* annotations) should use the
* {@link #getAnalysedText(EnhancementEngine, ContentItem, boolean)}
* method instead.
* @param engine the EnhancementEngine calling this method (used for logging)
* @param analysedTextFactory the {@link AnalysedTextFactory} used to create
* the {@link AnalysedText} instance (if not present).
* @param ci the {@link ContentItem}
* @return the AnalysedText
* @throws EngineException on any exception while accessing the
* '<code>text/plain</code>' Blob
* @throws IllegalStateException if no '<code>text/plain</code>' Blob is
* present as content part of the parsed {@link ContentItem} or the parsed
* {@link AnalysedTextFactory} is <code>null</code>. <i>NOTE</i> that
* {@link IllegalStateException} are only thrown if the {@link AnalysedText}
* ContentPart is not yet present in the parsed {@link ContentItem}
*/
public static AnalysedText initAnalysedText(EnhancementEngine engine, AnalysedTextFactory analysedTextFactory, ContentItem ci) throws EngineException {
AnalysedText at = AnalysedTextUtils.getAnalysedText(ci);
if (at == null) {
if (analysedTextFactory == null) {
throw new IllegalStateException("Unable to initialise AnalysedText" + "ContentPart because the parsed AnalysedTextFactory is NULL");
}
Entry<IRI, Blob> textBlob = getPlainText(engine, ci, true);
//we need to create
ci.getLock().writeLock().lock();
try {
//try again to retrieve (maybe an concurrent thread has created
//the content part in the meantime
at = AnalysedTextUtils.getAnalysedText(ci);
if (at == null) {
log.debug(" ... create new AnalysedText instance for Engine {}", engine.getName());
at = analysedTextFactory.createAnalysedText(ci, textBlob.getValue());
}
} catch (IOException e) {
throw new EngineException("Unable to create AnalysetText instance for Blob " + textBlob.getKey() + " of ContentItem " + ci.getUri() + "!", e);
} finally {
ci.getLock().writeLock().unlock();
}
} else {
log.debug(" ... use existing AnalysedText instance for Engine {}", engine.getName());
}
return at;
}
use of org.apache.stanbol.enhancer.nlp.model.AnalysedText in project stanbol by apache.
the class CorefFeatureSupportTest method testSerializationAndParse.
@Test
public void testSerializationAndParse() throws IOException {
String serialized = getSerializedString();
Assert.assertTrue(serialized.contains(jsonCorefCheckObama));
Assert.assertTrue(serialized.contains(jsonCorefCheckHe));
AnalysedText parsedAt = getParsedAnalysedText(serialized);
assertAnalysedTextEquality(parsedAt);
}
use of org.apache.stanbol.enhancer.nlp.model.AnalysedText in project stanbol by apache.
the class CeliAnalyzedTextSentimentAnalysisEngine method computeEnhancements.
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
AnalysedText at = getAnalysedText(this, ci, true);
String language = getLanguage(this, ci, true);
isLangaugeConfigured(this, languageConfig, language, true);
List<SentimentExpression> seList;
try {
seList = this.client.extractSentimentExpressions(at.getSpan(), language);
} catch (IOException e) {
throw new EngineException("Error while calling the CELI Sentiment Analysis service (configured URL: " + serviceURL + ")!", e);
} catch (SOAPException e) {
throw new EngineException("Error wile encoding/decoding the request/response to the CELI Sentiment Analysis service!", e);
}
for (SentimentExpression se : seList) {
//Add the Sentiment Expression as Token to the Text. NOTE that if a Token with the same start/end positions already exist this
//Method returns the existing instance
Token token = at.addToken(se.getStartSnippet(), se.getEndSnippet());
token.addAnnotation(NlpAnnotations.SENTIMENT_ANNOTATION, new Value<Double>(se.getSentimentPolarityAsDoubleValue()));
}
}
Aggregations