use of org.apache.stanbol.enhancer.nlp.model.Span in project stanbol by apache.
the class RestfulNlpAnalysisEngine method computeEnhancements.
/**
* Compute enhancements for supplied ContentItem. The results of the process
* are expected to be stored in the metadata of the content item.
* <p/>
* The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
* persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
* <p/>
* This method creates a new POSContentPart using {@link org.apache.stanbol.enhancer.engines.pos.api.POSTaggerHelper#createContentPart} from a text/plain part and
* stores it as a new part in the content item. The metadata is not changed.
*
* @throws org.apache.stanbol.enhancer.servicesapi.EngineException
* if the underlying process failed to work as
* expected
*/
@Override
public void computeEnhancements(final ContentItem ci) throws EngineException {
// validate that the service is active
checkRESTfulNlpAnalysisService();
// get/create the AnalysedText
final AnalysedText at = NlpEngineHelper.initAnalysedText(this, analysedTextFactory, ci);
final Blob blob = at.getBlob();
// send the text to the server
final String language = getLanguage(this, ci, true);
final HttpPost request = new HttpPost(analysisServiceUrl);
request.addHeader(HttpHeaders.CONTENT_LANGUAGE, language);
request.setEntity(new InputStreamEntity(blob.getStream(), blob.getContentLength(), ContentType.create(blob.getMimeType(), blob.getParameter().get("charset"))));
// execute the request
try {
AccessController.doPrivileged(new PrivilegedExceptionAction<AnalysedText>() {
public AnalysedText run() throws ClientProtocolException, IOException {
return httpClient.execute(request, new AnalysisResponseHandler(at));
}
});
} catch (PrivilegedActionException pae) {
Exception e = pae.getException();
if (e instanceof ClientProtocolException) {
// force re-initialisation upon error
setRESTfulNlpAnalysisServiceUnavailable();
throw new EngineException(this, ci, "Exception while executing Request " + "on RESTful NLP Analysis Service at " + analysisServiceUrl, e);
} else if (e instanceof IOException) {
// force re-initialisation upon error
setRESTfulNlpAnalysisServiceUnavailable();
throw new EngineException(this, ci, "Exception while executing Request " + "on RESTful NLP Analysis Service at " + analysisServiceUrl, e);
} else {
throw RuntimeException.class.cast(e);
}
}
if (writeTextAnnotations) {
// if enabled fise:TextAnnotations are created for Named Entities and Sentiments
double positiveSent = 0.0;
int positiveCount = 0;
double negativeSent = 0.0;
int negativeCount = 0;
int sentimentCount = 0;
Iterator<Span> spans = at.getEnclosed(EnumSet.of(SpanTypeEnum.Sentence, SpanTypeEnum.Chunk));
Sentence context = null;
Graph metadata = ci.getMetadata();
Language lang = new Language(language);
LiteralFactory lf = LiteralFactory.getInstance();
ci.getLock().writeLock().lock();
try {
// write TextAnnotations for Named Entities
while (spans.hasNext()) {
Span span = spans.next();
switch(span.getType()) {
case Sentence:
context = (Sentence) span;
// FALLThrough intended!!
default:
Value<NerTag> nerAnno = span.getAnnotation(NER_ANNOTATION);
if (nerAnno != null) {
IRI ta = EnhancementEngineHelper.createTextEnhancement(ci, this);
// add span related data
metadata.add(new TripleImpl(ta, ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(span.getSpan(), lang)));
metadata.add(new TripleImpl(ta, ENHANCER_START, lf.createTypedLiteral(span.getStart())));
metadata.add(new TripleImpl(ta, ENHANCER_END, lf.createTypedLiteral(span.getEnd())));
metadata.add(new TripleImpl(ta, ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(context == null ? getDefaultSelectionContext(at.getSpan(), span.getSpan(), span.getStart()) : context.getSpan(), lang)));
// add the NER type
if (nerAnno.value().getType() != null) {
metadata.add(new TripleImpl(ta, DC_TYPE, nerAnno.value().getType()));
}
if (nerAnno.probability() >= 0) {
metadata.add(new TripleImpl(ta, ENHANCER_CONFIDENCE, lf.createTypedLiteral(nerAnno.probability())));
}
}
Value<Double> sentimentAnnotation = span.getAnnotation(SENTIMENT_ANNOTATION);
if (sentimentAnnotation != null) {
// this span has a sentiment assigned
Double sentiment = sentimentAnnotation.value();
// Create a fise:TextAnnotation for the sentiment
IRI ta = EnhancementEngineHelper.createTextEnhancement(ci, this);
metadata.add(new TripleImpl(ta, ENHANCER_START, lf.createTypedLiteral(span.getStart())));
metadata.add(new TripleImpl(ta, ENHANCER_END, lf.createTypedLiteral(span.getEnd())));
metadata.add(new TripleImpl(ta, SENTIMENT_PROPERTY, lf.createTypedLiteral(sentiment)));
// add the generic dc:type used for all Sentiment annotation
metadata.add(new TripleImpl(ta, DC_TYPE, SENTIMENT_TYPE));
// determine the specific dc:type for the sentiment annotation
IRI ssoType = NIFHelper.SPAN_TYPE_TO_SSO_TYPE.get(span.getType());
if (ssoType != null) {
metadata.add(new TripleImpl(ta, DC_TYPE, ssoType));
}
// keep statistics for the overall sentiment for the Document
sentimentCount++;
if (sentiment > 0) {
positiveSent += sentiment;
positiveCount++;
} else if (sentiment < 0) {
negativeSent += sentiment;
negativeCount++;
}
}
break;
}
}
// Add the annotation for the overall sentiment of the document
if (sentimentCount > 0) {
IRI ta = EnhancementEngineHelper.createTextEnhancement(ci, this);
// calculate the average sentiment for a document
// TODO: Think on a better way to calculate a general sentiment value for a document.
metadata.add(new TripleImpl(ta, SENTIMENT_PROPERTY, lf.createTypedLiteral((positiveSent + negativeSent) / sentimentCount)));
if (positiveCount > 0) {
// average positive sentiment calculation for the document
metadata.add(new TripleImpl(ta, POSITIVE_SENTIMENT_PROPERTY, lf.createTypedLiteral(positiveSent / positiveCount)));
}
if (negativeCount > 0) {
// average negative sentiment calculation for the document
metadata.add(new TripleImpl(ta, NEGATIVE_SENTIMENT_PROPERTY, lf.createTypedLiteral(negativeSent / negativeCount)));
}
metadata.add(new TripleImpl(ta, DC_TYPE, SENTIMENT_TYPE));
metadata.add(new TripleImpl(ta, DC_TYPE, DOCUMENT_SENTIMENT_TYPE));
}
// no sentiment annotation present ... nothing to do
} finally {
ci.getLock().writeLock().unlock();
}
}
// else do not write fise:TextAnnotations
}
use of org.apache.stanbol.enhancer.nlp.model.Span in project stanbol by apache.
the class Nif20MetadataEngine method computeEnhancements.
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
AnalysedText at = getAnalysedText(this, ci, true);
String lang = EnhancementEngineHelper.getLanguage(ci);
Language language = lang == null ? null : new Language(lang);
// now iterate over the AnalysedText data and create the RDF representation
// TODO: make configureable
boolean sentences = true;
boolean phrases = true;
boolean words = true;
EnumSet<SpanTypeEnum> activeTypes = EnumSet.noneOf(SpanTypeEnum.class);
if (sentences) {
activeTypes.add(SpanTypeEnum.Sentence);
}
if (phrases) {
activeTypes.add(SpanTypeEnum.Chunk);
}
if (words) {
activeTypes.add(SpanTypeEnum.Token);
}
Graph metadata = ci.getMetadata();
IRI base = ci.getUri();
ci.getLock().writeLock().lock();
try {
// write the context
IRI text = writeSpan(metadata, base, at, language, at);
metadata.add(new TripleImpl(text, Nif20.sourceUrl.getUri(), ci.getUri()));
Iterator<Span> spans = at.getEnclosed(activeTypes);
IRI sentence = null;
IRI phrase = null;
IRI word = null;
boolean firstWordInSentence = true;
while (spans.hasNext()) {
Span span = spans.next();
// TODO: filter Spans based on additional requirements
// (1) write generic information about the span
IRI current = writeSpan(metadata, base, at, language, span);
// write the context
metadata.add(new TripleImpl(current, Nif20.referenceContext.getUri(), text));
// (2) add the relations between the different spans
switch(span.getType()) {
case Sentence:
if (sentence != null && writePrevNext) {
metadata.add(new TripleImpl(sentence, Nif20.nextSentence.getUri(), current));
metadata.add(new TripleImpl(current, Nif20.previousSentence.getUri(), sentence));
}
if (word != null) {
metadata.add(new TripleImpl(sentence, Nif20.lastWord.getUri(), word));
}
sentence = current;
firstWordInSentence = true;
break;
case Chunk:
if (sentence != null && writeHierary) {
metadata.add(new TripleImpl(current, Nif20.superString.getUri(), sentence));
}
phrase = current;
break;
case Token:
if (sentence != null) {
if (writeHierary) {
metadata.add(new TripleImpl(current, Nif20.sentence.getUri(), sentence));
}
// metadata.add(new TripleImpl(sentence, Nif20.word.getUri(), current));
if (firstWordInSentence) {
metadata.add(new TripleImpl(sentence, Nif20.firstWord.getUri(), current));
firstWordInSentence = false;
}
}
if (writeHierary && phrase != null && !phrase.equals(current)) {
metadata.add(new TripleImpl(current, Nif20.subString.getUri(), phrase));
}
if (word != null && writePrevNext) {
metadata.add(new TripleImpl(word, Nif20.nextWord.getUri(), current));
metadata.add(new TripleImpl(current, Nif20.previousWord.getUri(), word));
}
word = current;
break;
default:
break;
}
// (3) add specific information such as POS, chunk type ...
Nif20Helper.writePhrase(metadata, span, current);
Nif20Helper.writePos(metadata, span, current);
// TODO: sentiment support
Value<Double> sentiment = span.getAnnotation(NlpAnnotations.SENTIMENT_ANNOTATION);
if (sentiment != null && sentiment.value() != null) {
metadata.add(new TripleImpl(current, SENTIMENT_PROPERTY, lf.createTypedLiteral(sentiment.value())));
}
}
} finally {
ci.getLock().writeLock().unlock();
}
}
use of org.apache.stanbol.enhancer.nlp.model.Span in project stanbol by apache.
the class SentimentSummarizationEngine method extractSentiments.
/**
* Extracts {@link Sentiment}s for words with a {@link NlpAnnotations#SENTIMENT_ANNOTATION}.
* The {@link NlpAnnotations#POS_ANNOTATION}s are used to link those words with
* {@link LexicalCategory#Noun}s.
* @param at the AnalyzedText to process
* @return the {@link Sentiment} instances organised along {@link Sentence}s. If
* no {@link Sentence}s are present on the parsed {@link AnalysedText}, than all
* {@link Sentiment}s are added to the {@link AnalysedText}. Otherwise only
* {@link Sentiment}s not contained within a {@link Sentence} are added to the
* {@link AnalysedText} key.
*/
private List<SentimentPhrase> extractSentiments(AnalysedText at, String language) {
// we do use Sentences (optional) and Tokens (required)
Iterator<Span> tokenIt = at.getEnclosed(EnumSet.of(SpanTypeEnum.Sentence, SpanTypeEnum.Token));
List<Sentiment> sentimentTokens = new ArrayList<Sentiment>(32);
NavigableMap<Integer, Token> negations = new TreeMap<Integer, Token>();
NavigableMap<Integer, Token> nounsAndPronouns = new TreeMap<Integer, Token>();
NavigableMap<Integer, Token> verbs = new TreeMap<Integer, Token>();
NavigableMap<Integer, Token> conjuctions = new TreeMap<Integer, Token>();
NavigableMap<Integer, Token> sectionBorders = new TreeMap<Integer, Token>();
boolean firstTokenInSentence = true;
Sentence sentence = null;
final List<SentimentPhrase> sentimentPhrases = new ArrayList<SentimentPhrase>();
while (tokenIt.hasNext()) {
Span span = tokenIt.next();
switch(span.getType()) {
case Token:
Token word = (Token) span;
Integer wordIndex = sentimentTokens.size();
Value<Double> sentimentAnnotation = span.getAnnotation(SENTIMENT_ANNOTATION);
boolean addToList = false;
Sentiment sentiment = null;
if (sentimentAnnotation != null && sentimentAnnotation.value() != null && !sentimentAnnotation.value().equals(ZERO)) {
sentiment = new Sentiment(word, sentimentAnnotation.value(), sentence == null || word.getEnd() > sentence.getEnd() ? null : sentence);
addToList = true;
}
if (isNegation((Token) span, language)) {
addToList = true;
negations.put(wordIndex, word);
} else if (isNoun(word, firstTokenInSentence, language) || isPronoun(word, language)) {
addToList = true;
nounsAndPronouns.put(wordIndex, word);
} else if (isSectionBorder(word, language)) {
addToList = true;
sectionBorders.put(wordIndex, word);
} else if (isVerb(word, language)) {
addToList = true;
verbs.put(wordIndex, word);
} else if (isCoordinatingConjuction(word, language)) {
addToList = true;
conjuctions.put(wordIndex, word);
} else if (isCountable(word, language)) {
addToList = true;
}
if (log.isDebugEnabled()) {
Value<PosTag> pos = word.getAnnotation(NlpAnnotations.POS_ANNOTATION);
log.debug(" [{}] '{}' pos: {}, sentiment {}", new Object[] { addToList ? sentimentTokens.size() : "-", word.getSpan(), pos.value().getCategories(), sentiment == null ? "none" : sentiment.getValue() });
}
if (addToList) {
// add the token
sentimentTokens.add(sentiment);
}
firstTokenInSentence = false;
break;
case Sentence:
// cleanup the previous sentence
sentimentPhrases.addAll(summarizeSentence(sentimentTokens, negations, nounsAndPronouns, verbs, conjuctions, sectionBorders));
negations.clear();
nounsAndPronouns.clear();
sentimentTokens.clear();
verbs.clear();
sectionBorders.clear();
firstTokenInSentence = true;
sentence = (Sentence) span;
break;
case TextSection:
break;
default:
break;
}
}
sentimentPhrases.addAll(summarizeSentence(sentimentTokens, negations, nounsAndPronouns, verbs, conjuctions, sectionBorders));
return sentimentPhrases;
}
use of org.apache.stanbol.enhancer.nlp.model.Span in project stanbol by apache.
the class AnalyzedTextSerializer method serialize.
/**
* Serializes the parsed {@link AnalysedText} to the {@link OutputStream} by
* using the {@link Charset}.
* @param at the {@link AnalysedText} to serialize
* @param out the {@link OutputStream}
* @param charset the {@link Charset}. UTF-8 is used as default if <code>null</code>
* is parsed
*/
public void serialize(AnalysedText at, OutputStream out, Charset charset) throws IOException {
if (at == null) {
throw new IllegalArgumentException("The parsed AnalysedText MUST NOT be NULL!");
}
if (out == null) {
throw new IllegalArgumentException("The parsed OutputStream MUST NOT be NULL");
}
if (charset == null) {
charset = UTF8;
}
JsonFactory jsonFactory = mapper.getJsonFactory();
JsonGenerator jg = jsonFactory.createJsonGenerator(new OutputStreamWriter(out, charset));
jg.useDefaultPrettyPrinter();
jg.writeStartObject();
jg.writeArrayFieldStart("spans");
jg.writeTree(writeSpan(at));
for (Iterator<Span> it = at.getEnclosed(EnumSet.allOf(SpanTypeEnum.class)); it.hasNext(); ) {
jg.writeTree(writeSpan(it.next()));
}
jg.writeEndArray();
jg.writeEndObject();
jg.close();
}
use of org.apache.stanbol.enhancer.nlp.model.Span in project stanbol by apache.
the class EntityCoReferenceEngine method extractNersAndNounPhrases.
/**
* Extracts the NERs and the noun phrases from the given text and puts them in the given lists.
*
* @param ci
* @param ners
* @param nounPhrases
*/
private void extractNersAndNounPhrases(ContentItem ci, Map<Integer, List<Span>> ners, List<NounPhrase> nounPhrases) {
AnalysedText at = NlpEngineHelper.getAnalysedText(this, ci, true);
Iterator<? extends Section> sections = at.getSentences();
if (!sections.hasNext()) {
// process as single sentence
sections = Collections.singleton(at).iterator();
}
int sentenceCnt = 0;
while (sections.hasNext()) {
sentenceCnt++;
Section section = sections.next();
List<NounPhrase> sectionNounPhrases = new ArrayList<NounPhrase>();
List<Span> sectionNers = new ArrayList<Span>();
Iterator<Span> chunks = section.getEnclosed(EnumSet.of(SpanTypeEnum.Chunk));
while (chunks.hasNext()) {
Span chunk = chunks.next();
Value<NerTag> ner = chunk.getAnnotation(NlpAnnotations.NER_ANNOTATION);
if (ner != null) {
sectionNers.add(chunk);
}
Value<PhraseTag> phrase = chunk.getAnnotation(NlpAnnotations.PHRASE_ANNOTATION);
if (phrase != null && phrase.value().getCategory() == LexicalCategory.Noun) {
sectionNounPhrases.add(new NounPhrase(chunk, sentenceCnt));
}
}
for (NounPhrase nounPhrase : sectionNounPhrases) {
Iterator<Span> tokens = section.getEnclosed(EnumSet.of(SpanTypeEnum.Token));
while (tokens.hasNext()) {
Span token = tokens.next();
if (nounPhrase.containsSpan(token)) {
nounPhrase.addToken(token);
}
}
for (Span sectionNer : sectionNers) {
if (nounPhrase.containsSpan(sectionNer)) {
nounPhrase.addNerChunk(sectionNer);
}
}
}
nounPhrases.addAll(sectionNounPhrases);
if (!sectionNers.isEmpty()) {
ners.put(sentenceCnt, sectionNers);
}
}
}
Aggregations