use of org.apache.stanbol.enhancer.nlp.model.Token in project stanbol by apache.
the class SmartcnTokenizerEngine method computeEnhancements.
/**
* Compute enhancements for supplied ContentItem. The results of the process
* are expected to be stored in the metadata of the content item.
* <p/>
* The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
* persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
* <p/>
* This method creates a new POSContentPart using {@link org.apache.stanbol.enhancer.engines.pos.api.POSTaggerHelper#createContentPart} from a text/plain part and
* stores it as a new part in the content item. The metadata is not changed.
*
* @throws org.apache.stanbol.enhancer.servicesapi.EngineException
* if the underlying process failed to work as
* expected
*/
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
final AnalysedText at = initAnalysedText(this, analysedTextFactory, ci);
String language = getLanguage(this, ci, false);
if (!("zh".equals(language) || (language != null && language.startsWith("zh-")))) {
throw new IllegalStateException("The detected language is NOT 'zh'! " + "As this is also checked within the #canEnhance(..) method this " + "indicates an Bug in the used EnhancementJobManager implementation. " + "Please report this on the dev@apache.stanbol.org or create an " + "JIRA issue about this.");
}
if (!at.getSentences().hasNext()) {
// no sentences ... use this engine to detect
// first the sentences
TokenStream sentences = new SentenceTokenizer(new CharSequenceReader(at.getText()));
try {
while (sentences.incrementToken()) {
OffsetAttribute offset = sentences.addAttribute(OffsetAttribute.class);
Sentence s = at.addSentence(offset.startOffset(), offset.endOffset());
if (log.isTraceEnabled()) {
log.trace("detected {}:{}", s, s.getSpan());
}
}
} catch (IOException e) {
String message = String.format("IOException while reading from " + "CharSequenceReader of AnalyzedText for ContentItem %s", ci.getUri());
log.error(message, e);
throw new EngineException(this, ci, message, e);
}
}
// now the tokens
TokenStream tokens = new WordTokenFilter(new AnalyzedTextSentenceTokenizer(at));
try {
tokens.reset();
while (tokens.incrementToken()) {
OffsetAttribute offset = tokens.addAttribute(OffsetAttribute.class);
Token t = at.addToken(offset.startOffset(), offset.endOffset());
log.trace("detected {}", t);
}
} catch (IOException e) {
String message = String.format("IOException while reading from " + "CharSequenceReader of AnalyzedText for ContentItem %s", ci.getUri());
log.error(message, e);
throw new EngineException(this, ci, message, e);
}
}
use of org.apache.stanbol.enhancer.nlp.model.Token in project stanbol by apache.
the class SentimentSummarizationEngine method summarizeSentence.
/**
* @param sentimentTokens
* @param negations
* @param nounsAndPronouns
* @param verbs
* @param sectionBorders
*/
private List<SentimentPhrase> summarizeSentence(List<Sentiment> sentimentTokens, NavigableMap<Integer, Token> negations, NavigableMap<Integer, Token> nounsAndPronouns, NavigableMap<Integer, Token> verbs, NavigableMap<Integer, Token> conjunctions, NavigableMap<Integer, Token> sectionBorders) {
List<Sentiment> processedSentiments = new ArrayList<Sentiment>();
Integer[] searchSpan = new Integer[] { -1, -1 };
for (int i = 0; i < sentimentTokens.size(); i++) {
Integer index = Integer.valueOf(i);
Sentiment sentiment = sentimentTokens.get(i);
if (sentiment != null) {
// check for a new section
if (index.compareTo(searchSpan[1]) > 0) {
searchSpan[0] = sectionBorders.floorKey(index);
if (searchSpan[0] == null) {
searchSpan[0] = Integer.valueOf(0);
}
searchSpan[1] = sectionBorders.ceilingKey(index);
if (searchSpan[1] == null) {
searchSpan[1] = Integer.valueOf(sentimentTokens.size() - 1);
}
}
// for negation use the negation context
Integer[] context = getNegationContext(index, conjunctions, searchSpan);
for (Token negationToken : negations.subMap(context[0], true, context[1], true).values()) {
sentiment.addNegate(negationToken);
}
// for nouns use the sentiment context
context = getSentimentContext(index, sentiment, verbs, conjunctions, nounsAndPronouns, searchSpan);
for (Token word : nounsAndPronouns.subMap(context[0], true, context[1], true).values()) {
sentiment.addAbout(word);
}
processedSentiments.add(sentiment);
}
}
// now combine the processed sentiments to SentimentPhrases
Collections.sort(processedSentiments, sentimentComparator);
List<SentimentPhrase> sentimentPhrases = new ArrayList<SentimentPhrase>();
SentimentPhrase phrase = null;
for (Sentiment sentiment : processedSentiments) {
if (phrase == null || sentiment.getStart() > phrase.getEndIndex()) {
phrase = new SentimentPhrase(sentiment);
sentimentPhrases.add(phrase);
} else {
phrase.addSentiment(sentiment);
}
}
return sentimentPhrases;
}
use of org.apache.stanbol.enhancer.nlp.model.Token in project stanbol by apache.
the class CeliAnalyzedTextLemmatizerEngineTest method testEngineDe.
@Test
public void testEngineDe() throws IOException, EngineException {
ContentItem ci = ciFactory.createContentItem(new StringSource(de_text));
Assert.assertNotNull(ci);
AnalysedText at = atFactory.createAnalysedText(ci, ci.getBlob());
Assert.assertNotNull(at);
ci.getMetadata().add(new TripleImpl(ci.getUri(), DC_LANGUAGE, new PlainLiteralImpl("de")));
Assert.assertEquals("de", EnhancementEngineHelper.getLanguage(ci));
// Add some Tokens with POS annotations to test the usage of
// existing POS annotations by the lemmatizer
Token verbrachten = at.addToken(de_verbStart, de_verbStart + de_verb.length());
verbrachten.addAnnotation(POS_ANNOTATION, Value.value(new PosTag("V", LexicalCategory.Verb), de_verbProb));
Token schonen = at.addToken(de_adjectiveStart, de_adjectiveStart + de_adjective.length());
schonen.addAnnotation(POS_ANNOTATION, Value.value(new PosTag("ADJ", LexicalCategory.Adjective), de_adjectiveProb));
Token urlaub = at.addToken(de_nounStart, de_nounStart + de_noun.length());
urlaub.addAnnotation(POS_ANNOTATION, Value.value(new PosTag("NC", LexicalCategory.Noun), de_nounProb));
Assert.assertEquals("Can not enhance Test ContentItem", EnhancementEngine.ENHANCE_ASYNC, engine.canEnhance(ci));
// compute the enhancements
try {
engine.computeEnhancements(ci);
} catch (EngineException e) {
RemoteServiceHelper.checkServiceUnavailable(e);
// deactivate test
return;
}
// now validate the enhancements
boolean foundVerb = false;
boolean foundAdjective = false;
boolean foundNoun = false;
for (Iterator<Token> tokens = at.getTokens(); tokens.hasNext(); ) {
Token token = tokens.next();
log.info("Token: {}", token);
List<Value<MorphoFeatures>> mfs = token.getAnnotations(NlpAnnotations.MORPHO_ANNOTATION);
if (de_verb.equals(token.getSpan())) {
foundVerb = !mfs.isEmpty();
validateMorphFeatureProbability(mfs, LexicalCategory.Verb, de_verbProb);
} else if (de_adjective.equals(token.getSpan())) {
foundAdjective = !mfs.isEmpty();
validateMorphFeatureProbability(mfs, LexicalCategory.Adjective, de_adjectiveProb);
} else if (de_noun.equals(token.getSpan())) {
foundNoun = !mfs.isEmpty();
validateMorphFeatureProbability(mfs, LexicalCategory.Noun, de_nounProb);
}
for (Value<MorphoFeatures> mf : mfs) {
log.info(" - {}", mf);
Assert.assertNotNull(mf.value().getLemma());
}
}
Assert.assertTrue("No MorphoFeatures found for '" + de_verb + "'!", foundVerb);
Assert.assertTrue("No MorphoFeatures found for '" + de_adjective + "'!", foundAdjective);
Assert.assertTrue("No MorphoFeatures found for '" + de_noun + "'!", foundNoun);
}
use of org.apache.stanbol.enhancer.nlp.model.Token in project stanbol by apache.
the class CeliAnalyzedTextSentimentAnalysisEngineTest method testEngine.
@Test
public void testEngine() throws IOException, EngineException {
ContentItem ci = ciFactory.createContentItem(new StringSource(text));
Assert.assertNotNull(ci);
AnalysedText at = atFactory.createAnalysedText(ci, ci.getBlob());
Assert.assertNotNull(at);
ci.getMetadata().add(new TripleImpl(ci.getUri(), DC_LANGUAGE, new PlainLiteralImpl("it")));
Assert.assertEquals("it", EnhancementEngineHelper.getLanguage(ci));
Assert.assertEquals("Can not enhance Test ContentItem", EnhancementEngine.ENHANCE_ASYNC, engine.canEnhance(ci));
// compute the enhancements
try {
engine.computeEnhancements(ci);
} catch (EngineException e) {
RemoteServiceHelper.checkServiceUnavailable(e);
// deactivate test
return;
}
// now validate the enhancements
int sentimentExpressionCnt = 0;
for (Iterator<Token> tokens = at.getTokens(); tokens.hasNext(); ) {
Token token = tokens.next();
log.info("Token: {}", token);
List<Value<Double>> sentimentExpressionsList = token.getAnnotations(NlpAnnotations.SENTIMENT_ANNOTATION);
if (sentimentExpressionsList != null && sentimentExpressionsList.size() > 0)
sentimentExpressionCnt++;
}
Assert.assertTrue("2 sentiment expressions should be recognized in: " + text, sentimentExpressionCnt == 2);
}
use of org.apache.stanbol.enhancer.nlp.model.Token in project stanbol by apache.
the class CeliAnalyzedTextLemmatizerEngine method computeEnhancements.
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
AnalysedText at = getAnalysedText(this, ci, true);
String language = getLanguage(this, ci, true);
isLangaugeConfigured(this, languageConfig, language, true);
List<LexicalEntry> terms;
try {
terms = this.client.performMorfologicalAnalysis(at.getSpan(), language);
} catch (IOException e) {
throw new EngineException("Error while calling the CELI Lemmatizer" + " service (configured URL: " + serviceURL + ")!", e);
} catch (SOAPException e) {
throw new EngineException("Error wile encoding/decoding the request/" + "response to the CELI lemmatizer service!", e);
}
Map<LexicalCategory, Double> tokenLexCats = new EnumMap<LexicalCategory, Double>(LexicalCategory.class);
for (LexicalEntry term : terms) {
if (term.getTermReadings().isEmpty()) {
// ignore terms without readings
continue;
}
// Add the LexicalEntry as Token to the Text. NOTE that if a
// Token with the same start/end positions already exist this
// Method returns the existing instance
Token token = at.addToken(term.getFrom(), term.getTo());
// Now try to get POS annotations for the Token
for (Value<PosTag> posAnno : token.getAnnotations(NlpAnnotations.POS_ANNOTATION)) {
if (posAnno.value().isMapped()) {
for (LexicalCategory cat : posAnno.value().getCategories()) {
if (!tokenLexCats.containsKey(cat)) {
// do not override with lover prob
tokenLexCats.put(cat, posAnno.probability());
}
}
}
}
for (Reading reading : term.getTermReadings()) {
MorphoFeatures mf = CeliMorphoFeatures.parseFrom(reading, language);
// add the readings (MorphoFeatures)
if (mf != null) {
// use the POS tags of the morpho analysis and compare it
// with existing POS tags.
double posProbability = -1;
Set<LexicalCategory> mfCats = EnumSet.noneOf(LexicalCategory.class);
for (PosTag mfPos : mf.getPosList()) {
mfCats.addAll(mfPos.getCategories());
}
for (LexicalCategory mfCat : mfCats) {
Double prob = tokenLexCats.get(mfCat);
if (prob != null && posProbability < prob) {
posProbability = prob;
}
}
// add the morpho features with the posProbabiliy
Value<MorphoFeatures> value = Value.value(mf, posProbability < 0 ? Value.UNKNOWN_PROBABILITY : posProbability);
token.addAnnotation(NlpAnnotations.MORPHO_ANNOTATION, value);
}
}
}
}
Aggregations