use of org.apache.stanbol.enhancer.nlp.model.annotation.Value in project stanbol by apache.
the class CeliAnalyzedTextLemmatizerEngineTest method testEngineDe.
@Test
public void testEngineDe() throws IOException, EngineException {
ContentItem ci = ciFactory.createContentItem(new StringSource(de_text));
Assert.assertNotNull(ci);
AnalysedText at = atFactory.createAnalysedText(ci, ci.getBlob());
Assert.assertNotNull(at);
ci.getMetadata().add(new TripleImpl(ci.getUri(), DC_LANGUAGE, new PlainLiteralImpl("de")));
Assert.assertEquals("de", EnhancementEngineHelper.getLanguage(ci));
// Add some Tokens with POS annotations to test the usage of
// existing POS annotations by the lemmatizer
Token verbrachten = at.addToken(de_verbStart, de_verbStart + de_verb.length());
verbrachten.addAnnotation(POS_ANNOTATION, Value.value(new PosTag("V", LexicalCategory.Verb), de_verbProb));
Token schonen = at.addToken(de_adjectiveStart, de_adjectiveStart + de_adjective.length());
schonen.addAnnotation(POS_ANNOTATION, Value.value(new PosTag("ADJ", LexicalCategory.Adjective), de_adjectiveProb));
Token urlaub = at.addToken(de_nounStart, de_nounStart + de_noun.length());
urlaub.addAnnotation(POS_ANNOTATION, Value.value(new PosTag("NC", LexicalCategory.Noun), de_nounProb));
Assert.assertEquals("Can not enhance Test ContentItem", EnhancementEngine.ENHANCE_ASYNC, engine.canEnhance(ci));
// compute the enhancements
try {
engine.computeEnhancements(ci);
} catch (EngineException e) {
RemoteServiceHelper.checkServiceUnavailable(e);
// deactivate test
return;
}
// now validate the enhancements
boolean foundVerb = false;
boolean foundAdjective = false;
boolean foundNoun = false;
for (Iterator<Token> tokens = at.getTokens(); tokens.hasNext(); ) {
Token token = tokens.next();
log.info("Token: {}", token);
List<Value<MorphoFeatures>> mfs = token.getAnnotations(NlpAnnotations.MORPHO_ANNOTATION);
if (de_verb.equals(token.getSpan())) {
foundVerb = !mfs.isEmpty();
validateMorphFeatureProbability(mfs, LexicalCategory.Verb, de_verbProb);
} else if (de_adjective.equals(token.getSpan())) {
foundAdjective = !mfs.isEmpty();
validateMorphFeatureProbability(mfs, LexicalCategory.Adjective, de_adjectiveProb);
} else if (de_noun.equals(token.getSpan())) {
foundNoun = !mfs.isEmpty();
validateMorphFeatureProbability(mfs, LexicalCategory.Noun, de_nounProb);
}
for (Value<MorphoFeatures> mf : mfs) {
log.info(" - {}", mf);
Assert.assertNotNull(mf.value().getLemma());
}
}
Assert.assertTrue("No MorphoFeatures found for '" + de_verb + "'!", foundVerb);
Assert.assertTrue("No MorphoFeatures found for '" + de_adjective + "'!", foundAdjective);
Assert.assertTrue("No MorphoFeatures found for '" + de_noun + "'!", foundNoun);
}
use of org.apache.stanbol.enhancer.nlp.model.annotation.Value in project stanbol by apache.
the class CeliAnalyzedTextSentimentAnalysisEngineTest method testEngine.
@Test
public void testEngine() throws IOException, EngineException {
ContentItem ci = ciFactory.createContentItem(new StringSource(text));
Assert.assertNotNull(ci);
AnalysedText at = atFactory.createAnalysedText(ci, ci.getBlob());
Assert.assertNotNull(at);
ci.getMetadata().add(new TripleImpl(ci.getUri(), DC_LANGUAGE, new PlainLiteralImpl("it")));
Assert.assertEquals("it", EnhancementEngineHelper.getLanguage(ci));
Assert.assertEquals("Can not enhance Test ContentItem", EnhancementEngine.ENHANCE_ASYNC, engine.canEnhance(ci));
// compute the enhancements
try {
engine.computeEnhancements(ci);
} catch (EngineException e) {
RemoteServiceHelper.checkServiceUnavailable(e);
// deactivate test
return;
}
// now validate the enhancements
int sentimentExpressionCnt = 0;
for (Iterator<Token> tokens = at.getTokens(); tokens.hasNext(); ) {
Token token = tokens.next();
log.info("Token: {}", token);
List<Value<Double>> sentimentExpressionsList = token.getAnnotations(NlpAnnotations.SENTIMENT_ANNOTATION);
if (sentimentExpressionsList != null && sentimentExpressionsList.size() > 0)
sentimentExpressionCnt++;
}
Assert.assertTrue("2 sentiment expressions should be recognized in: " + text, sentimentExpressionCnt == 2);
}
use of org.apache.stanbol.enhancer.nlp.model.annotation.Value in project stanbol by apache.
the class AnalyzedTextSerializerAndParserTest method testSerialization.
@Test
public void testSerialization() throws IOException {
ByteArrayOutputStream bout = new ByteArrayOutputStream();
AnalyzedTextSerializer serializer = AnalyzedTextSerializer.getDefaultInstance();
serializer.serialize(analysedTextWithData, bout, null);
// get the serialized String and check for some expected elements
byte[] data = bout.toByteArray();
String serialized = new String(data, Charset.forName("UTF-8"));
log.info(serialized);
Assert.assertTrue(serialized.contains("\"spans\" : [ {"));
Assert.assertTrue(serialized.contains("\"type\" : \"Text\""));
Assert.assertTrue(serialized.contains("\"type\" : \"Sentence\""));
Assert.assertTrue(serialized.contains("\"type\" : \"Token\""));
Assert.assertTrue(serialized.contains("\"stanbol.enhancer.nlp.pos\" : {"));
Assert.assertTrue(serialized.contains("\"class\" : \"org.apache.stanbol.enhancer.nlp.pos.PosTag\""));
Assert.assertTrue(serialized.contains("\"stanbol.enhancer.nlp.ner\" : {"));
Assert.assertTrue(serialized.contains("\"class\" : \"org.apache.stanbol.enhancer.nlp.ner.NerTag\""));
Assert.assertTrue(serialized.contains("\"stanbol.enhancer.nlp.morpho\" : {"));
Assert.assertTrue(serialized.contains("\"class\" : \"org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures\""));
// deserialize
AnalyzedTextParser parser = AnalyzedTextParser.getDefaultInstance();
AnalysedText parsedAt = parser.parse(new ByteArrayInputStream(data), null, atFactory.createAnalysedText(textBlob.getValue()));
Assert.assertEquals(analysedTextWithData, parsedAt);
Iterator<Span> origSpanIt = analysedTextWithData.getEnclosed(EnumSet.allOf(SpanTypeEnum.class));
Iterator<Span> parsedSpanIt = parsedAt.getEnclosed(EnumSet.allOf(SpanTypeEnum.class));
while (origSpanIt.hasNext() && parsedSpanIt.hasNext()) {
Span orig = origSpanIt.next();
Span parsed = parsedSpanIt.next();
Assert.assertEquals(orig, parsed);
Set<String> origKeys = orig.getKeys();
Set<String> parsedKeys = parsed.getKeys();
Assert.assertEquals(origKeys, parsedKeys);
for (String key : origKeys) {
List<Value<?>> origValues = orig.getValues(key);
List<Value<?>> parsedValues = parsed.getValues(key);
Assert.assertEquals(origValues, parsedValues);
}
}
Assert.assertFalse("Original AnalyzedText MUST NOT have additional Spans", origSpanIt.hasNext());
Assert.assertFalse("Parsed AnalyzedText MUST NOT have additional Spans", parsedSpanIt.hasNext());
}
use of org.apache.stanbol.enhancer.nlp.model.annotation.Value in project stanbol by apache.
the class AnalyzedTextSerializer method writeSpan.
private ObjectNode writeSpan(Span span) throws IOException {
log.trace("wirte {}", span);
ObjectNode jSpan = mapper.createObjectNode();
jSpan.put("type", span.getType().name());
jSpan.put("start", span.getStart());
jSpan.put("end", span.getEnd());
for (String key : span.getKeys()) {
List<Value<?>> values = span.getValues(key);
if (values.size() == 1) {
jSpan.put(key, writeValue(values.get(0)));
} else {
ArrayNode jValues = jSpan.putArray(key);
for (Value<?> value : values) {
jValues.add(writeValue(value));
}
jSpan.put(key, jValues);
}
}
log.trace(" ... {}", jSpan);
return jSpan;
}
use of org.apache.stanbol.enhancer.nlp.model.annotation.Value in project stanbol by apache.
the class SentimentEngine method computeEnhancements.
/**
* Compute enhancements for supplied ContentItem. The results of the process
* are expected to be stored in the metadata of the content item.
* <p/>
* The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
* persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
*
* @throws org.apache.stanbol.enhancer.servicesapi.EngineException
* if the underlying process failed to work as
* expected
*/
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
AnalysedText analysedText = getAnalysedText(this, ci, true);
String language = getLanguage(this, ci, true);
SentimentClassifier classifier = classifiers.get(language);
if (classifier == null) {
throw new IllegalStateException("Sentiment Classifier for language '" + language + "' not available. As this is also checked in " + " canEnhance this may indicate an Bug in the used " + "EnhancementJobManager!");
}
// TODO: locking for AnalysedText not yet defined
// ci.getLock().writeLock().lock();
// try {
Iterator<Token> tokens = analysedText.getTokens();
while (tokens.hasNext()) {
Token token = tokens.next();
Set<LexicalCategory> cats = null;
boolean process = false;
if (!adjectivesOnly) {
process = true;
Value<PosTag> posTag = token.getAnnotation(NlpAnnotations.POS_ANNOTATION);
if (posTag != null && posTag.probability() == Value.UNKNOWN_PROBABILITY || posTag.probability() >= (minPOSConfidence / 2.0)) {
cats = classifier.getCategories(posTag.value());
} else {
// no POS tags or probability to low
cats = Collections.emptySet();
}
} else {
// check PosTags if we need to lookup this word
Iterator<Value<PosTag>> posTags = token.getAnnotations(NlpAnnotations.POS_ANNOTATION).iterator();
boolean ignore = false;
while (!ignore && !process && posTags.hasNext()) {
Value<PosTag> value = posTags.next();
PosTag tag = value.value();
cats = classifier.getCategories(tag);
boolean state = cats.contains(LexicalCategory.Adjective) || cats.contains(LexicalCategory.Noun);
ignore = !state && (value.probability() == Value.UNKNOWN_PROBABILITY || value.probability() >= minPOSConfidence);
process = state && (value.probability() == Value.UNKNOWN_PROBABILITY || value.probability() >= (minPOSConfidence / 2.0));
}
}
// else process all tokens ... no POS tag checking needed
if (process) {
String word = token.getSpan();
double sentiment = 0.0;
if (cats.isEmpty()) {
sentiment = classifier.classifyWord(null, word);
} else {
// in case of multiple Lexical Cats
// we build the average over NOT NULL sentiments for the word
int catSentNum = 0;
for (LexicalCategory cat : cats) {
double catSent = classifier.classifyWord(cat, word);
if (catSent != 0.0) {
catSentNum++;
sentiment = sentiment + catSent;
}
}
if (catSentNum > 0) {
sentiment = sentiment / (double) catSentNum;
}
}
if (sentiment != 0.0) {
token.addAnnotation(SENTIMENT_ANNOTATION, new Value<Double>(sentiment));
}
// else do not set sentiments with 0.0
}
// else do not process
}
// } finally {
// ci.getLock().writeLock().unlock();
// }
}
Aggregations