Search in sources :

Example 1 with Value

use of org.apache.stanbol.enhancer.nlp.model.annotation.Value in project stanbol by apache.

the class CeliAnalyzedTextLemmatizerEngineTest method testEngineDe.

@Test
public void testEngineDe() throws IOException, EngineException {
    ContentItem ci = ciFactory.createContentItem(new StringSource(de_text));
    Assert.assertNotNull(ci);
    AnalysedText at = atFactory.createAnalysedText(ci, ci.getBlob());
    Assert.assertNotNull(at);
    ci.getMetadata().add(new TripleImpl(ci.getUri(), DC_LANGUAGE, new PlainLiteralImpl("de")));
    Assert.assertEquals("de", EnhancementEngineHelper.getLanguage(ci));
    // Add some Tokens with POS annotations to test the usage of
    // existing POS annotations by the lemmatizer
    Token verbrachten = at.addToken(de_verbStart, de_verbStart + de_verb.length());
    verbrachten.addAnnotation(POS_ANNOTATION, Value.value(new PosTag("V", LexicalCategory.Verb), de_verbProb));
    Token schonen = at.addToken(de_adjectiveStart, de_adjectiveStart + de_adjective.length());
    schonen.addAnnotation(POS_ANNOTATION, Value.value(new PosTag("ADJ", LexicalCategory.Adjective), de_adjectiveProb));
    Token urlaub = at.addToken(de_nounStart, de_nounStart + de_noun.length());
    urlaub.addAnnotation(POS_ANNOTATION, Value.value(new PosTag("NC", LexicalCategory.Noun), de_nounProb));
    Assert.assertEquals("Can not enhance Test ContentItem", EnhancementEngine.ENHANCE_ASYNC, engine.canEnhance(ci));
    // compute the enhancements
    try {
        engine.computeEnhancements(ci);
    } catch (EngineException e) {
        RemoteServiceHelper.checkServiceUnavailable(e);
        // deactivate test
        return;
    }
    // now validate the enhancements
    boolean foundVerb = false;
    boolean foundAdjective = false;
    boolean foundNoun = false;
    for (Iterator<Token> tokens = at.getTokens(); tokens.hasNext(); ) {
        Token token = tokens.next();
        log.info("Token: {}", token);
        List<Value<MorphoFeatures>> mfs = token.getAnnotations(NlpAnnotations.MORPHO_ANNOTATION);
        if (de_verb.equals(token.getSpan())) {
            foundVerb = !mfs.isEmpty();
            validateMorphFeatureProbability(mfs, LexicalCategory.Verb, de_verbProb);
        } else if (de_adjective.equals(token.getSpan())) {
            foundAdjective = !mfs.isEmpty();
            validateMorphFeatureProbability(mfs, LexicalCategory.Adjective, de_adjectiveProb);
        } else if (de_noun.equals(token.getSpan())) {
            foundNoun = !mfs.isEmpty();
            validateMorphFeatureProbability(mfs, LexicalCategory.Noun, de_nounProb);
        }
        for (Value<MorphoFeatures> mf : mfs) {
            log.info("  - {}", mf);
            Assert.assertNotNull(mf.value().getLemma());
        }
    }
    Assert.assertTrue("No MorphoFeatures found for '" + de_verb + "'!", foundVerb);
    Assert.assertTrue("No MorphoFeatures found for '" + de_adjective + "'!", foundAdjective);
    Assert.assertTrue("No MorphoFeatures found for '" + de_noun + "'!", foundNoun);
}
Also used : PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) Token(org.apache.stanbol.enhancer.nlp.model.Token) AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) PosTag(org.apache.stanbol.enhancer.nlp.pos.PosTag) Value(org.apache.stanbol.enhancer.nlp.model.annotation.Value) StringSource(org.apache.stanbol.enhancer.servicesapi.impl.StringSource) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) MorphoFeatures(org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures) ContentItem(org.apache.stanbol.enhancer.servicesapi.ContentItem) Test(org.junit.Test)

Example 2 with Value

use of org.apache.stanbol.enhancer.nlp.model.annotation.Value in project stanbol by apache.

the class CeliAnalyzedTextSentimentAnalysisEngineTest method testEngine.

@Test
public void testEngine() throws IOException, EngineException {
    ContentItem ci = ciFactory.createContentItem(new StringSource(text));
    Assert.assertNotNull(ci);
    AnalysedText at = atFactory.createAnalysedText(ci, ci.getBlob());
    Assert.assertNotNull(at);
    ci.getMetadata().add(new TripleImpl(ci.getUri(), DC_LANGUAGE, new PlainLiteralImpl("it")));
    Assert.assertEquals("it", EnhancementEngineHelper.getLanguage(ci));
    Assert.assertEquals("Can not enhance Test ContentItem", EnhancementEngine.ENHANCE_ASYNC, engine.canEnhance(ci));
    // compute the enhancements
    try {
        engine.computeEnhancements(ci);
    } catch (EngineException e) {
        RemoteServiceHelper.checkServiceUnavailable(e);
        // deactivate test
        return;
    }
    // now validate the enhancements
    int sentimentExpressionCnt = 0;
    for (Iterator<Token> tokens = at.getTokens(); tokens.hasNext(); ) {
        Token token = tokens.next();
        log.info("Token: {}", token);
        List<Value<Double>> sentimentExpressionsList = token.getAnnotations(NlpAnnotations.SENTIMENT_ANNOTATION);
        if (sentimentExpressionsList != null && sentimentExpressionsList.size() > 0)
            sentimentExpressionCnt++;
    }
    Assert.assertTrue("2 sentiment expressions should be recognized in: " + text, sentimentExpressionCnt == 2);
}
Also used : AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) Value(org.apache.stanbol.enhancer.nlp.model.annotation.Value) Token(org.apache.stanbol.enhancer.nlp.model.Token) StringSource(org.apache.stanbol.enhancer.servicesapi.impl.StringSource) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) ContentItem(org.apache.stanbol.enhancer.servicesapi.ContentItem) CeliAnalyzedTextLemmatizerEngineTest(org.apache.stanbol.enhancer.engines.celi.lemmatizer.impl.CeliAnalyzedTextLemmatizerEngineTest) Test(org.junit.Test)

Example 3 with Value

use of org.apache.stanbol.enhancer.nlp.model.annotation.Value in project stanbol by apache.

the class AnalyzedTextSerializerAndParserTest method testSerialization.

@Test
public void testSerialization() throws IOException {
    ByteArrayOutputStream bout = new ByteArrayOutputStream();
    AnalyzedTextSerializer serializer = AnalyzedTextSerializer.getDefaultInstance();
    serializer.serialize(analysedTextWithData, bout, null);
    // get the serialized String and check for some expected elements
    byte[] data = bout.toByteArray();
    String serialized = new String(data, Charset.forName("UTF-8"));
    log.info(serialized);
    Assert.assertTrue(serialized.contains("\"spans\" : [ {"));
    Assert.assertTrue(serialized.contains("\"type\" : \"Text\""));
    Assert.assertTrue(serialized.contains("\"type\" : \"Sentence\""));
    Assert.assertTrue(serialized.contains("\"type\" : \"Token\""));
    Assert.assertTrue(serialized.contains("\"stanbol.enhancer.nlp.pos\" : {"));
    Assert.assertTrue(serialized.contains("\"class\" : \"org.apache.stanbol.enhancer.nlp.pos.PosTag\""));
    Assert.assertTrue(serialized.contains("\"stanbol.enhancer.nlp.ner\" : {"));
    Assert.assertTrue(serialized.contains("\"class\" : \"org.apache.stanbol.enhancer.nlp.ner.NerTag\""));
    Assert.assertTrue(serialized.contains("\"stanbol.enhancer.nlp.morpho\" : {"));
    Assert.assertTrue(serialized.contains("\"class\" : \"org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures\""));
    // deserialize
    AnalyzedTextParser parser = AnalyzedTextParser.getDefaultInstance();
    AnalysedText parsedAt = parser.parse(new ByteArrayInputStream(data), null, atFactory.createAnalysedText(textBlob.getValue()));
    Assert.assertEquals(analysedTextWithData, parsedAt);
    Iterator<Span> origSpanIt = analysedTextWithData.getEnclosed(EnumSet.allOf(SpanTypeEnum.class));
    Iterator<Span> parsedSpanIt = parsedAt.getEnclosed(EnumSet.allOf(SpanTypeEnum.class));
    while (origSpanIt.hasNext() && parsedSpanIt.hasNext()) {
        Span orig = origSpanIt.next();
        Span parsed = parsedSpanIt.next();
        Assert.assertEquals(orig, parsed);
        Set<String> origKeys = orig.getKeys();
        Set<String> parsedKeys = parsed.getKeys();
        Assert.assertEquals(origKeys, parsedKeys);
        for (String key : origKeys) {
            List<Value<?>> origValues = orig.getValues(key);
            List<Value<?>> parsedValues = parsed.getValues(key);
            Assert.assertEquals(origValues, parsedValues);
        }
    }
    Assert.assertFalse("Original AnalyzedText MUST NOT have additional Spans", origSpanIt.hasNext());
    Assert.assertFalse("Parsed AnalyzedText MUST NOT have additional Spans", parsedSpanIt.hasNext());
}
Also used : SpanTypeEnum(org.apache.stanbol.enhancer.nlp.model.SpanTypeEnum) ByteArrayOutputStream(java.io.ByteArrayOutputStream) Span(org.apache.stanbol.enhancer.nlp.model.Span) AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) ByteArrayInputStream(java.io.ByteArrayInputStream) Value(org.apache.stanbol.enhancer.nlp.model.annotation.Value) Test(org.junit.Test)

Example 4 with Value

use of org.apache.stanbol.enhancer.nlp.model.annotation.Value in project stanbol by apache.

the class AnalyzedTextSerializer method writeSpan.

private ObjectNode writeSpan(Span span) throws IOException {
    log.trace("wirte {}", span);
    ObjectNode jSpan = mapper.createObjectNode();
    jSpan.put("type", span.getType().name());
    jSpan.put("start", span.getStart());
    jSpan.put("end", span.getEnd());
    for (String key : span.getKeys()) {
        List<Value<?>> values = span.getValues(key);
        if (values.size() == 1) {
            jSpan.put(key, writeValue(values.get(0)));
        } else {
            ArrayNode jValues = jSpan.putArray(key);
            for (Value<?> value : values) {
                jValues.add(writeValue(value));
            }
            jSpan.put(key, jValues);
        }
    }
    log.trace(" ... {}", jSpan);
    return jSpan;
}
Also used : ObjectNode(org.codehaus.jackson.node.ObjectNode) Value(org.apache.stanbol.enhancer.nlp.model.annotation.Value) ArrayNode(org.codehaus.jackson.node.ArrayNode)

Example 5 with Value

use of org.apache.stanbol.enhancer.nlp.model.annotation.Value in project stanbol by apache.

the class SentimentEngine method computeEnhancements.

/**
 * Compute enhancements for supplied ContentItem. The results of the process
 * are expected to be stored in the metadata of the content item.
 * <p/>
 * The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
 * persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
 *
 * @throws org.apache.stanbol.enhancer.servicesapi.EngineException
 *          if the underlying process failed to work as
 *          expected
 */
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    AnalysedText analysedText = getAnalysedText(this, ci, true);
    String language = getLanguage(this, ci, true);
    SentimentClassifier classifier = classifiers.get(language);
    if (classifier == null) {
        throw new IllegalStateException("Sentiment Classifier for language '" + language + "' not available. As this is also checked in " + " canEnhance this may indicate an Bug in the used " + "EnhancementJobManager!");
    }
    // TODO: locking for AnalysedText not yet defined
    // ci.getLock().writeLock().lock();
    // try {
    Iterator<Token> tokens = analysedText.getTokens();
    while (tokens.hasNext()) {
        Token token = tokens.next();
        Set<LexicalCategory> cats = null;
        boolean process = false;
        if (!adjectivesOnly) {
            process = true;
            Value<PosTag> posTag = token.getAnnotation(NlpAnnotations.POS_ANNOTATION);
            if (posTag != null && posTag.probability() == Value.UNKNOWN_PROBABILITY || posTag.probability() >= (minPOSConfidence / 2.0)) {
                cats = classifier.getCategories(posTag.value());
            } else {
                // no POS tags or probability to low
                cats = Collections.emptySet();
            }
        } else {
            // check PosTags if we need to lookup this word
            Iterator<Value<PosTag>> posTags = token.getAnnotations(NlpAnnotations.POS_ANNOTATION).iterator();
            boolean ignore = false;
            while (!ignore && !process && posTags.hasNext()) {
                Value<PosTag> value = posTags.next();
                PosTag tag = value.value();
                cats = classifier.getCategories(tag);
                boolean state = cats.contains(LexicalCategory.Adjective) || cats.contains(LexicalCategory.Noun);
                ignore = !state && (value.probability() == Value.UNKNOWN_PROBABILITY || value.probability() >= minPOSConfidence);
                process = state && (value.probability() == Value.UNKNOWN_PROBABILITY || value.probability() >= (minPOSConfidence / 2.0));
            }
        }
        // else process all tokens ... no POS tag checking needed
        if (process) {
            String word = token.getSpan();
            double sentiment = 0.0;
            if (cats.isEmpty()) {
                sentiment = classifier.classifyWord(null, word);
            } else {
                // in case of multiple Lexical Cats
                // we build the average over NOT NULL sentiments for the word
                int catSentNum = 0;
                for (LexicalCategory cat : cats) {
                    double catSent = classifier.classifyWord(cat, word);
                    if (catSent != 0.0) {
                        catSentNum++;
                        sentiment = sentiment + catSent;
                    }
                }
                if (catSentNum > 0) {
                    sentiment = sentiment / (double) catSentNum;
                }
            }
            if (sentiment != 0.0) {
                token.addAnnotation(SENTIMENT_ANNOTATION, new Value<Double>(sentiment));
            }
        // else do not set sentiments with 0.0
        }
    // else do not process
    }
// } finally {
// ci.getLock().writeLock().unlock();
// }
}
Also used : Token(org.apache.stanbol.enhancer.nlp.model.Token) LexicalCategory(org.apache.stanbol.enhancer.nlp.pos.LexicalCategory) NlpEngineHelper.getAnalysedText(org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getAnalysedText) AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) SentimentClassifier(org.apache.stanbol.enhancer.engines.sentiment.api.SentimentClassifier) PosTag(org.apache.stanbol.enhancer.nlp.pos.PosTag) Value(org.apache.stanbol.enhancer.nlp.model.annotation.Value)

Aggregations

Value (org.apache.stanbol.enhancer.nlp.model.annotation.Value)6 AnalysedText (org.apache.stanbol.enhancer.nlp.model.AnalysedText)4 Token (org.apache.stanbol.enhancer.nlp.model.Token)3 Test (org.junit.Test)3 PlainLiteralImpl (org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl)2 TripleImpl (org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)2 Span (org.apache.stanbol.enhancer.nlp.model.Span)2 SpanTypeEnum (org.apache.stanbol.enhancer.nlp.model.SpanTypeEnum)2 PosTag (org.apache.stanbol.enhancer.nlp.pos.PosTag)2 ContentItem (org.apache.stanbol.enhancer.servicesapi.ContentItem)2 EngineException (org.apache.stanbol.enhancer.servicesapi.EngineException)2 StringSource (org.apache.stanbol.enhancer.servicesapi.impl.StringSource)2 ByteArrayInputStream (java.io.ByteArrayInputStream)1 ByteArrayOutputStream (java.io.ByteArrayOutputStream)1 CeliAnalyzedTextLemmatizerEngineTest (org.apache.stanbol.enhancer.engines.celi.lemmatizer.impl.CeliAnalyzedTextLemmatizerEngineTest)1 SentimentClassifier (org.apache.stanbol.enhancer.engines.sentiment.api.SentimentClassifier)1 MorphoFeatures (org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures)1 LexicalCategory (org.apache.stanbol.enhancer.nlp.pos.LexicalCategory)1 NlpEngineHelper.getAnalysedText (org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getAnalysedText)1 ArrayNode (org.codehaus.jackson.node.ArrayNode)1