Search in sources :

Example 11 with Token

use of org.apache.stanbol.enhancer.nlp.model.Token in project stanbol by apache.

the class CeliAnalyzedTextSentimentAnalysisEngineTest method testEngine.

@Test
public void testEngine() throws IOException, EngineException {
    ContentItem ci = ciFactory.createContentItem(new StringSource(text));
    Assert.assertNotNull(ci);
    AnalysedText at = atFactory.createAnalysedText(ci, ci.getBlob());
    Assert.assertNotNull(at);
    ci.getMetadata().add(new TripleImpl(ci.getUri(), DC_LANGUAGE, new PlainLiteralImpl("it")));
    Assert.assertEquals("it", EnhancementEngineHelper.getLanguage(ci));
    Assert.assertEquals("Can not enhance Test ContentItem", EnhancementEngine.ENHANCE_ASYNC, engine.canEnhance(ci));
    //compute the enhancements
    try {
        engine.computeEnhancements(ci);
    } catch (EngineException e) {
        RemoteServiceHelper.checkServiceUnavailable(e);
        //deactivate test
        return;
    }
    //now validate the enhancements
    int sentimentExpressionCnt = 0;
    for (Iterator<Token> tokens = at.getTokens(); tokens.hasNext(); ) {
        Token token = tokens.next();
        log.info("Token: {}", token);
        List<Value<Double>> sentimentExpressionsList = token.getAnnotations(NlpAnnotations.SENTIMENT_ANNOTATION);
        if (sentimentExpressionsList != null && sentimentExpressionsList.size() > 0)
            sentimentExpressionCnt++;
    }
    Assert.assertTrue("2 sentiment expressions should be recognized in: " + text, sentimentExpressionCnt == 2);
}
Also used : AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) Value(org.apache.stanbol.enhancer.nlp.model.annotation.Value) Token(org.apache.stanbol.enhancer.nlp.model.Token) StringSource(org.apache.stanbol.enhancer.servicesapi.impl.StringSource) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) ContentItem(org.apache.stanbol.enhancer.servicesapi.ContentItem) CeliAnalyzedTextLemmatizerEngineTest(org.apache.stanbol.enhancer.engines.celi.lemmatizer.impl.CeliAnalyzedTextLemmatizerEngineTest) Test(org.junit.Test)

Example 12 with Token

use of org.apache.stanbol.enhancer.nlp.model.Token in project stanbol by apache.

the class CeliAnalyzedTextLemmatizerEngineTest method testEngineDe.

@Test
public void testEngineDe() throws IOException, EngineException {
    ContentItem ci = ciFactory.createContentItem(new StringSource(de_text));
    Assert.assertNotNull(ci);
    AnalysedText at = atFactory.createAnalysedText(ci, ci.getBlob());
    Assert.assertNotNull(at);
    ci.getMetadata().add(new TripleImpl(ci.getUri(), DC_LANGUAGE, new PlainLiteralImpl("de")));
    Assert.assertEquals("de", EnhancementEngineHelper.getLanguage(ci));
    //Add some Tokens with POS annotations to test the usage of
    //existing POS annotations by the lemmatizer
    Token verbrachten = at.addToken(de_verbStart, de_verbStart + de_verb.length());
    verbrachten.addAnnotation(POS_ANNOTATION, Value.value(new PosTag("V", LexicalCategory.Verb), de_verbProb));
    Token schonen = at.addToken(de_adjectiveStart, de_adjectiveStart + de_adjective.length());
    schonen.addAnnotation(POS_ANNOTATION, Value.value(new PosTag("ADJ", LexicalCategory.Adjective), de_adjectiveProb));
    Token urlaub = at.addToken(de_nounStart, de_nounStart + de_noun.length());
    urlaub.addAnnotation(POS_ANNOTATION, Value.value(new PosTag("NC", LexicalCategory.Noun), de_nounProb));
    Assert.assertEquals("Can not enhance Test ContentItem", EnhancementEngine.ENHANCE_ASYNC, engine.canEnhance(ci));
    //compute the enhancements
    try {
        engine.computeEnhancements(ci);
    } catch (EngineException e) {
        RemoteServiceHelper.checkServiceUnavailable(e);
        //deactivate test
        return;
    }
    //now validate the enhancements
    boolean foundVerb = false;
    boolean foundAdjective = false;
    boolean foundNoun = false;
    for (Iterator<Token> tokens = at.getTokens(); tokens.hasNext(); ) {
        Token token = tokens.next();
        log.info("Token: {}", token);
        List<Value<MorphoFeatures>> mfs = token.getAnnotations(NlpAnnotations.MORPHO_ANNOTATION);
        if (de_verb.equals(token.getSpan())) {
            foundVerb = !mfs.isEmpty();
            validateMorphFeatureProbability(mfs, LexicalCategory.Verb, de_verbProb);
        } else if (de_adjective.equals(token.getSpan())) {
            foundAdjective = !mfs.isEmpty();
            validateMorphFeatureProbability(mfs, LexicalCategory.Adjective, de_adjectiveProb);
        } else if (de_noun.equals(token.getSpan())) {
            foundNoun = !mfs.isEmpty();
            validateMorphFeatureProbability(mfs, LexicalCategory.Noun, de_nounProb);
        }
        for (Value<MorphoFeatures> mf : mfs) {
            log.info("  - {}", mf);
            Assert.assertNotNull(mf.value().getLemma());
        }
    }
    Assert.assertTrue("No MorphoFeatures found for '" + de_verb + "'!", foundVerb);
    Assert.assertTrue("No MorphoFeatures found for '" + de_adjective + "'!", foundAdjective);
    Assert.assertTrue("No MorphoFeatures found for '" + de_noun + "'!", foundNoun);
}
Also used : PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) Token(org.apache.stanbol.enhancer.nlp.model.Token) AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) PosTag(org.apache.stanbol.enhancer.nlp.pos.PosTag) Value(org.apache.stanbol.enhancer.nlp.model.annotation.Value) StringSource(org.apache.stanbol.enhancer.servicesapi.impl.StringSource) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) MorphoFeatures(org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures) ContentItem(org.apache.stanbol.enhancer.servicesapi.ContentItem) Test(org.junit.Test)

Example 13 with Token

use of org.apache.stanbol.enhancer.nlp.model.Token in project stanbol by apache.

the class CeliAnalyzedTextLemmatizerEngine method computeEnhancements.

@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    AnalysedText at = getAnalysedText(this, ci, true);
    String language = getLanguage(this, ci, true);
    isLangaugeConfigured(this, languageConfig, language, true);
    List<LexicalEntry> terms;
    try {
        terms = this.client.performMorfologicalAnalysis(at.getSpan(), language);
    } catch (IOException e) {
        throw new EngineException("Error while calling the CELI Lemmatizer" + " service (configured URL: " + serviceURL + ")!", e);
    } catch (SOAPException e) {
        throw new EngineException("Error wile encoding/decoding the request/" + "response to the CELI lemmatizer service!", e);
    }
    Map<LexicalCategory, Double> tokenLexCats = new EnumMap<LexicalCategory, Double>(LexicalCategory.class);
    for (LexicalEntry term : terms) {
        if (term.getTermReadings().isEmpty()) {
            //ignore terms without readings
            continue;
        }
        //Add the LexicalEntry as Token to the Text. NOTE that if a
        //Token with the same start/end positions already exist this
        //Method returns the existing instance
        Token token = at.addToken(term.getFrom(), term.getTo());
        //Now try to get POS annotations for the Token
        for (Value<PosTag> posAnno : token.getAnnotations(NlpAnnotations.POS_ANNOTATION)) {
            if (posAnno.value().isMapped()) {
                for (LexicalCategory cat : posAnno.value().getCategories()) {
                    if (!tokenLexCats.containsKey(cat)) {
                        //do not override with lover prob
                        tokenLexCats.put(cat, posAnno.probability());
                    }
                }
            }
        }
        for (Reading reading : term.getTermReadings()) {
            MorphoFeatures mf = CeliMorphoFeatures.parseFrom(reading, language);
            //add the readings (MorphoFeatures)
            if (mf != null) {
                //use the POS tags of the morpho analysis and compare it
                //with existing POS tags.
                double posProbability = -1;
                Set<LexicalCategory> mfCats = EnumSet.noneOf(LexicalCategory.class);
                for (PosTag mfPos : mf.getPosList()) {
                    mfCats.addAll(mfPos.getCategories());
                }
                for (LexicalCategory mfCat : mfCats) {
                    Double prob = tokenLexCats.get(mfCat);
                    if (prob != null && posProbability < prob) {
                        posProbability = prob;
                    }
                }
                //add the morpho features with the posProbabiliy
                Value<MorphoFeatures> value = Value.value(mf, posProbability < 0 ? Value.UNKNOWN_PROBABILITY : posProbability);
                token.addAnnotation(NlpAnnotations.MORPHO_ANNOTATION, value);
            }
        }
    }
}
Also used : EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) Token(org.apache.stanbol.enhancer.nlp.model.Token) IOException(java.io.IOException) LexicalCategory(org.apache.stanbol.enhancer.nlp.pos.LexicalCategory) NlpEngineHelper.getAnalysedText(org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getAnalysedText) AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) PosTag(org.apache.stanbol.enhancer.nlp.pos.PosTag) SOAPException(javax.xml.soap.SOAPException) MorphoFeatures(org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures) CeliMorphoFeatures(org.apache.stanbol.enhancer.engines.celi.CeliMorphoFeatures) EnumMap(java.util.EnumMap)

Example 14 with Token

use of org.apache.stanbol.enhancer.nlp.model.Token in project stanbol by apache.

the class TestKuromojiNlpEngine method testEngine.

@Test
public void testEngine() throws EngineException {
    LiteralFactory lf = LiteralFactory.getInstance();
    Assert.assertEquals(EnhancementEngine.ENHANCE_ASYNC, engine.canEnhance(contentItem));
    engine.computeEnhancements(contentItem);
    //assert the results
    Map<IRI, RDFTerm> expected = new HashMap<IRI, RDFTerm>();
    expected.put(Properties.DC_CREATOR, lf.createTypedLiteral(engine.getClass().getName()));
    expected.put(Properties.ENHANCER_EXTRACTED_FROM, contentItem.getUri());
    Assert.assertEquals(16, EnhancementStructureHelper.validateAllTextAnnotations(contentItem.getMetadata(), text, expected));
    AnalysedText at = AnalysedTextUtils.getAnalysedText(contentItem);
    Assert.assertNotNull(at);
    List<Sentence> sentences = AnalysedTextUtils.asList(at.getSentences());
    Assert.assertNotNull(sentences);
    Assert.assertEquals(7, sentences.size());
    //TODO: values in the following arrays are based on the first run of the
    // engine. So this is only to detect changes in results. It can not validate
    // that the tokenization and NER detections are correct - sorry I do not
    // speak Japanese ...
    int[] expectedChunks = new int[] { 5, 3, 1, 0, 1, 2, 4 };
    int[] expectedTokens = new int[] { 25, 25, 25, 24, 33, 17, 32 };
    int sentIndex = 0;
    for (Sentence sent : sentences) {
        List<Chunk> sentenceNer = AnalysedTextUtils.asList(sent.getChunks());
        Assert.assertEquals(expectedChunks[sentIndex], sentenceNer.size());
        for (Chunk chunk : sentenceNer) {
            Value<NerTag> nerValue = chunk.getAnnotation(NlpAnnotations.NER_ANNOTATION);
            Assert.assertNotNull(nerValue);
            Assert.assertNotNull(nerValue.value().getType());
        }
        List<Token> tokens = AnalysedTextUtils.asList(sent.getTokens());
        Assert.assertEquals(expectedTokens[sentIndex], tokens.size());
        for (Token token : tokens) {
            Value<PosTag> posValue = token.getAnnotation(NlpAnnotations.POS_ANNOTATION);
            Assert.assertNotNull(posValue);
        }
        sentIndex++;
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) NerTag(org.apache.stanbol.enhancer.nlp.ner.NerTag) HashMap(java.util.HashMap) RDFTerm(org.apache.clerezza.commons.rdf.RDFTerm) Token(org.apache.stanbol.enhancer.nlp.model.Token) Chunk(org.apache.stanbol.enhancer.nlp.model.Chunk) LiteralFactory(org.apache.clerezza.rdf.core.LiteralFactory) AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) PosTag(org.apache.stanbol.enhancer.nlp.pos.PosTag) Sentence(org.apache.stanbol.enhancer.nlp.model.Sentence) Test(org.junit.Test)

Example 15 with Token

use of org.apache.stanbol.enhancer.nlp.model.Token in project stanbol by apache.

the class CorefFeatureSupportTest method initCorefAnnotations.

private static void initCorefAnnotations() {
    Sentence sentence1 = at.addSentence(0, sentenceText1.indexOf(".") + 1);
    Token obama = sentence1.addToken(0, "Obama".length());
    Sentence sentence2 = at.addSentence(sentenceText1.indexOf(".") + 2, sentenceText2.indexOf(".") + 1);
    int heStartIdx = sentence2.getSpan().indexOf("He");
    Token he = sentence2.addToken(heStartIdx, heStartIdx + "He".length());
    Set<Span> obamaMentions = new HashSet<Span>();
    obamaMentions.add(he);
    obama.addAnnotation(NlpAnnotations.COREF_ANNOTATION, Value.value(new CorefFeature(true, obamaMentions)));
    Set<Span> heMentions = new HashSet<Span>();
    heMentions.add(obama);
    he.addAnnotation(NlpAnnotations.COREF_ANNOTATION, Value.value(new CorefFeature(false, heMentions)));
}
Also used : CorefFeature(org.apache.stanbol.enhancer.nlp.coref.CorefFeature) Token(org.apache.stanbol.enhancer.nlp.model.Token) Sentence(org.apache.stanbol.enhancer.nlp.model.Sentence) Span(org.apache.stanbol.enhancer.nlp.model.Span) HashSet(java.util.HashSet)

Aggregations

Token (org.apache.stanbol.enhancer.nlp.model.Token)23 AnalysedText (org.apache.stanbol.enhancer.nlp.model.AnalysedText)13 PosTag (org.apache.stanbol.enhancer.nlp.pos.PosTag)12 Sentence (org.apache.stanbol.enhancer.nlp.model.Sentence)9 EngineException (org.apache.stanbol.enhancer.servicesapi.EngineException)8 ArrayList (java.util.ArrayList)7 Chunk (org.apache.stanbol.enhancer.nlp.model.Chunk)7 Section (org.apache.stanbol.enhancer.nlp.model.Section)5 NerTag (org.apache.stanbol.enhancer.nlp.ner.NerTag)5 NlpEngineHelper.getAnalysedText (org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getAnalysedText)5 IOException (java.io.IOException)4 PlainLiteralImpl (org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl)4 TripleImpl (org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)4 NlpEngineHelper.initAnalysedText (org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.initAnalysedText)4 StringSource (org.apache.stanbol.enhancer.servicesapi.impl.StringSource)4 IRI (org.apache.clerezza.commons.rdf.IRI)3 Value (org.apache.stanbol.enhancer.nlp.model.annotation.Value)3 MorphoFeatures (org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures)3 PhraseTag (org.apache.stanbol.enhancer.nlp.phrase.PhraseTag)3 ContentItem (org.apache.stanbol.enhancer.servicesapi.ContentItem)3