Examples with PosTag - org.apache.stanbol.enhancer.nlp.pos.PosTag

Example 11 with PosTag

use of org.apache.stanbol.enhancer.nlp.pos.PosTag in project stanbol by apache.

the class CeliAnalyzedTextLemmatizerEngineTest method testEngineDe.

@Test
public void testEngineDe() throws IOException, EngineException {
    ContentItem ci = ciFactory.createContentItem(new StringSource(de_text));
    Assert.assertNotNull(ci);
    AnalysedText at = atFactory.createAnalysedText(ci, ci.getBlob());
    Assert.assertNotNull(at);
    ci.getMetadata().add(new TripleImpl(ci.getUri(), DC_LANGUAGE, new PlainLiteralImpl("de")));
    Assert.assertEquals("de", EnhancementEngineHelper.getLanguage(ci));
    //Add some Tokens with POS annotations to test the usage of
    //existing POS annotations by the lemmatizer
    Token verbrachten = at.addToken(de_verbStart, de_verbStart + de_verb.length());
    verbrachten.addAnnotation(POS_ANNOTATION, Value.value(new PosTag("V", LexicalCategory.Verb), de_verbProb));
    Token schonen = at.addToken(de_adjectiveStart, de_adjectiveStart + de_adjective.length());
    schonen.addAnnotation(POS_ANNOTATION, Value.value(new PosTag("ADJ", LexicalCategory.Adjective), de_adjectiveProb));
    Token urlaub = at.addToken(de_nounStart, de_nounStart + de_noun.length());
    urlaub.addAnnotation(POS_ANNOTATION, Value.value(new PosTag("NC", LexicalCategory.Noun), de_nounProb));
    Assert.assertEquals("Can not enhance Test ContentItem", EnhancementEngine.ENHANCE_ASYNC, engine.canEnhance(ci));
    //compute the enhancements
    try {
        engine.computeEnhancements(ci);
    } catch (EngineException e) {
        RemoteServiceHelper.checkServiceUnavailable(e);
        //deactivate test
        return;
    }
    //now validate the enhancements
    boolean foundVerb = false;
    boolean foundAdjective = false;
    boolean foundNoun = false;
    for (Iterator<Token> tokens = at.getTokens(); tokens.hasNext(); ) {
        Token token = tokens.next();
        log.info("Token: {}", token);
        List<Value<MorphoFeatures>> mfs = token.getAnnotations(NlpAnnotations.MORPHO_ANNOTATION);
        if (de_verb.equals(token.getSpan())) {
            foundVerb = !mfs.isEmpty();
            validateMorphFeatureProbability(mfs, LexicalCategory.Verb, de_verbProb);
        } else if (de_adjective.equals(token.getSpan())) {
            foundAdjective = !mfs.isEmpty();
            validateMorphFeatureProbability(mfs, LexicalCategory.Adjective, de_adjectiveProb);
        } else if (de_noun.equals(token.getSpan())) {
            foundNoun = !mfs.isEmpty();
            validateMorphFeatureProbability(mfs, LexicalCategory.Noun, de_nounProb);
        }
        for (Value<MorphoFeatures> mf : mfs) {
            log.info("  - {}", mf);
            Assert.assertNotNull(mf.value().getLemma());
        }
    }
    Assert.assertTrue("No MorphoFeatures found for '" + de_verb + "'!", foundVerb);
    Assert.assertTrue("No MorphoFeatures found for '" + de_adjective + "'!", foundAdjective);
    Assert.assertTrue("No MorphoFeatures found for '" + de_noun + "'!", foundNoun);
}

Also used : PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) Token(org.apache.stanbol.enhancer.nlp.model.Token) AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) PosTag(org.apache.stanbol.enhancer.nlp.pos.PosTag) Value(org.apache.stanbol.enhancer.nlp.model.annotation.Value) StringSource(org.apache.stanbol.enhancer.servicesapi.impl.StringSource) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) MorphoFeatures(org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures) ContentItem(org.apache.stanbol.enhancer.servicesapi.ContentItem) Test(org.junit.Test)

Example 12 with PosTag

use of org.apache.stanbol.enhancer.nlp.pos.PosTag in project stanbol by apache.

the class CeliAnalyzedTextLemmatizerEngine method computeEnhancements.

@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    AnalysedText at = getAnalysedText(this, ci, true);
    String language = getLanguage(this, ci, true);
    isLangaugeConfigured(this, languageConfig, language, true);
    List<LexicalEntry> terms;
    try {
        terms = this.client.performMorfologicalAnalysis(at.getSpan(), language);
    } catch (IOException e) {
        throw new EngineException("Error while calling the CELI Lemmatizer" + " service (configured URL: " + serviceURL + ")!", e);
    } catch (SOAPException e) {
        throw new EngineException("Error wile encoding/decoding the request/" + "response to the CELI lemmatizer service!", e);
    }
    Map<LexicalCategory, Double> tokenLexCats = new EnumMap<LexicalCategory, Double>(LexicalCategory.class);
    for (LexicalEntry term : terms) {
        if (term.getTermReadings().isEmpty()) {
            //ignore terms without readings
            continue;
        }
        //Add the LexicalEntry as Token to the Text. NOTE that if a
        //Token with the same start/end positions already exist this
        //Method returns the existing instance
        Token token = at.addToken(term.getFrom(), term.getTo());
        //Now try to get POS annotations for the Token
        for (Value<PosTag> posAnno : token.getAnnotations(NlpAnnotations.POS_ANNOTATION)) {
            if (posAnno.value().isMapped()) {
                for (LexicalCategory cat : posAnno.value().getCategories()) {
                    if (!tokenLexCats.containsKey(cat)) {
                        //do not override with lover prob
                        tokenLexCats.put(cat, posAnno.probability());
                    }
                }
            }
        }
        for (Reading reading : term.getTermReadings()) {
            MorphoFeatures mf = CeliMorphoFeatures.parseFrom(reading, language);
            //add the readings (MorphoFeatures)
            if (mf != null) {
                //use the POS tags of the morpho analysis and compare it
                //with existing POS tags.
                double posProbability = -1;
                Set<LexicalCategory> mfCats = EnumSet.noneOf(LexicalCategory.class);
                for (PosTag mfPos : mf.getPosList()) {
                    mfCats.addAll(mfPos.getCategories());
                }
                for (LexicalCategory mfCat : mfCats) {
                    Double prob = tokenLexCats.get(mfCat);
                    if (prob != null && posProbability < prob) {
                        posProbability = prob;
                    }
                }
                //add the morpho features with the posProbabiliy
                Value<MorphoFeatures> value = Value.value(mf, posProbability < 0 ? Value.UNKNOWN_PROBABILITY : posProbability);
                token.addAnnotation(NlpAnnotations.MORPHO_ANNOTATION, value);
            }
        }
    }
}

Also used : EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) Token(org.apache.stanbol.enhancer.nlp.model.Token) IOException(java.io.IOException) LexicalCategory(org.apache.stanbol.enhancer.nlp.pos.LexicalCategory) NlpEngineHelper.getAnalysedText(org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getAnalysedText) AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) PosTag(org.apache.stanbol.enhancer.nlp.pos.PosTag) SOAPException(javax.xml.soap.SOAPException) MorphoFeatures(org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures) CeliMorphoFeatures(org.apache.stanbol.enhancer.engines.celi.CeliMorphoFeatures) EnumMap(java.util.EnumMap)

Example 13 with PosTag

use of org.apache.stanbol.enhancer.nlp.pos.PosTag in project stanbol by apache.

the class TestKuromojiNlpEngine method testEngine.

@Test
public void testEngine() throws EngineException {
    LiteralFactory lf = LiteralFactory.getInstance();
    Assert.assertEquals(EnhancementEngine.ENHANCE_ASYNC, engine.canEnhance(contentItem));
    engine.computeEnhancements(contentItem);
    //assert the results
    Map<IRI, RDFTerm> expected = new HashMap<IRI, RDFTerm>();
    expected.put(Properties.DC_CREATOR, lf.createTypedLiteral(engine.getClass().getName()));
    expected.put(Properties.ENHANCER_EXTRACTED_FROM, contentItem.getUri());
    Assert.assertEquals(16, EnhancementStructureHelper.validateAllTextAnnotations(contentItem.getMetadata(), text, expected));
    AnalysedText at = AnalysedTextUtils.getAnalysedText(contentItem);
    Assert.assertNotNull(at);
    List<Sentence> sentences = AnalysedTextUtils.asList(at.getSentences());
    Assert.assertNotNull(sentences);
    Assert.assertEquals(7, sentences.size());
    //TODO: values in the following arrays are based on the first run of the
    // engine. So this is only to detect changes in results. It can not validate
    // that the tokenization and NER detections are correct - sorry I do not
    // speak Japanese ...
    int[] expectedChunks = new int[] { 5, 3, 1, 0, 1, 2, 4 };
    int[] expectedTokens = new int[] { 25, 25, 25, 24, 33, 17, 32 };
    int sentIndex = 0;
    for (Sentence sent : sentences) {
        List<Chunk> sentenceNer = AnalysedTextUtils.asList(sent.getChunks());
        Assert.assertEquals(expectedChunks[sentIndex], sentenceNer.size());
        for (Chunk chunk : sentenceNer) {
            Value<NerTag> nerValue = chunk.getAnnotation(NlpAnnotations.NER_ANNOTATION);
            Assert.assertNotNull(nerValue);
            Assert.assertNotNull(nerValue.value().getType());
        }
        List<Token> tokens = AnalysedTextUtils.asList(sent.getTokens());
        Assert.assertEquals(expectedTokens[sentIndex], tokens.size());
        for (Token token : tokens) {
            Value<PosTag> posValue = token.getAnnotation(NlpAnnotations.POS_ANNOTATION);
            Assert.assertNotNull(posValue);
        }
        sentIndex++;
    }
}

Also used : IRI(org.apache.clerezza.commons.rdf.IRI) NerTag(org.apache.stanbol.enhancer.nlp.ner.NerTag) HashMap(java.util.HashMap) RDFTerm(org.apache.clerezza.commons.rdf.RDFTerm) Token(org.apache.stanbol.enhancer.nlp.model.Token) Chunk(org.apache.stanbol.enhancer.nlp.model.Chunk) LiteralFactory(org.apache.clerezza.rdf.core.LiteralFactory) AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) PosTag(org.apache.stanbol.enhancer.nlp.pos.PosTag) Sentence(org.apache.stanbol.enhancer.nlp.model.Sentence) Test(org.junit.Test)

Example 14 with PosTag

use of org.apache.stanbol.enhancer.nlp.pos.PosTag in project stanbol by apache.

the class NounPhraseFilterer method filter.

/**
     * Filters out noun phrases which do not contain a determiner from the given config and do not a token
     * count bigger than 2 - TODO : should this be configurable to be able to also include 1 word noun
     * phrases?
     * 
     * @param nounPhrases
     * @param language
     */
public void filter(List<NounPhrase> nounPhrases, String language) {
    Set<String> langDeterminerSet = withinTextRefDeterminers.get(language);
    Iterator<NounPhrase> it = nounPhrases.iterator();
    while (it.hasNext()) {
        NounPhrase nounPhrase = it.next();
        boolean hasGoodDeterminer = false;
        short nounNo = 0;
        for (Span token : nounPhrase.getTokens()) {
            Value<PosTag> pos = token.getAnnotation(NlpAnnotations.POS_ANNOTATION);
            if (pos != null) {
                PosTag posTag = pos.value();
                if (posTag.hasCategory(LexicalCategory.Noun) || posTag.hasCategory(LexicalCategory.Adjective)) {
                    nounNo++;
                }
                if (!hasGoodDeterminer && posTag.hasPos(Pos.Determiner) && langDeterminerSet.contains(token.getSpan().toLowerCase())) {
                    hasGoodDeterminer = true;
                }
            }
        }
        if (!hasGoodDeterminer || nounNo < MIN_POS_NUMBER) {
            it.remove();
        }
    }
}

Also used : PosTag(org.apache.stanbol.enhancer.nlp.pos.PosTag) NounPhrase(org.apache.stanbol.enhancer.engines.entitycoreference.datamodel.NounPhrase) Span(org.apache.stanbol.enhancer.nlp.model.Span)

Example 15 with PosTag

use of org.apache.stanbol.enhancer.nlp.pos.PosTag in project stanbol by apache.

the class NIFHelper method writePos.

/**
     * Writes the {@link NlpAnnotations#POS_ANNOTATION} as NIF 1.0 to the parsed
     * RDF graph by using the parsed segmentUri as subject
     * @param graph the graph
     * @param annotated the annotated element (e.g. a {@link Token})
     * @param segmentUri the URI of the resource representing the parsed 
     * annotated element in the graph
     */
public static void writePos(Graph graph, Annotated annotated, IRI segmentUri) {
    Value<PosTag> posTag = annotated.getAnnotation(NlpAnnotations.POS_ANNOTATION);
    if (posTag != null) {
        if (posTag.value().isMapped()) {
            for (Pos pos : posTag.value().getPos()) {
                graph.add(new TripleImpl(segmentUri, SsoOntology.oliaLink.getUri(), pos.getUri()));
            }
            for (LexicalCategory cat : posTag.value().getCategories()) {
                graph.add(new TripleImpl(segmentUri, SsoOntology.oliaLink.getUri(), cat.getUri()));
            }
        }
        graph.add(new TripleImpl(segmentUri, SsoOntology.posTag.getUri(), lf.createTypedLiteral(posTag.value().getTag())));
        graph.add(new TripleImpl(segmentUri, ENHANCER_CONFIDENCE, lf.createTypedLiteral(posTag.probability())));
    }
}

Also used : PosTag(org.apache.stanbol.enhancer.nlp.pos.PosTag) Pos(org.apache.stanbol.enhancer.nlp.pos.Pos) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) LexicalCategory(org.apache.stanbol.enhancer.nlp.pos.LexicalCategory)

Aggregations

PosTag (org.apache.stanbol.enhancer.nlp.pos.PosTag)21 Token (org.apache.stanbol.enhancer.nlp.model.Token)12 AnalysedText (org.apache.stanbol.enhancer.nlp.model.AnalysedText)8 TripleImpl (org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)7 Sentence (org.apache.stanbol.enhancer.nlp.model.Sentence)6 LexicalCategory (org.apache.stanbol.enhancer.nlp.pos.LexicalCategory)6 PlainLiteralImpl (org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl)5 Chunk (org.apache.stanbol.enhancer.nlp.model.Chunk)5 IRI (org.apache.clerezza.commons.rdf.IRI)4 MorphoFeatures (org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures)4 NerTag (org.apache.stanbol.enhancer.nlp.ner.NerTag)4 EngineException (org.apache.stanbol.enhancer.servicesapi.EngineException)4 StringSource (org.apache.stanbol.enhancer.servicesapi.impl.StringSource)4 ArrayList (java.util.ArrayList)3 Graph (org.apache.clerezza.commons.rdf.Graph)3 CaseTag (org.apache.stanbol.enhancer.nlp.morpho.CaseTag)3 GenderTag (org.apache.stanbol.enhancer.nlp.morpho.GenderTag)3 NumberTag (org.apache.stanbol.enhancer.nlp.morpho.NumberTag)3 TenseTag (org.apache.stanbol.enhancer.nlp.morpho.TenseTag)3 VerbMoodTag (org.apache.stanbol.enhancer.nlp.morpho.VerbMoodTag)3