Search in sources :

Example 1 with LexicalCategory

use of org.apache.stanbol.enhancer.nlp.pos.LexicalCategory in project stanbol by apache.

the class SentimentEngine method computeEnhancements.

/**
     * Compute enhancements for supplied ContentItem. The results of the process
     * are expected to be stored in the metadata of the content item.
     * <p/>
     * The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
     * persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
     *
     * @throws org.apache.stanbol.enhancer.servicesapi.EngineException
     *          if the underlying process failed to work as
     *          expected
     */
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    AnalysedText analysedText = getAnalysedText(this, ci, true);
    String language = getLanguage(this, ci, true);
    SentimentClassifier classifier = classifiers.get(language);
    if (classifier == null) {
        throw new IllegalStateException("Sentiment Classifier for language '" + language + "' not available. As this is also checked in " + " canEnhance this may indicate an Bug in the used " + "EnhancementJobManager!");
    }
    //TODO: locking for AnalysedText not yet defined
    //        ci.getLock().writeLock().lock();
    //        try {
    Iterator<Token> tokens = analysedText.getTokens();
    while (tokens.hasNext()) {
        Token token = tokens.next();
        Set<LexicalCategory> cats = null;
        boolean process = false;
        if (!adjectivesOnly) {
            process = true;
            Value<PosTag> posTag = token.getAnnotation(NlpAnnotations.POS_ANNOTATION);
            if (posTag != null && posTag.probability() == Value.UNKNOWN_PROBABILITY || posTag.probability() >= (minPOSConfidence / 2.0)) {
                cats = classifier.getCategories(posTag.value());
            } else {
                //no POS tags or probability to low
                cats = Collections.emptySet();
            }
        } else {
            //check PosTags if we need to lookup this word
            Iterator<Value<PosTag>> posTags = token.getAnnotations(NlpAnnotations.POS_ANNOTATION).iterator();
            boolean ignore = false;
            while (!ignore && !process && posTags.hasNext()) {
                Value<PosTag> value = posTags.next();
                PosTag tag = value.value();
                cats = classifier.getCategories(tag);
                boolean state = cats.contains(LexicalCategory.Adjective) || cats.contains(LexicalCategory.Noun);
                ignore = !state && (value.probability() == Value.UNKNOWN_PROBABILITY || value.probability() >= minPOSConfidence);
                process = state && (value.probability() == Value.UNKNOWN_PROBABILITY || value.probability() >= (minPOSConfidence / 2.0));
            }
        }
        //else process all tokens ... no POS tag checking needed
        if (process) {
            String word = token.getSpan();
            double sentiment = 0.0;
            if (cats.isEmpty()) {
                sentiment = classifier.classifyWord(null, word);
            } else {
                //in case of multiple Lexical Cats
                //we build the average over NOT NULL sentiments for the word
                int catSentNum = 0;
                for (LexicalCategory cat : cats) {
                    double catSent = classifier.classifyWord(cat, word);
                    if (catSent != 0.0) {
                        catSentNum++;
                        sentiment = sentiment + catSent;
                    }
                }
                if (catSentNum > 0) {
                    sentiment = sentiment / (double) catSentNum;
                }
            }
            if (sentiment != 0.0) {
                token.addAnnotation(SENTIMENT_ANNOTATION, new Value<Double>(sentiment));
            }
        //else do not set sentiments with 0.0
        }
    // else do not process
    }
//        } finally {
//            ci.getLock().writeLock().unlock();
//        }
}
Also used : Token(org.apache.stanbol.enhancer.nlp.model.Token) LexicalCategory(org.apache.stanbol.enhancer.nlp.pos.LexicalCategory) NlpEngineHelper.getAnalysedText(org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getAnalysedText) AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) SentimentClassifier(org.apache.stanbol.enhancer.engines.sentiment.api.SentimentClassifier) PosTag(org.apache.stanbol.enhancer.nlp.pos.PosTag) Value(org.apache.stanbol.enhancer.nlp.model.annotation.Value)

Example 2 with LexicalCategory

use of org.apache.stanbol.enhancer.nlp.pos.LexicalCategory in project stanbol by apache.

the class CeliLemmatizerEnhancementEngineTest method validateMorphoFeatureProperty.

/**
     * [1..*] values of an {@link TypedLiteral} in the form {key=value}
     * @param enhancements The graph with the enhancements
     * @param textAnnotation the TextAnnotation to check
     */
private void validateMorphoFeatureProperty(Graph enhancements, BlankNodeOrIRI textAnnotation) {
    //This taste checks for known morpho features of a given input (constant TERM)
    Iterator<Triple> morphoFeatureIterator = enhancements.filter(textAnnotation, RDF_TYPE, null);
    assertTrue("No POS Morpho Feature value found for TextAnnotation " + textAnnotation + "!", morphoFeatureIterator.hasNext());
    while (morphoFeatureIterator.hasNext()) {
        RDFTerm morphoFeature = morphoFeatureIterator.next().getObject();
        assertTrue("Morpho Feature value are expected of typed literal", morphoFeature instanceof IRI);
        String feature = ((IRI) morphoFeature).getUnicodeString();
        assertFalse("Morpho Feature MUST NOT be empty", feature.isEmpty());
        if (feature.startsWith(OLIA_NAMESPACE)) {
            String key = feature.substring(OLIA_NAMESPACE.length());
            LexicalCategory cat = LexicalCategory.valueOf(key);
            assertTrue("Part of Speech of " + TERM + " should be " + LexicalCategory.Noun, (cat == LexicalCategory.Noun));
        }
    }
    morphoFeatureIterator = enhancements.filter(textAnnotation, CeliMorphoFeatures.HAS_GENDER, null);
    assertTrue("No Gender Morpho Feature value found for TextAnnotation " + textAnnotation + "!", morphoFeatureIterator.hasNext());
    if (morphoFeatureIterator.hasNext()) {
        RDFTerm morphoFeature = morphoFeatureIterator.next().getObject();
        assertTrue("Morpho Feature value are expected of typed literal", morphoFeature instanceof IRI);
        String feature = ((IRI) morphoFeature).getUnicodeString();
        assertFalse("Morpho Feature MUST NOT be empty", feature.isEmpty());
        if (feature.startsWith(OLIA_NAMESPACE)) {
            String key = feature.substring(OLIA_NAMESPACE.length());
            Gender cat = Gender.valueOf(key);
            assertTrue("Gender of " + TERM + " should be " + Gender.Feminine, (cat == Gender.Feminine));
        }
    }
    morphoFeatureIterator = enhancements.filter(textAnnotation, CeliMorphoFeatures.HAS_NUMBER, null);
    assertTrue("No Number Morpho Feature value found for TextAnnotation " + textAnnotation + "!", morphoFeatureIterator.hasNext());
    if (morphoFeatureIterator.hasNext()) {
        RDFTerm morphoFeature = morphoFeatureIterator.next().getObject();
        assertTrue("Morpho Feature value are expected of typed literal", morphoFeature instanceof IRI);
        String feature = ((IRI) morphoFeature).getUnicodeString();
        assertFalse("Morpho Feature MUST NOT be empty", feature.isEmpty());
        if (feature.startsWith(OLIA_NAMESPACE)) {
            String key = feature.substring(OLIA_NAMESPACE.length());
            NumberFeature cat = NumberFeature.valueOf(key);
            assertTrue("Number of " + TERM + " should be " + Gender.Feminine, (cat == NumberFeature.Singular));
        }
    }
    morphoFeatureIterator = enhancements.filter(textAnnotation, CeliLemmatizerEnhancementEngine.hasLemmaForm, null);
    assertTrue("No Number Morpho Feature value found for TextAnnotation " + textAnnotation + "!", morphoFeatureIterator.hasNext());
    if (morphoFeatureIterator.hasNext()) {
        RDFTerm morphoFeature = morphoFeatureIterator.next().getObject();
        assertTrue("Lemma Forms value are expected of type Literal", morphoFeature instanceof Literal);
        assertFalse("Lemma forms MUST NOT be empty", ((Literal) morphoFeature).getLexicalForm().isEmpty());
        String feature = ((Literal) morphoFeature).getLexicalForm();
        assertTrue("Lemma of " + TERM + " should be " + TERM, (feature.equals(TERM)));
    }
}
Also used : Triple(org.apache.clerezza.commons.rdf.Triple) IRI(org.apache.clerezza.commons.rdf.IRI) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) NumberFeature(org.apache.stanbol.enhancer.nlp.morpho.NumberFeature) Literal(org.apache.clerezza.commons.rdf.Literal) RDFTerm(org.apache.clerezza.commons.rdf.RDFTerm) Gender(org.apache.stanbol.enhancer.nlp.morpho.Gender) LexicalCategory(org.apache.stanbol.enhancer.nlp.pos.LexicalCategory)

Example 3 with LexicalCategory

use of org.apache.stanbol.enhancer.nlp.pos.LexicalCategory in project stanbol by apache.

the class CeliAnalyzedTextLemmatizerEngine method computeEnhancements.

@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    AnalysedText at = getAnalysedText(this, ci, true);
    String language = getLanguage(this, ci, true);
    isLangaugeConfigured(this, languageConfig, language, true);
    List<LexicalEntry> terms;
    try {
        terms = this.client.performMorfologicalAnalysis(at.getSpan(), language);
    } catch (IOException e) {
        throw new EngineException("Error while calling the CELI Lemmatizer" + " service (configured URL: " + serviceURL + ")!", e);
    } catch (SOAPException e) {
        throw new EngineException("Error wile encoding/decoding the request/" + "response to the CELI lemmatizer service!", e);
    }
    Map<LexicalCategory, Double> tokenLexCats = new EnumMap<LexicalCategory, Double>(LexicalCategory.class);
    for (LexicalEntry term : terms) {
        if (term.getTermReadings().isEmpty()) {
            //ignore terms without readings
            continue;
        }
        //Add the LexicalEntry as Token to the Text. NOTE that if a
        //Token with the same start/end positions already exist this
        //Method returns the existing instance
        Token token = at.addToken(term.getFrom(), term.getTo());
        //Now try to get POS annotations for the Token
        for (Value<PosTag> posAnno : token.getAnnotations(NlpAnnotations.POS_ANNOTATION)) {
            if (posAnno.value().isMapped()) {
                for (LexicalCategory cat : posAnno.value().getCategories()) {
                    if (!tokenLexCats.containsKey(cat)) {
                        //do not override with lover prob
                        tokenLexCats.put(cat, posAnno.probability());
                    }
                }
            }
        }
        for (Reading reading : term.getTermReadings()) {
            MorphoFeatures mf = CeliMorphoFeatures.parseFrom(reading, language);
            //add the readings (MorphoFeatures)
            if (mf != null) {
                //use the POS tags of the morpho analysis and compare it
                //with existing POS tags.
                double posProbability = -1;
                Set<LexicalCategory> mfCats = EnumSet.noneOf(LexicalCategory.class);
                for (PosTag mfPos : mf.getPosList()) {
                    mfCats.addAll(mfPos.getCategories());
                }
                for (LexicalCategory mfCat : mfCats) {
                    Double prob = tokenLexCats.get(mfCat);
                    if (prob != null && posProbability < prob) {
                        posProbability = prob;
                    }
                }
                //add the morpho features with the posProbabiliy
                Value<MorphoFeatures> value = Value.value(mf, posProbability < 0 ? Value.UNKNOWN_PROBABILITY : posProbability);
                token.addAnnotation(NlpAnnotations.MORPHO_ANNOTATION, value);
            }
        }
    }
}
Also used : EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) Token(org.apache.stanbol.enhancer.nlp.model.Token) IOException(java.io.IOException) LexicalCategory(org.apache.stanbol.enhancer.nlp.pos.LexicalCategory) NlpEngineHelper.getAnalysedText(org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getAnalysedText) AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) PosTag(org.apache.stanbol.enhancer.nlp.pos.PosTag) SOAPException(javax.xml.soap.SOAPException) MorphoFeatures(org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures) CeliMorphoFeatures(org.apache.stanbol.enhancer.engines.celi.CeliMorphoFeatures) EnumMap(java.util.EnumMap)

Example 4 with LexicalCategory

use of org.apache.stanbol.enhancer.nlp.pos.LexicalCategory in project stanbol by apache.

the class Nif20Helper method writePos.

/**
     * Writes the {@link NlpAnnotations#POS_ANNOTATION} as NIF 1.0 to the parsed
     * RDF graph by using the parsed segmentUri as subject
     * @param graph the graph
     * @param annotated the annotated element (e.g. a {@link Token})
     * @param segmentUri the URI of the resource representing the parsed 
     * annotated element in the graph
     */
public static void writePos(Graph graph, Annotated annotated, IRI segmentUri) {
    Value<PosTag> posTag = annotated.getAnnotation(NlpAnnotations.POS_ANNOTATION);
    if (posTag != null) {
        if (posTag.value().isMapped()) {
            for (Pos pos : posTag.value().getPos()) {
                graph.add(new TripleImpl(segmentUri, Nif20.oliaCategory.getUri(), pos.getUri()));
            }
            for (LexicalCategory cat : posTag.value().getCategories()) {
                graph.add(new TripleImpl(segmentUri, Nif20.oliaCategory.getUri(), cat.getUri()));
            }
        }
        graph.add(new TripleImpl(segmentUri, Nif20.posTag.getUri(), lf.createTypedLiteral(posTag.value().getTag())));
        //set the oliaConf
        //remove existing conf values (e.g. for a single word phrase)
        setOliaConf(graph, segmentUri, posTag);
    }
}
Also used : PosTag(org.apache.stanbol.enhancer.nlp.pos.PosTag) Pos(org.apache.stanbol.enhancer.nlp.pos.Pos) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) LexicalCategory(org.apache.stanbol.enhancer.nlp.pos.LexicalCategory)

Example 5 with LexicalCategory

use of org.apache.stanbol.enhancer.nlp.pos.LexicalCategory in project stanbol by apache.

the class PosTagSupport method serialize.

@Override
public ObjectNode serialize(ObjectMapper mapper, PosTag value) {
    ObjectNode jPosTag = mapper.createObjectNode();
    jPosTag.put("tag", value.getTag());
    if (value.getPos().size() == 1) {
        jPosTag.put("pos", value.getPos().iterator().next().ordinal());
    } else if (!value.getPos().isEmpty()) {
        ArrayNode jPos = mapper.createArrayNode();
        for (Pos pos : value.getPos()) {
            jPos.add(pos.ordinal());
        }
        jPosTag.put("pos", jPos);
    }
    if (!value.getCategories().isEmpty()) {
        //we need only the categories not covered by Pos elements
        EnumSet<LexicalCategory> categories = EnumSet.noneOf(LexicalCategory.class);
        categories.addAll(value.getCategories());
        for (Pos pos : value.getPos()) {
            categories.removeAll(pos.categories());
        }
        if (categories.size() == 1) {
            jPosTag.put("lc", categories.iterator().next().ordinal());
        } else if (!categories.isEmpty()) {
            ArrayNode jCategory = mapper.createArrayNode();
            for (LexicalCategory lc : categories) {
                jCategory.add(lc.ordinal());
            }
            jPosTag.put("lc", jCategory);
        }
    }
    return jPosTag;
}
Also used : ObjectNode(org.codehaus.jackson.node.ObjectNode) Pos(org.apache.stanbol.enhancer.nlp.pos.Pos) ArrayNode(org.codehaus.jackson.node.ArrayNode) LexicalCategory(org.apache.stanbol.enhancer.nlp.pos.LexicalCategory)

Aggregations

LexicalCategory (org.apache.stanbol.enhancer.nlp.pos.LexicalCategory)8 PosTag (org.apache.stanbol.enhancer.nlp.pos.PosTag)5 TripleImpl (org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)3 Pos (org.apache.stanbol.enhancer.nlp.pos.Pos)3 AnalysedText (org.apache.stanbol.enhancer.nlp.model.AnalysedText)2 Token (org.apache.stanbol.enhancer.nlp.model.Token)2 NlpEngineHelper.getAnalysedText (org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getAnalysedText)2 IOException (java.io.IOException)1 EnumMap (java.util.EnumMap)1 Vector (java.util.Vector)1 SOAPException (javax.xml.soap.SOAPException)1 BlankNodeOrIRI (org.apache.clerezza.commons.rdf.BlankNodeOrIRI)1 IRI (org.apache.clerezza.commons.rdf.IRI)1 Literal (org.apache.clerezza.commons.rdf.Literal)1 RDFTerm (org.apache.clerezza.commons.rdf.RDFTerm)1 Triple (org.apache.clerezza.commons.rdf.Triple)1 PlainLiteralImpl (org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl)1 CeliMorphoFeatures (org.apache.stanbol.enhancer.engines.celi.CeliMorphoFeatures)1 SentimentClassifier (org.apache.stanbol.enhancer.engines.sentiment.api.SentimentClassifier)1 Value (org.apache.stanbol.enhancer.nlp.model.annotation.Value)1