use of org.apache.stanbol.enhancer.nlp.pos.LexicalCategory in project stanbol by apache.
the class SentimentEngine method computeEnhancements.
/**
* Compute enhancements for supplied ContentItem. The results of the process
* are expected to be stored in the metadata of the content item.
* <p/>
* The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
* persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
*
* @throws org.apache.stanbol.enhancer.servicesapi.EngineException
* if the underlying process failed to work as
* expected
*/
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
AnalysedText analysedText = getAnalysedText(this, ci, true);
String language = getLanguage(this, ci, true);
SentimentClassifier classifier = classifiers.get(language);
if (classifier == null) {
throw new IllegalStateException("Sentiment Classifier for language '" + language + "' not available. As this is also checked in " + " canEnhance this may indicate an Bug in the used " + "EnhancementJobManager!");
}
//TODO: locking for AnalysedText not yet defined
// ci.getLock().writeLock().lock();
// try {
Iterator<Token> tokens = analysedText.getTokens();
while (tokens.hasNext()) {
Token token = tokens.next();
Set<LexicalCategory> cats = null;
boolean process = false;
if (!adjectivesOnly) {
process = true;
Value<PosTag> posTag = token.getAnnotation(NlpAnnotations.POS_ANNOTATION);
if (posTag != null && posTag.probability() == Value.UNKNOWN_PROBABILITY || posTag.probability() >= (minPOSConfidence / 2.0)) {
cats = classifier.getCategories(posTag.value());
} else {
//no POS tags or probability to low
cats = Collections.emptySet();
}
} else {
//check PosTags if we need to lookup this word
Iterator<Value<PosTag>> posTags = token.getAnnotations(NlpAnnotations.POS_ANNOTATION).iterator();
boolean ignore = false;
while (!ignore && !process && posTags.hasNext()) {
Value<PosTag> value = posTags.next();
PosTag tag = value.value();
cats = classifier.getCategories(tag);
boolean state = cats.contains(LexicalCategory.Adjective) || cats.contains(LexicalCategory.Noun);
ignore = !state && (value.probability() == Value.UNKNOWN_PROBABILITY || value.probability() >= minPOSConfidence);
process = state && (value.probability() == Value.UNKNOWN_PROBABILITY || value.probability() >= (minPOSConfidence / 2.0));
}
}
//else process all tokens ... no POS tag checking needed
if (process) {
String word = token.getSpan();
double sentiment = 0.0;
if (cats.isEmpty()) {
sentiment = classifier.classifyWord(null, word);
} else {
//in case of multiple Lexical Cats
//we build the average over NOT NULL sentiments for the word
int catSentNum = 0;
for (LexicalCategory cat : cats) {
double catSent = classifier.classifyWord(cat, word);
if (catSent != 0.0) {
catSentNum++;
sentiment = sentiment + catSent;
}
}
if (catSentNum > 0) {
sentiment = sentiment / (double) catSentNum;
}
}
if (sentiment != 0.0) {
token.addAnnotation(SENTIMENT_ANNOTATION, new Value<Double>(sentiment));
}
//else do not set sentiments with 0.0
}
// else do not process
}
// } finally {
// ci.getLock().writeLock().unlock();
// }
}
use of org.apache.stanbol.enhancer.nlp.pos.LexicalCategory in project stanbol by apache.
the class CeliLemmatizerEnhancementEngineTest method validateMorphoFeatureProperty.
/**
* [1..*] values of an {@link TypedLiteral} in the form {key=value}
* @param enhancements The graph with the enhancements
* @param textAnnotation the TextAnnotation to check
*/
private void validateMorphoFeatureProperty(Graph enhancements, BlankNodeOrIRI textAnnotation) {
//This taste checks for known morpho features of a given input (constant TERM)
Iterator<Triple> morphoFeatureIterator = enhancements.filter(textAnnotation, RDF_TYPE, null);
assertTrue("No POS Morpho Feature value found for TextAnnotation " + textAnnotation + "!", morphoFeatureIterator.hasNext());
while (morphoFeatureIterator.hasNext()) {
RDFTerm morphoFeature = morphoFeatureIterator.next().getObject();
assertTrue("Morpho Feature value are expected of typed literal", morphoFeature instanceof IRI);
String feature = ((IRI) morphoFeature).getUnicodeString();
assertFalse("Morpho Feature MUST NOT be empty", feature.isEmpty());
if (feature.startsWith(OLIA_NAMESPACE)) {
String key = feature.substring(OLIA_NAMESPACE.length());
LexicalCategory cat = LexicalCategory.valueOf(key);
assertTrue("Part of Speech of " + TERM + " should be " + LexicalCategory.Noun, (cat == LexicalCategory.Noun));
}
}
morphoFeatureIterator = enhancements.filter(textAnnotation, CeliMorphoFeatures.HAS_GENDER, null);
assertTrue("No Gender Morpho Feature value found for TextAnnotation " + textAnnotation + "!", morphoFeatureIterator.hasNext());
if (morphoFeatureIterator.hasNext()) {
RDFTerm morphoFeature = morphoFeatureIterator.next().getObject();
assertTrue("Morpho Feature value are expected of typed literal", morphoFeature instanceof IRI);
String feature = ((IRI) morphoFeature).getUnicodeString();
assertFalse("Morpho Feature MUST NOT be empty", feature.isEmpty());
if (feature.startsWith(OLIA_NAMESPACE)) {
String key = feature.substring(OLIA_NAMESPACE.length());
Gender cat = Gender.valueOf(key);
assertTrue("Gender of " + TERM + " should be " + Gender.Feminine, (cat == Gender.Feminine));
}
}
morphoFeatureIterator = enhancements.filter(textAnnotation, CeliMorphoFeatures.HAS_NUMBER, null);
assertTrue("No Number Morpho Feature value found for TextAnnotation " + textAnnotation + "!", morphoFeatureIterator.hasNext());
if (morphoFeatureIterator.hasNext()) {
RDFTerm morphoFeature = morphoFeatureIterator.next().getObject();
assertTrue("Morpho Feature value are expected of typed literal", morphoFeature instanceof IRI);
String feature = ((IRI) morphoFeature).getUnicodeString();
assertFalse("Morpho Feature MUST NOT be empty", feature.isEmpty());
if (feature.startsWith(OLIA_NAMESPACE)) {
String key = feature.substring(OLIA_NAMESPACE.length());
NumberFeature cat = NumberFeature.valueOf(key);
assertTrue("Number of " + TERM + " should be " + Gender.Feminine, (cat == NumberFeature.Singular));
}
}
morphoFeatureIterator = enhancements.filter(textAnnotation, CeliLemmatizerEnhancementEngine.hasLemmaForm, null);
assertTrue("No Number Morpho Feature value found for TextAnnotation " + textAnnotation + "!", morphoFeatureIterator.hasNext());
if (morphoFeatureIterator.hasNext()) {
RDFTerm morphoFeature = morphoFeatureIterator.next().getObject();
assertTrue("Lemma Forms value are expected of type Literal", morphoFeature instanceof Literal);
assertFalse("Lemma forms MUST NOT be empty", ((Literal) morphoFeature).getLexicalForm().isEmpty());
String feature = ((Literal) morphoFeature).getLexicalForm();
assertTrue("Lemma of " + TERM + " should be " + TERM, (feature.equals(TERM)));
}
}
use of org.apache.stanbol.enhancer.nlp.pos.LexicalCategory in project stanbol by apache.
the class CeliAnalyzedTextLemmatizerEngine method computeEnhancements.
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
AnalysedText at = getAnalysedText(this, ci, true);
String language = getLanguage(this, ci, true);
isLangaugeConfigured(this, languageConfig, language, true);
List<LexicalEntry> terms;
try {
terms = this.client.performMorfologicalAnalysis(at.getSpan(), language);
} catch (IOException e) {
throw new EngineException("Error while calling the CELI Lemmatizer" + " service (configured URL: " + serviceURL + ")!", e);
} catch (SOAPException e) {
throw new EngineException("Error wile encoding/decoding the request/" + "response to the CELI lemmatizer service!", e);
}
Map<LexicalCategory, Double> tokenLexCats = new EnumMap<LexicalCategory, Double>(LexicalCategory.class);
for (LexicalEntry term : terms) {
if (term.getTermReadings().isEmpty()) {
//ignore terms without readings
continue;
}
//Add the LexicalEntry as Token to the Text. NOTE that if a
//Token with the same start/end positions already exist this
//Method returns the existing instance
Token token = at.addToken(term.getFrom(), term.getTo());
//Now try to get POS annotations for the Token
for (Value<PosTag> posAnno : token.getAnnotations(NlpAnnotations.POS_ANNOTATION)) {
if (posAnno.value().isMapped()) {
for (LexicalCategory cat : posAnno.value().getCategories()) {
if (!tokenLexCats.containsKey(cat)) {
//do not override with lover prob
tokenLexCats.put(cat, posAnno.probability());
}
}
}
}
for (Reading reading : term.getTermReadings()) {
MorphoFeatures mf = CeliMorphoFeatures.parseFrom(reading, language);
//add the readings (MorphoFeatures)
if (mf != null) {
//use the POS tags of the morpho analysis and compare it
//with existing POS tags.
double posProbability = -1;
Set<LexicalCategory> mfCats = EnumSet.noneOf(LexicalCategory.class);
for (PosTag mfPos : mf.getPosList()) {
mfCats.addAll(mfPos.getCategories());
}
for (LexicalCategory mfCat : mfCats) {
Double prob = tokenLexCats.get(mfCat);
if (prob != null && posProbability < prob) {
posProbability = prob;
}
}
//add the morpho features with the posProbabiliy
Value<MorphoFeatures> value = Value.value(mf, posProbability < 0 ? Value.UNKNOWN_PROBABILITY : posProbability);
token.addAnnotation(NlpAnnotations.MORPHO_ANNOTATION, value);
}
}
}
}
use of org.apache.stanbol.enhancer.nlp.pos.LexicalCategory in project stanbol by apache.
the class Nif20Helper method writePos.
/**
* Writes the {@link NlpAnnotations#POS_ANNOTATION} as NIF 1.0 to the parsed
* RDF graph by using the parsed segmentUri as subject
* @param graph the graph
* @param annotated the annotated element (e.g. a {@link Token})
* @param segmentUri the URI of the resource representing the parsed
* annotated element in the graph
*/
public static void writePos(Graph graph, Annotated annotated, IRI segmentUri) {
Value<PosTag> posTag = annotated.getAnnotation(NlpAnnotations.POS_ANNOTATION);
if (posTag != null) {
if (posTag.value().isMapped()) {
for (Pos pos : posTag.value().getPos()) {
graph.add(new TripleImpl(segmentUri, Nif20.oliaCategory.getUri(), pos.getUri()));
}
for (LexicalCategory cat : posTag.value().getCategories()) {
graph.add(new TripleImpl(segmentUri, Nif20.oliaCategory.getUri(), cat.getUri()));
}
}
graph.add(new TripleImpl(segmentUri, Nif20.posTag.getUri(), lf.createTypedLiteral(posTag.value().getTag())));
//set the oliaConf
//remove existing conf values (e.g. for a single word phrase)
setOliaConf(graph, segmentUri, posTag);
}
}
use of org.apache.stanbol.enhancer.nlp.pos.LexicalCategory in project stanbol by apache.
the class PosTagSupport method serialize.
@Override
public ObjectNode serialize(ObjectMapper mapper, PosTag value) {
ObjectNode jPosTag = mapper.createObjectNode();
jPosTag.put("tag", value.getTag());
if (value.getPos().size() == 1) {
jPosTag.put("pos", value.getPos().iterator().next().ordinal());
} else if (!value.getPos().isEmpty()) {
ArrayNode jPos = mapper.createArrayNode();
for (Pos pos : value.getPos()) {
jPos.add(pos.ordinal());
}
jPosTag.put("pos", jPos);
}
if (!value.getCategories().isEmpty()) {
//we need only the categories not covered by Pos elements
EnumSet<LexicalCategory> categories = EnumSet.noneOf(LexicalCategory.class);
categories.addAll(value.getCategories());
for (Pos pos : value.getPos()) {
categories.removeAll(pos.categories());
}
if (categories.size() == 1) {
jPosTag.put("lc", categories.iterator().next().ordinal());
} else if (!categories.isEmpty()) {
ArrayNode jCategory = mapper.createArrayNode();
for (LexicalCategory lc : categories) {
jCategory.add(lc.ordinal());
}
jPosTag.put("lc", jCategory);
}
}
return jPosTag;
}
Aggregations