use of org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures in project stanbol by apache.
the class CeliAnalyzedTextLemmatizerEngineTest method testEngineDe.
@Test
public void testEngineDe() throws IOException, EngineException {
ContentItem ci = ciFactory.createContentItem(new StringSource(de_text));
Assert.assertNotNull(ci);
AnalysedText at = atFactory.createAnalysedText(ci, ci.getBlob());
Assert.assertNotNull(at);
ci.getMetadata().add(new TripleImpl(ci.getUri(), DC_LANGUAGE, new PlainLiteralImpl("de")));
Assert.assertEquals("de", EnhancementEngineHelper.getLanguage(ci));
// Add some Tokens with POS annotations to test the usage of
// existing POS annotations by the lemmatizer
Token verbrachten = at.addToken(de_verbStart, de_verbStart + de_verb.length());
verbrachten.addAnnotation(POS_ANNOTATION, Value.value(new PosTag("V", LexicalCategory.Verb), de_verbProb));
Token schonen = at.addToken(de_adjectiveStart, de_adjectiveStart + de_adjective.length());
schonen.addAnnotation(POS_ANNOTATION, Value.value(new PosTag("ADJ", LexicalCategory.Adjective), de_adjectiveProb));
Token urlaub = at.addToken(de_nounStart, de_nounStart + de_noun.length());
urlaub.addAnnotation(POS_ANNOTATION, Value.value(new PosTag("NC", LexicalCategory.Noun), de_nounProb));
Assert.assertEquals("Can not enhance Test ContentItem", EnhancementEngine.ENHANCE_ASYNC, engine.canEnhance(ci));
// compute the enhancements
try {
engine.computeEnhancements(ci);
} catch (EngineException e) {
RemoteServiceHelper.checkServiceUnavailable(e);
// deactivate test
return;
}
// now validate the enhancements
boolean foundVerb = false;
boolean foundAdjective = false;
boolean foundNoun = false;
for (Iterator<Token> tokens = at.getTokens(); tokens.hasNext(); ) {
Token token = tokens.next();
log.info("Token: {}", token);
List<Value<MorphoFeatures>> mfs = token.getAnnotations(NlpAnnotations.MORPHO_ANNOTATION);
if (de_verb.equals(token.getSpan())) {
foundVerb = !mfs.isEmpty();
validateMorphFeatureProbability(mfs, LexicalCategory.Verb, de_verbProb);
} else if (de_adjective.equals(token.getSpan())) {
foundAdjective = !mfs.isEmpty();
validateMorphFeatureProbability(mfs, LexicalCategory.Adjective, de_adjectiveProb);
} else if (de_noun.equals(token.getSpan())) {
foundNoun = !mfs.isEmpty();
validateMorphFeatureProbability(mfs, LexicalCategory.Noun, de_nounProb);
}
for (Value<MorphoFeatures> mf : mfs) {
log.info(" - {}", mf);
Assert.assertNotNull(mf.value().getLemma());
}
}
Assert.assertTrue("No MorphoFeatures found for '" + de_verb + "'!", foundVerb);
Assert.assertTrue("No MorphoFeatures found for '" + de_adjective + "'!", foundAdjective);
Assert.assertTrue("No MorphoFeatures found for '" + de_noun + "'!", foundNoun);
}
use of org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures in project stanbol by apache.
the class CeliAnalyzedTextLemmatizerEngine method computeEnhancements.
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
AnalysedText at = getAnalysedText(this, ci, true);
String language = getLanguage(this, ci, true);
isLangaugeConfigured(this, languageConfig, language, true);
List<LexicalEntry> terms;
try {
terms = this.client.performMorfologicalAnalysis(at.getSpan(), language);
} catch (IOException e) {
throw new EngineException("Error while calling the CELI Lemmatizer" + " service (configured URL: " + serviceURL + ")!", e);
} catch (SOAPException e) {
throw new EngineException("Error wile encoding/decoding the request/" + "response to the CELI lemmatizer service!", e);
}
Map<LexicalCategory, Double> tokenLexCats = new EnumMap<LexicalCategory, Double>(LexicalCategory.class);
for (LexicalEntry term : terms) {
if (term.getTermReadings().isEmpty()) {
// ignore terms without readings
continue;
}
// Add the LexicalEntry as Token to the Text. NOTE that if a
// Token with the same start/end positions already exist this
// Method returns the existing instance
Token token = at.addToken(term.getFrom(), term.getTo());
// Now try to get POS annotations for the Token
for (Value<PosTag> posAnno : token.getAnnotations(NlpAnnotations.POS_ANNOTATION)) {
if (posAnno.value().isMapped()) {
for (LexicalCategory cat : posAnno.value().getCategories()) {
if (!tokenLexCats.containsKey(cat)) {
// do not override with lover prob
tokenLexCats.put(cat, posAnno.probability());
}
}
}
}
for (Reading reading : term.getTermReadings()) {
MorphoFeatures mf = CeliMorphoFeatures.parseFrom(reading, language);
// add the readings (MorphoFeatures)
if (mf != null) {
// use the POS tags of the morpho analysis and compare it
// with existing POS tags.
double posProbability = -1;
Set<LexicalCategory> mfCats = EnumSet.noneOf(LexicalCategory.class);
for (PosTag mfPos : mf.getPosList()) {
mfCats.addAll(mfPos.getCategories());
}
for (LexicalCategory mfCat : mfCats) {
Double prob = tokenLexCats.get(mfCat);
if (prob != null && posProbability < prob) {
posProbability = prob;
}
}
// add the morpho features with the posProbabiliy
Value<MorphoFeatures> value = Value.value(mf, posProbability < 0 ? Value.UNKNOWN_PROBABILITY : posProbability);
token.addAnnotation(NlpAnnotations.MORPHO_ANNOTATION, value);
}
}
}
}
use of org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures in project stanbol by apache.
the class AnalyzedTextSerializerAndParserTest method setup.
@BeforeClass
public static final void setup() throws IOException {
ci = ciFactory.createContentItem(new StringSource(text));
textBlob = ContentItemHelper.getBlob(ci, Collections.singleton("text/plain"));
analysedTextWithData = createAnalysedText();
int sentence = text.indexOf('.') + 1;
Sentence sent1 = analysedTextWithData.addSentence(0, sentence);
expectedSentences.put(sent1, "The Stanbol enhancer can detect famous " + "cities such as Paris and people such as Bob Marley.");
Token the = sent1.addToken(0, 3);
expectedTokens.put(the, "The");
the.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("PREP", Pos.Preposition), 0.85));
Token stanbol = sent1.addToken(4, 11);
expectedTokens.put(stanbol, "Stanbol");
stanbol.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("PN", Pos.ProperNoun), 0.95));
stanbol.addAnnotation(NlpAnnotations.SENTIMENT_ANNOTATION, Value.value(0.5));
// use index to create Tokens
int enhancerStart = sent1.getSpan().indexOf("enhancer");
Token enhancer = sent1.addToken(enhancerStart, enhancerStart + "enhancer".length());
expectedTokens.put(enhancer, "enhancer");
enhancer.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("PN", Pos.ProperNoun), 0.95));
enhancer.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("N", LexicalCategory.Noun), 0.87));
MorphoFeatures morpho = new MorphoFeatures("enhance");
morpho.addCase(new CaseTag("test-case-1", Case.Comitative));
morpho.addCase(new CaseTag("test-case-2", Case.Abessive));
morpho.addDefinitness(Definitness.Definite);
morpho.addPerson(Person.First);
morpho.addPos(new PosTag("PN", Pos.ProperNoun));
morpho.addGender(new GenderTag("test-gender", Gender.Masculine));
morpho.addNumber(new NumberTag("test-number", NumberFeature.Plural));
morpho.addTense(new TenseTag("test-tense", Tense.Present));
morpho.addVerbForm(new VerbMoodTag("test-verb-mood", VerbMood.ConditionalVerb));
enhancer.addAnnotation(NlpAnnotations.MORPHO_ANNOTATION, Value.value(morpho));
// create a chunk
Chunk stanbolEnhancer = analysedTextWithData.addChunk(stanbol.getStart(), enhancer.getEnd());
expectedChunks.put(stanbolEnhancer, "Stanbol enhancer");
stanbolEnhancer.addAnnotation(NlpAnnotations.NER_ANNOTATION, Value.value(new NerTag("organization", DBPEDIA_ORGANISATION)));
stanbolEnhancer.addAnnotation(NlpAnnotations.PHRASE_ANNOTATION, Value.value(new PhraseTag("NP", LexicalCategory.Noun), 0.98));
}
use of org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures in project stanbol by apache.
the class MorphoFeaturesSupport method parse.
@Override
public MorphoFeatures parse(ObjectNode jMorpho, AnalysedText at) {
JsonNode jLemma = jMorpho.path("lemma");
if (!jLemma.isTextual()) {
throw new IllegalStateException("Field 'lemma' MUST provide a String value (parsed JSON: " + jMorpho);
}
MorphoFeatures morpho = new MorphoFeatures(jLemma.asText());
JsonNode node = jMorpho.path("case");
if (node.isArray()) {
ArrayNode jCases = (ArrayNode) node;
for (int i = 0; i < jCases.size(); i++) {
JsonNode member = jCases.get(i);
if (member.isObject()) {
ObjectNode jCase = (ObjectNode) member;
JsonNode tag = jCase.path("tag");
if (tag.isTextual()) {
EnumSet<Case> type = JsonUtils.parseEnum(jCase, "type", Case.class);
if (type.isEmpty()) {
morpho.addCase(new CaseTag(tag.getTextValue()));
} else {
morpho.addCase(new CaseTag(tag.getTextValue(), type.iterator().next()));
}
} else {
log.warn("Unable to parse CaseTag becuase 'tag' value is " + "missing or is not a String (json: " + jCase.toString() + ")");
}
} else {
log.warn("Unable to parse CaseTag from " + member.toString());
}
}
} else if (!node.isMissingNode()) {
log.warn("Unable to parse CaseTags (Json Array expected as value for field 'case' but was " + node);
}
if (jMorpho.has("definitness")) {
for (Definitness d : JsonUtils.parseEnum(jMorpho, "definitness", Definitness.class)) {
morpho.addDefinitness(d);
}
}
node = jMorpho.path("gender");
if (node.isArray()) {
ArrayNode jGenders = (ArrayNode) node;
for (int i = 0; i < jGenders.size(); i++) {
JsonNode member = jGenders.get(i);
if (member.isObject()) {
ObjectNode jGender = (ObjectNode) member;
JsonNode tag = jGender.path("tag");
if (tag.isTextual()) {
EnumSet<Gender> type = JsonUtils.parseEnum(jGender, "type", Gender.class);
if (type.isEmpty()) {
morpho.addGender(new GenderTag(tag.getTextValue()));
} else {
morpho.addGender(new GenderTag(tag.getTextValue(), type.iterator().next()));
}
} else {
log.warn("Unable to parse GenderTag becuase 'tag' value is " + "missing or is not a String (json: " + jGender.toString() + ")");
}
} else {
log.warn("Unable to parse GenderTag from " + member.toString());
}
}
} else if (!node.isMissingNode()) {
log.warn("Unable to parse GenderTag (Json Array expected as value for field 'case' but was " + node);
}
node = jMorpho.path("number");
if (node.isArray()) {
ArrayNode jNumbers = (ArrayNode) node;
for (int i = 0; i < jNumbers.size(); i++) {
JsonNode member = jNumbers.get(i);
if (member.isObject()) {
ObjectNode jNumber = (ObjectNode) member;
JsonNode tag = jNumber.path("tag");
if (tag.isTextual()) {
EnumSet<NumberFeature> type = JsonUtils.parseEnum(jNumber, "type", NumberFeature.class);
if (type.isEmpty()) {
morpho.addNumber(new NumberTag(tag.getTextValue()));
} else {
morpho.addNumber(new NumberTag(tag.getTextValue(), type.iterator().next()));
}
} else {
log.warn("Unable to parse NumberTag becuase 'tag' value is " + "missing or is not a String (json: " + jNumber.toString() + ")");
}
} else {
log.warn("Unable to parse NumberTag from " + member.toString());
}
}
} else if (!node.isMissingNode()) {
log.warn("Unable to parse NumberTag (Json Array expected as value for field 'case' but was " + node);
}
if (jMorpho.has("person")) {
for (Person p : JsonUtils.parseEnum(jMorpho, "person", Person.class)) {
morpho.addPerson(p);
}
}
node = jMorpho.path("pos");
if (node.isArray()) {
ArrayNode jPosTags = (ArrayNode) node;
for (int i = 0; i < jPosTags.size(); i++) {
JsonNode member = jPosTags.get(i);
if (member.isObject()) {
ObjectNode jPosTag = (ObjectNode) member;
morpho.addPos(getPosTagParser().parse(jPosTag, at));
} else {
log.warn("Unable to parse PosTag from " + member.toString());
}
}
} else if (!node.isMissingNode()) {
log.warn("Unable to parse PosTag (Json Array expected as value for field 'case' but was " + node);
}
node = jMorpho.path("tense");
if (node.isArray()) {
ArrayNode jTenses = (ArrayNode) node;
for (int i = 0; i < jTenses.size(); i++) {
JsonNode member = jTenses.get(i);
if (member.isObject()) {
ObjectNode jTense = (ObjectNode) member;
JsonNode tag = jTense.path("tag");
if (tag.isTextual()) {
EnumSet<Tense> type = JsonUtils.parseEnum(jTense, "type", Tense.class);
if (type.isEmpty()) {
morpho.addTense(new TenseTag(tag.getTextValue()));
} else {
morpho.addTense(new TenseTag(tag.getTextValue(), type.iterator().next()));
}
} else {
log.warn("Unable to parse TenseTag becuase 'tag' value is " + "missing or is not a String (json: " + jTense.toString() + ")");
}
} else {
log.warn("Unable to parse TenseTag from " + member.toString());
}
}
} else if (!node.isMissingNode()) {
log.warn("Unable to parse TenseTag (Json Array expected as value for field 'case' but was " + node);
}
node = jMorpho.path("verb-mood");
if (node.isArray()) {
ArrayNode jVerbMoods = (ArrayNode) node;
for (int i = 0; i < jVerbMoods.size(); i++) {
JsonNode member = jVerbMoods.get(i);
if (member.isObject()) {
ObjectNode jVerbMood = (ObjectNode) member;
JsonNode tag = jVerbMood.path("tag");
if (tag.isTextual()) {
EnumSet<VerbMood> type = JsonUtils.parseEnum(jVerbMood, "type", VerbMood.class);
if (type.isEmpty()) {
morpho.addVerbForm(new VerbMoodTag(tag.getTextValue()));
} else {
morpho.addVerbForm(new VerbMoodTag(tag.getTextValue(), type.iterator().next()));
}
} else {
log.warn("Unable to parse VerbMoodTag becuase 'tag' value is " + "missing or is not a String (json: " + jVerbMood.toString() + ")");
}
} else {
log.warn("Unable to parse VerbMoodTag from " + member.toString());
}
}
} else if (!node.isMissingNode()) {
log.warn("Unable to parse VerbMoodTag (Json Array expected as value for field 'case' but was " + node);
}
return morpho;
}
use of org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures in project stanbol by apache.
the class KuromojiNlpEngine method computeEnhancements.
/**
* Compute enhancements for supplied ContentItem. The results of the process
* are expected to be stored in the metadata of the content item.
* <p/>
* The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
* persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
* <p/>
* This method creates a new POSContentPart using {@link org.apache.stanbol.enhancer.engines.pos.api.POSTaggerHelper#createContentPart} from a text/plain part and
* stores it as a new part in the content item. The metadata is not changed.
*
* @throws org.apache.stanbol.enhancer.servicesapi.EngineException
* if the underlying process failed to work as
* expected
*/
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
final AnalysedText at = initAnalysedText(this, analysedTextFactory, ci);
String language = getLanguage(this, ci, false);
if (!("ja".equals(language) || (language != null && language.startsWith("ja-")))) {
throw new IllegalStateException("The detected language is NOT 'ja'! " + "As this is also checked within the #canEnhance(..) method this " + "indicates an Bug in the used EnhancementJobManager implementation. " + "Please report this on the dev@apache.stanbol.org or create an " + "JIRA issue about this.");
}
// start with the Tokenizer
TokenStream tokenStream = tokenizerFactory.create(new CharSequenceReader(at.getText()));
// build the analyzing chain by adding all TokenFilters
for (TokenFilterFactory filterFactory : filterFactories) {
tokenStream = filterFactory.create(tokenStream);
}
// Try to extract sentences based on POS tags ...
int sentStartOffset = -1;
// NER data
List<NerData> nerList = new ArrayList<NerData>();
// the next index where the NerData.context need to be set
int nerSentIndex = 0;
NerData ner = null;
OffsetAttribute offset = null;
try {
// required with Solr 4
tokenStream.reset();
while (tokenStream.incrementToken()) {
offset = tokenStream.addAttribute(OffsetAttribute.class);
Token token = at.addToken(offset.startOffset(), offset.endOffset());
// Get the POS attribute and init the PosTag
PartOfSpeechAttribute posAttr = tokenStream.addAttribute(PartOfSpeechAttribute.class);
PosTag posTag = POS_TAG_SET.getTag(posAttr.getPartOfSpeech());
if (posTag == null) {
posTag = adhocTags.get(posAttr.getPartOfSpeech());
if (posTag == null) {
posTag = new PosTag(posAttr.getPartOfSpeech());
adhocTags.put(posAttr.getPartOfSpeech(), posTag);
log.warn(" ... missing PosTag mapping for {}", posAttr.getPartOfSpeech());
}
}
// Sentence detection by POS tag
if (sentStartOffset < 0) {
// the last token was a sentence ending
sentStartOffset = offset.startOffset();
}
if (posTag.hasPos(Pos.Point)) {
Sentence sent = at.addSentence(sentStartOffset, offset.startOffset());
// add the sentence as context to the NerData instances
while (nerSentIndex < nerList.size()) {
nerList.get(nerSentIndex).context = sent.getSpan();
nerSentIndex++;
}
sentStartOffset = -1;
}
// POS
token.addAnnotation(POS_ANNOTATION, Value.value(posTag));
// NER
NerTag nerTag = NER_TAG_SET.getTag(posAttr.getPartOfSpeech());
if (ner != null && (nerTag == null || !ner.tag.getType().equals(nerTag.getType()))) {
// write NER annotation
Chunk chunk = at.addChunk(ner.start, ner.end);
chunk.addAnnotation(NlpAnnotations.NER_ANNOTATION, Value.value(ner.tag));
// NOTE that the fise:TextAnnotation are written later based on the nerList
// clean up
ner = null;
}
if (nerTag != null) {
if (ner == null) {
ner = new NerData(nerTag, offset.startOffset());
nerList.add(ner);
}
ner.end = offset.endOffset();
}
BaseFormAttribute baseFormAttr = tokenStream.addAttribute(BaseFormAttribute.class);
MorphoFeatures morpho = null;
if (baseFormAttr != null && baseFormAttr.getBaseForm() != null) {
morpho = new MorphoFeatures(baseFormAttr.getBaseForm());
// and add the posTag
morpho.addPos(posTag);
}
InflectionAttribute inflectionAttr = tokenStream.addAttribute(InflectionAttribute.class);
inflectionAttr.getInflectionForm();
inflectionAttr.getInflectionType();
if (morpho != null) {
// if present add the morpho
token.addAnnotation(MORPHO_ANNOTATION, Value.value(morpho));
}
}
// we still need to write the last sentence
Sentence lastSent = null;
if (offset != null && sentStartOffset >= 0 && offset.endOffset() > sentStartOffset) {
lastSent = at.addSentence(sentStartOffset, offset.endOffset());
}
// and set the context off remaining named entities
while (nerSentIndex < nerList.size()) {
if (lastSent != null) {
nerList.get(nerSentIndex).context = lastSent.getSpan();
} else {
// no sentence detected
nerList.get(nerSentIndex).context = at.getSpan();
}
nerSentIndex++;
}
} catch (IOException e) {
throw new EngineException(this, ci, "Exception while reading from " + "AnalyzedText contentpart", e);
} finally {
try {
tokenStream.close();
} catch (IOException e) {
/* ignore */
}
}
// finally write the NER annotations to the metadata of the ContentItem
final Graph metadata = ci.getMetadata();
ci.getLock().writeLock().lock();
try {
Language lang = new Language("ja");
for (NerData nerData : nerList) {
IRI ta = EnhancementEngineHelper.createTextEnhancement(ci, this);
metadata.add(new TripleImpl(ta, ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(at.getSpan().substring(nerData.start, nerData.end), lang)));
metadata.add(new TripleImpl(ta, DC_TYPE, nerData.tag.getType()));
metadata.add(new TripleImpl(ta, ENHANCER_START, lf.createTypedLiteral(nerData.start)));
metadata.add(new TripleImpl(ta, ENHANCER_END, lf.createTypedLiteral(nerData.end)));
metadata.add(new TripleImpl(ta, ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(nerData.context, lang)));
}
} finally {
ci.getLock().writeLock().unlock();
}
}
Aggregations