use of org.apache.stanbol.enhancer.nlp.pos.PosTag in project stanbol by apache.
the class CeliAnalyzedTextLemmatizerEngineTest method testEngineDe.
@Test
public void testEngineDe() throws IOException, EngineException {
ContentItem ci = ciFactory.createContentItem(new StringSource(de_text));
Assert.assertNotNull(ci);
AnalysedText at = atFactory.createAnalysedText(ci, ci.getBlob());
Assert.assertNotNull(at);
ci.getMetadata().add(new TripleImpl(ci.getUri(), DC_LANGUAGE, new PlainLiteralImpl("de")));
Assert.assertEquals("de", EnhancementEngineHelper.getLanguage(ci));
// Add some Tokens with POS annotations to test the usage of
// existing POS annotations by the lemmatizer
Token verbrachten = at.addToken(de_verbStart, de_verbStart + de_verb.length());
verbrachten.addAnnotation(POS_ANNOTATION, Value.value(new PosTag("V", LexicalCategory.Verb), de_verbProb));
Token schonen = at.addToken(de_adjectiveStart, de_adjectiveStart + de_adjective.length());
schonen.addAnnotation(POS_ANNOTATION, Value.value(new PosTag("ADJ", LexicalCategory.Adjective), de_adjectiveProb));
Token urlaub = at.addToken(de_nounStart, de_nounStart + de_noun.length());
urlaub.addAnnotation(POS_ANNOTATION, Value.value(new PosTag("NC", LexicalCategory.Noun), de_nounProb));
Assert.assertEquals("Can not enhance Test ContentItem", EnhancementEngine.ENHANCE_ASYNC, engine.canEnhance(ci));
// compute the enhancements
try {
engine.computeEnhancements(ci);
} catch (EngineException e) {
RemoteServiceHelper.checkServiceUnavailable(e);
// deactivate test
return;
}
// now validate the enhancements
boolean foundVerb = false;
boolean foundAdjective = false;
boolean foundNoun = false;
for (Iterator<Token> tokens = at.getTokens(); tokens.hasNext(); ) {
Token token = tokens.next();
log.info("Token: {}", token);
List<Value<MorphoFeatures>> mfs = token.getAnnotations(NlpAnnotations.MORPHO_ANNOTATION);
if (de_verb.equals(token.getSpan())) {
foundVerb = !mfs.isEmpty();
validateMorphFeatureProbability(mfs, LexicalCategory.Verb, de_verbProb);
} else if (de_adjective.equals(token.getSpan())) {
foundAdjective = !mfs.isEmpty();
validateMorphFeatureProbability(mfs, LexicalCategory.Adjective, de_adjectiveProb);
} else if (de_noun.equals(token.getSpan())) {
foundNoun = !mfs.isEmpty();
validateMorphFeatureProbability(mfs, LexicalCategory.Noun, de_nounProb);
}
for (Value<MorphoFeatures> mf : mfs) {
log.info(" - {}", mf);
Assert.assertNotNull(mf.value().getLemma());
}
}
Assert.assertTrue("No MorphoFeatures found for '" + de_verb + "'!", foundVerb);
Assert.assertTrue("No MorphoFeatures found for '" + de_adjective + "'!", foundAdjective);
Assert.assertTrue("No MorphoFeatures found for '" + de_noun + "'!", foundNoun);
}
use of org.apache.stanbol.enhancer.nlp.pos.PosTag in project stanbol by apache.
the class CeliAnalyzedTextLemmatizerEngine method computeEnhancements.
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
AnalysedText at = getAnalysedText(this, ci, true);
String language = getLanguage(this, ci, true);
isLangaugeConfigured(this, languageConfig, language, true);
List<LexicalEntry> terms;
try {
terms = this.client.performMorfologicalAnalysis(at.getSpan(), language);
} catch (IOException e) {
throw new EngineException("Error while calling the CELI Lemmatizer" + " service (configured URL: " + serviceURL + ")!", e);
} catch (SOAPException e) {
throw new EngineException("Error wile encoding/decoding the request/" + "response to the CELI lemmatizer service!", e);
}
Map<LexicalCategory, Double> tokenLexCats = new EnumMap<LexicalCategory, Double>(LexicalCategory.class);
for (LexicalEntry term : terms) {
if (term.getTermReadings().isEmpty()) {
// ignore terms without readings
continue;
}
// Add the LexicalEntry as Token to the Text. NOTE that if a
// Token with the same start/end positions already exist this
// Method returns the existing instance
Token token = at.addToken(term.getFrom(), term.getTo());
// Now try to get POS annotations for the Token
for (Value<PosTag> posAnno : token.getAnnotations(NlpAnnotations.POS_ANNOTATION)) {
if (posAnno.value().isMapped()) {
for (LexicalCategory cat : posAnno.value().getCategories()) {
if (!tokenLexCats.containsKey(cat)) {
// do not override with lover prob
tokenLexCats.put(cat, posAnno.probability());
}
}
}
}
for (Reading reading : term.getTermReadings()) {
MorphoFeatures mf = CeliMorphoFeatures.parseFrom(reading, language);
// add the readings (MorphoFeatures)
if (mf != null) {
// use the POS tags of the morpho analysis and compare it
// with existing POS tags.
double posProbability = -1;
Set<LexicalCategory> mfCats = EnumSet.noneOf(LexicalCategory.class);
for (PosTag mfPos : mf.getPosList()) {
mfCats.addAll(mfPos.getCategories());
}
for (LexicalCategory mfCat : mfCats) {
Double prob = tokenLexCats.get(mfCat);
if (prob != null && posProbability < prob) {
posProbability = prob;
}
}
// add the morpho features with the posProbabiliy
Value<MorphoFeatures> value = Value.value(mf, posProbability < 0 ? Value.UNKNOWN_PROBABILITY : posProbability);
token.addAnnotation(NlpAnnotations.MORPHO_ANNOTATION, value);
}
}
}
}
use of org.apache.stanbol.enhancer.nlp.pos.PosTag in project stanbol by apache.
the class AnalyzedTextSerializerAndParserTest method setup.
@BeforeClass
public static final void setup() throws IOException {
ci = ciFactory.createContentItem(new StringSource(text));
textBlob = ContentItemHelper.getBlob(ci, Collections.singleton("text/plain"));
analysedTextWithData = createAnalysedText();
int sentence = text.indexOf('.') + 1;
Sentence sent1 = analysedTextWithData.addSentence(0, sentence);
expectedSentences.put(sent1, "The Stanbol enhancer can detect famous " + "cities such as Paris and people such as Bob Marley.");
Token the = sent1.addToken(0, 3);
expectedTokens.put(the, "The");
the.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("PREP", Pos.Preposition), 0.85));
Token stanbol = sent1.addToken(4, 11);
expectedTokens.put(stanbol, "Stanbol");
stanbol.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("PN", Pos.ProperNoun), 0.95));
stanbol.addAnnotation(NlpAnnotations.SENTIMENT_ANNOTATION, Value.value(0.5));
// use index to create Tokens
int enhancerStart = sent1.getSpan().indexOf("enhancer");
Token enhancer = sent1.addToken(enhancerStart, enhancerStart + "enhancer".length());
expectedTokens.put(enhancer, "enhancer");
enhancer.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("PN", Pos.ProperNoun), 0.95));
enhancer.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("N", LexicalCategory.Noun), 0.87));
MorphoFeatures morpho = new MorphoFeatures("enhance");
morpho.addCase(new CaseTag("test-case-1", Case.Comitative));
morpho.addCase(new CaseTag("test-case-2", Case.Abessive));
morpho.addDefinitness(Definitness.Definite);
morpho.addPerson(Person.First);
morpho.addPos(new PosTag("PN", Pos.ProperNoun));
morpho.addGender(new GenderTag("test-gender", Gender.Masculine));
morpho.addNumber(new NumberTag("test-number", NumberFeature.Plural));
morpho.addTense(new TenseTag("test-tense", Tense.Present));
morpho.addVerbForm(new VerbMoodTag("test-verb-mood", VerbMood.ConditionalVerb));
enhancer.addAnnotation(NlpAnnotations.MORPHO_ANNOTATION, Value.value(morpho));
// create a chunk
Chunk stanbolEnhancer = analysedTextWithData.addChunk(stanbol.getStart(), enhancer.getEnd());
expectedChunks.put(stanbolEnhancer, "Stanbol enhancer");
stanbolEnhancer.addAnnotation(NlpAnnotations.NER_ANNOTATION, Value.value(new NerTag("organization", DBPEDIA_ORGANISATION)));
stanbolEnhancer.addAnnotation(NlpAnnotations.PHRASE_ANNOTATION, Value.value(new PhraseTag("NP", LexicalCategory.Noun), 0.98));
}
use of org.apache.stanbol.enhancer.nlp.pos.PosTag in project stanbol by apache.
the class MorphoFeaturesSupport method serialize.
@Override
public ObjectNode serialize(ObjectMapper mapper, MorphoFeatures morpho) {
ObjectNode jMorpho = mapper.createObjectNode();
jMorpho.put("lemma", morpho.getLemma());
List<CaseTag> caseList = morpho.getCaseList();
if (!caseList.isEmpty()) {
ArrayNode jCases = mapper.createArrayNode();
for (CaseTag caseTag : caseList) {
ObjectNode jCase = mapper.createObjectNode();
jCase.put("tag", caseTag.getTag());
if (caseTag.getCase() != null) {
jCase.put("type", caseTag.getCase().name());
}
jCases.add(jCase);
}
jMorpho.put("case", jCases);
}
List<Definitness> definitnesses = morpho.getDefinitnessList();
if (!definitnesses.isEmpty()) {
if (definitnesses.size() == 1) {
jMorpho.put("definitness", definitnesses.get(0).name());
} else {
ArrayNode jDefinitnesses = mapper.createArrayNode();
for (Definitness d : definitnesses) {
jDefinitnesses.add(d.name());
}
jMorpho.put("definitness", jDefinitnesses);
}
}
List<GenderTag> genderList = morpho.getGenderList();
if (!genderList.isEmpty()) {
ArrayNode jGenders = mapper.createArrayNode();
for (GenderTag genderTag : genderList) {
ObjectNode jGender = mapper.createObjectNode();
jGender.put("tag", genderTag.getTag());
if (genderTag.getGender() != null) {
jGender.put("type", genderTag.getGender().name());
}
jGenders.add(jGender);
}
jMorpho.put("gender", jGenders);
}
List<NumberTag> numberList = morpho.getNumberList();
if (!numberList.isEmpty()) {
ArrayNode jNumbers = mapper.createArrayNode();
for (NumberTag numberTag : numberList) {
ObjectNode jNumber = mapper.createObjectNode();
jNumber.put("tag", numberTag.getTag());
if (numberTag.getNumber() != null) {
jNumber.put("type", numberTag.getNumber().name());
}
jNumbers.add(jNumber);
}
jMorpho.put("number", jNumbers);
}
List<Person> persons = morpho.getPersonList();
if (!persons.isEmpty()) {
if (persons.size() == 1) {
jMorpho.put("person", persons.get(0).name());
} else {
ArrayNode jPersons = mapper.createArrayNode();
for (Person d : persons) {
jPersons.add(d.name());
}
jMorpho.put("person", jPersons);
}
}
List<PosTag> posList = morpho.getPosList();
if (!posList.isEmpty()) {
ArrayNode jPosTags = mapper.createArrayNode();
for (PosTag posTag : posList) {
jPosTags.add(getPosTagSerializer().serialize(mapper, posTag));
}
jMorpho.put("pos", jPosTags);
}
List<TenseTag> tenseList = morpho.getTenseList();
if (!tenseList.isEmpty()) {
ArrayNode jTenses = mapper.createArrayNode();
for (TenseTag tenseTag : tenseList) {
ObjectNode jTense = mapper.createObjectNode();
jTense.put("tag", tenseTag.getTag());
if (tenseTag.getTense() != null) {
jTense.put("type", tenseTag.getTense().name());
}
jTenses.add(jTense);
}
jMorpho.put("tense", jTenses);
}
List<VerbMoodTag> verbMoodList = morpho.getVerbMoodList();
if (!verbMoodList.isEmpty()) {
ArrayNode jMoods = mapper.createArrayNode();
for (VerbMoodTag verbMoodTag : verbMoodList) {
ObjectNode jMood = mapper.createObjectNode();
jMood.put("tag", verbMoodTag.getTag());
if (verbMoodTag.getVerbForm() != null) {
jMood.put("type", verbMoodTag.getVerbForm().name());
}
jMoods.add(jMood);
}
jMorpho.put("verb-mood", jMoods);
}
return jMorpho;
}
use of org.apache.stanbol.enhancer.nlp.pos.PosTag in project stanbol by apache.
the class OpenNlpChunkingEngine method computeEnhancements.
/**
* Compute enhancements for supplied ContentItem. The results of the process
* are expected to be stored in the metadata of the content item.
* <p/>
* The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
* persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
*
* @throws org.apache.stanbol.enhancer.servicesapi.EngineException
* if the underlying process failed to work as
* expected
*/
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
AnalysedText at = getAnalysedText(this, ci, true);
String language = getLanguage(this, ci, true);
isLangaugeConfigured(this, languageConfiguration, language, true);
ChunkerME chunker = initChunker(language);
if (chunker == null) {
return;
}
// init the Phrase TagSet
TagSet<PhraseTag> tagSet = tagSetRegistry.getTagSet(language);
if (tagSet == null) {
}
if (tagSet == null) {
log.warn("No Phrase TagSet registered for Language '{}'. Will build an " + "adhoc set based on encountered Tags!", language);
// for now only created to avoid checks for tagSet == null
// TODO: in future we might want to automatically create posModels based
// on tagged texts. However this makes no sense as long we can not
// persist TagSets.
tagSet = new TagSet<PhraseTag>("dummy", language);
}
// holds PosTags created for POS tags that where not part of the posModel
// (will hold all PosTags in case tagSet is NULL
Map<String, PhraseTag> adhocTags = languageAdhocTags.get(language);
if (adhocTags == null) {
adhocTags = new HashMap<String, PhraseTag>();
languageAdhocTags.put(language, adhocTags);
}
ci.getLock().writeLock().lock();
try {
Iterator<? extends Section> sentences = at.getSentences();
if (!sentences.hasNext()) {
// no sentences ... iterate over the whole text
sentences = Collections.singleton(at).iterator();
}
List<String> tokenTextList = new ArrayList<String>(64);
List<String> posList = new ArrayList<String>(64);
List<Token> tokenList = new ArrayList<Token>(64);
// process each sentence seperatly
while (sentences.hasNext()) {
// (1) get Tokens and POS information for the sentence
Section sentence = sentences.next();
Iterator<Token> tokens = sentence.getTokens();
while (tokens.hasNext()) {
Token token = tokens.next();
tokenList.add(token);
tokenTextList.add(token.getSpan());
Value<PosTag> posValue = token.getAnnotation(POS_ANNOTATION);
if (posValue == null) {
throw new EngineException("Missing POS value for Token '" + token.getSpan() + "' of ContentItem " + ci.getUri() + "(Sentence: '" + sentence.getSpan() + "'). This may " + "indicate that a POS tagging Engine is missing in " + "the EnhancementChain or that the used POS tagging " + "does not provide POS tags for each token!");
} else {
posList.add(posValue.value().getTag());
}
}
String[] tokenStrings = tokenTextList.toArray(new String[tokenTextList.size()]);
String[] tokenPos = posList.toArray(new String[tokenTextList.size()]);
if (log.isTraceEnabled()) {
log.trace("Tokens: {}" + Arrays.toString(tokenStrings));
}
// free memory
tokenTextList.clear();
// free memory
posList.clear();
// (2) Chunk the sentence
String[] chunkTags = chunker.chunk(tokenStrings, tokenPos);
double[] chunkProb = chunker.probs();
if (log.isTraceEnabled()) {
log.trace("Chunks: {}" + Arrays.toString(chunkTags));
}
// free memory
tokenStrings = null;
// free memory
tokenPos = null;
// (3) Process the results and write the Annotations
double chunkProps = 0;
int chunkTokenCount = 0;
PhraseTag tag = null;
int i;
/*
* This assumes:
* - 'B-{tag}' ... for start of a new chunk
* - '???' ... anything other for continuing the current chunk
* - 'O' ... no chunk (ends current chunk)
*/
for (i = 0; i < tokenList.size(); i++) {
boolean start = chunkTags[i].charAt(0) == 'B';
boolean end = tag != null && (start || chunkTags[i].charAt(0) == 'O');
if (end) {
// add the current phrase
// add at AnalysedText level, because offsets are absolute
// NOTE we are already at the next token when we detect the end
Chunk chunk = at.addChunk(tokenList.get(i - chunkTokenCount).getStart(), tokenList.get(i - 1).getEnd());
chunk.addAnnotation(PHRASE_ANNOTATION, new Value<PhraseTag>(tag, chunkProps / (double) chunkTokenCount));
// reset the state
tag = null;
chunkTokenCount = 0;
chunkProps = 0;
}
if (start) {
// create the new tag
tag = getPhraseTag(tagSet, adhocTags, chunkTags[i].substring(2), // skip 'B-'
language);
}
if (tag != null) {
// count this token for the current chunk
chunkProps = chunkProps + chunkProb[i];
chunkTokenCount++;
}
}
if (tag != null) {
Chunk chunk = at.addChunk(tokenList.get(i - chunkTokenCount).getStart(), tokenList.get(i - 1).getEnd());
chunk.addAnnotation(PHRASE_ANNOTATION, new Value<PhraseTag>(tag, chunkProps / (double) chunkTokenCount));
}
// (4) clean up
tokenList.clear();
}
} finally {
ci.getLock().writeLock().unlock();
}
if (log.isTraceEnabled()) {
logChunks(at);
}
}
Aggregations