use of org.apache.stanbol.enhancer.nlp.pos.PosTag in project stanbol by apache.
the class CeliAnalyzedTextLemmatizerEngineTest method testEngineDe.
@Test
public void testEngineDe() throws IOException, EngineException {
ContentItem ci = ciFactory.createContentItem(new StringSource(de_text));
Assert.assertNotNull(ci);
AnalysedText at = atFactory.createAnalysedText(ci, ci.getBlob());
Assert.assertNotNull(at);
ci.getMetadata().add(new TripleImpl(ci.getUri(), DC_LANGUAGE, new PlainLiteralImpl("de")));
Assert.assertEquals("de", EnhancementEngineHelper.getLanguage(ci));
//Add some Tokens with POS annotations to test the usage of
//existing POS annotations by the lemmatizer
Token verbrachten = at.addToken(de_verbStart, de_verbStart + de_verb.length());
verbrachten.addAnnotation(POS_ANNOTATION, Value.value(new PosTag("V", LexicalCategory.Verb), de_verbProb));
Token schonen = at.addToken(de_adjectiveStart, de_adjectiveStart + de_adjective.length());
schonen.addAnnotation(POS_ANNOTATION, Value.value(new PosTag("ADJ", LexicalCategory.Adjective), de_adjectiveProb));
Token urlaub = at.addToken(de_nounStart, de_nounStart + de_noun.length());
urlaub.addAnnotation(POS_ANNOTATION, Value.value(new PosTag("NC", LexicalCategory.Noun), de_nounProb));
Assert.assertEquals("Can not enhance Test ContentItem", EnhancementEngine.ENHANCE_ASYNC, engine.canEnhance(ci));
//compute the enhancements
try {
engine.computeEnhancements(ci);
} catch (EngineException e) {
RemoteServiceHelper.checkServiceUnavailable(e);
//deactivate test
return;
}
//now validate the enhancements
boolean foundVerb = false;
boolean foundAdjective = false;
boolean foundNoun = false;
for (Iterator<Token> tokens = at.getTokens(); tokens.hasNext(); ) {
Token token = tokens.next();
log.info("Token: {}", token);
List<Value<MorphoFeatures>> mfs = token.getAnnotations(NlpAnnotations.MORPHO_ANNOTATION);
if (de_verb.equals(token.getSpan())) {
foundVerb = !mfs.isEmpty();
validateMorphFeatureProbability(mfs, LexicalCategory.Verb, de_verbProb);
} else if (de_adjective.equals(token.getSpan())) {
foundAdjective = !mfs.isEmpty();
validateMorphFeatureProbability(mfs, LexicalCategory.Adjective, de_adjectiveProb);
} else if (de_noun.equals(token.getSpan())) {
foundNoun = !mfs.isEmpty();
validateMorphFeatureProbability(mfs, LexicalCategory.Noun, de_nounProb);
}
for (Value<MorphoFeatures> mf : mfs) {
log.info(" - {}", mf);
Assert.assertNotNull(mf.value().getLemma());
}
}
Assert.assertTrue("No MorphoFeatures found for '" + de_verb + "'!", foundVerb);
Assert.assertTrue("No MorphoFeatures found for '" + de_adjective + "'!", foundAdjective);
Assert.assertTrue("No MorphoFeatures found for '" + de_noun + "'!", foundNoun);
}
use of org.apache.stanbol.enhancer.nlp.pos.PosTag in project stanbol by apache.
the class CeliAnalyzedTextLemmatizerEngine method computeEnhancements.
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
AnalysedText at = getAnalysedText(this, ci, true);
String language = getLanguage(this, ci, true);
isLangaugeConfigured(this, languageConfig, language, true);
List<LexicalEntry> terms;
try {
terms = this.client.performMorfologicalAnalysis(at.getSpan(), language);
} catch (IOException e) {
throw new EngineException("Error while calling the CELI Lemmatizer" + " service (configured URL: " + serviceURL + ")!", e);
} catch (SOAPException e) {
throw new EngineException("Error wile encoding/decoding the request/" + "response to the CELI lemmatizer service!", e);
}
Map<LexicalCategory, Double> tokenLexCats = new EnumMap<LexicalCategory, Double>(LexicalCategory.class);
for (LexicalEntry term : terms) {
if (term.getTermReadings().isEmpty()) {
//ignore terms without readings
continue;
}
//Add the LexicalEntry as Token to the Text. NOTE that if a
//Token with the same start/end positions already exist this
//Method returns the existing instance
Token token = at.addToken(term.getFrom(), term.getTo());
//Now try to get POS annotations for the Token
for (Value<PosTag> posAnno : token.getAnnotations(NlpAnnotations.POS_ANNOTATION)) {
if (posAnno.value().isMapped()) {
for (LexicalCategory cat : posAnno.value().getCategories()) {
if (!tokenLexCats.containsKey(cat)) {
//do not override with lover prob
tokenLexCats.put(cat, posAnno.probability());
}
}
}
}
for (Reading reading : term.getTermReadings()) {
MorphoFeatures mf = CeliMorphoFeatures.parseFrom(reading, language);
//add the readings (MorphoFeatures)
if (mf != null) {
//use the POS tags of the morpho analysis and compare it
//with existing POS tags.
double posProbability = -1;
Set<LexicalCategory> mfCats = EnumSet.noneOf(LexicalCategory.class);
for (PosTag mfPos : mf.getPosList()) {
mfCats.addAll(mfPos.getCategories());
}
for (LexicalCategory mfCat : mfCats) {
Double prob = tokenLexCats.get(mfCat);
if (prob != null && posProbability < prob) {
posProbability = prob;
}
}
//add the morpho features with the posProbabiliy
Value<MorphoFeatures> value = Value.value(mf, posProbability < 0 ? Value.UNKNOWN_PROBABILITY : posProbability);
token.addAnnotation(NlpAnnotations.MORPHO_ANNOTATION, value);
}
}
}
}
use of org.apache.stanbol.enhancer.nlp.pos.PosTag in project stanbol by apache.
the class TestKuromojiNlpEngine method testEngine.
@Test
public void testEngine() throws EngineException {
LiteralFactory lf = LiteralFactory.getInstance();
Assert.assertEquals(EnhancementEngine.ENHANCE_ASYNC, engine.canEnhance(contentItem));
engine.computeEnhancements(contentItem);
//assert the results
Map<IRI, RDFTerm> expected = new HashMap<IRI, RDFTerm>();
expected.put(Properties.DC_CREATOR, lf.createTypedLiteral(engine.getClass().getName()));
expected.put(Properties.ENHANCER_EXTRACTED_FROM, contentItem.getUri());
Assert.assertEquals(16, EnhancementStructureHelper.validateAllTextAnnotations(contentItem.getMetadata(), text, expected));
AnalysedText at = AnalysedTextUtils.getAnalysedText(contentItem);
Assert.assertNotNull(at);
List<Sentence> sentences = AnalysedTextUtils.asList(at.getSentences());
Assert.assertNotNull(sentences);
Assert.assertEquals(7, sentences.size());
//TODO: values in the following arrays are based on the first run of the
// engine. So this is only to detect changes in results. It can not validate
// that the tokenization and NER detections are correct - sorry I do not
// speak Japanese ...
int[] expectedChunks = new int[] { 5, 3, 1, 0, 1, 2, 4 };
int[] expectedTokens = new int[] { 25, 25, 25, 24, 33, 17, 32 };
int sentIndex = 0;
for (Sentence sent : sentences) {
List<Chunk> sentenceNer = AnalysedTextUtils.asList(sent.getChunks());
Assert.assertEquals(expectedChunks[sentIndex], sentenceNer.size());
for (Chunk chunk : sentenceNer) {
Value<NerTag> nerValue = chunk.getAnnotation(NlpAnnotations.NER_ANNOTATION);
Assert.assertNotNull(nerValue);
Assert.assertNotNull(nerValue.value().getType());
}
List<Token> tokens = AnalysedTextUtils.asList(sent.getTokens());
Assert.assertEquals(expectedTokens[sentIndex], tokens.size());
for (Token token : tokens) {
Value<PosTag> posValue = token.getAnnotation(NlpAnnotations.POS_ANNOTATION);
Assert.assertNotNull(posValue);
}
sentIndex++;
}
}
use of org.apache.stanbol.enhancer.nlp.pos.PosTag in project stanbol by apache.
the class NounPhraseFilterer method filter.
/**
* Filters out noun phrases which do not contain a determiner from the given config and do not a token
* count bigger than 2 - TODO : should this be configurable to be able to also include 1 word noun
* phrases?
*
* @param nounPhrases
* @param language
*/
public void filter(List<NounPhrase> nounPhrases, String language) {
Set<String> langDeterminerSet = withinTextRefDeterminers.get(language);
Iterator<NounPhrase> it = nounPhrases.iterator();
while (it.hasNext()) {
NounPhrase nounPhrase = it.next();
boolean hasGoodDeterminer = false;
short nounNo = 0;
for (Span token : nounPhrase.getTokens()) {
Value<PosTag> pos = token.getAnnotation(NlpAnnotations.POS_ANNOTATION);
if (pos != null) {
PosTag posTag = pos.value();
if (posTag.hasCategory(LexicalCategory.Noun) || posTag.hasCategory(LexicalCategory.Adjective)) {
nounNo++;
}
if (!hasGoodDeterminer && posTag.hasPos(Pos.Determiner) && langDeterminerSet.contains(token.getSpan().toLowerCase())) {
hasGoodDeterminer = true;
}
}
}
if (!hasGoodDeterminer || nounNo < MIN_POS_NUMBER) {
it.remove();
}
}
}
use of org.apache.stanbol.enhancer.nlp.pos.PosTag in project stanbol by apache.
the class NIFHelper method writePos.
/**
* Writes the {@link NlpAnnotations#POS_ANNOTATION} as NIF 1.0 to the parsed
* RDF graph by using the parsed segmentUri as subject
* @param graph the graph
* @param annotated the annotated element (e.g. a {@link Token})
* @param segmentUri the URI of the resource representing the parsed
* annotated element in the graph
*/
public static void writePos(Graph graph, Annotated annotated, IRI segmentUri) {
Value<PosTag> posTag = annotated.getAnnotation(NlpAnnotations.POS_ANNOTATION);
if (posTag != null) {
if (posTag.value().isMapped()) {
for (Pos pos : posTag.value().getPos()) {
graph.add(new TripleImpl(segmentUri, SsoOntology.oliaLink.getUri(), pos.getUri()));
}
for (LexicalCategory cat : posTag.value().getCategories()) {
graph.add(new TripleImpl(segmentUri, SsoOntology.oliaLink.getUri(), cat.getUri()));
}
}
graph.add(new TripleImpl(segmentUri, SsoOntology.posTag.getUri(), lf.createTypedLiteral(posTag.value().getTag())));
graph.add(new TripleImpl(segmentUri, ENHANCER_CONFIDENCE, lf.createTypedLiteral(posTag.probability())));
}
}
Aggregations