use of org.apache.stanbol.enhancer.nlp.pos.PosTag in project stanbol by apache.
the class AnalyzedTextSerializerAndParserTest method setup.
@BeforeClass
public static final void setup() throws IOException {
ci = ciFactory.createContentItem(new StringSource(text));
textBlob = ContentItemHelper.getBlob(ci, Collections.singleton("text/plain"));
analysedTextWithData = createAnalysedText();
int sentence = text.indexOf('.') + 1;
Sentence sent1 = analysedTextWithData.addSentence(0, sentence);
expectedSentences.put(sent1, "The Stanbol enhancer can detect famous " + "cities such as Paris and people such as Bob Marley.");
Token the = sent1.addToken(0, 3);
expectedTokens.put(the, "The");
the.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("PREP", Pos.Preposition), 0.85));
Token stanbol = sent1.addToken(4, 11);
expectedTokens.put(stanbol, "Stanbol");
stanbol.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("PN", Pos.ProperNoun), 0.95));
stanbol.addAnnotation(NlpAnnotations.SENTIMENT_ANNOTATION, Value.value(0.5));
//use index to create Tokens
int enhancerStart = sent1.getSpan().indexOf("enhancer");
Token enhancer = sent1.addToken(enhancerStart, enhancerStart + "enhancer".length());
expectedTokens.put(enhancer, "enhancer");
enhancer.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("PN", Pos.ProperNoun), 0.95));
enhancer.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("N", LexicalCategory.Noun), 0.87));
MorphoFeatures morpho = new MorphoFeatures("enhance");
morpho.addCase(new CaseTag("test-case-1", Case.Comitative));
morpho.addCase(new CaseTag("test-case-2", Case.Abessive));
morpho.addDefinitness(Definitness.Definite);
morpho.addPerson(Person.First);
morpho.addPos(new PosTag("PN", Pos.ProperNoun));
morpho.addGender(new GenderTag("test-gender", Gender.Masculine));
morpho.addNumber(new NumberTag("test-number", NumberFeature.Plural));
morpho.addTense(new TenseTag("test-tense", Tense.Present));
morpho.addVerbForm(new VerbMoodTag("test-verb-mood", VerbMood.ConditionalVerb));
enhancer.addAnnotation(NlpAnnotations.MORPHO_ANNOTATION, Value.value(morpho));
//create a chunk
Chunk stanbolEnhancer = analysedTextWithData.addChunk(stanbol.getStart(), enhancer.getEnd());
expectedChunks.put(stanbolEnhancer, "Stanbol enhancer");
stanbolEnhancer.addAnnotation(NlpAnnotations.NER_ANNOTATION, Value.value(new NerTag("organization", DBPEDIA_ORGANISATION)));
stanbolEnhancer.addAnnotation(NlpAnnotations.PHRASE_ANNOTATION, Value.value(new PhraseTag("NP", LexicalCategory.Noun), 0.98));
}
use of org.apache.stanbol.enhancer.nlp.pos.PosTag in project stanbol by apache.
the class MorphoFeaturesSupport method serialize.
@Override
public ObjectNode serialize(ObjectMapper mapper, MorphoFeatures morpho) {
ObjectNode jMorpho = mapper.createObjectNode();
jMorpho.put("lemma", morpho.getLemma());
List<CaseTag> caseList = morpho.getCaseList();
if (!caseList.isEmpty()) {
ArrayNode jCases = mapper.createArrayNode();
for (CaseTag caseTag : caseList) {
ObjectNode jCase = mapper.createObjectNode();
jCase.put("tag", caseTag.getTag());
if (caseTag.getCase() != null) {
jCase.put("type", caseTag.getCase().name());
}
jCases.add(jCase);
}
jMorpho.put("case", jCases);
}
List<Definitness> definitnesses = morpho.getDefinitnessList();
if (!definitnesses.isEmpty()) {
if (definitnesses.size() == 1) {
jMorpho.put("definitness", definitnesses.get(0).name());
} else {
ArrayNode jDefinitnesses = mapper.createArrayNode();
for (Definitness d : definitnesses) {
jDefinitnesses.add(d.name());
}
jMorpho.put("definitness", jDefinitnesses);
}
}
List<GenderTag> genderList = morpho.getGenderList();
if (!genderList.isEmpty()) {
ArrayNode jGenders = mapper.createArrayNode();
for (GenderTag genderTag : genderList) {
ObjectNode jGender = mapper.createObjectNode();
jGender.put("tag", genderTag.getTag());
if (genderTag.getGender() != null) {
jGender.put("type", genderTag.getGender().name());
}
jGenders.add(jGender);
}
jMorpho.put("gender", jGenders);
}
List<NumberTag> numberList = morpho.getNumberList();
if (!numberList.isEmpty()) {
ArrayNode jNumbers = mapper.createArrayNode();
for (NumberTag numberTag : numberList) {
ObjectNode jNumber = mapper.createObjectNode();
jNumber.put("tag", numberTag.getTag());
if (numberTag.getNumber() != null) {
jNumber.put("type", numberTag.getNumber().name());
}
jNumbers.add(jNumber);
}
jMorpho.put("number", jNumbers);
}
List<Person> persons = morpho.getPersonList();
if (!persons.isEmpty()) {
if (persons.size() == 1) {
jMorpho.put("person", persons.get(0).name());
} else {
ArrayNode jPersons = mapper.createArrayNode();
for (Person d : persons) {
jPersons.add(d.name());
}
jMorpho.put("person", jPersons);
}
}
List<PosTag> posList = morpho.getPosList();
if (!posList.isEmpty()) {
ArrayNode jPosTags = mapper.createArrayNode();
for (PosTag posTag : posList) {
jPosTags.add(getPosTagSerializer().serialize(mapper, posTag));
}
jMorpho.put("pos", jPosTags);
}
List<TenseTag> tenseList = morpho.getTenseList();
if (!tenseList.isEmpty()) {
ArrayNode jTenses = mapper.createArrayNode();
for (TenseTag tenseTag : tenseList) {
ObjectNode jTense = mapper.createObjectNode();
jTense.put("tag", tenseTag.getTag());
if (tenseTag.getTense() != null) {
jTense.put("type", tenseTag.getTense().name());
}
jTenses.add(jTense);
}
jMorpho.put("tense", jTenses);
}
List<VerbMoodTag> verbMoodList = morpho.getVerbMoodList();
if (!verbMoodList.isEmpty()) {
ArrayNode jMoods = mapper.createArrayNode();
for (VerbMoodTag verbMoodTag : verbMoodList) {
ObjectNode jMood = mapper.createObjectNode();
jMood.put("tag", verbMoodTag.getTag());
if (verbMoodTag.getVerbForm() != null) {
jMood.put("type", verbMoodTag.getVerbForm().name());
}
jMoods.add(jMood);
}
jMorpho.put("verb-mood", jMoods);
}
return jMorpho;
}
use of org.apache.stanbol.enhancer.nlp.pos.PosTag in project stanbol by apache.
the class EntityLinkingEngineTest method setUpServices.
@BeforeClass
public static void setUpServices() throws IOException {
searcher = new TestSearcherImpl(TEST_REFERENCED_SITE_NAME, NAME, new SimpleLabelTokenizer());
//add some terms to the searcher
Graph graph = new IndexedGraph();
IRI uri = new IRI("urn:test:PatrickMarshall");
graph.add(new TripleImpl(uri, NAME, new PlainLiteralImpl("Patrick Marshall")));
graph.add(new TripleImpl(uri, TYPE, OntologicalClasses.DBPEDIA_PERSON));
searcher.addEntity(new Entity(uri, graph));
uri = new IRI("urn:test:Geologist");
graph.add(new TripleImpl(uri, NAME, new PlainLiteralImpl("Geologist")));
graph.add(new TripleImpl(uri, TYPE, new IRI(NamespaceEnum.skos + "Concept")));
graph.add(new TripleImpl(uri, REDIRECT, new IRI("urn:test:redirect:Geologist")));
searcher.addEntity(new Entity(uri, graph));
//a redirect
uri = new IRI("urn:test:redirect:Geologist");
graph.add(new TripleImpl(uri, NAME, new PlainLiteralImpl("Geologe (redirect)")));
graph.add(new TripleImpl(uri, TYPE, new IRI(NamespaceEnum.skos + "Concept")));
searcher.addEntity(new Entity(uri, graph));
uri = new IRI("urn:test:NewZealand");
graph.add(new TripleImpl(uri, NAME, new PlainLiteralImpl("New Zealand")));
graph.add(new TripleImpl(uri, TYPE, OntologicalClasses.DBPEDIA_PLACE));
searcher.addEntity(new Entity(uri, graph));
uri = new IRI("urn:test:UniversityOfOtago");
graph.add(new TripleImpl(uri, NAME, new PlainLiteralImpl("University of Otago")));
graph.add(new TripleImpl(uri, TYPE, OntologicalClasses.DBPEDIA_ORGANISATION));
searcher.addEntity(new Entity(uri, graph));
uri = new IRI("urn:test:University");
graph.add(new TripleImpl(uri, NAME, new PlainLiteralImpl("University")));
graph.add(new TripleImpl(uri, TYPE, new IRI(NamespaceEnum.skos + "Concept")));
searcher.addEntity(new Entity(uri, graph));
uri = new IRI("urn:test:Otago");
graph.add(new TripleImpl(uri, NAME, new PlainLiteralImpl("Otago")));
graph.add(new TripleImpl(uri, TYPE, OntologicalClasses.DBPEDIA_PLACE));
searcher.addEntity(new Entity(uri, graph));
//add a 2nd Otago (Place and University
uri = new IRI("urn:test:Otago_Texas");
graph.add(new TripleImpl(uri, NAME, new PlainLiteralImpl("Otago (Texas)")));
graph.add(new TripleImpl(uri, NAME, new PlainLiteralImpl("Otago")));
graph.add(new TripleImpl(uri, TYPE, OntologicalClasses.DBPEDIA_PLACE));
searcher.addEntity(new Entity(uri, graph));
uri = new IRI("urn:test:UniversityOfOtago_Texas");
graph.add(new TripleImpl(uri, NAME, new PlainLiteralImpl("University of Otago (Texas)")));
graph.add(new TripleImpl(uri, TYPE, OntologicalClasses.DBPEDIA_ORGANISATION));
searcher.addEntity(new Entity(uri, graph));
TEST_ANALYSED_TEXT = AnalysedTextFactory.getDefaultInstance().createAnalysedText(ciFactory.createBlob(new StringSource(TEST_TEXT)));
TEST_ANALYSED_TEXT_WO = AnalysedTextFactory.getDefaultInstance().createAnalysedText(ciFactory.createBlob(new StringSource(TEST_TEXT_WO)));
initAnalyzedText(TEST_ANALYSED_TEXT);
TEST_ANALYSED_TEXT.addChunk(0, "Dr. Patrick Marshall".length()).addAnnotation(PHRASE_ANNOTATION, NOUN_PHRASE);
TEST_ANALYSED_TEXT.addToken(4, 11).addAnnotation(POS_ANNOTATION, Value.value(new PosTag("NP", Pos.ProperNoun), 1d));
TEST_ANALYSED_TEXT.addToken(12, 20).addAnnotation(POS_ANNOTATION, Value.value(new PosTag("NP", Pos.ProperNoun), 1d));
initAnalyzedText(TEST_ANALYSED_TEXT_WO);
TEST_ANALYSED_TEXT_WO.addChunk(0, "Dr. Marshall Patrick".length()).addAnnotation(PHRASE_ANNOTATION, NOUN_PHRASE);
TEST_ANALYSED_TEXT_WO.addToken(4, 12).addAnnotation(POS_ANNOTATION, Value.value(new PosTag("NP", Pos.ProperNoun), 1d));
TEST_ANALYSED_TEXT_WO.addToken(13, 20).addAnnotation(POS_ANNOTATION, Value.value(new PosTag("NP", Pos.ProperNoun), 1d));
}
use of org.apache.stanbol.enhancer.nlp.pos.PosTag in project stanbol by apache.
the class EntityCoReferenceEngineTest method testSpatialCoref.
@Test
public void testSpatialCoref() throws EngineException, IOException {
ContentItem ci = ciFactory.createContentItem(new StringSource(SPATIAL_TEXT));
Graph graph = ci.getMetadata();
IRI textEnhancement = EnhancementEngineHelper.createTextEnhancement(ci, engine);
graph.add(new TripleImpl(textEnhancement, DC_LANGUAGE, new PlainLiteralImpl("en")));
graph.add(new TripleImpl(textEnhancement, ENHANCER_CONFIDENCE, new PlainLiteralImpl("100.0")));
graph.add(new TripleImpl(textEnhancement, DC_TYPE, DCTERMS_LINGUISTIC_SYSTEM));
Entry<IRI, Blob> textBlob = ContentItemHelper.getBlob(ci, Collections.singleton("text/plain"));
AnalysedText at = atFactory.createAnalysedText(ci, textBlob.getValue());
Sentence sentence1 = at.addSentence(0, SPATIAL_SENTENCE_1.indexOf(".") + 1);
Chunk angelaMerkel = sentence1.addChunk(0, "Angela Merkel".length());
angelaMerkel.addAnnotation(NlpAnnotations.NER_ANNOTATION, Value.value(new NerTag("Angela Merkel", OntologicalClasses.DBPEDIA_PERSON)));
Sentence sentence2 = at.addSentence(SPATIAL_SENTENCE_1.indexOf(".") + 1, SPATIAL_SENTENCE_1.length() + SPATIAL_SENTENCE_2.indexOf(".") + 1);
int theStartIdx = sentence2.getSpan().indexOf("The");
int germanStartIdx = sentence2.getSpan().indexOf("German");
int chancellorStartIdx = sentence2.getSpan().indexOf("politician");
Token the = sentence2.addToken(theStartIdx, theStartIdx + "The".length());
the.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("The", LexicalCategory.PronounOrDeterminer, Pos.Determiner)));
Token german = sentence2.addToken(germanStartIdx, germanStartIdx + "German".length());
german.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("German", LexicalCategory.Adjective)));
Token politician = sentence2.addToken(chancellorStartIdx, chancellorStartIdx + "politician".length());
politician.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("politician", LexicalCategory.Noun)));
Chunk theGermanChancellor = sentence2.addChunk(theStartIdx, chancellorStartIdx + "politician".length());
theGermanChancellor.addAnnotation(NlpAnnotations.PHRASE_ANNOTATION, Value.value(new PhraseTag("The German politician", LexicalCategory.Noun)));
engine.computeEnhancements(ci);
Value<CorefFeature> representativeCorefValue = angelaMerkel.getAnnotation(NlpAnnotations.COREF_ANNOTATION);
Assert.assertNotNull(representativeCorefValue);
CorefFeature representativeCoref = representativeCorefValue.value();
Assert.assertTrue(representativeCoref.isRepresentative());
Assert.assertTrue(representativeCoref.getMentions().contains(theGermanChancellor));
Value<CorefFeature> subordinateCorefValue = theGermanChancellor.getAnnotation(NlpAnnotations.COREF_ANNOTATION);
Assert.assertNotNull(subordinateCorefValue);
CorefFeature subordinateCoref = subordinateCorefValue.value();
Assert.assertTrue(!subordinateCoref.isRepresentative());
Assert.assertTrue(subordinateCoref.getMentions().contains(angelaMerkel));
}
use of org.apache.stanbol.enhancer.nlp.pos.PosTag in project stanbol by apache.
the class SentimentEngine method computeEnhancements.
/**
* Compute enhancements for supplied ContentItem. The results of the process
* are expected to be stored in the metadata of the content item.
* <p/>
* The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
* persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
*
* @throws org.apache.stanbol.enhancer.servicesapi.EngineException
* if the underlying process failed to work as
* expected
*/
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
AnalysedText analysedText = getAnalysedText(this, ci, true);
String language = getLanguage(this, ci, true);
SentimentClassifier classifier = classifiers.get(language);
if (classifier == null) {
throw new IllegalStateException("Sentiment Classifier for language '" + language + "' not available. As this is also checked in " + " canEnhance this may indicate an Bug in the used " + "EnhancementJobManager!");
}
//TODO: locking for AnalysedText not yet defined
// ci.getLock().writeLock().lock();
// try {
Iterator<Token> tokens = analysedText.getTokens();
while (tokens.hasNext()) {
Token token = tokens.next();
Set<LexicalCategory> cats = null;
boolean process = false;
if (!adjectivesOnly) {
process = true;
Value<PosTag> posTag = token.getAnnotation(NlpAnnotations.POS_ANNOTATION);
if (posTag != null && posTag.probability() == Value.UNKNOWN_PROBABILITY || posTag.probability() >= (minPOSConfidence / 2.0)) {
cats = classifier.getCategories(posTag.value());
} else {
//no POS tags or probability to low
cats = Collections.emptySet();
}
} else {
//check PosTags if we need to lookup this word
Iterator<Value<PosTag>> posTags = token.getAnnotations(NlpAnnotations.POS_ANNOTATION).iterator();
boolean ignore = false;
while (!ignore && !process && posTags.hasNext()) {
Value<PosTag> value = posTags.next();
PosTag tag = value.value();
cats = classifier.getCategories(tag);
boolean state = cats.contains(LexicalCategory.Adjective) || cats.contains(LexicalCategory.Noun);
ignore = !state && (value.probability() == Value.UNKNOWN_PROBABILITY || value.probability() >= minPOSConfidence);
process = state && (value.probability() == Value.UNKNOWN_PROBABILITY || value.probability() >= (minPOSConfidence / 2.0));
}
}
//else process all tokens ... no POS tag checking needed
if (process) {
String word = token.getSpan();
double sentiment = 0.0;
if (cats.isEmpty()) {
sentiment = classifier.classifyWord(null, word);
} else {
//in case of multiple Lexical Cats
//we build the average over NOT NULL sentiments for the word
int catSentNum = 0;
for (LexicalCategory cat : cats) {
double catSent = classifier.classifyWord(cat, word);
if (catSent != 0.0) {
catSentNum++;
sentiment = sentiment + catSent;
}
}
if (catSentNum > 0) {
sentiment = sentiment / (double) catSentNum;
}
}
if (sentiment != 0.0) {
token.addAnnotation(SENTIMENT_ANNOTATION, new Value<Double>(sentiment));
}
//else do not set sentiments with 0.0
}
// else do not process
}
// } finally {
// ci.getLock().writeLock().unlock();
// }
}
Aggregations