use of org.apache.stanbol.enhancer.nlp.coref.CorefFeature in project stanbol by apache.
the class EntityCoReferenceEngineTest method testSpatialCoref.
@Test
public void testSpatialCoref() throws EngineException, IOException {
ContentItem ci = ciFactory.createContentItem(new StringSource(SPATIAL_TEXT));
Graph graph = ci.getMetadata();
IRI textEnhancement = EnhancementEngineHelper.createTextEnhancement(ci, engine);
graph.add(new TripleImpl(textEnhancement, DC_LANGUAGE, new PlainLiteralImpl("en")));
graph.add(new TripleImpl(textEnhancement, ENHANCER_CONFIDENCE, new PlainLiteralImpl("100.0")));
graph.add(new TripleImpl(textEnhancement, DC_TYPE, DCTERMS_LINGUISTIC_SYSTEM));
Entry<IRI, Blob> textBlob = ContentItemHelper.getBlob(ci, Collections.singleton("text/plain"));
AnalysedText at = atFactory.createAnalysedText(ci, textBlob.getValue());
Sentence sentence1 = at.addSentence(0, SPATIAL_SENTENCE_1.indexOf(".") + 1);
Chunk angelaMerkel = sentence1.addChunk(0, "Angela Merkel".length());
angelaMerkel.addAnnotation(NlpAnnotations.NER_ANNOTATION, Value.value(new NerTag("Angela Merkel", OntologicalClasses.DBPEDIA_PERSON)));
Sentence sentence2 = at.addSentence(SPATIAL_SENTENCE_1.indexOf(".") + 1, SPATIAL_SENTENCE_1.length() + SPATIAL_SENTENCE_2.indexOf(".") + 1);
int theStartIdx = sentence2.getSpan().indexOf("The");
int germanStartIdx = sentence2.getSpan().indexOf("German");
int chancellorStartIdx = sentence2.getSpan().indexOf("politician");
Token the = sentence2.addToken(theStartIdx, theStartIdx + "The".length());
the.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("The", LexicalCategory.PronounOrDeterminer, Pos.Determiner)));
Token german = sentence2.addToken(germanStartIdx, germanStartIdx + "German".length());
german.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("German", LexicalCategory.Adjective)));
Token politician = sentence2.addToken(chancellorStartIdx, chancellorStartIdx + "politician".length());
politician.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("politician", LexicalCategory.Noun)));
Chunk theGermanChancellor = sentence2.addChunk(theStartIdx, chancellorStartIdx + "politician".length());
theGermanChancellor.addAnnotation(NlpAnnotations.PHRASE_ANNOTATION, Value.value(new PhraseTag("The German politician", LexicalCategory.Noun)));
engine.computeEnhancements(ci);
Value<CorefFeature> representativeCorefValue = angelaMerkel.getAnnotation(NlpAnnotations.COREF_ANNOTATION);
Assert.assertNotNull(representativeCorefValue);
CorefFeature representativeCoref = representativeCorefValue.value();
Assert.assertTrue(representativeCoref.isRepresentative());
Assert.assertTrue(representativeCoref.getMentions().contains(theGermanChancellor));
Value<CorefFeature> subordinateCorefValue = theGermanChancellor.getAnnotation(NlpAnnotations.COREF_ANNOTATION);
Assert.assertNotNull(subordinateCorefValue);
CorefFeature subordinateCoref = subordinateCorefValue.value();
Assert.assertTrue(!subordinateCoref.isRepresentative());
Assert.assertTrue(subordinateCoref.getMentions().contains(angelaMerkel));
}
use of org.apache.stanbol.enhancer.nlp.coref.CorefFeature in project stanbol by apache.
the class CoreferenceFinder method extractCorefs.
/**
* Performs the actual coreference resolution by iterating through all the NERs and all the
* {@link NounPhrase}s which are after the given Ner in the text. If any coreferences are found they are
* written as {@link NlpAnnotation}s in the NER and noun phrase {@link Span}s.
*
* @param ners
* @param nounPhrases
* @param language
* @throws EngineException
*/
public void extractCorefs(Map<Integer, List<Span>> ners, List<NounPhrase> nounPhrases, String language) throws EngineException {
for (Map.Entry<Integer, List<Span>> entry : ners.entrySet()) {
int nerSentenceNo = entry.getKey();
List<Span> nerSpans = entry.getValue();
int maxDistance = this.config.getMaxDistance();
for (Span ner : nerSpans) {
Entity entity = null;
Set<String> typeLabels = null;
Set<Span> corefs = new HashSet<Span>();
for (NounPhrase nounPhrase : nounPhrases) {
int nounPhraseSentenceNo = nounPhrase.getSentenceNo();
if (nounPhrase.getChunk().getStart() > ner.getStart() && (maxDistance != Constants.MAX_DISTANCE_NO_CONSTRAINT && nounPhraseSentenceNo > nerSentenceNo && nounPhraseSentenceNo - nerSentenceNo <= maxDistance)) {
if (entity == null) {
entity = lookupEntity(ner, language);
/*
* If the entity is still null there's nothing to do but go to the next ner.
*/
if (entity == null)
break;
if (typeLabels == null) {
typeLabels = buildEntityTypeLabels(entity, language);
}
}
if (isCoreferent(typeLabels, entity, ner, nounPhrase, language)) {
Set<Span> coreferencedNer = new HashSet<Span>();
coreferencedNer.add(ner);
Span chunk = nounPhrase.getChunk();
chunk.addAnnotation(COREF_ANNOTATION, Value.value(new CorefFeature(false, coreferencedNer)));
corefs.add(chunk);
}
}
}
if (corefs.size() > 0) {
ner.addAnnotation(COREF_ANNOTATION, Value.value(new CorefFeature(true, corefs)));
}
}
}
}
use of org.apache.stanbol.enhancer.nlp.coref.CorefFeature in project stanbol by apache.
the class CorefFeatureSupport method parse.
@Override
public CorefFeature parse(ObjectNode jCoref, AnalysedText at) {
JsonNode jIsRepresentative = jCoref.path(IS_REPRESENTATIVE_TAG);
if (!jIsRepresentative.isBoolean()) {
throw new IllegalStateException("Field 'isRepresentative' must have a true/false format");
}
JsonNode node = jCoref.path(MENTIONS_TAG);
Set<Span> mentions = new HashSet<Span>();
if (node.isArray()) {
ArrayNode jMentions = (ArrayNode) node;
for (int i = 0; i < jMentions.size(); i++) {
JsonNode member = jMentions.get(i);
if (member.isObject()) {
ObjectNode jMention = (ObjectNode) member;
SpanTypeEnum spanType = SpanTypeEnum.valueOf(jMention.path(MENTION_TYPE_TAG).getTextValue());
int spanStart = jMention.path(MENTION_START_TAG).asInt();
int spanEnd = jMention.path(MENTION_END_TAG).asInt();
Span mentionedSpan = null;
switch(spanType) {
case Chunk:
mentionedSpan = at.addChunk(spanStart, spanEnd);
break;
case Sentence:
case Text:
case TextSection:
break;
case Token:
mentionedSpan = at.addToken(spanStart, spanEnd);
break;
}
mentions.add(mentionedSpan);
}
}
}
return new CorefFeature(jIsRepresentative.asBoolean(), mentions);
}
use of org.apache.stanbol.enhancer.nlp.coref.CorefFeature in project stanbol by apache.
the class CorefFeatureSupportTest method initCorefAnnotations.
private static void initCorefAnnotations() {
Sentence sentence1 = at.addSentence(0, sentenceText1.indexOf(".") + 1);
Token obama = sentence1.addToken(0, "Obama".length());
Sentence sentence2 = at.addSentence(sentenceText1.indexOf(".") + 2, sentenceText2.indexOf(".") + 1);
int heStartIdx = sentence2.getSpan().indexOf("He");
Token he = sentence2.addToken(heStartIdx, heStartIdx + "He".length());
Set<Span> obamaMentions = new HashSet<Span>();
obamaMentions.add(he);
obama.addAnnotation(NlpAnnotations.COREF_ANNOTATION, Value.value(new CorefFeature(true, obamaMentions)));
Set<Span> heMentions = new HashSet<Span>();
heMentions.add(obama);
he.addAnnotation(NlpAnnotations.COREF_ANNOTATION, Value.value(new CorefFeature(false, heMentions)));
}
Aggregations