Search in sources :

Example 1 with CorefFeature

use of org.apache.stanbol.enhancer.nlp.coref.CorefFeature in project stanbol by apache.

the class EntityCoReferenceEngineTest method testSpatialCoref.

@Test
public void testSpatialCoref() throws EngineException, IOException {
    ContentItem ci = ciFactory.createContentItem(new StringSource(SPATIAL_TEXT));
    Graph graph = ci.getMetadata();
    IRI textEnhancement = EnhancementEngineHelper.createTextEnhancement(ci, engine);
    graph.add(new TripleImpl(textEnhancement, DC_LANGUAGE, new PlainLiteralImpl("en")));
    graph.add(new TripleImpl(textEnhancement, ENHANCER_CONFIDENCE, new PlainLiteralImpl("100.0")));
    graph.add(new TripleImpl(textEnhancement, DC_TYPE, DCTERMS_LINGUISTIC_SYSTEM));
    Entry<IRI, Blob> textBlob = ContentItemHelper.getBlob(ci, Collections.singleton("text/plain"));
    AnalysedText at = atFactory.createAnalysedText(ci, textBlob.getValue());
    Sentence sentence1 = at.addSentence(0, SPATIAL_SENTENCE_1.indexOf(".") + 1);
    Chunk angelaMerkel = sentence1.addChunk(0, "Angela Merkel".length());
    angelaMerkel.addAnnotation(NlpAnnotations.NER_ANNOTATION, Value.value(new NerTag("Angela Merkel", OntologicalClasses.DBPEDIA_PERSON)));
    Sentence sentence2 = at.addSentence(SPATIAL_SENTENCE_1.indexOf(".") + 1, SPATIAL_SENTENCE_1.length() + SPATIAL_SENTENCE_2.indexOf(".") + 1);
    int theStartIdx = sentence2.getSpan().indexOf("The");
    int germanStartIdx = sentence2.getSpan().indexOf("German");
    int chancellorStartIdx = sentence2.getSpan().indexOf("politician");
    Token the = sentence2.addToken(theStartIdx, theStartIdx + "The".length());
    the.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("The", LexicalCategory.PronounOrDeterminer, Pos.Determiner)));
    Token german = sentence2.addToken(germanStartIdx, germanStartIdx + "German".length());
    german.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("German", LexicalCategory.Adjective)));
    Token politician = sentence2.addToken(chancellorStartIdx, chancellorStartIdx + "politician".length());
    politician.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("politician", LexicalCategory.Noun)));
    Chunk theGermanChancellor = sentence2.addChunk(theStartIdx, chancellorStartIdx + "politician".length());
    theGermanChancellor.addAnnotation(NlpAnnotations.PHRASE_ANNOTATION, Value.value(new PhraseTag("The German politician", LexicalCategory.Noun)));
    engine.computeEnhancements(ci);
    Value<CorefFeature> representativeCorefValue = angelaMerkel.getAnnotation(NlpAnnotations.COREF_ANNOTATION);
    Assert.assertNotNull(representativeCorefValue);
    CorefFeature representativeCoref = representativeCorefValue.value();
    Assert.assertTrue(representativeCoref.isRepresentative());
    Assert.assertTrue(representativeCoref.getMentions().contains(theGermanChancellor));
    Value<CorefFeature> subordinateCorefValue = theGermanChancellor.getAnnotation(NlpAnnotations.COREF_ANNOTATION);
    Assert.assertNotNull(subordinateCorefValue);
    CorefFeature subordinateCoref = subordinateCorefValue.value();
    Assert.assertTrue(!subordinateCoref.isRepresentative());
    Assert.assertTrue(subordinateCoref.getMentions().contains(angelaMerkel));
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) NerTag(org.apache.stanbol.enhancer.nlp.ner.NerTag) CorefFeature(org.apache.stanbol.enhancer.nlp.coref.CorefFeature) Blob(org.apache.stanbol.enhancer.servicesapi.Blob) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) Token(org.apache.stanbol.enhancer.nlp.model.Token) Chunk(org.apache.stanbol.enhancer.nlp.model.Chunk) PhraseTag(org.apache.stanbol.enhancer.nlp.phrase.PhraseTag) AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) Graph(org.apache.clerezza.commons.rdf.Graph) PosTag(org.apache.stanbol.enhancer.nlp.pos.PosTag) StringSource(org.apache.stanbol.enhancer.servicesapi.impl.StringSource) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) Sentence(org.apache.stanbol.enhancer.nlp.model.Sentence) ContentItem(org.apache.stanbol.enhancer.servicesapi.ContentItem) Test(org.junit.Test)

Example 2 with CorefFeature

use of org.apache.stanbol.enhancer.nlp.coref.CorefFeature in project stanbol by apache.

the class CoreferenceFinder method extractCorefs.

/**
 * Performs the actual coreference resolution by iterating through all the NERs and all the
 * {@link NounPhrase}s which are after the given Ner in the text. If any coreferences are found they are
 * written as {@link NlpAnnotation}s in the NER and noun phrase {@link Span}s.
 *
 * @param ners
 * @param nounPhrases
 * @param language
 * @throws EngineException
 */
public void extractCorefs(Map<Integer, List<Span>> ners, List<NounPhrase> nounPhrases, String language) throws EngineException {
    for (Map.Entry<Integer, List<Span>> entry : ners.entrySet()) {
        int nerSentenceNo = entry.getKey();
        List<Span> nerSpans = entry.getValue();
        int maxDistance = this.config.getMaxDistance();
        for (Span ner : nerSpans) {
            Entity entity = null;
            Set<String> typeLabels = null;
            Set<Span> corefs = new HashSet<Span>();
            for (NounPhrase nounPhrase : nounPhrases) {
                int nounPhraseSentenceNo = nounPhrase.getSentenceNo();
                if (nounPhrase.getChunk().getStart() > ner.getStart() && (maxDistance != Constants.MAX_DISTANCE_NO_CONSTRAINT && nounPhraseSentenceNo > nerSentenceNo && nounPhraseSentenceNo - nerSentenceNo <= maxDistance)) {
                    if (entity == null) {
                        entity = lookupEntity(ner, language);
                        /*
                             * If the entity is still null there's nothing to do but go to the next ner.
                             */
                        if (entity == null)
                            break;
                        if (typeLabels == null) {
                            typeLabels = buildEntityTypeLabels(entity, language);
                        }
                    }
                    if (isCoreferent(typeLabels, entity, ner, nounPhrase, language)) {
                        Set<Span> coreferencedNer = new HashSet<Span>();
                        coreferencedNer.add(ner);
                        Span chunk = nounPhrase.getChunk();
                        chunk.addAnnotation(COREF_ANNOTATION, Value.value(new CorefFeature(false, coreferencedNer)));
                        corefs.add(chunk);
                    }
                }
            }
            if (corefs.size() > 0) {
                ner.addAnnotation(COREF_ANNOTATION, Value.value(new CorefFeature(true, corefs)));
            }
        }
    }
}
Also used : Entity(org.apache.stanbol.entityhub.servicesapi.model.Entity) CorefFeature(org.apache.stanbol.enhancer.nlp.coref.CorefFeature) NounPhrase(org.apache.stanbol.enhancer.engines.entitycoreference.datamodel.NounPhrase) Span(org.apache.stanbol.enhancer.nlp.model.Span) Constraint(org.apache.stanbol.entityhub.servicesapi.query.Constraint) TextConstraint(org.apache.stanbol.entityhub.servicesapi.query.TextConstraint) ReferenceConstraint(org.apache.stanbol.entityhub.servicesapi.query.ReferenceConstraint) QueryResultList(org.apache.stanbol.entityhub.servicesapi.query.QueryResultList) List(java.util.List) Map(java.util.Map) HashSet(java.util.HashSet)

Example 3 with CorefFeature

use of org.apache.stanbol.enhancer.nlp.coref.CorefFeature in project stanbol by apache.

the class CorefFeatureSupport method parse.

@Override
public CorefFeature parse(ObjectNode jCoref, AnalysedText at) {
    JsonNode jIsRepresentative = jCoref.path(IS_REPRESENTATIVE_TAG);
    if (!jIsRepresentative.isBoolean()) {
        throw new IllegalStateException("Field 'isRepresentative' must have a true/false format");
    }
    JsonNode node = jCoref.path(MENTIONS_TAG);
    Set<Span> mentions = new HashSet<Span>();
    if (node.isArray()) {
        ArrayNode jMentions = (ArrayNode) node;
        for (int i = 0; i < jMentions.size(); i++) {
            JsonNode member = jMentions.get(i);
            if (member.isObject()) {
                ObjectNode jMention = (ObjectNode) member;
                SpanTypeEnum spanType = SpanTypeEnum.valueOf(jMention.path(MENTION_TYPE_TAG).getTextValue());
                int spanStart = jMention.path(MENTION_START_TAG).asInt();
                int spanEnd = jMention.path(MENTION_END_TAG).asInt();
                Span mentionedSpan = null;
                switch(spanType) {
                    case Chunk:
                        mentionedSpan = at.addChunk(spanStart, spanEnd);
                        break;
                    case Sentence:
                    case Text:
                    case TextSection:
                        break;
                    case Token:
                        mentionedSpan = at.addToken(spanStart, spanEnd);
                        break;
                }
                mentions.add(mentionedSpan);
            }
        }
    }
    return new CorefFeature(jIsRepresentative.asBoolean(), mentions);
}
Also used : CorefFeature(org.apache.stanbol.enhancer.nlp.coref.CorefFeature) SpanTypeEnum(org.apache.stanbol.enhancer.nlp.model.SpanTypeEnum) ObjectNode(org.codehaus.jackson.node.ObjectNode) JsonNode(org.codehaus.jackson.JsonNode) ArrayNode(org.codehaus.jackson.node.ArrayNode) Span(org.apache.stanbol.enhancer.nlp.model.Span) HashSet(java.util.HashSet)

Example 4 with CorefFeature

use of org.apache.stanbol.enhancer.nlp.coref.CorefFeature in project stanbol by apache.

the class CorefFeatureSupportTest method initCorefAnnotations.

private static void initCorefAnnotations() {
    Sentence sentence1 = at.addSentence(0, sentenceText1.indexOf(".") + 1);
    Token obama = sentence1.addToken(0, "Obama".length());
    Sentence sentence2 = at.addSentence(sentenceText1.indexOf(".") + 2, sentenceText2.indexOf(".") + 1);
    int heStartIdx = sentence2.getSpan().indexOf("He");
    Token he = sentence2.addToken(heStartIdx, heStartIdx + "He".length());
    Set<Span> obamaMentions = new HashSet<Span>();
    obamaMentions.add(he);
    obama.addAnnotation(NlpAnnotations.COREF_ANNOTATION, Value.value(new CorefFeature(true, obamaMentions)));
    Set<Span> heMentions = new HashSet<Span>();
    heMentions.add(obama);
    he.addAnnotation(NlpAnnotations.COREF_ANNOTATION, Value.value(new CorefFeature(false, heMentions)));
}
Also used : CorefFeature(org.apache.stanbol.enhancer.nlp.coref.CorefFeature) Token(org.apache.stanbol.enhancer.nlp.model.Token) Sentence(org.apache.stanbol.enhancer.nlp.model.Sentence) Span(org.apache.stanbol.enhancer.nlp.model.Span) HashSet(java.util.HashSet)

Aggregations

CorefFeature (org.apache.stanbol.enhancer.nlp.coref.CorefFeature)4 HashSet (java.util.HashSet)3 Span (org.apache.stanbol.enhancer.nlp.model.Span)3 Sentence (org.apache.stanbol.enhancer.nlp.model.Sentence)2 Token (org.apache.stanbol.enhancer.nlp.model.Token)2 List (java.util.List)1 Map (java.util.Map)1 Graph (org.apache.clerezza.commons.rdf.Graph)1 IRI (org.apache.clerezza.commons.rdf.IRI)1 PlainLiteralImpl (org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl)1 TripleImpl (org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)1 NounPhrase (org.apache.stanbol.enhancer.engines.entitycoreference.datamodel.NounPhrase)1 AnalysedText (org.apache.stanbol.enhancer.nlp.model.AnalysedText)1 Chunk (org.apache.stanbol.enhancer.nlp.model.Chunk)1 SpanTypeEnum (org.apache.stanbol.enhancer.nlp.model.SpanTypeEnum)1 NerTag (org.apache.stanbol.enhancer.nlp.ner.NerTag)1 PhraseTag (org.apache.stanbol.enhancer.nlp.phrase.PhraseTag)1 PosTag (org.apache.stanbol.enhancer.nlp.pos.PosTag)1 Blob (org.apache.stanbol.enhancer.servicesapi.Blob)1 ContentItem (org.apache.stanbol.enhancer.servicesapi.ContentItem)1