Search in sources :

Example 1 with NounPhrase

use of org.apache.stanbol.enhancer.engines.entitycoreference.datamodel.NounPhrase in project stanbol by apache.

the class CoreferenceFinder method extractCorefs.

/**
 * Performs the actual coreference resolution by iterating through all the NERs and all the
 * {@link NounPhrase}s which are after the given Ner in the text. If any coreferences are found they are
 * written as {@link NlpAnnotation}s in the NER and noun phrase {@link Span}s.
 *
 * @param ners
 * @param nounPhrases
 * @param language
 * @throws EngineException
 */
public void extractCorefs(Map<Integer, List<Span>> ners, List<NounPhrase> nounPhrases, String language) throws EngineException {
    for (Map.Entry<Integer, List<Span>> entry : ners.entrySet()) {
        int nerSentenceNo = entry.getKey();
        List<Span> nerSpans = entry.getValue();
        int maxDistance = this.config.getMaxDistance();
        for (Span ner : nerSpans) {
            Entity entity = null;
            Set<String> typeLabels = null;
            Set<Span> corefs = new HashSet<Span>();
            for (NounPhrase nounPhrase : nounPhrases) {
                int nounPhraseSentenceNo = nounPhrase.getSentenceNo();
                if (nounPhrase.getChunk().getStart() > ner.getStart() && (maxDistance != Constants.MAX_DISTANCE_NO_CONSTRAINT && nounPhraseSentenceNo > nerSentenceNo && nounPhraseSentenceNo - nerSentenceNo <= maxDistance)) {
                    if (entity == null) {
                        entity = lookupEntity(ner, language);
                        /*
                             * If the entity is still null there's nothing to do but go to the next ner.
                             */
                        if (entity == null)
                            break;
                        if (typeLabels == null) {
                            typeLabels = buildEntityTypeLabels(entity, language);
                        }
                    }
                    if (isCoreferent(typeLabels, entity, ner, nounPhrase, language)) {
                        Set<Span> coreferencedNer = new HashSet<Span>();
                        coreferencedNer.add(ner);
                        Span chunk = nounPhrase.getChunk();
                        chunk.addAnnotation(COREF_ANNOTATION, Value.value(new CorefFeature(false, coreferencedNer)));
                        corefs.add(chunk);
                    }
                }
            }
            if (corefs.size() > 0) {
                ner.addAnnotation(COREF_ANNOTATION, Value.value(new CorefFeature(true, corefs)));
            }
        }
    }
}
Also used : Entity(org.apache.stanbol.entityhub.servicesapi.model.Entity) CorefFeature(org.apache.stanbol.enhancer.nlp.coref.CorefFeature) NounPhrase(org.apache.stanbol.enhancer.engines.entitycoreference.datamodel.NounPhrase) Span(org.apache.stanbol.enhancer.nlp.model.Span) Constraint(org.apache.stanbol.entityhub.servicesapi.query.Constraint) TextConstraint(org.apache.stanbol.entityhub.servicesapi.query.TextConstraint) ReferenceConstraint(org.apache.stanbol.entityhub.servicesapi.query.ReferenceConstraint) QueryResultList(org.apache.stanbol.entityhub.servicesapi.query.QueryResultList) List(java.util.List) Map(java.util.Map) HashSet(java.util.HashSet)

Example 2 with NounPhrase

use of org.apache.stanbol.enhancer.engines.entitycoreference.datamodel.NounPhrase in project stanbol by apache.

the class EntityCoReferenceEngine method extractNersAndNounPhrases.

/**
 * Extracts the NERs and the noun phrases from the given text and puts them in the given lists.
 *
 * @param ci
 * @param ners
 * @param nounPhrases
 */
private void extractNersAndNounPhrases(ContentItem ci, Map<Integer, List<Span>> ners, List<NounPhrase> nounPhrases) {
    AnalysedText at = NlpEngineHelper.getAnalysedText(this, ci, true);
    Iterator<? extends Section> sections = at.getSentences();
    if (!sections.hasNext()) {
        // process as single sentence
        sections = Collections.singleton(at).iterator();
    }
    int sentenceCnt = 0;
    while (sections.hasNext()) {
        sentenceCnt++;
        Section section = sections.next();
        List<NounPhrase> sectionNounPhrases = new ArrayList<NounPhrase>();
        List<Span> sectionNers = new ArrayList<Span>();
        Iterator<Span> chunks = section.getEnclosed(EnumSet.of(SpanTypeEnum.Chunk));
        while (chunks.hasNext()) {
            Span chunk = chunks.next();
            Value<NerTag> ner = chunk.getAnnotation(NlpAnnotations.NER_ANNOTATION);
            if (ner != null) {
                sectionNers.add(chunk);
            }
            Value<PhraseTag> phrase = chunk.getAnnotation(NlpAnnotations.PHRASE_ANNOTATION);
            if (phrase != null && phrase.value().getCategory() == LexicalCategory.Noun) {
                sectionNounPhrases.add(new NounPhrase(chunk, sentenceCnt));
            }
        }
        for (NounPhrase nounPhrase : sectionNounPhrases) {
            Iterator<Span> tokens = section.getEnclosed(EnumSet.of(SpanTypeEnum.Token));
            while (tokens.hasNext()) {
                Span token = tokens.next();
                if (nounPhrase.containsSpan(token)) {
                    nounPhrase.addToken(token);
                }
            }
            for (Span sectionNer : sectionNers) {
                if (nounPhrase.containsSpan(sectionNer)) {
                    nounPhrase.addNerChunk(sectionNer);
                }
            }
        }
        nounPhrases.addAll(sectionNounPhrases);
        if (!sectionNers.isEmpty()) {
            ners.put(sentenceCnt, sectionNers);
        }
    }
}
Also used : NerTag(org.apache.stanbol.enhancer.nlp.ner.NerTag) NounPhrase(org.apache.stanbol.enhancer.engines.entitycoreference.datamodel.NounPhrase) ArrayList(java.util.ArrayList) PhraseTag(org.apache.stanbol.enhancer.nlp.phrase.PhraseTag) Section(org.apache.stanbol.enhancer.nlp.model.Section) Span(org.apache.stanbol.enhancer.nlp.model.Span) AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText)

Example 3 with NounPhrase

use of org.apache.stanbol.enhancer.engines.entitycoreference.datamodel.NounPhrase in project stanbol by apache.

the class NounPhraseFilterer method filter.

/**
 * Filters out noun phrases which do not contain a determiner from the given config and do not a token
 * count bigger than 2 - TODO : should this be configurable to be able to also include 1 word noun
 * phrases?
 *
 * @param nounPhrases
 * @param language
 */
public void filter(List<NounPhrase> nounPhrases, String language) {
    Set<String> langDeterminerSet = withinTextRefDeterminers.get(language);
    Iterator<NounPhrase> it = nounPhrases.iterator();
    while (it.hasNext()) {
        NounPhrase nounPhrase = it.next();
        boolean hasGoodDeterminer = false;
        short nounNo = 0;
        for (Span token : nounPhrase.getTokens()) {
            Value<PosTag> pos = token.getAnnotation(NlpAnnotations.POS_ANNOTATION);
            if (pos != null) {
                PosTag posTag = pos.value();
                if (posTag.hasCategory(LexicalCategory.Noun) || posTag.hasCategory(LexicalCategory.Adjective)) {
                    nounNo++;
                }
                if (!hasGoodDeterminer && posTag.hasPos(Pos.Determiner) && langDeterminerSet.contains(token.getSpan().toLowerCase())) {
                    hasGoodDeterminer = true;
                }
            }
        }
        if (!hasGoodDeterminer || nounNo < MIN_POS_NUMBER) {
            it.remove();
        }
    }
}
Also used : PosTag(org.apache.stanbol.enhancer.nlp.pos.PosTag) NounPhrase(org.apache.stanbol.enhancer.engines.entitycoreference.datamodel.NounPhrase) Span(org.apache.stanbol.enhancer.nlp.model.Span)

Aggregations

NounPhrase (org.apache.stanbol.enhancer.engines.entitycoreference.datamodel.NounPhrase)3 Span (org.apache.stanbol.enhancer.nlp.model.Span)3 ArrayList (java.util.ArrayList)1 HashSet (java.util.HashSet)1 List (java.util.List)1 Map (java.util.Map)1 CorefFeature (org.apache.stanbol.enhancer.nlp.coref.CorefFeature)1 AnalysedText (org.apache.stanbol.enhancer.nlp.model.AnalysedText)1 Section (org.apache.stanbol.enhancer.nlp.model.Section)1 NerTag (org.apache.stanbol.enhancer.nlp.ner.NerTag)1 PhraseTag (org.apache.stanbol.enhancer.nlp.phrase.PhraseTag)1 PosTag (org.apache.stanbol.enhancer.nlp.pos.PosTag)1 Entity (org.apache.stanbol.entityhub.servicesapi.model.Entity)1 Constraint (org.apache.stanbol.entityhub.servicesapi.query.Constraint)1 QueryResultList (org.apache.stanbol.entityhub.servicesapi.query.QueryResultList)1 ReferenceConstraint (org.apache.stanbol.entityhub.servicesapi.query.ReferenceConstraint)1 TextConstraint (org.apache.stanbol.entityhub.servicesapi.query.TextConstraint)1