use of org.apache.stanbol.enhancer.engines.entitycoreference.datamodel.NounPhrase in project stanbol by apache.
the class CoreferenceFinder method extractCorefs.
/**
* Performs the actual coreference resolution by iterating through all the NERs and all the
* {@link NounPhrase}s which are after the given Ner in the text. If any coreferences are found they are
* written as {@link NlpAnnotation}s in the NER and noun phrase {@link Span}s.
*
* @param ners
* @param nounPhrases
* @param language
* @throws EngineException
*/
public void extractCorefs(Map<Integer, List<Span>> ners, List<NounPhrase> nounPhrases, String language) throws EngineException {
for (Map.Entry<Integer, List<Span>> entry : ners.entrySet()) {
int nerSentenceNo = entry.getKey();
List<Span> nerSpans = entry.getValue();
int maxDistance = this.config.getMaxDistance();
for (Span ner : nerSpans) {
Entity entity = null;
Set<String> typeLabels = null;
Set<Span> corefs = new HashSet<Span>();
for (NounPhrase nounPhrase : nounPhrases) {
int nounPhraseSentenceNo = nounPhrase.getSentenceNo();
if (nounPhrase.getChunk().getStart() > ner.getStart() && (maxDistance != Constants.MAX_DISTANCE_NO_CONSTRAINT && nounPhraseSentenceNo > nerSentenceNo && nounPhraseSentenceNo - nerSentenceNo <= maxDistance)) {
if (entity == null) {
entity = lookupEntity(ner, language);
/*
* If the entity is still null there's nothing to do but go to the next ner.
*/
if (entity == null)
break;
if (typeLabels == null) {
typeLabels = buildEntityTypeLabels(entity, language);
}
}
if (isCoreferent(typeLabels, entity, ner, nounPhrase, language)) {
Set<Span> coreferencedNer = new HashSet<Span>();
coreferencedNer.add(ner);
Span chunk = nounPhrase.getChunk();
chunk.addAnnotation(COREF_ANNOTATION, Value.value(new CorefFeature(false, coreferencedNer)));
corefs.add(chunk);
}
}
}
if (corefs.size() > 0) {
ner.addAnnotation(COREF_ANNOTATION, Value.value(new CorefFeature(true, corefs)));
}
}
}
}
use of org.apache.stanbol.enhancer.engines.entitycoreference.datamodel.NounPhrase in project stanbol by apache.
the class EntityCoReferenceEngine method extractNersAndNounPhrases.
/**
* Extracts the NERs and the noun phrases from the given text and puts them in the given lists.
*
* @param ci
* @param ners
* @param nounPhrases
*/
private void extractNersAndNounPhrases(ContentItem ci, Map<Integer, List<Span>> ners, List<NounPhrase> nounPhrases) {
AnalysedText at = NlpEngineHelper.getAnalysedText(this, ci, true);
Iterator<? extends Section> sections = at.getSentences();
if (!sections.hasNext()) {
// process as single sentence
sections = Collections.singleton(at).iterator();
}
int sentenceCnt = 0;
while (sections.hasNext()) {
sentenceCnt++;
Section section = sections.next();
List<NounPhrase> sectionNounPhrases = new ArrayList<NounPhrase>();
List<Span> sectionNers = new ArrayList<Span>();
Iterator<Span> chunks = section.getEnclosed(EnumSet.of(SpanTypeEnum.Chunk));
while (chunks.hasNext()) {
Span chunk = chunks.next();
Value<NerTag> ner = chunk.getAnnotation(NlpAnnotations.NER_ANNOTATION);
if (ner != null) {
sectionNers.add(chunk);
}
Value<PhraseTag> phrase = chunk.getAnnotation(NlpAnnotations.PHRASE_ANNOTATION);
if (phrase != null && phrase.value().getCategory() == LexicalCategory.Noun) {
sectionNounPhrases.add(new NounPhrase(chunk, sentenceCnt));
}
}
for (NounPhrase nounPhrase : sectionNounPhrases) {
Iterator<Span> tokens = section.getEnclosed(EnumSet.of(SpanTypeEnum.Token));
while (tokens.hasNext()) {
Span token = tokens.next();
if (nounPhrase.containsSpan(token)) {
nounPhrase.addToken(token);
}
}
for (Span sectionNer : sectionNers) {
if (nounPhrase.containsSpan(sectionNer)) {
nounPhrase.addNerChunk(sectionNer);
}
}
}
nounPhrases.addAll(sectionNounPhrases);
if (!sectionNers.isEmpty()) {
ners.put(sentenceCnt, sectionNers);
}
}
}
use of org.apache.stanbol.enhancer.engines.entitycoreference.datamodel.NounPhrase in project stanbol by apache.
the class NounPhraseFilterer method filter.
/**
* Filters out noun phrases which do not contain a determiner from the given config and do not a token
* count bigger than 2 - TODO : should this be configurable to be able to also include 1 word noun
* phrases?
*
* @param nounPhrases
* @param language
*/
public void filter(List<NounPhrase> nounPhrases, String language) {
Set<String> langDeterminerSet = withinTextRefDeterminers.get(language);
Iterator<NounPhrase> it = nounPhrases.iterator();
while (it.hasNext()) {
NounPhrase nounPhrase = it.next();
boolean hasGoodDeterminer = false;
short nounNo = 0;
for (Span token : nounPhrase.getTokens()) {
Value<PosTag> pos = token.getAnnotation(NlpAnnotations.POS_ANNOTATION);
if (pos != null) {
PosTag posTag = pos.value();
if (posTag.hasCategory(LexicalCategory.Noun) || posTag.hasCategory(LexicalCategory.Adjective)) {
nounNo++;
}
if (!hasGoodDeterminer && posTag.hasPos(Pos.Determiner) && langDeterminerSet.contains(token.getSpan().toLowerCase())) {
hasGoodDeterminer = true;
}
}
}
if (!hasGoodDeterminer || nounNo < MIN_POS_NUMBER) {
it.remove();
}
}
}
Aggregations