use of org.apache.stanbol.enhancer.engines.entitycoreference.datamodel.PlaceAdjectival in project stanbol by apache.
the class CoreferenceFinder method isCoreferent.
/**
* Performs the coreference matching rules: 1. Match the entity type. 2. If the {@link NounPhrase}
* contains any NERs match the NER to any spatial/org membership/functional Entity properties from the
* {@link Site}. 3. If {@link NounPhrase} contains any place adjectivals perform spatial co-reference
* based on the entity spatial properties.
*
* @param typeLabels
* - a list of types (classes) that the given entity has.
* @param entity
* - the entity for which we want to do the coref.
* @param ner
* - the ner in the text for which we want to do the coref.
* @param nounPhrase
* - the {@link NounPhrase} which we want to test for coref.
* @param language
* - the language of the text.
* @return
* @throws EngineException
*/
private boolean isCoreferent(Set<String> typeLabels, Entity entity, Span ner, NounPhrase nounPhrase, String language) throws EngineException {
/*
* 1. Try to match the entity class to the noun phrase.
*/
String matchedClass = null;
String nounPhraseText = nounPhrase.getChunk().getSpan().toLowerCase();
int classStart = 0;
int classEnd = 0;
for (String label : typeLabels) {
if (nounPhraseText.matches(".*\\b" + label + "\\b.*") && (matchedClass == null || label.split("\\s").length > matchedClass.split("\\s").length)) {
matchedClass = label;
classStart = nounPhrase.getChunk().getStart() + nounPhraseText.indexOf(label);
classEnd = classStart + label.length();
}
}
if (matchedClass == null)
return false;
/*
* TODO - devise a coref confidence scheme?
*/
if (nounPhrase.hasNers()) {
List<Span> npNers = nounPhrase.getNerChunks();
IRI nerType = ner.getAnnotation(NlpAnnotations.NER_ANNOTATION).value().getType();
for (Span npNer : npNers) {
/*
* Don't go any further if for some reason it turns out that the ner text is the same as the
* entity class text.
*/
if ((npNer.getStart() >= classStart && npNer.getStart() <= classEnd) || (npNer.getEnd() >= classStart && npNer.getEnd() <= classEnd))
continue;
Entity npEntity = lookupEntity(npNer, language);
if (npEntity != null) {
IRI npNerType = npNer.getAnnotation(NlpAnnotations.NER_ANNOTATION).value().getType();
Set<String> rulesOntologyAttr = new HashSet<String>();
if (OntologicalClasses.DBPEDIA_PLACE.equals(npNerType)) {
rulesOntologyAttr = this.config.getSpatialAttributes(nerType);
} else if (OntologicalClasses.DBPEDIA_ORGANISATION.equals(npNerType)) {
rulesOntologyAttr = this.config.getOrgMembershipAttributes(nerType);
}
if (valueExistsInEntityAttributes(rulesOntologyAttr, entity, npEntity.getId())) {
return true;
}
}
}
}
/*
* 3. Detect any place adjectivals in noun phrases and use them for spatial coreference. Any place
* adjectivals found should be separate words from the class matches from point 1.
*/
PlaceAdjectival placeAdjectival = this.dictionaries.findPlaceAdjectival(language, nounPhrase);
if (placeAdjectival != null && (placeAdjectival.getEnd() < classStart || placeAdjectival.getStart() > classEnd)) {
/*
* We use the same spatial rules ontology attributes as before.
*/
Set<String> rulesOntologyAttr = this.config.getSpatialAttributes(ner.getAnnotation(NlpAnnotations.NER_ANNOTATION).value().getType());
if (valueExistsInEntityAttributes(rulesOntologyAttr, entity, placeAdjectival.getPlaceUri().getUnicodeString())) {
return true;
}
}
/*
* If there was no additional info to do the coref and if the entity class matched and has more than 1
* word then we consider this a good enough coreference.
*/
if (matchedClass.split("\\s").length > 1)
return true;
return false;
}
use of org.apache.stanbol.enhancer.engines.entitycoreference.datamodel.PlaceAdjectival in project stanbol by apache.
the class Dictionaries method findPlaceAdjectival.
/**
* Checks whether a {@link NounPhrase} contains a place adjectival and returns it.
*
* @param language
* @param nounPhrase
* @return the {@link PlaceAdjectival} if the {@link NounPhrase} contains one or null if not.
*/
public PlaceAdjectival findPlaceAdjectival(String language, NounPhrase nounPhrase) {
List<Span> tokens = nounPhrase.getTokens();
Map<String, IRI> langPlaceAdjectivalsMap = placeAdjectivalsMap.get(language);
/*
* Go through all 1-grams and 2-grams and see if we have a match in the place adjectivals map. 2-grams
* should be good enough since there are no 3-gram places at least from what I saw.
*/
for (int i = 0; i < tokens.size(); i++) {
Span currentToken = tokens.get(i);
String currentTokenString = currentToken.getSpan().toLowerCase();
// First the current 1-gram
if (langPlaceAdjectivalsMap.containsKey(currentTokenString)) {
return new PlaceAdjectival(currentToken.getStart(), currentToken.getEnd(), langPlaceAdjectivalsMap.get(currentTokenString));
}
// Then use the 2-gram with the token before it
StringBuilder concatTokens = new StringBuilder();
String concatTokensString = null;
if (i > 0) {
Span previousToken = tokens.get(i - 1);
String previousTokenString = previousToken.getSpan().toLowerCase();
concatTokens = new StringBuilder();
concatTokens.append(previousTokenString);
concatTokens.append(" ");
concatTokens.append(currentTokenString);
concatTokensString = concatTokens.toString();
if (langPlaceAdjectivalsMap.containsKey(concatTokensString.toLowerCase())) {
return new PlaceAdjectival(previousToken.getStart(), currentToken.getEnd(), langPlaceAdjectivalsMap.get(concatTokensString));
}
}
// Now use the 2-gram with the token after it
if (i < tokens.size() - 1) {
Span nextToken = tokens.get(i + 1);
String nextTokenString = nextToken.getSpan().toLowerCase();
concatTokens = new StringBuilder();
concatTokens.append(currentTokenString);
concatTokens.append(" ");
concatTokens.append(nextTokenString);
concatTokensString = concatTokens.toString();
if (langPlaceAdjectivalsMap.containsKey(concatTokens.toString())) {
return new PlaceAdjectival(currentToken.getStart(), nextToken.getEnd(), langPlaceAdjectivalsMap.get(concatTokensString));
}
}
}
return null;
}
Aggregations