Search in sources :

Example 1 with PlaceAdjectival

use of org.apache.stanbol.enhancer.engines.entitycoreference.datamodel.PlaceAdjectival in project stanbol by apache.

the class CoreferenceFinder method isCoreferent.

/**
     * Performs the coreference matching rules: 1. Match the entity type. 2. If the {@link NounPhrase}
     * contains any NERs match the NER to any spatial/org membership/functional Entity properties from the
     * {@link Site}. 3. If {@link NounPhrase} contains any place adjectivals perform spatial co-reference
     * based on the entity spatial properties.
     * 
     * @param typeLabels
     *            - a list of types (classes) that the given entity has.
     * @param entity
     *            - the entity for which we want to do the coref.
     * @param ner
     *            - the ner in the text for which we want to do the coref.
     * @param nounPhrase
     *            - the {@link NounPhrase} which we want to test for coref.
     * @param language
     *            - the language of the text.
     * @return
     * @throws EngineException
     */
private boolean isCoreferent(Set<String> typeLabels, Entity entity, Span ner, NounPhrase nounPhrase, String language) throws EngineException {
    /*
         * 1. Try to match the entity class to the noun phrase.
         */
    String matchedClass = null;
    String nounPhraseText = nounPhrase.getChunk().getSpan().toLowerCase();
    int classStart = 0;
    int classEnd = 0;
    for (String label : typeLabels) {
        if (nounPhraseText.matches(".*\\b" + label + "\\b.*") && (matchedClass == null || label.split("\\s").length > matchedClass.split("\\s").length)) {
            matchedClass = label;
            classStart = nounPhrase.getChunk().getStart() + nounPhraseText.indexOf(label);
            classEnd = classStart + label.length();
        }
    }
    if (matchedClass == null)
        return false;
    /*
         * TODO - devise a coref confidence scheme?
         */
    if (nounPhrase.hasNers()) {
        List<Span> npNers = nounPhrase.getNerChunks();
        IRI nerType = ner.getAnnotation(NlpAnnotations.NER_ANNOTATION).value().getType();
        for (Span npNer : npNers) {
            /*
                 * Don't go any further if for some reason it turns out that the ner text is the same as the
                 * entity class text.
                 */
            if ((npNer.getStart() >= classStart && npNer.getStart() <= classEnd) || (npNer.getEnd() >= classStart && npNer.getEnd() <= classEnd))
                continue;
            Entity npEntity = lookupEntity(npNer, language);
            if (npEntity != null) {
                IRI npNerType = npNer.getAnnotation(NlpAnnotations.NER_ANNOTATION).value().getType();
                Set<String> rulesOntologyAttr = new HashSet<String>();
                if (OntologicalClasses.DBPEDIA_PLACE.equals(npNerType)) {
                    rulesOntologyAttr = this.config.getSpatialAttributes(nerType);
                } else if (OntologicalClasses.DBPEDIA_ORGANISATION.equals(npNerType)) {
                    rulesOntologyAttr = this.config.getOrgMembershipAttributes(nerType);
                }
                if (valueExistsInEntityAttributes(rulesOntologyAttr, entity, npEntity.getId())) {
                    return true;
                }
            }
        }
    }
    /*
         * 3. Detect any place adjectivals in noun phrases and use them for spatial coreference. Any place
         * adjectivals found should be separate words from the class matches from point 1.
         */
    PlaceAdjectival placeAdjectival = this.dictionaries.findPlaceAdjectival(language, nounPhrase);
    if (placeAdjectival != null && (placeAdjectival.getEnd() < classStart || placeAdjectival.getStart() > classEnd)) {
        /*
             * We use the same spatial rules ontology attributes as before.
             */
        Set<String> rulesOntologyAttr = this.config.getSpatialAttributes(ner.getAnnotation(NlpAnnotations.NER_ANNOTATION).value().getType());
        if (valueExistsInEntityAttributes(rulesOntologyAttr, entity, placeAdjectival.getPlaceUri().getUnicodeString())) {
            return true;
        }
    }
    /*
         * If there was no additional info to do the coref and if the entity class matched and has more than 1
         * word then we consider this a good enough coreference.
         */
    if (matchedClass.split("\\s").length > 1)
        return true;
    return false;
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) Entity(org.apache.stanbol.entityhub.servicesapi.model.Entity) PlaceAdjectival(org.apache.stanbol.enhancer.engines.entitycoreference.datamodel.PlaceAdjectival) Span(org.apache.stanbol.enhancer.nlp.model.Span) Constraint(org.apache.stanbol.entityhub.servicesapi.query.Constraint) TextConstraint(org.apache.stanbol.entityhub.servicesapi.query.TextConstraint) ReferenceConstraint(org.apache.stanbol.entityhub.servicesapi.query.ReferenceConstraint) HashSet(java.util.HashSet)

Example 2 with PlaceAdjectival

use of org.apache.stanbol.enhancer.engines.entitycoreference.datamodel.PlaceAdjectival in project stanbol by apache.

the class Dictionaries method findPlaceAdjectival.

/**
     * Checks whether a {@link NounPhrase} contains a place adjectival and returns it.
     * 
     * @param language
     * @param nounPhrase
     * @return the {@link PlaceAdjectival} if the {@link NounPhrase} contains one or null if not.
     */
public PlaceAdjectival findPlaceAdjectival(String language, NounPhrase nounPhrase) {
    List<Span> tokens = nounPhrase.getTokens();
    Map<String, IRI> langPlaceAdjectivalsMap = placeAdjectivalsMap.get(language);
    /*
         * Go through all 1-grams and 2-grams and see if we have a match in the place adjectivals map. 2-grams
         * should be good enough since there are no 3-gram places at least from what I saw.
         */
    for (int i = 0; i < tokens.size(); i++) {
        Span currentToken = tokens.get(i);
        String currentTokenString = currentToken.getSpan().toLowerCase();
        // First the current 1-gram
        if (langPlaceAdjectivalsMap.containsKey(currentTokenString)) {
            return new PlaceAdjectival(currentToken.getStart(), currentToken.getEnd(), langPlaceAdjectivalsMap.get(currentTokenString));
        }
        // Then use the 2-gram with the token before it
        StringBuilder concatTokens = new StringBuilder();
        String concatTokensString = null;
        if (i > 0) {
            Span previousToken = tokens.get(i - 1);
            String previousTokenString = previousToken.getSpan().toLowerCase();
            concatTokens = new StringBuilder();
            concatTokens.append(previousTokenString);
            concatTokens.append(" ");
            concatTokens.append(currentTokenString);
            concatTokensString = concatTokens.toString();
            if (langPlaceAdjectivalsMap.containsKey(concatTokensString.toLowerCase())) {
                return new PlaceAdjectival(previousToken.getStart(), currentToken.getEnd(), langPlaceAdjectivalsMap.get(concatTokensString));
            }
        }
        // Now use the 2-gram with the token after it
        if (i < tokens.size() - 1) {
            Span nextToken = tokens.get(i + 1);
            String nextTokenString = nextToken.getSpan().toLowerCase();
            concatTokens = new StringBuilder();
            concatTokens.append(currentTokenString);
            concatTokens.append(" ");
            concatTokens.append(nextTokenString);
            concatTokensString = concatTokens.toString();
            if (langPlaceAdjectivalsMap.containsKey(concatTokens.toString())) {
                return new PlaceAdjectival(currentToken.getStart(), nextToken.getEnd(), langPlaceAdjectivalsMap.get(concatTokensString));
            }
        }
    }
    return null;
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) PlaceAdjectival(org.apache.stanbol.enhancer.engines.entitycoreference.datamodel.PlaceAdjectival) Span(org.apache.stanbol.enhancer.nlp.model.Span)

Aggregations

IRI (org.apache.clerezza.commons.rdf.IRI)2 PlaceAdjectival (org.apache.stanbol.enhancer.engines.entitycoreference.datamodel.PlaceAdjectival)2 Span (org.apache.stanbol.enhancer.nlp.model.Span)2 HashSet (java.util.HashSet)1 Entity (org.apache.stanbol.entityhub.servicesapi.model.Entity)1 Constraint (org.apache.stanbol.entityhub.servicesapi.query.Constraint)1 ReferenceConstraint (org.apache.stanbol.entityhub.servicesapi.query.ReferenceConstraint)1 TextConstraint (org.apache.stanbol.entityhub.servicesapi.query.TextConstraint)1