Search in sources :

Example 1 with LinkedEntity

use of org.apache.stanbol.enhancer.engines.entitylinking.impl.LinkedEntity in project stanbol by apache.

the class EntityCoMentionEngine method writeComentions.

private void writeComentions(ContentItem ci, Collection<LinkedEntity> comentions, String language, Set<IRI> textAnnotations) {
    Language languageObject = null;
    if (language != null && !language.isEmpty()) {
        languageObject = new Language(language);
    }
    Graph metadata = ci.getMetadata();
    // we MUST adjust the confidence level of existing annotations only once
    // se we need to keep track of those
    Set<BlankNodeOrIRI> adjustedSuggestions = new HashSet<BlankNodeOrIRI>();
    log.debug("Write Co-Mentions:");
    for (LinkedEntity comention : comentions) {
        log.debug(" > {}", comention);
        // URIs of TextAnnotations for the initial mention of this co-mention
        Collection<IRI> initialMentions = new ArrayList<IRI>(comention.getSuggestions().size());
        for (Suggestion suggestion : comention.getSuggestions()) {
            Entity entity = suggestion.getEntity();
            if (textAnnotations.contains(entity.getUri())) {
                // if(entity.getData().filter(entity.getUri(),RDF_TYPE,ENHANCER_TEXTANNOTATION).hasNext()){
                // this is a textAnnotation
                initialMentions.add(entity.getUri());
            }
        // else TODO support also Entities!!
        }
        // create the TextAnnotations for the co-mention
        for (Occurrence occurrence : comention.getOccurrences()) {
            Literal startLiteral = literalFactory.createTypedLiteral(occurrence.getStart());
            Literal endLiteral = literalFactory.createTypedLiteral(occurrence.getEnd());
            // search for existing text annotation
            boolean ignore = false;
            // search for textAnnotations with the same end
            IRI textAnnotation = null;
            Iterator<Triple> it = metadata.filter(null, ENHANCER_START, startLiteral);
            while (it.hasNext()) {
                Triple t = it.next();
                Integer end = EnhancementEngineHelper.get(metadata, t.getSubject(), ENHANCER_END, Integer.class, literalFactory);
                if (end != null && textAnnotations.contains(t.getSubject())) {
                    // metadata.filter(t.getSubject(), RDF_TYPE, ENHANCER_TEXTANNOTATION).hasNext()){
                    textAnnotation = (IRI) t.getSubject();
                    if (end > occurrence.getEnd()) {
                        // there is an other TextAnnotation selecting a bigger Span
                        // so we should ignore this Occurrence
                        ignore = true;
                    }
                }
            }
            it = metadata.filter(null, ENHANCER_END, endLiteral);
            while (it.hasNext()) {
                Triple t = it.next();
                Integer start = EnhancementEngineHelper.get(metadata, t.getSubject(), ENHANCER_START, Integer.class, literalFactory);
                if (start != null && textAnnotations.contains(t.getSubject())) {
                    // metadata.filter(t.getSubject(), RDF_TYPE, ENHANCER_TEXTANNOTATION).hasNext()){
                    textAnnotation = (IRI) t.getSubject();
                    if (start < occurrence.getStart()) {
                        // there is an other TextAnnotation selecting a bigger Span
                        // so we should ignore this Occurrence
                        ignore = true;
                    }
                }
            }
            if (!ignore) {
                // collect confidence values of co-mentions
                // maximum confidence of suggestions of the initial mention
                Double maxConfidence = null;
                // maximum confidence of existing suggestions
                Double maxExistingConfidence = null;
                if (textAnnotation == null) {
                    // not found ... create a new TextAnnotation for the co-mention
                    textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
                    // add it to the set of TextAnnotations
                    textAnnotations.add(textAnnotation);
                    metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_START, startLiteral));
                    metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_END, endLiteral));
                    metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(occurrence.getContext(), languageObject)));
                    metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(occurrence.getSelectedText(), languageObject)));
                } else {
                    // if existing add this engine as contributor
                    metadata.add(new TripleImpl(textAnnotation, DC_CONTRIBUTOR, new PlainLiteralImpl(this.getClass().getName())));
                // maxConfidence = EnhancementEngineHelper.get(metadata, textAnnotation,
                // ENHANCER_CONFIDENCE, Double.class, literalFactory);
                }
                // now process initial mention(s) for the co-mention
                Set<IRI> dcTypes = new HashSet<IRI>();
                for (IRI initialMention : initialMentions) {
                    // get the dc:type(s) of the initial mentions
                    Iterator<IRI> dcTypesIt = getReferences(metadata, initialMention, DC_TYPE);
                    while (dcTypesIt.hasNext()) {
                        dcTypes.add(dcTypesIt.next());
                    }
                    // check confidence of the initial mention (fise:TextAnnotation)
                    Double confidnece = EnhancementEngineHelper.get(metadata, initialMention, ENHANCER_CONFIDENCE, Double.class, literalFactory);
                    if (confidnece != null) {
                        if (maxConfidence == null) {
                            maxConfidence = confidnece;
                        } else if (maxConfidence.compareTo(confidnece) <= 0) {
                            maxConfidence = confidnece;
                        }
                    }
                    // else nothing to do
                    // now we need to compare the suggestions of the initial
                    // mention(s) with the existing one.
                    // Get information about the suggestions of the initial mention
                    Map<RDFTerm, Double> initialSuggestions = new HashMap<RDFTerm, Double>();
                    Map<RDFTerm, RDFTerm> initialSuggestedEntities = new HashMap<RDFTerm, RDFTerm>();
                    for (Iterator<Triple> suggestions = metadata.filter(null, DC_RELATION, initialMention); suggestions.hasNext(); ) {
                        if (!textAnnotations.contains(suggestions)) {
                            BlankNodeOrIRI suggestion = suggestions.next().getSubject();
                            RDFTerm suggestedEntity = EnhancementEngineHelper.getReference(metadata, suggestion, ENHANCER_ENTITY_REFERENCE);
                            if (suggestedEntity != null) {
                                // it has a suggestion
                                Double confidence = EnhancementEngineHelper.get(metadata, suggestion, ENHANCER_CONFIDENCE, Double.class, literalFactory);
                                if (maxConfidence == null) {
                                    maxConfidence = confidence;
                                } else if (confidnece != null && maxConfidence.compareTo(confidnece) <= 0) {
                                    maxConfidence = confidnece;
                                }
                                // else nothing to do
                                initialSuggestions.put(suggestion, confidence);
                                initialSuggestedEntities.put(suggestedEntity, suggestion);
                            }
                        // no suggestion (dc:relation to some other resource)
                        }
                    // else ignore dc:relation to other fise:TextAnnotations
                    }
                    // now we collect existing Suggestions for this TextAnnoation where we need
                    // to adjust the confidence (quite some things to check ....)
                    Map<BlankNodeOrIRI, Double> existingSuggestions = new HashMap<BlankNodeOrIRI, Double>();
                    if (maxConfidence != null && confidenceAdjustmentFactor < 1) {
                        // suggestions are defined by incoming dc:releation
                        for (Iterator<Triple> esIt = metadata.filter(null, DC_RELATION, textAnnotation); esIt.hasNext(); ) {
                            BlankNodeOrIRI existingSuggestion = esIt.next().getSubject();
                            // but not all of them are suggestions
                            if (!textAnnotations.contains(existingSuggestion)) {
                                // ignore fise:TextAnnotations
                                Double existingConfidence = EnhancementEngineHelper.get(metadata, existingSuggestion, ENHANCER_CONFIDENCE, Double.class, literalFactory);
                                // ignore fise:TextAnnotations also suggested for the initial mention
                                if (!initialSuggestions.containsKey(existingSuggestion)) {
                                    RDFTerm suggestedEntity = EnhancementEngineHelper.getReference(metadata, existingSuggestion, ENHANCER_ENTITY_REFERENCE);
                                    // suggestions for the initial mention
                                    if (!initialSuggestedEntities.containsKey(suggestedEntity)) {
                                        // finally make sure that we adjust confidences only once
                                        if (!adjustedSuggestions.contains(existingSuggestion)) {
                                            existingSuggestions.put(existingSuggestion, existingConfidence);
                                        }
                                    // else confidence already adjusted
                                    } else {
                                        // different fise:EntityAnnotation, but same reference Entity
                                        // we need to check confidences to decide what to do
                                        RDFTerm initialSuggestion = initialSuggestedEntities.get(suggestedEntity);
                                        Double initialConfidence = initialSuggestions.get(initialSuggestion);
                                        if (initialConfidence == null || (existingConfidence != null && existingConfidence.compareTo(initialConfidence) >= 0)) {
                                            // existing confidence >= initial .. keep existing
                                            initialSuggestions.remove(initialSuggestion);
                                            if (maxExistingConfidence == null) {
                                                maxExistingConfidence = existingConfidence;
                                            } else if (maxExistingConfidence.compareTo(existingConfidence) <= 0) {
                                                maxExistingConfidence = existingConfidence;
                                            }
                                        } else {
                                            // adjust this one (if not yet adjusted)
                                            if (!adjustedSuggestions.contains(existingSuggestion)) {
                                                existingSuggestions.put(existingSuggestion, existingConfidence);
                                            }
                                        }
                                    }
                                } else {
                                    // a initial mention already present
                                    // no need to process initial mention
                                    initialSuggestions.remove(existingSuggestion);
                                    if (maxExistingConfidence == null) {
                                        maxExistingConfidence = existingConfidence;
                                    } else if (existingConfidence != null && maxExistingConfidence.compareTo(existingConfidence) <= 0) {
                                        maxExistingConfidence = existingConfidence;
                                    }
                                // else maxExistingConfidence == null (undefined)
                                }
                            }
                        // else ignore dc:relations to other fise:TextAnnotations
                        }
                        for (Entry<BlankNodeOrIRI, Double> entry : existingSuggestions.entrySet()) {
                            if (entry.getValue() != null) {
                                double adjustedConfidence = entry.getValue() * confidenceAdjustmentFactor;
                                if (maxExistingConfidence == null || adjustedConfidence > maxExistingConfidence) {
                                    maxExistingConfidence = adjustedConfidence;
                                }
                                EnhancementEngineHelper.set(metadata, entry.getKey(), ENHANCER_CONFIDENCE, adjustedConfidence, literalFactory);
                                // mark as adjusted
                                adjustedSuggestions.add(entry.getKey());
                            }
                        }
                    }
                    // add the suggestions of the initial mention to this one
                    for (RDFTerm suggestion : initialSuggestions.keySet()) {
                        metadata.add(new TripleImpl((BlankNodeOrIRI) suggestion, DC_RELATION, textAnnotation));
                    }
                    // finally link the co-mentation with the initial one
                    metadata.add(new TripleImpl(textAnnotation, DC_RELATION, initialMention));
                // metadata.add(new TripleImpl(initialMention, DC_RELATION, textAnnotation));
                }
                // Adapt the dc:type values of the fise:TextAnnotation
                // - if Suggestions added by this engine do have the max confidence
                // use the dc:type values of the initial mention
                // - if the original suggestions do have a higher confidence keep the
                // existing
                // - in case both do have the same confidence we add all dc:types
                boolean removeExistingDcTypes = maxConfidence != null && (maxExistingConfidence == null || maxConfidence.compareTo(maxExistingConfidence) >= 0);
                boolean addCoMentionDcTypes = maxExistingConfidence == null || (maxConfidence != null && maxConfidence.compareTo(maxExistingConfidence) >= 1);
                Iterator<IRI> existingDcTypesIt = getReferences(metadata, textAnnotation, DC_TYPE);
                while (existingDcTypesIt.hasNext()) {
                    // removeExistingDcTypes == true
                    if ((!dcTypes.remove(existingDcTypesIt.next()) || !addCoMentionDcTypes) && removeExistingDcTypes) {
                        // remove the dcType
                        existingDcTypesIt.remove();
                    }
                }
                if (addCoMentionDcTypes) {
                    for (IRI dcType : dcTypes) {
                        // add missing
                        metadata.add(new TripleImpl(textAnnotation, DC_TYPE, dcType));
                    }
                }
                // TODO: support also Entities
                if (maxConfidence != null) {
                    // set the confidence value (if known)
                    EnhancementEngineHelper.set(metadata, textAnnotation, ENHANCER_CONFIDENCE, maxConfidence, literalFactory);
                }
            }
        // else ignore this occurence
        }
    }
}
Also used : LinkedEntity(org.apache.stanbol.enhancer.engines.entitylinking.impl.LinkedEntity) IRI(org.apache.clerezza.commons.rdf.IRI) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) LinkedEntity(org.apache.stanbol.enhancer.engines.entitylinking.impl.LinkedEntity) Entity(org.apache.stanbol.enhancer.engines.entitylinking.Entity) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) Suggestion(org.apache.stanbol.enhancer.engines.entitylinking.impl.Suggestion) Language(org.apache.clerezza.commons.rdf.Language) NlpEngineHelper.getLanguage(org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getLanguage) Literal(org.apache.clerezza.commons.rdf.Literal) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) Occurrence(org.apache.stanbol.enhancer.engines.entitylinking.impl.LinkedEntity.Occurrence) HashSet(java.util.HashSet) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) RDFTerm(org.apache.clerezza.commons.rdf.RDFTerm) Triple(org.apache.clerezza.commons.rdf.Triple) Graph(org.apache.clerezza.commons.rdf.Graph)

Example 2 with LinkedEntity

use of org.apache.stanbol.enhancer.engines.entitylinking.impl.LinkedEntity in project stanbol by apache.

the class EntityLinkingEngine method writeEnhancements.

/**
 * Writes the Enhancements for the {@link LinkedEntity LinkedEntities}
 * extracted from the parsed ContentItem
 * @param ci
 * @param linkedEntities
 * @param language
 */
private void writeEnhancements(ContentItem ci, Collection<LinkedEntity> linkedEntities, String language, boolean writeRankings) {
    Language languageObject = null;
    if (language != null && !language.isEmpty()) {
        languageObject = new Language(language);
    }
    Set<IRI> dereferencedEntitis = new HashSet<IRI>();
    Graph metadata = ci.getMetadata();
    for (LinkedEntity linkedEntity : linkedEntities) {
        Collection<IRI> textAnnotations = new ArrayList<IRI>(linkedEntity.getOccurrences().size());
        // first create the TextAnnotations for the Occurrences
        for (Occurrence occurrence : linkedEntity.getOccurrences()) {
            Literal startLiteral = literalFactory.createTypedLiteral(occurrence.getStart());
            Literal endLiteral = literalFactory.createTypedLiteral(occurrence.getEnd());
            // search for existing text annotation
            Iterator<Triple> it = metadata.filter(null, ENHANCER_START, startLiteral);
            IRI textAnnotation = null;
            while (it.hasNext()) {
                Triple t = it.next();
                if (metadata.filter(t.getSubject(), ENHANCER_END, endLiteral).hasNext() && metadata.filter(t.getSubject(), RDF_TYPE, ENHANCER_TEXTANNOTATION).hasNext()) {
                    textAnnotation = (IRI) t.getSubject();
                    break;
                }
            }
            if (textAnnotation == null) {
                // not found ... create a new one
                textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
                metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_START, startLiteral));
                metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_END, endLiteral));
                metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(occurrence.getContext(), languageObject)));
                metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(occurrence.getSelectedText(), languageObject)));
                metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(linkedEntity.getScore())));
            } else {
                // if existing add this engine as contributor
                metadata.add(new TripleImpl(textAnnotation, DC_CONTRIBUTOR, new PlainLiteralImpl(this.getClass().getName())));
            }
            // add dc:types (even to existing)
            for (IRI dcType : linkedEntity.getTypes()) {
                metadata.add(new TripleImpl(textAnnotation, Properties.DC_TYPE, dcType));
            }
            textAnnotations.add(textAnnotation);
        }
        // now the EntityAnnotations for the Suggestions
        for (Suggestion suggestion : linkedEntity.getSuggestions()) {
            IRI entityAnnotation = EnhancementEngineHelper.createEntityEnhancement(ci, this);
            // should we use the label used for the match, or search the
            // representation for the best label ... currently its the matched one
            Literal label = suggestion.getBestLabel(linkerConfig.getNameField(), language);
            Entity entity = suggestion.getEntity();
            metadata.add(new TripleImpl(entityAnnotation, Properties.ENHANCER_ENTITY_LABEL, label));
            metadata.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_REFERENCE, entity.getUri()));
            Iterator<IRI> suggestionTypes = entity.getReferences(linkerConfig.getTypeField());
            while (suggestionTypes.hasNext()) {
                metadata.add(new TripleImpl(entityAnnotation, Properties.ENHANCER_ENTITY_TYPE, suggestionTypes.next()));
            }
            metadata.add(new TripleImpl(entityAnnotation, Properties.ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(suggestion.getScore())));
            for (IRI textAnnotation : textAnnotations) {
                metadata.add(new TripleImpl(entityAnnotation, Properties.DC_RELATION, textAnnotation));
            }
            // add origin information of the EntiySearcher
            for (Entry<IRI, Collection<RDFTerm>> originInfo : entitySearcher.getOriginInformation().entrySet()) {
                for (RDFTerm value : originInfo.getValue()) {
                    metadata.add(new TripleImpl(entityAnnotation, originInfo.getKey(), value));
                }
            }
            if (writeRankings) {
                Float ranking = suggestion.getEntity().getEntityRanking();
                if (ranking != null) {
                    metadata.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_RANKING, // write the float as double
                    new TypedLiteralImpl(ranking.toString(), XSD_DOUBLE)));
                }
            }
            // add the RDF data for entities
            if (linkerConfig.isDereferenceEntitiesEnabled() && dereferencedEntitis.add(entity.getUri())) {
                // NOTE: do not add all triples as there might be other data in the graph
                for (Iterator<Triple> triples = entity.getData().filter(entity.getUri(), null, null); triples.hasNext(); metadata.add(triples.next())) ;
            }
        }
    }
}
Also used : LinkedEntity(org.apache.stanbol.enhancer.engines.entitylinking.impl.LinkedEntity) IRI(org.apache.clerezza.commons.rdf.IRI) LinkedEntity(org.apache.stanbol.enhancer.engines.entitylinking.impl.LinkedEntity) Entity(org.apache.stanbol.enhancer.engines.entitylinking.Entity) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) ArrayList(java.util.ArrayList) RDFTerm(org.apache.clerezza.commons.rdf.RDFTerm) TypedLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.TypedLiteralImpl) Triple(org.apache.clerezza.commons.rdf.Triple) Suggestion(org.apache.stanbol.enhancer.engines.entitylinking.impl.Suggestion) Graph(org.apache.clerezza.commons.rdf.Graph) Language(org.apache.clerezza.commons.rdf.Language) NlpEngineHelper.getLanguage(org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getLanguage) Literal(org.apache.clerezza.commons.rdf.Literal) Collection(java.util.Collection) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) Occurrence(org.apache.stanbol.enhancer.engines.entitylinking.impl.LinkedEntity.Occurrence) HashSet(java.util.HashSet)

Example 3 with LinkedEntity

use of org.apache.stanbol.enhancer.engines.entitylinking.impl.LinkedEntity in project stanbol by apache.

the class EntityLinkingEngineTest method validateEntityLinkerResults.

private void validateEntityLinkerResults(EntityLinker linker, Map<String, List<String>> expectedResults) {
    log.info("---------------------");
    log.info("- Validating Results-");
    log.info("---------------------");
    for (LinkedEntity linkedEntity : linker.getLinkedEntities().values()) {
        log.info("> LinkedEntity {}", linkedEntity);
        List<String> expectedSuggestions = expectedResults.remove(linkedEntity.getSelectedText());
        assertNotNull("LinkedEntity '" + linkedEntity.getSelectedText() + "' is not an expected Result (or was found twice)", expectedSuggestions);
        linkedEntity.getSuggestions().iterator();
        assertEquals("Number of suggestions " + linkedEntity.getSuggestions().size() + " != number of expected suggestions " + expectedSuggestions.size() + "for selection " + linkedEntity.getSelectedText() + "(Expected: " + expectedSuggestions + ")", linkedEntity.getSuggestions().size(), expectedSuggestions.size());
        double score = linkedEntity.getScore();
        for (int i = 0; i < expectedSuggestions.size(); i++) {
            Suggestion suggestion = linkedEntity.getSuggestions().get(i);
            assertEquals("Expecced Suggestion at Rank " + i + " expected: " + expectedSuggestions.get(i) + " suggestion: " + suggestion.getEntity().getId(), expectedSuggestions.get(i), suggestion.getEntity().getId());
            assertTrue("Score of suggestion " + i + "(" + suggestion.getScore() + " > as of the previous one (" + score + ")", score >= suggestion.getScore());
            score = suggestion.getScore();
        }
    }
    assertTrue("The expected Result(s) " + expectedResults + " wehre not found", expectedResults.isEmpty());
}
Also used : LinkedEntity(org.apache.stanbol.enhancer.engines.entitylinking.impl.LinkedEntity) Suggestion(org.apache.stanbol.enhancer.engines.entitylinking.impl.Suggestion)

Aggregations

LinkedEntity (org.apache.stanbol.enhancer.engines.entitylinking.impl.LinkedEntity)3 Suggestion (org.apache.stanbol.enhancer.engines.entitylinking.impl.Suggestion)3 ArrayList (java.util.ArrayList)2 HashSet (java.util.HashSet)2 Graph (org.apache.clerezza.commons.rdf.Graph)2 IRI (org.apache.clerezza.commons.rdf.IRI)2 Language (org.apache.clerezza.commons.rdf.Language)2 Literal (org.apache.clerezza.commons.rdf.Literal)2 RDFTerm (org.apache.clerezza.commons.rdf.RDFTerm)2 Triple (org.apache.clerezza.commons.rdf.Triple)2 PlainLiteralImpl (org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl)2 TripleImpl (org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)2 Entity (org.apache.stanbol.enhancer.engines.entitylinking.Entity)2 Occurrence (org.apache.stanbol.enhancer.engines.entitylinking.impl.LinkedEntity.Occurrence)2 NlpEngineHelper.getLanguage (org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getLanguage)2 Collection (java.util.Collection)1 HashMap (java.util.HashMap)1 BlankNodeOrIRI (org.apache.clerezza.commons.rdf.BlankNodeOrIRI)1 TypedLiteralImpl (org.apache.clerezza.commons.rdf.impl.utils.TypedLiteralImpl)1