Search in sources :

Example 11 with Language

use of org.apache.clerezza.commons.rdf.Language in project stanbol by apache.

the class Nlp2RdfMetadataEngine method computeEnhancements.

@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    AnalysedText at = getAnalysedText(this, ci, true);
    String lang = EnhancementEngineHelper.getLanguage(ci);
    Language language = lang == null ? null : new Language(lang);
    // now iterate over the AnalysedText data and create the RDF representation
    // TODO: make configureable
    boolean sentences = true;
    boolean phrases = true;
    boolean words = true;
    EnumSet<SpanTypeEnum> activeTypes = EnumSet.noneOf(SpanTypeEnum.class);
    if (sentences) {
        activeTypes.add(SpanTypeEnum.Sentence);
    }
    if (phrases) {
        activeTypes.add(SpanTypeEnum.Chunk);
    }
    if (words) {
        activeTypes.add(SpanTypeEnum.Token);
    }
    Graph metadata = ci.getMetadata();
    IRI base = ci.getUri();
    ci.getLock().writeLock().lock();
    try {
        Iterator<Span> spans = at.getEnclosed(activeTypes);
        IRI sentence = null;
        IRI phrase = null;
        IRI word = null;
        boolean firstWordInSentence = true;
        while (spans.hasNext()) {
            Span span = spans.next();
            // TODO: filter Spans based on additional requirements
            // (1) write generic information about the span
            IRI current = writeSpan(metadata, base, at, language, span);
            // (2) add the relations between the different spans
            switch(span.getType()) {
                case Sentence:
                    if (sentence != null) {
                        metadata.add(new TripleImpl(sentence, SsoOntology.nextSentence.getUri(), current));
                    }
                    sentence = current;
                    firstWordInSentence = true;
                    break;
                case Chunk:
                    if (sentence != null) {
                        metadata.add(new TripleImpl(current, StringOntology.superString.getUri(), sentence));
                        if (word != null) {
                            metadata.add(new TripleImpl(word, SsoOntology.lastWord.getUri(), sentence));
                        }
                    }
                    phrase = current;
                    break;
                case Token:
                    if (sentence != null) {
                        metadata.add(new TripleImpl(current, SsoOntology.sentence.getUri(), sentence));
                        if (firstWordInSentence) {
                            metadata.add(new TripleImpl(current, SsoOntology.firstWord.getUri(), sentence));
                            firstWordInSentence = false;
                        }
                    }
                    if (phrase != null) {
                        metadata.add(new TripleImpl(current, SsoOntology.parent.getUri(), phrase));
                    }
                    if (word != null) {
                        metadata.add(new TripleImpl(word, SsoOntology.nextWord.getUri(), current));
                        metadata.add(new TripleImpl(current, SsoOntology.previousWord.getUri(), word));
                    }
                    word = current;
                    break;
                default:
                    break;
            }
            // (3) add specific information such as POS, chunk type ...
            writePos(metadata, span, current);
            writePhrase(metadata, span, current);
            // OlIA does not include Sentiments
            Value<Double> sentiment = span.getAnnotation(NlpAnnotations.SENTIMENT_ANNOTATION);
            if (sentiment != null && sentiment.value() != null) {
                metadata.add(new TripleImpl(current, SENTIMENT_PROPERTY, lf.createTypedLiteral(sentiment.value())));
            }
        }
    } finally {
        ci.getLock().writeLock().unlock();
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) SpanTypeEnum(org.apache.stanbol.enhancer.nlp.model.SpanTypeEnum) NIFHelper.writeSpan(org.apache.stanbol.enhancer.nlp.utils.NIFHelper.writeSpan) Span(org.apache.stanbol.enhancer.nlp.model.Span) NlpEngineHelper.getAnalysedText(org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getAnalysedText) AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) Graph(org.apache.clerezza.commons.rdf.Graph) Language(org.apache.clerezza.commons.rdf.Language) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)

Example 12 with Language

use of org.apache.clerezza.commons.rdf.Language in project stanbol by apache.

the class NEREngineCore method findNamedEntities.

protected void findNamedEntities(final ContentItem ci, final AnalysedText at, final String text, final String lang, final TokenNameFinderModel nameFinderModel) {
    if (ci == null) {
        throw new IllegalArgumentException("Parsed ContentItem MUST NOT be NULL");
    }
    if (at == null && text == null) {
        log.warn("NULL was parsed as AnalysedText AND Text for content item " + ci.getUri() + ". One of the two MUST BE present! -> call ignored");
        return;
    }
    final Language language;
    if (lang != null && !lang.isEmpty()) {
        language = new Language(lang);
    } else {
        language = null;
    }
    if (log.isDebugEnabled()) {
        log.debug("findNamedEntities model={},  language={}, text=", new Object[] { nameFinderModel, language, StringUtils.abbreviate(at != null ? at.getSpan() : text, 100) });
    }
    LiteralFactory literalFactory = LiteralFactory.getInstance();
    Graph g = ci.getMetadata();
    Map<String, List<NameOccurrence>> entityNames;
    if (at != null) {
        entityNames = extractNameOccurrences(nameFinderModel, at, lang);
    } else {
        entityNames = extractNameOccurrences(nameFinderModel, text, lang);
    }
    // lock the ContentItem while writing the RDF data for found Named Entities
    ci.getLock().writeLock().lock();
    try {
        Map<String, IRI> previousAnnotations = new LinkedHashMap<String, IRI>();
        for (Map.Entry<String, List<NameOccurrence>> nameInContext : entityNames.entrySet()) {
            String name = nameInContext.getKey();
            List<NameOccurrence> occurrences = nameInContext.getValue();
            IRI firstOccurrenceAnnotation = null;
            for (NameOccurrence occurrence : occurrences) {
                IRI textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
                g.add(new TripleImpl(textAnnotation, ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(name, language)));
                g.add(new TripleImpl(textAnnotation, ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(occurrence.context, language)));
                if (occurrence.type != null) {
                    g.add(new TripleImpl(textAnnotation, DC_TYPE, occurrence.type));
                }
                if (occurrence.confidence != null) {
                    g.add(new TripleImpl(textAnnotation, ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(occurrence.confidence)));
                }
                if (occurrence.start != null && occurrence.end != null) {
                    g.add(new TripleImpl(textAnnotation, ENHANCER_START, literalFactory.createTypedLiteral(occurrence.start)));
                    g.add(new TripleImpl(textAnnotation, ENHANCER_END, literalFactory.createTypedLiteral(occurrence.end)));
                }
                // name
                if (firstOccurrenceAnnotation == null) {
                    // specific occurrence
                    for (Map.Entry<String, IRI> entry : previousAnnotations.entrySet()) {
                        if (entry.getKey().contains(name)) {
                            // we have found a most specific previous
                            // occurrence, use it as subsumption target
                            firstOccurrenceAnnotation = entry.getValue();
                            g.add(new TripleImpl(textAnnotation, DC_RELATION, firstOccurrenceAnnotation));
                            break;
                        }
                    }
                    if (firstOccurrenceAnnotation == null) {
                        // no most specific previous occurrence, I am the first,
                        // most specific occurrence to be later used as a target
                        firstOccurrenceAnnotation = textAnnotation;
                        previousAnnotations.put(name, textAnnotation);
                    }
                } else {
                    // I am referring to a most specific first occurrence of the
                    // same name
                    g.add(new TripleImpl(textAnnotation, DC_RELATION, firstOccurrenceAnnotation));
                }
            }
        }
    } finally {
        ci.getLock().writeLock().unlock();
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) LiteralFactory(org.apache.clerezza.rdf.core.LiteralFactory) LinkedHashMap(java.util.LinkedHashMap) Graph(org.apache.clerezza.commons.rdf.Graph) Language(org.apache.clerezza.commons.rdf.Language) List(java.util.List) ArrayList(java.util.ArrayList) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) Map(java.util.Map) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap)

Example 13 with Language

use of org.apache.clerezza.commons.rdf.Language in project stanbol by apache.

the class IndexedGraphTest method createGraph.

private static void createGraph(Collection<Triple> tc, int triples, Long seed) {
    Random rnd = new Random();
    if (seed != null) {
        rnd.setSeed(seed);
    }
    LiteralFactory lf = LiteralFactory.getInstance();
    // randoms are in the range [0..3]
    // literal
    double l = 1.0;
    // int
    double i = l / 3;
    // double
    double d = l * 2 / 3;
    // bNode
    double b = 2.0;
    // create new bNode
    double nb = b - (l * 2 / 3);
    double random;
    BlankNodeOrIRI subject = null;
    IRI predicate = null;
    List<IRI> predicateList = new ArrayList<IRI>();
    predicateList.add(RDF.first);
    predicateList.add(RDF.rest);
    predicateList.add(RDF.type);
    predicateList.add(RDFS.label);
    predicateList.add(RDFS.comment);
    predicateList.add(RDFS.range);
    predicateList.add(RDFS.domain);
    predicateList.add(FOAF.name);
    predicateList.add(FOAF.nick);
    predicateList.add(FOAF.homepage);
    predicateList.add(FOAF.age);
    predicateList.add(FOAF.depiction);
    String URI_PREFIX = "http://www.test.org/bigGraph/ref";
    Language DE = new Language("de");
    Language EN = new Language("en");
    Iterator<IRI> predicates = predicateList.iterator();
    List<BlankNode> bNodes = new ArrayList<BlankNode>();
    bNodes.add(new BlankNode());
    for (int count = 0; tc.size() < triples; count++) {
        random = rnd.nextDouble() * 3;
        if (random >= 2.5 || count == 0) {
            if (random <= 2.75) {
                subject = new IRI(URI_PREFIX + count);
            } else {
                int rndIndex = (int) ((random - 2.75) * bNodes.size() / (3.0 - 2.75));
                subject = bNodes.get(rndIndex);
            }
        }
        if (random > 2.0 || count == 0) {
            if (!predicates.hasNext()) {
                Collections.shuffle(predicateList, rnd);
                predicates = predicateList.iterator();
            }
            predicate = predicates.next();
        }
        if (random <= l) {
            // literal
            if (random <= i) {
                tc.add(new TripleImpl(subject, predicate, lf.createTypedLiteral(count)));
            } else if (random <= d) {
                tc.add(new TripleImpl(subject, predicate, lf.createTypedLiteral(random)));
            } else {
                Literal text;
                if (random <= i) {
                    text = new PlainLiteralImpl("Literal for " + count);
                } else if (random <= d) {
                    text = new PlainLiteralImpl("An English literal for " + count, EN);
                } else {
                    text = new PlainLiteralImpl("Ein Deutsches Literal für " + count, DE);
                }
                tc.add(new TripleImpl(subject, predicate, text));
            }
        } else if (random <= b) {
            // bnode
            BlankNode bnode;
            if (random <= nb) {
                bnode = new BlankNode();
                bNodes.add(bnode);
            } else {
                // >nb <b
                int rndIndex = (int) ((random - nb) * bNodes.size() / (b - nb));
                bnode = bNodes.get(rndIndex);
            }
            tc.add(new TripleImpl(subject, predicate, bnode));
        } else {
            // IRI
            tc.add(new TripleImpl(subject, predicate, new IRI(URI_PREFIX + count * random)));
        }
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) ArrayList(java.util.ArrayList) BlankNode(org.apache.clerezza.commons.rdf.BlankNode) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) LiteralFactory(org.apache.clerezza.rdf.core.LiteralFactory) Random(java.util.Random) Language(org.apache.clerezza.commons.rdf.Language) Literal(org.apache.clerezza.commons.rdf.Literal) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)

Example 14 with Language

use of org.apache.clerezza.commons.rdf.Language in project stanbol by apache.

the class EntityCoMentionEngine method writeComentions.

private void writeComentions(ContentItem ci, Collection<LinkedEntity> comentions, String language, Set<IRI> textAnnotations) {
    Language languageObject = null;
    if (language != null && !language.isEmpty()) {
        languageObject = new Language(language);
    }
    Graph metadata = ci.getMetadata();
    // we MUST adjust the confidence level of existing annotations only once
    // se we need to keep track of those
    Set<BlankNodeOrIRI> adjustedSuggestions = new HashSet<BlankNodeOrIRI>();
    log.debug("Write Co-Mentions:");
    for (LinkedEntity comention : comentions) {
        log.debug(" > {}", comention);
        // URIs of TextAnnotations for the initial mention of this co-mention
        Collection<IRI> initialMentions = new ArrayList<IRI>(comention.getSuggestions().size());
        for (Suggestion suggestion : comention.getSuggestions()) {
            Entity entity = suggestion.getEntity();
            if (textAnnotations.contains(entity.getUri())) {
                // if(entity.getData().filter(entity.getUri(),RDF_TYPE,ENHANCER_TEXTANNOTATION).hasNext()){
                // this is a textAnnotation
                initialMentions.add(entity.getUri());
            }
        // else TODO support also Entities!!
        }
        // create the TextAnnotations for the co-mention
        for (Occurrence occurrence : comention.getOccurrences()) {
            Literal startLiteral = literalFactory.createTypedLiteral(occurrence.getStart());
            Literal endLiteral = literalFactory.createTypedLiteral(occurrence.getEnd());
            // search for existing text annotation
            boolean ignore = false;
            // search for textAnnotations with the same end
            IRI textAnnotation = null;
            Iterator<Triple> it = metadata.filter(null, ENHANCER_START, startLiteral);
            while (it.hasNext()) {
                Triple t = it.next();
                Integer end = EnhancementEngineHelper.get(metadata, t.getSubject(), ENHANCER_END, Integer.class, literalFactory);
                if (end != null && textAnnotations.contains(t.getSubject())) {
                    // metadata.filter(t.getSubject(), RDF_TYPE, ENHANCER_TEXTANNOTATION).hasNext()){
                    textAnnotation = (IRI) t.getSubject();
                    if (end > occurrence.getEnd()) {
                        // there is an other TextAnnotation selecting a bigger Span
                        // so we should ignore this Occurrence
                        ignore = true;
                    }
                }
            }
            it = metadata.filter(null, ENHANCER_END, endLiteral);
            while (it.hasNext()) {
                Triple t = it.next();
                Integer start = EnhancementEngineHelper.get(metadata, t.getSubject(), ENHANCER_START, Integer.class, literalFactory);
                if (start != null && textAnnotations.contains(t.getSubject())) {
                    // metadata.filter(t.getSubject(), RDF_TYPE, ENHANCER_TEXTANNOTATION).hasNext()){
                    textAnnotation = (IRI) t.getSubject();
                    if (start < occurrence.getStart()) {
                        // there is an other TextAnnotation selecting a bigger Span
                        // so we should ignore this Occurrence
                        ignore = true;
                    }
                }
            }
            if (!ignore) {
                // collect confidence values of co-mentions
                // maximum confidence of suggestions of the initial mention
                Double maxConfidence = null;
                // maximum confidence of existing suggestions
                Double maxExistingConfidence = null;
                if (textAnnotation == null) {
                    // not found ... create a new TextAnnotation for the co-mention
                    textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
                    // add it to the set of TextAnnotations
                    textAnnotations.add(textAnnotation);
                    metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_START, startLiteral));
                    metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_END, endLiteral));
                    metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(occurrence.getContext(), languageObject)));
                    metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(occurrence.getSelectedText(), languageObject)));
                } else {
                    // if existing add this engine as contributor
                    metadata.add(new TripleImpl(textAnnotation, DC_CONTRIBUTOR, new PlainLiteralImpl(this.getClass().getName())));
                // maxConfidence = EnhancementEngineHelper.get(metadata, textAnnotation,
                // ENHANCER_CONFIDENCE, Double.class, literalFactory);
                }
                // now process initial mention(s) for the co-mention
                Set<IRI> dcTypes = new HashSet<IRI>();
                for (IRI initialMention : initialMentions) {
                    // get the dc:type(s) of the initial mentions
                    Iterator<IRI> dcTypesIt = getReferences(metadata, initialMention, DC_TYPE);
                    while (dcTypesIt.hasNext()) {
                        dcTypes.add(dcTypesIt.next());
                    }
                    // check confidence of the initial mention (fise:TextAnnotation)
                    Double confidnece = EnhancementEngineHelper.get(metadata, initialMention, ENHANCER_CONFIDENCE, Double.class, literalFactory);
                    if (confidnece != null) {
                        if (maxConfidence == null) {
                            maxConfidence = confidnece;
                        } else if (maxConfidence.compareTo(confidnece) <= 0) {
                            maxConfidence = confidnece;
                        }
                    }
                    // else nothing to do
                    // now we need to compare the suggestions of the initial
                    // mention(s) with the existing one.
                    // Get information about the suggestions of the initial mention
                    Map<RDFTerm, Double> initialSuggestions = new HashMap<RDFTerm, Double>();
                    Map<RDFTerm, RDFTerm> initialSuggestedEntities = new HashMap<RDFTerm, RDFTerm>();
                    for (Iterator<Triple> suggestions = metadata.filter(null, DC_RELATION, initialMention); suggestions.hasNext(); ) {
                        if (!textAnnotations.contains(suggestions)) {
                            BlankNodeOrIRI suggestion = suggestions.next().getSubject();
                            RDFTerm suggestedEntity = EnhancementEngineHelper.getReference(metadata, suggestion, ENHANCER_ENTITY_REFERENCE);
                            if (suggestedEntity != null) {
                                // it has a suggestion
                                Double confidence = EnhancementEngineHelper.get(metadata, suggestion, ENHANCER_CONFIDENCE, Double.class, literalFactory);
                                if (maxConfidence == null) {
                                    maxConfidence = confidence;
                                } else if (confidnece != null && maxConfidence.compareTo(confidnece) <= 0) {
                                    maxConfidence = confidnece;
                                }
                                // else nothing to do
                                initialSuggestions.put(suggestion, confidence);
                                initialSuggestedEntities.put(suggestedEntity, suggestion);
                            }
                        // no suggestion (dc:relation to some other resource)
                        }
                    // else ignore dc:relation to other fise:TextAnnotations
                    }
                    // now we collect existing Suggestions for this TextAnnoation where we need
                    // to adjust the confidence (quite some things to check ....)
                    Map<BlankNodeOrIRI, Double> existingSuggestions = new HashMap<BlankNodeOrIRI, Double>();
                    if (maxConfidence != null && confidenceAdjustmentFactor < 1) {
                        // suggestions are defined by incoming dc:releation
                        for (Iterator<Triple> esIt = metadata.filter(null, DC_RELATION, textAnnotation); esIt.hasNext(); ) {
                            BlankNodeOrIRI existingSuggestion = esIt.next().getSubject();
                            // but not all of them are suggestions
                            if (!textAnnotations.contains(existingSuggestion)) {
                                // ignore fise:TextAnnotations
                                Double existingConfidence = EnhancementEngineHelper.get(metadata, existingSuggestion, ENHANCER_CONFIDENCE, Double.class, literalFactory);
                                // ignore fise:TextAnnotations also suggested for the initial mention
                                if (!initialSuggestions.containsKey(existingSuggestion)) {
                                    RDFTerm suggestedEntity = EnhancementEngineHelper.getReference(metadata, existingSuggestion, ENHANCER_ENTITY_REFERENCE);
                                    // suggestions for the initial mention
                                    if (!initialSuggestedEntities.containsKey(suggestedEntity)) {
                                        // finally make sure that we adjust confidences only once
                                        if (!adjustedSuggestions.contains(existingSuggestion)) {
                                            existingSuggestions.put(existingSuggestion, existingConfidence);
                                        }
                                    // else confidence already adjusted
                                    } else {
                                        // different fise:EntityAnnotation, but same reference Entity
                                        // we need to check confidences to decide what to do
                                        RDFTerm initialSuggestion = initialSuggestedEntities.get(suggestedEntity);
                                        Double initialConfidence = initialSuggestions.get(initialSuggestion);
                                        if (initialConfidence == null || (existingConfidence != null && existingConfidence.compareTo(initialConfidence) >= 0)) {
                                            // existing confidence >= initial .. keep existing
                                            initialSuggestions.remove(initialSuggestion);
                                            if (maxExistingConfidence == null) {
                                                maxExistingConfidence = existingConfidence;
                                            } else if (maxExistingConfidence.compareTo(existingConfidence) <= 0) {
                                                maxExistingConfidence = existingConfidence;
                                            }
                                        } else {
                                            // adjust this one (if not yet adjusted)
                                            if (!adjustedSuggestions.contains(existingSuggestion)) {
                                                existingSuggestions.put(existingSuggestion, existingConfidence);
                                            }
                                        }
                                    }
                                } else {
                                    // a initial mention already present
                                    // no need to process initial mention
                                    initialSuggestions.remove(existingSuggestion);
                                    if (maxExistingConfidence == null) {
                                        maxExistingConfidence = existingConfidence;
                                    } else if (existingConfidence != null && maxExistingConfidence.compareTo(existingConfidence) <= 0) {
                                        maxExistingConfidence = existingConfidence;
                                    }
                                // else maxExistingConfidence == null (undefined)
                                }
                            }
                        // else ignore dc:relations to other fise:TextAnnotations
                        }
                        for (Entry<BlankNodeOrIRI, Double> entry : existingSuggestions.entrySet()) {
                            if (entry.getValue() != null) {
                                double adjustedConfidence = entry.getValue() * confidenceAdjustmentFactor;
                                if (maxExistingConfidence == null || adjustedConfidence > maxExistingConfidence) {
                                    maxExistingConfidence = adjustedConfidence;
                                }
                                EnhancementEngineHelper.set(metadata, entry.getKey(), ENHANCER_CONFIDENCE, adjustedConfidence, literalFactory);
                                // mark as adjusted
                                adjustedSuggestions.add(entry.getKey());
                            }
                        }
                    }
                    // add the suggestions of the initial mention to this one
                    for (RDFTerm suggestion : initialSuggestions.keySet()) {
                        metadata.add(new TripleImpl((BlankNodeOrIRI) suggestion, DC_RELATION, textAnnotation));
                    }
                    // finally link the co-mentation with the initial one
                    metadata.add(new TripleImpl(textAnnotation, DC_RELATION, initialMention));
                // metadata.add(new TripleImpl(initialMention, DC_RELATION, textAnnotation));
                }
                // Adapt the dc:type values of the fise:TextAnnotation
                // - if Suggestions added by this engine do have the max confidence
                // use the dc:type values of the initial mention
                // - if the original suggestions do have a higher confidence keep the
                // existing
                // - in case both do have the same confidence we add all dc:types
                boolean removeExistingDcTypes = maxConfidence != null && (maxExistingConfidence == null || maxConfidence.compareTo(maxExistingConfidence) >= 0);
                boolean addCoMentionDcTypes = maxExistingConfidence == null || (maxConfidence != null && maxConfidence.compareTo(maxExistingConfidence) >= 1);
                Iterator<IRI> existingDcTypesIt = getReferences(metadata, textAnnotation, DC_TYPE);
                while (existingDcTypesIt.hasNext()) {
                    // removeExistingDcTypes == true
                    if ((!dcTypes.remove(existingDcTypesIt.next()) || !addCoMentionDcTypes) && removeExistingDcTypes) {
                        // remove the dcType
                        existingDcTypesIt.remove();
                    }
                }
                if (addCoMentionDcTypes) {
                    for (IRI dcType : dcTypes) {
                        // add missing
                        metadata.add(new TripleImpl(textAnnotation, DC_TYPE, dcType));
                    }
                }
                // TODO: support also Entities
                if (maxConfidence != null) {
                    // set the confidence value (if known)
                    EnhancementEngineHelper.set(metadata, textAnnotation, ENHANCER_CONFIDENCE, maxConfidence, literalFactory);
                }
            }
        // else ignore this occurence
        }
    }
}
Also used : LinkedEntity(org.apache.stanbol.enhancer.engines.entitylinking.impl.LinkedEntity) IRI(org.apache.clerezza.commons.rdf.IRI) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) LinkedEntity(org.apache.stanbol.enhancer.engines.entitylinking.impl.LinkedEntity) Entity(org.apache.stanbol.enhancer.engines.entitylinking.Entity) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) Suggestion(org.apache.stanbol.enhancer.engines.entitylinking.impl.Suggestion) Language(org.apache.clerezza.commons.rdf.Language) NlpEngineHelper.getLanguage(org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getLanguage) Literal(org.apache.clerezza.commons.rdf.Literal) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) Occurrence(org.apache.stanbol.enhancer.engines.entitylinking.impl.LinkedEntity.Occurrence) HashSet(java.util.HashSet) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) RDFTerm(org.apache.clerezza.commons.rdf.RDFTerm) Triple(org.apache.clerezza.commons.rdf.Triple) Graph(org.apache.clerezza.commons.rdf.Graph)

Example 15 with Language

use of org.apache.clerezza.commons.rdf.Language in project stanbol by apache.

the class EnhancementRDFUtils method writeEntityAnnotation.

/**
 * @param literalFactory
 *            the LiteralFactory to use
 * @param graph
 *            the Graph to use
 * @param contentItemId
 *            the contentItemId the enhancement is extracted from
 * @param relatedEnhancements
 *            enhancements this textAnnotation is related to
 * @param suggestion
 *            the entity suggestion
 * @param nameField the field used to extract the name
 * @param lang the preferred language to include or <code>null</code> if none
 */
public static IRI writeEntityAnnotation(EnhancementEngine engine, LiteralFactory literalFactory, Graph graph, IRI contentItemId, Collection<BlankNodeOrIRI> relatedEnhancements, Suggestion suggestion, String nameField, String lang) {
    Representation rep = suggestion.getEntity().getRepresentation();
    // 1. extract the "best label"
    // Start with the matched one
    Text label = suggestion.getMatchedLabel();
    // if the matched label is not in the requested language
    boolean langMatch = (lang == null && label.getLanguage() == null) || (label.getLanguage() != null && label.getLanguage().startsWith(lang));
    // search if a better label is available for this Entity
    if (!langMatch) {
        Iterator<Text> labels = rep.getText(nameField);
        while (labels.hasNext() && !langMatch) {
            Text actLabel = labels.next();
            langMatch = (lang == null && actLabel.getLanguage() == null) || (actLabel.getLanguage() != null && actLabel.getLanguage().startsWith(lang));
            if (langMatch) {
                // if the language matches ->
                // override the matched label
                label = actLabel;
            }
        }
    }
    // else the matched label will be the best to use
    Literal literal;
    if (label.getLanguage() == null) {
        literal = new PlainLiteralImpl(label.getText());
    } else {
        literal = new PlainLiteralImpl(label.getText(), new Language(label.getLanguage()));
    }
    // Now create the entityAnnotation
    IRI entityAnnotation = EnhancementEngineHelper.createEntityEnhancement(graph, engine, contentItemId);
    // first relate this entity annotation to the text annotation(s)
    for (BlankNodeOrIRI enhancement : relatedEnhancements) {
        graph.add(new TripleImpl(entityAnnotation, DC_RELATION, enhancement));
    }
    IRI entityUri = new IRI(rep.getId());
    // add the link to the referred entity
    graph.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_REFERENCE, entityUri));
    // add the label parsed above
    graph.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_LABEL, literal));
    if (suggestion.getScore() != null) {
        graph.add(new TripleImpl(entityAnnotation, ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(suggestion.getScore())));
    }
    Iterator<Reference> types = rep.getReferences(RDF_TYPE.getUnicodeString());
    while (types.hasNext()) {
        graph.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_TYPE, new IRI(types.next().getReference())));
    }
    // add the name of the ReferencedSite that manages the Entity
    if (suggestion.getEntity().getSite() != null) {
        graph.add(new TripleImpl(entityAnnotation, new IRI(RdfResourceEnum.site.getUri()), new PlainLiteralImpl(suggestion.getEntity().getSite())));
    }
    return entityAnnotation;
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) Language(org.apache.clerezza.commons.rdf.Language) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) Reference(org.apache.stanbol.entityhub.servicesapi.model.Reference) Literal(org.apache.clerezza.commons.rdf.Literal) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) Representation(org.apache.stanbol.entityhub.servicesapi.model.Representation) Text(org.apache.stanbol.entityhub.servicesapi.model.Text) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)

Aggregations

Language (org.apache.clerezza.commons.rdf.Language)32 IRI (org.apache.clerezza.commons.rdf.IRI)24 TripleImpl (org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)20 PlainLiteralImpl (org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl)19 Graph (org.apache.clerezza.commons.rdf.Graph)17 Literal (org.apache.clerezza.commons.rdf.Literal)12 ArrayList (java.util.ArrayList)8 BlankNodeOrIRI (org.apache.clerezza.commons.rdf.BlankNodeOrIRI)8 LiteralFactory (org.apache.clerezza.rdf.core.LiteralFactory)8 IOException (java.io.IOException)7 EngineException (org.apache.stanbol.enhancer.servicesapi.EngineException)7 HashSet (java.util.HashSet)5 RDFTerm (org.apache.clerezza.commons.rdf.RDFTerm)5 AnalysedText (org.apache.stanbol.enhancer.nlp.model.AnalysedText)5 NlpEngineHelper.getLanguage (org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getLanguage)5 ByteArrayOutputStream (java.io.ByteArrayOutputStream)4 UnsupportedEncodingException (java.io.UnsupportedEncodingException)4 HashMap (java.util.HashMap)4 SOAPException (javax.xml.soap.SOAPException)4 Triple (org.apache.clerezza.commons.rdf.Triple)4