Search in sources :

Example 1 with Literal

use of org.apache.clerezza.commons.rdf.Literal in project stanbol by apache.

the class FstLinkingEngine method writeEnhancements.

/**
     * Writes the Enhancements for the {@link LinkedEntity LinkedEntities}
     * extracted from the parsed ContentItem
     * @param ci
     * @param tags
     * @param language
     */
private void writeEnhancements(ContentItem ci, String text, Collection<Tag> tags, String language, boolean writeRankings) {
    Language languageObject = null;
    if (language != null && !language.isEmpty()) {
        languageObject = new Language(language);
    }
    Graph metadata = ci.getMetadata();
    for (Tag tag : tags) {
        Collection<IRI> textAnnotations = new ArrayList<IRI>(tags.size());
        //first create the TextAnnotations for the Occurrences
        Literal startLiteral = literalFactory.createTypedLiteral(tag.getStart());
        Literal endLiteral = literalFactory.createTypedLiteral(tag.getEnd());
        //search for existing text annotation
        Iterator<Triple> it = metadata.filter(null, ENHANCER_START, startLiteral);
        IRI textAnnotation = null;
        while (it.hasNext()) {
            Triple t = it.next();
            if (metadata.filter(t.getSubject(), ENHANCER_END, endLiteral).hasNext() && metadata.filter(t.getSubject(), RDF_TYPE, ENHANCER_TEXTANNOTATION).hasNext()) {
                textAnnotation = (IRI) t.getSubject();
                break;
            }
        }
        if (textAnnotation == null) {
            //not found ... create a new one
            textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
            metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_START, startLiteral));
            metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_END, endLiteral));
            metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(getSelectionContext(text, tag.getAnchor(), tag.getStart()), languageObject)));
            metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(tag.getAnchor(), languageObject)));
            metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(tag.getScore())));
        } else {
            //if existing add this engine as contributor
            metadata.add(new TripleImpl(textAnnotation, DC_CONTRIBUTOR, new PlainLiteralImpl(this.getClass().getName())));
        }
        //add dc:types (even to existing)
        for (IRI dcType : getDcTypes(tag.getSuggestions())) {
            metadata.add(new TripleImpl(textAnnotation, Properties.DC_TYPE, dcType));
        }
        textAnnotations.add(textAnnotation);
        //now the EntityAnnotations for the Suggestions
        for (Match match : tag.getSuggestions()) {
            IRI entityAnnotation = EnhancementEngineHelper.createEntityEnhancement(ci, this);
            //should we use the label used for the match, or search the
            //representation for the best label ... currently its the matched one
            metadata.add(new TripleImpl(entityAnnotation, Properties.ENHANCER_ENTITY_LABEL, match.getMatchLabel()));
            metadata.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_REFERENCE, new IRI(match.getUri())));
            for (IRI type : match.getTypes()) {
                metadata.add(new TripleImpl(entityAnnotation, Properties.ENHANCER_ENTITY_TYPE, type));
            }
            metadata.add(new TripleImpl(entityAnnotation, Properties.ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(match.getScore())));
            //add the relation to the fise:TextAnnotation (the tag)
            metadata.add(new TripleImpl(entityAnnotation, Properties.DC_RELATION, textAnnotation));
            //write origin information
            if (indexConfig.getOrigin() != null) {
                metadata.add(new TripleImpl(entityAnnotation, FISE_ORIGIN, indexConfig.getOrigin()));
            }
            //                }
            if (writeRankings) {
                Double ranking = match.getRanking();
                if (ranking != null) {
                    metadata.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_RANKING, literalFactory.createTypedLiteral(ranking)));
                }
            }
        //TODO: dereferencing 
        //                if(linkerConfig.isDereferenceEntitiesEnabled() &&
        //                        dereferencedEntitis.add(entity.getUri())){ //not yet dereferenced
        //                    //add all outgoing triples for this entity
        //                    //NOTE: do not add all triples as there might be other data in the graph
        //                    for(Iterator<Triple> triples = entity.getData().filter(entity.getUri(), null, null);
        //                            triples.hasNext();metadata.add(triples.next()));
        //                }
        }
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) ArrayList(java.util.ArrayList) Triple(org.apache.clerezza.commons.rdf.Triple) Graph(org.apache.clerezza.commons.rdf.Graph) Language(org.apache.clerezza.commons.rdf.Language) NlpEngineHelper.getLanguage(org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getLanguage) Literal(org.apache.clerezza.commons.rdf.Literal) NerTag(org.apache.stanbol.enhancer.nlp.ner.NerTag) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)

Example 2 with Literal

use of org.apache.clerezza.commons.rdf.Literal in project stanbol by apache.

the class FstLinkingEngine method match.

private int match(String text, Collection<Tag> tags, Map<int[], Set<String>> emTypes) {
    log.trace("  ... process matches for {} extracted Tags:", tags.size());
    int matchCount = 0;
    Iterator<Tag> tagIt = tags.iterator();
    while (tagIt.hasNext()) {
        Tag tag = tagIt.next();
        String anchor = text.substring(tag.getStart(), tag.getEnd());
        log.trace(" {}: '{}'", tag, anchor);
        tag.setAnchor(anchor);
        if (!elConfig.isCaseSensitiveMatching()) {
            anchor = anchor.toLowerCase(Locale.ROOT);
        }
        int alength = anchor.length();
        List<Match> suggestions = new ArrayList<Match>(tag.getMatches().size());
        //only for trace level debugging
        int i = 1;
        for (Match match : tag.getMatches()) {
            if (log.isTraceEnabled()) {
                log.trace(" {}. {}", i++, match.getUri());
            }
            matchCount++;
            final boolean filterType;
            if (linkingMode == LinkingModeEnum.NER) {
                Set<String> types = emTypes.get(new int[] { tag.getStart(), tag.getEnd() });
                if (types == null) {
                    log.warn(" - missing NE types for Named Entity [{},{}] {}!", new Object[] { tag.getStart(), tag.getEnd(), tag.getAnchor() });
                    filterType = true;
                } else {
                    filterType = filterByNamedEntityType(match.getTypes().iterator(), types);
                }
            } else {
                filterType = filterEntityByType(match.getTypes().iterator());
            }
            if (!filterType) {
                int distance = Integer.MAX_VALUE;
                Literal matchLabel = null;
                for (Iterator<Literal> it = match.getLabels().iterator(); it.hasNext() && distance > 0; ) {
                    Literal literal = it.next();
                    String label = literal.getLexicalForm();
                    int d;
                    if (!elConfig.isCaseSensitiveMatching()) {
                        label = label.toLowerCase(Locale.ROOT);
                    }
                    d = StringUtils.getLevenshteinDistance(anchor, label);
                    if (d < distance) {
                        distance = d;
                        matchLabel = literal;
                    }
                }
                if (distance == 0) {
                    match.setMatch(1.0, matchLabel);
                } else {
                    double length = Math.max(alength, matchLabel.getLexicalForm().length());
                    match.setMatch(1d - ((double) distance / length), matchLabel);
                }
                if (match.getScore() >= elConfig.getMinMatchScore()) {
                    log.trace(" ... add suggestion: label: '{}'; conf: {}", matchLabel, match.getScore());
                    suggestions.add(match);
                } else {
                    log.trace(" ... filtered because match score < {}", elConfig.getMinMatchScore());
                }
            } else {
                //the type of the current Entity is blacklisted
                log.trace("  ... filtered because of entity types");
            }
        }
        if (suggestions.isEmpty()) {
            // remove this tag as no match is left
            tagIt.remove();
        } else if (suggestions.size() > 1) {
            //if we have multiple suggestions
            //sort based on score
            Collections.sort(suggestions, Match.SCORE_COMPARATOR);
            int maxSuggestions = elConfig.getMaxSuggestions();
            if ((suggestions.size() > maxSuggestions + 1) && elConfig.isIncludeSuggestionsWithSimilarScore()) {
                //include suggestions with similar score
                double minIncludeScore = suggestions.get(maxSuggestions).getScore();
                //the next element
                int numInclude = maxSuggestions + 1;
                double actScore;
                do {
                    actScore = suggestions.get(numInclude).getScore();
                    //increase for the next iteration
                    numInclude++;
                } while (numInclude < suggestions.size() && actScore >= minIncludeScore);
                maxSuggestions = numInclude - 1;
            }
            //adapt score based on entity ranking
            if (elConfig.isRankEqualScoresBasedOnEntityRankings()) {
                adaptScoresForEntityRankings(suggestions);
            }
            if (log.isTraceEnabled()) {
                //log the suggestion information
                log.trace("Suggestions:");
                int si = 1;
                for (Match m : suggestions) {
                    log.trace(" {}. {} - {} ({})", new Object[] { si <= maxSuggestions ? si : "--", m.getScore(), m.getMatchLabel(), m.getUri() });
                    si++;
                }
            }
            //remove all suggestions > maxSuggestions
            if (suggestions.size() > maxSuggestions) {
                suggestions.subList(maxSuggestions, suggestions.size()).clear();
            }
        }
        tag.setSuggestions(suggestions);
    }
    return matchCount;
}
Also used : ArrayList(java.util.ArrayList) Literal(org.apache.clerezza.commons.rdf.Literal) NerTag(org.apache.stanbol.enhancer.nlp.ner.NerTag)

Example 3 with Literal

use of org.apache.clerezza.commons.rdf.Literal in project stanbol by apache.

the class ResultSetToXml method createValueElement.

private Element createValueElement(RDFTerm resource, Document doc) {
    Element value;
    if (resource instanceof IRI) {
        value = doc.createElement("uri");
        value.appendChild(doc.createTextNode(((IRI) resource).getUnicodeString()));
    } else if (resource instanceof Literal) {
        value = doc.createElement("literal");
        value.appendChild(doc.createTextNode(((Literal) resource).getLexicalForm()));
        value.setAttribute("datatype", (((Literal) resource).getDataType().getUnicodeString()));
        Language lang = ((Literal) resource).getLanguage();
        if (lang != null) {
            value.setAttribute("xml:lang", (lang.toString()));
        }
    } else {
        value = doc.createElement("bnode");
        value.appendChild(doc.createTextNode(resource.toString()));
    }
    return value;
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) Language(org.apache.clerezza.commons.rdf.Language) Element(org.w3c.dom.Element) Literal(org.apache.clerezza.commons.rdf.Literal)

Example 4 with Literal

use of org.apache.clerezza.commons.rdf.Literal in project stanbol by apache.

the class ContentItemBackendTest method testEnhancements.

@Test
public void testEnhancements() throws LDPathParseException {
    String path = "fn:enhancement(.)";
    Collection<RDFTerm> result = ldpath.pathQuery(ci.getUri(), path, null);
    assertNotNull(result);
    assertFalse(result.isEmpty());
    assertTrue(result.size() == 7);
    for (RDFTerm r : result) {
        assertTrue(r instanceof IRI);
        log.info("Entity: {}", r);
    }
    //and with a filter
    path = "fn:enhancement(.)[rdf:type is fise:TextAnnotation]";
    result = ldpath.pathQuery(ci.getUri(), path, null);
    assertNotNull(result);
    assertFalse(result.isEmpty());
    assertTrue(result.size() == 3);
    //        assertTrue(result.contains(new IRI("http://dbpedia.org/resource/Bob_Marley")));
    path = "fn:enhancement(.)/dc:language";
    result = ldpath.pathQuery(ci.getUri(), path, null);
    assertNotNull(result);
    assertFalse(result.isEmpty());
    assertTrue(result.size() == 1);
    RDFTerm r = result.iterator().next();
    assertTrue(r instanceof Literal);
    assertEquals("en", ((Literal) r).getLexicalForm());
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) Literal(org.apache.clerezza.commons.rdf.Literal) RDFTerm(org.apache.clerezza.commons.rdf.RDFTerm) Test(org.junit.Test)

Example 5 with Literal

use of org.apache.clerezza.commons.rdf.Literal in project stanbol by apache.

the class ContentItemBackendTest method testContent.

@Test
public void testContent() throws LDPathParseException {
    Collection<RDFTerm> result = ldpath.pathQuery(ci.getUri(), "fn:content(\"text/plain\")", null);
    assertNotNull(result);
    assertFalse(result.isEmpty());
    assertTrue(result.size() == 1);
    RDFTerm r = result.iterator().next();
    assertTrue(r instanceof Literal);
    String content = ((Literal) r).getLexicalForm();
    assertEquals(content, textContent);
    result = ldpath.pathQuery(ci.getUri(), "fn:content(\"text/html\")", null);
    assertNotNull(result);
    assertFalse(result.isEmpty());
    assertTrue(result.size() == 1);
    r = result.iterator().next();
    assertTrue(r instanceof Literal);
    content = ((Literal) r).getLexicalForm();
    assertEquals(content, htmlContent);
}
Also used : Literal(org.apache.clerezza.commons.rdf.Literal) RDFTerm(org.apache.clerezza.commons.rdf.RDFTerm) Test(org.junit.Test)

Aggregations

Literal (org.apache.clerezza.commons.rdf.Literal)71 IRI (org.apache.clerezza.commons.rdf.IRI)35 RDFTerm (org.apache.clerezza.commons.rdf.RDFTerm)35 Triple (org.apache.clerezza.commons.rdf.Triple)30 BlankNodeOrIRI (org.apache.clerezza.commons.rdf.BlankNodeOrIRI)22 TripleImpl (org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)20 ArrayList (java.util.ArrayList)16 PlainLiteralImpl (org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl)16 Language (org.apache.clerezza.commons.rdf.Language)12 Graph (org.apache.clerezza.commons.rdf.Graph)11 Test (org.junit.Test)10 HashSet (java.util.HashSet)9 Date (java.util.Date)8 Lock (java.util.concurrent.locks.Lock)6 Entity (org.apache.stanbol.enhancer.engines.entitylinking.Entity)5 HashMap (java.util.HashMap)4 SimpleGraph (org.apache.clerezza.commons.rdf.impl.utils.simple.SimpleGraph)4 NoConvertorException (org.apache.clerezza.rdf.core.NoConvertorException)4 Representation (org.apache.stanbol.entityhub.servicesapi.model.Representation)4 Collection (java.util.Collection)3