use of org.apache.clerezza.commons.rdf.Literal in project stanbol by apache.
the class FstLinkingEngine method writeEnhancements.
/**
* Writes the Enhancements for the {@link LinkedEntity LinkedEntities}
* extracted from the parsed ContentItem
* @param ci
* @param tags
* @param language
*/
private void writeEnhancements(ContentItem ci, String text, Collection<Tag> tags, String language, boolean writeRankings) {
Language languageObject = null;
if (language != null && !language.isEmpty()) {
languageObject = new Language(language);
}
Graph metadata = ci.getMetadata();
for (Tag tag : tags) {
Collection<IRI> textAnnotations = new ArrayList<IRI>(tags.size());
//first create the TextAnnotations for the Occurrences
Literal startLiteral = literalFactory.createTypedLiteral(tag.getStart());
Literal endLiteral = literalFactory.createTypedLiteral(tag.getEnd());
//search for existing text annotation
Iterator<Triple> it = metadata.filter(null, ENHANCER_START, startLiteral);
IRI textAnnotation = null;
while (it.hasNext()) {
Triple t = it.next();
if (metadata.filter(t.getSubject(), ENHANCER_END, endLiteral).hasNext() && metadata.filter(t.getSubject(), RDF_TYPE, ENHANCER_TEXTANNOTATION).hasNext()) {
textAnnotation = (IRI) t.getSubject();
break;
}
}
if (textAnnotation == null) {
//not found ... create a new one
textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_START, startLiteral));
metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_END, endLiteral));
metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(getSelectionContext(text, tag.getAnchor(), tag.getStart()), languageObject)));
metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(tag.getAnchor(), languageObject)));
metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(tag.getScore())));
} else {
//if existing add this engine as contributor
metadata.add(new TripleImpl(textAnnotation, DC_CONTRIBUTOR, new PlainLiteralImpl(this.getClass().getName())));
}
//add dc:types (even to existing)
for (IRI dcType : getDcTypes(tag.getSuggestions())) {
metadata.add(new TripleImpl(textAnnotation, Properties.DC_TYPE, dcType));
}
textAnnotations.add(textAnnotation);
//now the EntityAnnotations for the Suggestions
for (Match match : tag.getSuggestions()) {
IRI entityAnnotation = EnhancementEngineHelper.createEntityEnhancement(ci, this);
//should we use the label used for the match, or search the
//representation for the best label ... currently its the matched one
metadata.add(new TripleImpl(entityAnnotation, Properties.ENHANCER_ENTITY_LABEL, match.getMatchLabel()));
metadata.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_REFERENCE, new IRI(match.getUri())));
for (IRI type : match.getTypes()) {
metadata.add(new TripleImpl(entityAnnotation, Properties.ENHANCER_ENTITY_TYPE, type));
}
metadata.add(new TripleImpl(entityAnnotation, Properties.ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(match.getScore())));
//add the relation to the fise:TextAnnotation (the tag)
metadata.add(new TripleImpl(entityAnnotation, Properties.DC_RELATION, textAnnotation));
//write origin information
if (indexConfig.getOrigin() != null) {
metadata.add(new TripleImpl(entityAnnotation, FISE_ORIGIN, indexConfig.getOrigin()));
}
// }
if (writeRankings) {
Double ranking = match.getRanking();
if (ranking != null) {
metadata.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_RANKING, literalFactory.createTypedLiteral(ranking)));
}
}
//TODO: dereferencing
// if(linkerConfig.isDereferenceEntitiesEnabled() &&
// dereferencedEntitis.add(entity.getUri())){ //not yet dereferenced
// //add all outgoing triples for this entity
// //NOTE: do not add all triples as there might be other data in the graph
// for(Iterator<Triple> triples = entity.getData().filter(entity.getUri(), null, null);
// triples.hasNext();metadata.add(triples.next()));
// }
}
}
}
use of org.apache.clerezza.commons.rdf.Literal in project stanbol by apache.
the class FstLinkingEngine method match.
private int match(String text, Collection<Tag> tags, Map<int[], Set<String>> emTypes) {
log.trace(" ... process matches for {} extracted Tags:", tags.size());
int matchCount = 0;
Iterator<Tag> tagIt = tags.iterator();
while (tagIt.hasNext()) {
Tag tag = tagIt.next();
String anchor = text.substring(tag.getStart(), tag.getEnd());
log.trace(" {}: '{}'", tag, anchor);
tag.setAnchor(anchor);
if (!elConfig.isCaseSensitiveMatching()) {
anchor = anchor.toLowerCase(Locale.ROOT);
}
int alength = anchor.length();
List<Match> suggestions = new ArrayList<Match>(tag.getMatches().size());
//only for trace level debugging
int i = 1;
for (Match match : tag.getMatches()) {
if (log.isTraceEnabled()) {
log.trace(" {}. {}", i++, match.getUri());
}
matchCount++;
final boolean filterType;
if (linkingMode == LinkingModeEnum.NER) {
Set<String> types = emTypes.get(new int[] { tag.getStart(), tag.getEnd() });
if (types == null) {
log.warn(" - missing NE types for Named Entity [{},{}] {}!", new Object[] { tag.getStart(), tag.getEnd(), tag.getAnchor() });
filterType = true;
} else {
filterType = filterByNamedEntityType(match.getTypes().iterator(), types);
}
} else {
filterType = filterEntityByType(match.getTypes().iterator());
}
if (!filterType) {
int distance = Integer.MAX_VALUE;
Literal matchLabel = null;
for (Iterator<Literal> it = match.getLabels().iterator(); it.hasNext() && distance > 0; ) {
Literal literal = it.next();
String label = literal.getLexicalForm();
int d;
if (!elConfig.isCaseSensitiveMatching()) {
label = label.toLowerCase(Locale.ROOT);
}
d = StringUtils.getLevenshteinDistance(anchor, label);
if (d < distance) {
distance = d;
matchLabel = literal;
}
}
if (distance == 0) {
match.setMatch(1.0, matchLabel);
} else {
double length = Math.max(alength, matchLabel.getLexicalForm().length());
match.setMatch(1d - ((double) distance / length), matchLabel);
}
if (match.getScore() >= elConfig.getMinMatchScore()) {
log.trace(" ... add suggestion: label: '{}'; conf: {}", matchLabel, match.getScore());
suggestions.add(match);
} else {
log.trace(" ... filtered because match score < {}", elConfig.getMinMatchScore());
}
} else {
//the type of the current Entity is blacklisted
log.trace(" ... filtered because of entity types");
}
}
if (suggestions.isEmpty()) {
// remove this tag as no match is left
tagIt.remove();
} else if (suggestions.size() > 1) {
//if we have multiple suggestions
//sort based on score
Collections.sort(suggestions, Match.SCORE_COMPARATOR);
int maxSuggestions = elConfig.getMaxSuggestions();
if ((suggestions.size() > maxSuggestions + 1) && elConfig.isIncludeSuggestionsWithSimilarScore()) {
//include suggestions with similar score
double minIncludeScore = suggestions.get(maxSuggestions).getScore();
//the next element
int numInclude = maxSuggestions + 1;
double actScore;
do {
actScore = suggestions.get(numInclude).getScore();
//increase for the next iteration
numInclude++;
} while (numInclude < suggestions.size() && actScore >= minIncludeScore);
maxSuggestions = numInclude - 1;
}
//adapt score based on entity ranking
if (elConfig.isRankEqualScoresBasedOnEntityRankings()) {
adaptScoresForEntityRankings(suggestions);
}
if (log.isTraceEnabled()) {
//log the suggestion information
log.trace("Suggestions:");
int si = 1;
for (Match m : suggestions) {
log.trace(" {}. {} - {} ({})", new Object[] { si <= maxSuggestions ? si : "--", m.getScore(), m.getMatchLabel(), m.getUri() });
si++;
}
}
//remove all suggestions > maxSuggestions
if (suggestions.size() > maxSuggestions) {
suggestions.subList(maxSuggestions, suggestions.size()).clear();
}
}
tag.setSuggestions(suggestions);
}
return matchCount;
}
use of org.apache.clerezza.commons.rdf.Literal in project stanbol by apache.
the class ResultSetToXml method createValueElement.
private Element createValueElement(RDFTerm resource, Document doc) {
Element value;
if (resource instanceof IRI) {
value = doc.createElement("uri");
value.appendChild(doc.createTextNode(((IRI) resource).getUnicodeString()));
} else if (resource instanceof Literal) {
value = doc.createElement("literal");
value.appendChild(doc.createTextNode(((Literal) resource).getLexicalForm()));
value.setAttribute("datatype", (((Literal) resource).getDataType().getUnicodeString()));
Language lang = ((Literal) resource).getLanguage();
if (lang != null) {
value.setAttribute("xml:lang", (lang.toString()));
}
} else {
value = doc.createElement("bnode");
value.appendChild(doc.createTextNode(resource.toString()));
}
return value;
}
use of org.apache.clerezza.commons.rdf.Literal in project stanbol by apache.
the class ContentItemBackendTest method testEnhancements.
@Test
public void testEnhancements() throws LDPathParseException {
String path = "fn:enhancement(.)";
Collection<RDFTerm> result = ldpath.pathQuery(ci.getUri(), path, null);
assertNotNull(result);
assertFalse(result.isEmpty());
assertTrue(result.size() == 7);
for (RDFTerm r : result) {
assertTrue(r instanceof IRI);
log.info("Entity: {}", r);
}
//and with a filter
path = "fn:enhancement(.)[rdf:type is fise:TextAnnotation]";
result = ldpath.pathQuery(ci.getUri(), path, null);
assertNotNull(result);
assertFalse(result.isEmpty());
assertTrue(result.size() == 3);
// assertTrue(result.contains(new IRI("http://dbpedia.org/resource/Bob_Marley")));
path = "fn:enhancement(.)/dc:language";
result = ldpath.pathQuery(ci.getUri(), path, null);
assertNotNull(result);
assertFalse(result.isEmpty());
assertTrue(result.size() == 1);
RDFTerm r = result.iterator().next();
assertTrue(r instanceof Literal);
assertEquals("en", ((Literal) r).getLexicalForm());
}
use of org.apache.clerezza.commons.rdf.Literal in project stanbol by apache.
the class ContentItemBackendTest method testContent.
@Test
public void testContent() throws LDPathParseException {
Collection<RDFTerm> result = ldpath.pathQuery(ci.getUri(), "fn:content(\"text/plain\")", null);
assertNotNull(result);
assertFalse(result.isEmpty());
assertTrue(result.size() == 1);
RDFTerm r = result.iterator().next();
assertTrue(r instanceof Literal);
String content = ((Literal) r).getLexicalForm();
assertEquals(content, textContent);
result = ldpath.pathQuery(ci.getUri(), "fn:content(\"text/html\")", null);
assertNotNull(result);
assertFalse(result.isEmpty());
assertTrue(result.size() == 1);
r = result.iterator().next();
assertTrue(r instanceof Literal);
content = ((Literal) r).getLexicalForm();
assertEquals(content, htmlContent);
}
Aggregations