use of org.apache.clerezza.commons.rdf.Triple in project stanbol by apache.
the class TestEntityLinkingEnhancementEngine method validateAllEntityAnnotations.
private static int validateAllEntityAnnotations(NamedEntityTaggingEngine entityLinkingEngine, ContentItem ci) {
Map<IRI, RDFTerm> expectedValues = new HashMap<IRI, RDFTerm>();
expectedValues.put(ENHANCER_EXTRACTED_FROM, ci.getUri());
expectedValues.put(DC_CREATOR, LiteralFactory.getInstance().createTypedLiteral(entityLinkingEngine.getClass().getName()));
Iterator<Triple> entityAnnotationIterator = ci.getMetadata().filter(null, RDF_TYPE, ENHANCER_ENTITYANNOTATION);
// adding null as expected for confidence makes it a required property
expectedValues.put(Properties.ENHANCER_CONFIDENCE, null);
int entityAnnotationCount = 0;
while (entityAnnotationIterator.hasNext()) {
IRI entityAnnotation = (IRI) entityAnnotationIterator.next().getSubject();
// test if selected Text is added
validateEntityAnnotation(ci.getMetadata(), entityAnnotation, expectedValues);
// fise:confidence now checked by EnhancementStructureHelper (STANBOL-630)
// Iterator<Triple> confidenceIterator = ci.getMetadata().filter(entityAnnotation, ENHANCER_CONFIDENCE, null);
// assertTrue("Expected fise:confidence value is missing (entityAnnotation "
// +entityAnnotation+")",confidenceIterator.hasNext());
// Double confidence = LiteralFactory.getInstance().createObject(Double.class,
// (TypedLiteral)confidenceIterator.next().getObject());
// assertTrue("fise:confidence MUST BE <= 1 (value= '"+confidence
// + "',entityAnnotation " +entityAnnotation+")",
// 1.0 >= confidence.doubleValue());
// assertTrue("fise:confidence MUST BE >= 0 (value= '"+confidence
// +"',entityAnnotation "+entityAnnotation+")",
// 0.0 <= confidence.doubleValue());
// Test the entityhub:site property (STANBOL-625)
IRI ENTITYHUB_SITE = new IRI(RdfResourceEnum.site.getUri());
Iterator<Triple> entitySiteIterator = ci.getMetadata().filter(entityAnnotation, ENTITYHUB_SITE, null);
assertTrue("Expected entityhub:site value is missing (entityAnnotation " + entityAnnotation + ")", entitySiteIterator.hasNext());
RDFTerm siteResource = entitySiteIterator.next().getObject();
assertTrue("entityhub:site values MUST BE Literals", siteResource instanceof Literal);
assertEquals("'dbpedia' is expected as entityhub:site value", "dbpedia", ((Literal) siteResource).getLexicalForm());
assertFalse("entityhub:site MUST HAVE only a single value", entitySiteIterator.hasNext());
entityAnnotationCount++;
}
return entityAnnotationCount;
}
use of org.apache.clerezza.commons.rdf.Triple in project stanbol by apache.
the class EntityLinkingEngine method writeEnhancements.
/**
* Writes the Enhancements for the {@link LinkedEntity LinkedEntities}
* extracted from the parsed ContentItem
* @param ci
* @param linkedEntities
* @param language
*/
private void writeEnhancements(ContentItem ci, Collection<LinkedEntity> linkedEntities, String language, boolean writeRankings) {
Language languageObject = null;
if (language != null && !language.isEmpty()) {
languageObject = new Language(language);
}
Set<IRI> dereferencedEntitis = new HashSet<IRI>();
Graph metadata = ci.getMetadata();
for (LinkedEntity linkedEntity : linkedEntities) {
Collection<IRI> textAnnotations = new ArrayList<IRI>(linkedEntity.getOccurrences().size());
// first create the TextAnnotations for the Occurrences
for (Occurrence occurrence : linkedEntity.getOccurrences()) {
Literal startLiteral = literalFactory.createTypedLiteral(occurrence.getStart());
Literal endLiteral = literalFactory.createTypedLiteral(occurrence.getEnd());
// search for existing text annotation
Iterator<Triple> it = metadata.filter(null, ENHANCER_START, startLiteral);
IRI textAnnotation = null;
while (it.hasNext()) {
Triple t = it.next();
if (metadata.filter(t.getSubject(), ENHANCER_END, endLiteral).hasNext() && metadata.filter(t.getSubject(), RDF_TYPE, ENHANCER_TEXTANNOTATION).hasNext()) {
textAnnotation = (IRI) t.getSubject();
break;
}
}
if (textAnnotation == null) {
// not found ... create a new one
textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_START, startLiteral));
metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_END, endLiteral));
metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(occurrence.getContext(), languageObject)));
metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(occurrence.getSelectedText(), languageObject)));
metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(linkedEntity.getScore())));
} else {
// if existing add this engine as contributor
metadata.add(new TripleImpl(textAnnotation, DC_CONTRIBUTOR, new PlainLiteralImpl(this.getClass().getName())));
}
// add dc:types (even to existing)
for (IRI dcType : linkedEntity.getTypes()) {
metadata.add(new TripleImpl(textAnnotation, Properties.DC_TYPE, dcType));
}
textAnnotations.add(textAnnotation);
}
// now the EntityAnnotations for the Suggestions
for (Suggestion suggestion : linkedEntity.getSuggestions()) {
IRI entityAnnotation = EnhancementEngineHelper.createEntityEnhancement(ci, this);
// should we use the label used for the match, or search the
// representation for the best label ... currently its the matched one
Literal label = suggestion.getBestLabel(linkerConfig.getNameField(), language);
Entity entity = suggestion.getEntity();
metadata.add(new TripleImpl(entityAnnotation, Properties.ENHANCER_ENTITY_LABEL, label));
metadata.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_REFERENCE, entity.getUri()));
Iterator<IRI> suggestionTypes = entity.getReferences(linkerConfig.getTypeField());
while (suggestionTypes.hasNext()) {
metadata.add(new TripleImpl(entityAnnotation, Properties.ENHANCER_ENTITY_TYPE, suggestionTypes.next()));
}
metadata.add(new TripleImpl(entityAnnotation, Properties.ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(suggestion.getScore())));
for (IRI textAnnotation : textAnnotations) {
metadata.add(new TripleImpl(entityAnnotation, Properties.DC_RELATION, textAnnotation));
}
// add origin information of the EntiySearcher
for (Entry<IRI, Collection<RDFTerm>> originInfo : entitySearcher.getOriginInformation().entrySet()) {
for (RDFTerm value : originInfo.getValue()) {
metadata.add(new TripleImpl(entityAnnotation, originInfo.getKey(), value));
}
}
if (writeRankings) {
Float ranking = suggestion.getEntity().getEntityRanking();
if (ranking != null) {
metadata.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_RANKING, // write the float as double
new TypedLiteralImpl(ranking.toString(), XSD_DOUBLE)));
}
}
// add the RDF data for entities
if (linkerConfig.isDereferenceEntitiesEnabled() && dereferencedEntitis.add(entity.getUri())) {
// NOTE: do not add all triples as there might be other data in the graph
for (Iterator<Triple> triples = entity.getData().filter(entity.getUri(), null, null); triples.hasNext(); metadata.add(triples.next())) ;
}
}
}
}
use of org.apache.clerezza.commons.rdf.Triple in project stanbol by apache.
the class EntityCoMentionEngine method computeEnhancements.
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
AnalysedText at = getAnalysedText(this, ci, true);
String language = getLanguage(this, ci, true);
LanguageProcessingConfig languageConfig = textProcessingConfig.getConfiguration(language);
if (languageConfig == null) {
throw new IllegalStateException("The language '" + language + "' is not configured " + "to be processed by this Engine. As this is already checked within the " + "canEnhance(..) method this may indicate an bug in the used " + "EnhanceemntJobManager implementation!");
}
if (log.isDebugEnabled()) {
log.debug("compute co-mentions for ContentItem {} language {} text={}", new Object[] { ci.getUri().getUnicodeString(), language, StringUtils.abbreviate(at.getSpan(), 100) });
}
LabelTokenizer labelTokenizer = (LabelTokenizer) labelTokenizerTracker.getService();
if (labelTokenizer == null) {
throw new EngineException(this, ci, "No LabelTokenizer available!", null);
}
// create the in-memory database for the mentioned Entities
ContentItemMentionBuilder entityMentionIndex = new ContentItemMentionBuilder(labelTokenizer, language, linkerConfig.getDefaultLanguage());
Graph metadata = ci.getMetadata();
Set<IRI> textAnnotations = new HashSet<IRI>();
ci.getLock().readLock().lock();
try {
// iterate over all TextAnnotations (mentions of Entities)
for (Iterator<Triple> it = metadata.filter(null, RDF_TYPE, ENHANCER_TEXTANNOTATION); it.hasNext(); ) {
IRI ta = (IRI) it.next().getSubject();
entityMentionIndex.registerTextAnnotation(ta, metadata);
// store the registered text annotations
textAnnotations.add(ta);
}
} finally {
ci.getLock().readLock().unlock();
}
EntityLinker entityLinker = new EntityLinker(at, language, languageConfig, entityMentionIndex, linkerConfig, labelTokenizer, entityMentionIndex);
// process
try {
entityLinker.process();
} catch (EntitySearcherException e) {
log.error("Unable to link Entities with " + entityLinker, e);
throw new EngineException(this, ci, "Unable to link Entities with " + entityLinker, e);
}
// TODO: write results
ci.getLock().writeLock().lock();
try {
writeComentions(ci, entityLinker.getLinkedEntities().values(), language, textAnnotations);
} finally {
ci.getLock().writeLock().unlock();
}
}
use of org.apache.clerezza.commons.rdf.Triple in project stanbol by apache.
the class RdfSerializingWriter method getAdditionalExpansionResources.
private Set<RDFTerm> getAdditionalExpansionResources(Graph tc, GraphNode recipe) {
final Set<IRI> subjectExpansionProperties = getSubjectExpansionProperties(recipe);
final Set<IRI> objectExpansionProperties = getObjectExpansionProperties(recipe);
final Set<RDFTerm> result = new HashSet<RDFTerm>();
if ((subjectExpansionProperties.size() > 0) || (objectExpansionProperties.size() > 0)) {
for (Triple triple : tc) {
final IRI predicate = triple.getPredicate();
if (subjectExpansionProperties.contains(predicate)) {
result.add(triple.getSubject());
}
if (objectExpansionProperties.contains(predicate)) {
result.add(triple.getObject());
}
}
}
return result;
}
use of org.apache.clerezza.commons.rdf.Triple in project stanbol by apache.
the class EventJobManagerImpl method logExecutionMetadata.
/**
* Logs the ExecutionMetadata
* @param logger the logger to log the execution metadata to
* @param job the enhancement job to log the execution metadata for
* @param isWarn if <code>true</code> the data are logged with <code>WARN</code> level.
* If <code>false</code> the <code>DEBUG</code> level is used
*/
protected void logExecutionMetadata(Logger logger, EnhancementJob job, boolean isWarn) {
if (log.isDebugEnabled() || (isWarn && log.isWarnEnabled())) {
StringBuilder message = new StringBuilder(1024);
message.append("ExecutionMetadata for ContentItem ").append(job.getContentItem().getUri()).append(" and Chain ").append(job.getChainName()).append(": \n");
boolean serialized = false;
if (serializer != null) {
ByteArrayOutputStream bout = new ByteArrayOutputStream();
try {
serializer.serialize(bout, job.getExecutionMetadata(), SupportedFormat.TURTLE);
message.append(bout.toString("utf-8"));
serialized = true;
} catch (RuntimeException e) {
log.warn(" ... unable to serialize Execution Metadata | {}: {}", e.getClass(), e.getMessage());
} catch (UnsupportedEncodingException e) {
log.warn(" ... unable to serialize Execution Metadata | {}: {}", e.getClass(), e.getMessage());
}
}
if (!serialized) {
// TURTLE serialization not possible ... use the toString method of triple
for (Triple t : job.getExecutionMetadata()) {
message.append(t.toString()).append('\n');
}
}
// finally write the serialized graph to the logger
if (isWarn) {
logger.warn(message.toString());
} else {
logger.debug(message.toString());
}
}
}
Aggregations