use of org.apache.clerezza.commons.rdf.Graph in project stanbol by apache.
the class EnhancementEngineHelperTest method testTextAnnotationNewModel.
@Test
public void testTextAnnotationNewModel() {
String content = "The Stanbol Enhancer can extract Entities form parsed Text.";
Language lang = new Language("en");
int start = content.indexOf("Stanbol");
int end = start + "Stanbol Enhancer".length();
IRI ciUri = new IRI("http://www.example.org/contentItem#1");
Graph metadata = new IndexedGraph();
IRI ta = EnhancementEngineHelper.createTextEnhancement(metadata, dummyEngine, ciUri);
EnhancementEngineHelper.setOccurrence(metadata, ta, content, start, end, lang, -1, true);
Assert.assertEquals("The ", EnhancementEngineHelper.getString(metadata, ta, Properties.ENHANCER_SELECTION_PREFIX));
Assert.assertEquals("Stanbol Enhancer", EnhancementEngineHelper.getString(metadata, ta, Properties.ENHANCER_SELECTED_TEXT));
Assert.assertEquals(" can extra", EnhancementEngineHelper.getString(metadata, ta, Properties.ENHANCER_SELECTION_SUFFIX));
Assert.assertEquals(Integer.valueOf(start), EnhancementEngineHelper.get(metadata, ta, Properties.ENHANCER_START, Integer.class, lf));
Assert.assertEquals(Integer.valueOf(end), EnhancementEngineHelper.get(metadata, ta, Properties.ENHANCER_END, Integer.class, lf));
//head and tail should be null
Assert.assertNull(EnhancementEngineHelper.getString(metadata, ta, Properties.ENHANCER_SELECTION_HEAD));
Assert.assertNull(EnhancementEngineHelper.getString(metadata, ta, Properties.ENHANCER_SELECTION_TAIL));
content = "Ich habe den Schlüssel fürs Donaudampfschiffahrtsgesellschaftskapitänskajütenschloss verlohren.";
start = content.indexOf("Donaudampfschi");
end = content.indexOf(" verlohren");
ta = EnhancementEngineHelper.createTextEnhancement(metadata, dummyEngine, ciUri);
EnhancementEngineHelper.setOccurrence(metadata, ta, content, start, end, lang, -1, true);
Assert.assertEquals("ssel fürs ", EnhancementEngineHelper.getString(metadata, ta, Properties.ENHANCER_SELECTION_PREFIX));
Assert.assertEquals(" verlohren", EnhancementEngineHelper.getString(metadata, ta, Properties.ENHANCER_SELECTION_SUFFIX));
Assert.assertEquals(Integer.valueOf(start), EnhancementEngineHelper.get(metadata, ta, Properties.ENHANCER_START, Integer.class, lf));
Assert.assertEquals(Integer.valueOf(end), EnhancementEngineHelper.get(metadata, ta, Properties.ENHANCER_END, Integer.class, lf));
//selected text is expected to be null
Assert.assertNull(EnhancementEngineHelper.getString(metadata, ta, Properties.ENHANCER_SELECTED_TEXT));
//tail and head should be present
Assert.assertEquals("Donaudampf", EnhancementEngineHelper.getString(metadata, ta, Properties.ENHANCER_SELECTION_HEAD));
Assert.assertEquals("tenschloss", EnhancementEngineHelper.getString(metadata, ta, Properties.ENHANCER_SELECTION_TAIL));
//finally the same but deactivating head/tail
ta = EnhancementEngineHelper.createTextEnhancement(metadata, dummyEngine, ciUri);
EnhancementEngineHelper.setOccurrence(metadata, ta, content, start, end, lang, -1, false);
Assert.assertEquals("Donaudampfschiffahrtsgesellschaftskapitänskajütenschloss", EnhancementEngineHelper.getString(metadata, ta, Properties.ENHANCER_SELECTED_TEXT));
Assert.assertNull(EnhancementEngineHelper.getString(metadata, ta, Properties.ENHANCER_SELECTION_HEAD));
Assert.assertNull(EnhancementEngineHelper.getString(metadata, ta, Properties.ENHANCER_SELECTION_TAIL));
}
use of org.apache.clerezza.commons.rdf.Graph in project stanbol by apache.
the class RdfEntityFactoryTest method testTypeStatements.
@Test
public void testTypeStatements() throws Exception {
Graph graph = new SimpleGraph();
RdfEntityFactory factory = RdfEntityFactory.createInstance(graph);
String testUri = "urn:RdfEntityFactoryTest:TestEntity";
IRI node = new IRI(testUri);
TestRdfEntity entity = factory.getProxy(node, TestRdfEntity.class, new Class[] { TestRdfEntity2.class });
// test the if the proxy implements both interfaces
assertTrue(entity instanceof TestRdfEntity);
assertTrue(entity instanceof TestRdfEntity2);
Set<String> typeStrings = getRdfTypes(graph, node);
assertTrue(typeStrings.contains(TestRdfEntity.class.getAnnotation(Rdf.class).id()));
assertTrue(typeStrings.contains(TestRdfEntity2.class.getAnnotation(Rdf.class).id()));
}
use of org.apache.clerezza.commons.rdf.Graph in project stanbol by apache.
the class ClerezzaYard method getRepresentation.
/**
* Internally used to create Representations for URIs
* @param uri the uri
* @param check if <code>false</code> than there is no check if the URI
* refers to a RDFTerm in the graph that is of type {@link #REPRESENTATION}
* @return the Representation
*/
protected final Representation getRepresentation(IRI uri, boolean check) {
final Lock readLock = readLockGraph();
try {
if (!check || isRepresentation(uri)) {
Graph nodeGraph = createRepresentationGraph(uri, graph);
//Remove the triple internally used to represent an empty Representation
// ... this will only remove the triple if the Representation is empty
// but a check would take longer than the this call
nodeGraph.remove(new TripleImpl(uri, MANAGED_REPRESENTATION, TRUE_LITERAL));
return ((RdfValueFactory) getValueFactory()).createRdfRepresentation(uri, nodeGraph);
} else {
//not found
return null;
}
} finally {
if (readLock != null) {
readLock.unlock();
}
}
}
use of org.apache.clerezza.commons.rdf.Graph in project stanbol by apache.
the class DisambiguatorEngine method computeEnhancements.
/*
* This function first evaluates all the possible ambiguations of each text annotation detected. the text
* of all entities detected is used for making a Dbpedia query with all string for MLT that contain all
* the other entities. The results obtained are used to calcualte new confidence values which are updated
* in the metadata.
*/
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
String textContent;
Entry<IRI, Blob> textBlob = ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES);
if (textBlob != null) {
try {
textContent = ContentItemHelper.getText(textBlob.getValue());
} catch (IOException e) {
log.warn("Unable to retieve plain text content for ContentItem " + ci.getUri(), e);
textContent = null;
}
} else {
textContent = null;
}
Graph graph = ci.getMetadata();
// (1) read the data from the content item
String contentLangauge;
DisambiguationData disData;
ci.getLock().readLock().lock();
try {
contentLangauge = EnhancementEngineHelper.getLanguage(ci);
// NOTE (rwesten): moved the parsing of the information from the
// contentItem to static method of the Class holding those information
// (similar as it already was for SavedEntity)
// readEntities(loseConfidence, allEntities, textAnnotations, graph);
disData = DisambiguationData.createFromContentItem(ci);
} finally {
ci.getLock().readLock().unlock();
}
// (2) Disambiguate the SavedEntities
for (SavedEntity savedEntity : disData.textAnnotations.values()) {
if (savedEntity.getSuggestions().size() <= 1) {
// we need not to disambiguate if only one suggestion is present
continue;
}
// NOTE: the site is determined from the
// fise:TextAnnotation <-- dc:relation --
// fise:EntityAnnotation -- entityhub:ste --> "{siteName}"^^xsd:string
// data.
// TODO: add configuration to include/exclude Sites by name
Site site = siteManager.getSite(savedEntity.getSite());
// potential types of entities
Collection<String> types = null;
// TODO: make configurable
boolean casesensitive = false;
String savedEntityLabel = casesensitive ? savedEntity.getName() : savedEntity.getName().toLowerCase();
// Determine the context used for disambiguation
// TODO: make this configurable options
String disambiguationContext;
// (0.a) The easiest way is to just use the selection context
// disambiguationContext = savedEntity.getContext();
// (0.b) Calculate a context based on a moving window
String window = getDisambiguationContext(textContent, savedEntity.getName(), savedEntity.getStart(), 100);
log.info("Use Window: '{}' for '{}'", window, savedEntity.getName());
// (1) The contextSelections:
// All other selected text within the selection context
List<String> contextSelections = getSelectionsInContext(savedEntity.getName(), disData.allSelectedTexts, window);
// savedEntity.getContext());
disambiguationContext = unionString(false, contextSelections);
// (2) I do not understand this variant (see comment for the
// EntitiesInRange(..) method
// List<String> L = EntitiesInRange(disData.directoryTextAnotation,
// (savedEntity.getStart() + savedEntity.getEnd()) / 2);
// disambiguationContext = unionString(false,contextSelections);
// (3) one can build a combination of the above
// disambiguationContext = unionString(true, //unique adds
// Collections.singleton(savedEntity.getName()), //the selected text
// Collections.singleton(context), //the context
// contextSelections); //other selected parsed in the context
// or just the name of the entity AND the context
// disambiguationContext = unionString(false,
// Collections.singleton(savedEntity.getName()),
// contextSelections);
// (4) TODO: I would also like to have the possibility to disambiguate
// using URIs of Entities suggested for other TextAnnotations
// within the context.
// make the similarity query on the Entityhub using the collected
// information
QueryResultList<Entity> results;
log.info(" - Query '{}' for {}@{} with context '{}'", new Object[] { site.getId(), savedEntityLabel, contentLangauge, disambiguationContext });
if (!StringUtils.isBlank(disambiguationContext)) {
try {
results = query(site, savedEntityLabel, contentLangauge, disambiguationContext);
} catch (SiteException e) {
// TODO we could also try to catch those errors ...
throw new EngineException("Unable to disambiguate Mention of '" + savedEntity.getName() + "' on Entityhub Site '" + site.getId() + "!", e);
}
log.debug(" - {} results returned by query {}", results.size(), results.getQuery());
// match the results with the suggestions
disambiguateSuggestions(results, savedEntity);
} else {
log.debug(" - not disambiguated because of empty context!");
}
}
// (3) Write back the Results of the Disambiguation process
// NOTE (rwesten): In the original version of Kritarth this was done as
// part of (2) - disambiguation. This is now changed as in (2) the
// disambiguation results are stored in the Suggestions and only
// applied to the EnhancementStructure in (3). This allows to reduce the
// coverage of the wirte lock needed to be applied to the ContentItem.
ci.getLock().writeLock().lock();
try {
applyDisambiguationResults(graph, disData);
} finally {
ci.getLock().writeLock().unlock();
}
}
use of org.apache.clerezza.commons.rdf.Graph in project stanbol by apache.
the class EntityCoMentionEngine method computeEnhancements.
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
AnalysedText at = getAnalysedText(this, ci, true);
String language = getLanguage(this, ci, true);
LanguageProcessingConfig languageConfig = textProcessingConfig.getConfiguration(language);
if (languageConfig == null) {
throw new IllegalStateException("The language '" + language + "' is not configured " + "to be processed by this Engine. As this is already checked within the " + "canEnhance(..) method this may indicate an bug in the used " + "EnhanceemntJobManager implementation!");
}
if (log.isDebugEnabled()) {
log.debug("compute co-mentions for ContentItem {} language {} text={}", new Object[] { ci.getUri().getUnicodeString(), language, StringUtils.abbreviate(at.getSpan(), 100) });
}
LabelTokenizer labelTokenizer = (LabelTokenizer) labelTokenizerTracker.getService();
if (labelTokenizer == null) {
throw new EngineException(this, ci, "No LabelTokenizer available!", null);
}
//create the in-memory database for the mentioned Entities
ContentItemMentionBuilder entityMentionIndex = new ContentItemMentionBuilder(labelTokenizer, language, linkerConfig.getDefaultLanguage());
Graph metadata = ci.getMetadata();
Set<IRI> textAnnotations = new HashSet<IRI>();
ci.getLock().readLock().lock();
try {
//iterate over all TextAnnotations (mentions of Entities)
for (Iterator<Triple> it = metadata.filter(null, RDF_TYPE, ENHANCER_TEXTANNOTATION); it.hasNext(); ) {
IRI ta = (IRI) it.next().getSubject();
entityMentionIndex.registerTextAnnotation(ta, metadata);
//store the registered text annotations
textAnnotations.add(ta);
}
} finally {
ci.getLock().readLock().unlock();
}
EntityLinker entityLinker = new EntityLinker(at, language, languageConfig, entityMentionIndex, linkerConfig, labelTokenizer, entityMentionIndex);
//process
try {
entityLinker.process();
} catch (EntitySearcherException e) {
log.error("Unable to link Entities with " + entityLinker, e);
throw new EngineException(this, ci, "Unable to link Entities with " + entityLinker, e);
}
//TODO: write results
ci.getLock().writeLock().lock();
try {
writeComentions(ci, entityLinker.getLinkedEntities().values(), language, textAnnotations);
} finally {
ci.getLock().writeLock().unlock();
}
}
Aggregations