use of org.apache.stanbol.entityhub.servicesapi.site.Site in project stanbol by apache.
the class SiteManagerImpl method findIds.
@Override
public QueryResultList<String> findIds(FieldQuery query) {
log.debug("findIds for query{}", query);
// We need to search all referenced Sites
Set<String> entityIds = new HashSet<String>();
// TODO: The QueryResultList expects that the query as executed is added
// to the response. However when executing queries on multiple site they
// might support a different set of features and therefore execute
// different variants. For now I return simple the query as executed by
// the first Site that contributes results
FieldQuery processedQuery = null;
FieldQuery queryWithResults = null;
for (Site site : referencedSites) {
if (site.supportsSearch()) {
log.debug(" > query site {}", site.getId());
try {
QueryResultList<String> results = site.findReferences(query);
if (processedQuery == null) {
processedQuery = results.getQuery();
}
if (!results.isEmpty() && queryWithResults == null) {
processedQuery = results.getQuery();
}
for (String entityId : results) {
entityIds.add(entityId);
}
} catch (SiteException e) {
log.warn("Unable to access Site " + site.getConfiguration().getName() + " (id = " + site.getId() + ")", e);
}
} else {
log.debug(" > Site {} does not support queries", site.getId());
}
}
return new QueryResultListImpl<String>(// use the query with results
queryWithResults != null ? // use the query with results
queryWithResults : // if not a processed
processedQuery != null ? // if not a processed
processedQuery : // else the parsed one
query, entityIds.iterator(), String.class);
}
use of org.apache.stanbol.entityhub.servicesapi.site.Site in project stanbol by apache.
the class SiteManagerImpl method getContent.
@Override
public InputStream getContent(String entityId, String contentType) {
Collection<Site> sites = getSitesByEntityPrefix(entityId);
if (sites.isEmpty()) {
log.info("No Referenced Site registered for Entity {}", entityId);
log.debug("Registered Prefixes {}", prefixList);
return null;
}
for (Site site : sites) {
InputStream content;
try {
content = site.getContent(entityId, contentType);
if (content != null) {
log.debug("Return Content of type {} for Entity {} from referenced site {}", new Object[] { contentType, entityId, site.getConfiguration().getName() });
return content;
}
} catch (SiteException e) {
log.warn("Unable to access Site " + site.getConfiguration().getName() + " (id = " + site.getId() + ")", e);
}
}
log.debug("Entity {} not found on any of the following Sites {}", entityId, sites);
return null;
}
use of org.apache.stanbol.entityhub.servicesapi.site.Site in project stanbol by apache.
the class SiteManagerImpl method findEntities.
@Override
public QueryResultList<Entity> findEntities(FieldQuery query) {
log.debug("findEntities for query{}", query);
// TODO: The QueryResultList expects that the query as executed is added
// to the response. However when executing queries on multiple site they
// might support a different set of features and therefore execute
// different variants. For now I return simple the query as executed by
// the first Site that contributes results
FieldQuery processedQuery = null;
FieldQuery queryWithResults = null;
Set<Entity> entities = new HashSet<Entity>();
for (Site site : referencedSites) {
if (site.supportsSearch()) {
// do not search on sites that do not support it
log.debug(" > query site {}", site.getId());
try {
QueryResultList<Entity> results = site.findEntities(query);
if (processedQuery == null) {
processedQuery = results.getQuery();
}
if (!results.isEmpty() && queryWithResults == null) {
processedQuery = results.getQuery();
}
for (Entity rep : results) {
if (!entities.contains(rep)) {
// do not override
entities.add(rep);
} else {
// TODO: find a solution for this problem
// e.g. allow to add the site for entities
log.info("Entity {} found on more than one Referenced Site" + " -> Representation of Site {} is ignored", rep.getId(), site.getConfiguration().getName());
}
}
} catch (SiteException e) {
log.warn("Unable to access Site " + site.getConfiguration().getName() + " (id = " + site.getId() + ")", e);
}
} else {
log.debug(" > Site {} does not support queries", site.getId());
}
}
return new QueryResultListImpl<Entity>(// use the query with results
queryWithResults != null ? // use the query with results
queryWithResults : // if not a processed
processedQuery != null ? // if not a processed
processedQuery : // else the parsed one
query, entities, Entity.class);
}
use of org.apache.stanbol.entityhub.servicesapi.site.Site in project stanbol by apache.
the class DisambiguatorEngine method computeEnhancements.
/*
* This function first evaluates all the possible ambiguations of each text annotation detected. the text
* of all entities detected is used for making a Dbpedia query with all string for MLT that contain all
* the other entities. The results obtained are used to calcualte new confidence values which are updated
* in the metadata.
*/
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
String textContent;
Entry<IRI, Blob> textBlob = ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES);
if (textBlob != null) {
try {
textContent = ContentItemHelper.getText(textBlob.getValue());
} catch (IOException e) {
log.warn("Unable to retieve plain text content for ContentItem " + ci.getUri(), e);
textContent = null;
}
} else {
textContent = null;
}
Graph graph = ci.getMetadata();
// (1) read the data from the content item
String contentLangauge;
DisambiguationData disData;
ci.getLock().readLock().lock();
try {
contentLangauge = EnhancementEngineHelper.getLanguage(ci);
// NOTE (rwesten): moved the parsing of the information from the
// contentItem to static method of the Class holding those information
// (similar as it already was for SavedEntity)
// readEntities(loseConfidence, allEntities, textAnnotations, graph);
disData = DisambiguationData.createFromContentItem(ci);
} finally {
ci.getLock().readLock().unlock();
}
// (2) Disambiguate the SavedEntities
for (SavedEntity savedEntity : disData.textAnnotations.values()) {
if (savedEntity.getSuggestions().size() <= 1) {
// we need not to disambiguate if only one suggestion is present
continue;
}
// NOTE: the site is determined from the
// fise:TextAnnotation <-- dc:relation --
// fise:EntityAnnotation -- entityhub:ste --> "{siteName}"^^xsd:string
// data.
// TODO: add configuration to include/exclude Sites by name
Site site = siteManager.getSite(savedEntity.getSite());
// potential types of entities
Collection<String> types = null;
// TODO: make configurable
boolean casesensitive = false;
String savedEntityLabel = casesensitive ? savedEntity.getName() : savedEntity.getName().toLowerCase();
// Determine the context used for disambiguation
// TODO: make this configurable options
String disambiguationContext;
// (0.a) The easiest way is to just use the selection context
// disambiguationContext = savedEntity.getContext();
// (0.b) Calculate a context based on a moving window
String window = getDisambiguationContext(textContent, savedEntity.getName(), savedEntity.getStart(), 100);
log.info("Use Window: '{}' for '{}'", window, savedEntity.getName());
// (1) The contextSelections:
// All other selected text within the selection context
List<String> contextSelections = getSelectionsInContext(savedEntity.getName(), disData.allSelectedTexts, window);
// savedEntity.getContext());
disambiguationContext = unionString(false, contextSelections);
// (2) I do not understand this variant (see comment for the
// EntitiesInRange(..) method
// List<String> L = EntitiesInRange(disData.directoryTextAnotation,
// (savedEntity.getStart() + savedEntity.getEnd()) / 2);
// disambiguationContext = unionString(false,contextSelections);
// (3) one can build a combination of the above
// disambiguationContext = unionString(true, //unique adds
// Collections.singleton(savedEntity.getName()), //the selected text
// Collections.singleton(context), //the context
// contextSelections); //other selected parsed in the context
// or just the name of the entity AND the context
// disambiguationContext = unionString(false,
// Collections.singleton(savedEntity.getName()),
// contextSelections);
// (4) TODO: I would also like to have the possibility to disambiguate
// using URIs of Entities suggested for other TextAnnotations
// within the context.
// make the similarity query on the Entityhub using the collected
// information
QueryResultList<Entity> results;
log.info(" - Query '{}' for {}@{} with context '{}'", new Object[] { site.getId(), savedEntityLabel, contentLangauge, disambiguationContext });
if (!StringUtils.isBlank(disambiguationContext)) {
try {
results = query(site, savedEntityLabel, contentLangauge, disambiguationContext);
} catch (SiteException e) {
// TODO we could also try to catch those errors ...
throw new EngineException("Unable to disambiguate Mention of '" + savedEntity.getName() + "' on Entityhub Site '" + site.getId() + "!", e);
}
log.debug(" - {} results returned by query {}", results.size(), results.getQuery());
// match the results with the suggestions
disambiguateSuggestions(results, savedEntity);
} else {
log.debug(" - not disambiguated because of empty context!");
}
}
// (3) Write back the Results of the Disambiguation process
// NOTE (rwesten): In the original version of Kritarth this was done as
// part of (2) - disambiguation. This is now changed as in (2) the
// disambiguation results are stored in the Suggestions and only
// applied to the EnhancementStructure in (3). This allows to reduce the
// coverage of the wirte lock needed to be applied to the ContentItem.
ci.getLock().writeLock().lock();
try {
applyDisambiguationResults(graph, disData);
} finally {
ci.getLock().writeLock().unlock();
}
}
use of org.apache.stanbol.entityhub.servicesapi.site.Site in project stanbol by apache.
the class CoreferenceFinder method buildEntityTypeLabels.
/**
* Builds a Set of Entity Type labels given the Entity type uris.
*
* @param entity
* @param language
* @return
* @throws EngineException
*/
private Set<String> buildEntityTypeLabels(Entity entity, String language) throws EngineException {
Iterator<Object> typeUris = entity.getRepresentation().get(RDF_TYPE.getUnicodeString());
Set<String> allTypeLabels = new HashSet<String>();
while (typeUris.hasNext()) {
String typeUri = typeUris.next().toString();
if (this.config.shouldExcludeClass(typeUri))
continue;
// First try the in memory index
Set<String> labels = this.entityTypeIndex.lookupEntityType(new IRI(typeUri), language);
if (labels == null) {
Site site = getReferencedSite();
Entity entityType = (site == null) ? this.entityHub.getEntity(typeUri) : site.getEntity(typeUri);
if (entityType != null) {
labels = new HashSet<String>();
Iterator<Text> labelIterator = entityType.getRepresentation().get(RDFS_LABEL.getUnicodeString(), language);
while (labelIterator.hasNext()) {
labels.add(labelIterator.next().getText());
}
this.entityTypeIndex.addEntityType(new IRI(typeUri), language, labels);
}
}
if (labels != null)
allTypeLabels.addAll(labels);
}
return allTypeLabels;
}
Aggregations