use of org.apache.stanbol.enhancer.topic.api.ClassifierException in project stanbol by apache.
the class TopicClassificationEngine method addConcept.
@Override
public void addConcept(String conceptUri, String primaryTopicUri, Collection<String> broaderConcepts) throws ClassifierException {
// ensure that there is no previous topic registered with the same id
removeConcept(conceptUri);
SolrInputDocument metadataEntry = new SolrInputDocument();
String metadataEntryId = UUID.randomUUID().toString();
String modelEntryId = UUID.randomUUID().toString();
metadataEntry.addField(conceptUriField, conceptUri);
metadataEntry.addField(entryIdField, metadataEntryId);
metadataEntry.addField(modelEntryIdField, modelEntryId);
metadataEntry.addField(entryTypeField, METADATA_ENTRY);
if (broaderConcepts != null && broaderField != null) {
metadataEntry.addField(broaderField, broaderConcepts);
}
if (primaryTopicUri != null && primaryTopicUriField != null) {
metadataEntry.addField(primaryTopicUriField, primaryTopicUri);
}
SolrInputDocument modelEntry = new SolrInputDocument();
modelEntry.addField(entryIdField, modelEntryId);
modelEntry.addField(conceptUriField, conceptUri);
modelEntry.addField(entryTypeField, MODEL_ENTRY);
if (broaderConcepts != null) {
invalidateModelFields(broaderConcepts, modelUpdateDateField, modelEvaluationDateField);
}
SolrServer solrServer = getActiveSolrServer();
try {
UpdateRequest request = new UpdateRequest();
request.add(metadataEntry);
request.add(modelEntry);
solrServer.request(request);
solrServer.commit();
} catch (Exception e) {
String msg = String.format("Error adding topic with id '%s' on Solr Core '%s'", conceptUri, solrCoreId);
throw new ClassifierException(msg, e);
}
}
use of org.apache.stanbol.enhancer.topic.api.ClassifierException in project stanbol by apache.
the class TopicClassificationEngine method updatePerformanceMetadata.
/**
* Update the performance statistics in a metadata entry of a topic. It is the responsibility of the
* caller to commit.
*/
protected void updatePerformanceMetadata(String conceptId, float precision, float recall, int positiveSupport, int negativeSupport, List<String> falsePositiveExamples, List<String> falseNegativeExamples) throws ClassifierException {
SolrServer solrServer = getActiveSolrServer();
try {
SolrQuery query = new SolrQuery("*:*");
query.addFilterQuery(entryTypeField + ":" + METADATA_ENTRY);
query.addFilterQuery(conceptUriField + ":" + ClientUtils.escapeQueryChars(conceptId));
for (SolrDocument result : solrServer.query(query).getResults()) {
// there should be only one (or none: tolerated)
// fetch any old values to update (all metadata fields are assumed to be stored)s
Map<String, Collection<Object>> fieldValues = new HashMap<String, Collection<Object>>();
for (String fieldName : result.getFieldNames()) {
fieldValues.put(fieldName, result.getFieldValues(fieldName));
}
addToList(fieldValues, precisionField, precision);
addToList(fieldValues, recallField, recall);
increment(fieldValues, positiveSupportField, positiveSupport);
increment(fieldValues, negativeSupportField, negativeSupport);
addToList(fieldValues, falsePositivesField, falsePositiveExamples);
addToList(fieldValues, falseNegativesField, falseNegativeExamples);
SolrInputDocument newEntry = new SolrInputDocument();
for (Map.Entry<String, Collection<Object>> entry : fieldValues.entrySet()) {
newEntry.addField(entry.getKey(), entry.getValue());
}
newEntry.setField(modelEvaluationDateField, UTCTimeStamper.nowUtcDate());
solrServer.add(newEntry);
}
log.info(String.format("Performance for concept '%s': precision=%f, recall=%f," + " positiveSupport=%d, negativeSupport=%d", conceptId, precision, recall, positiveSupport, negativeSupport));
} catch (Exception e) {
String msg = String.format("Error updating performance metadata for topic '%s' on Solr Core '%s'", conceptId, solrCoreId);
throw new ClassifierException(msg, e);
}
}
use of org.apache.stanbol.enhancer.topic.api.ClassifierException in project stanbol by apache.
the class TopicClassificationEngine method updatePerformanceEstimates.
public synchronized int updatePerformanceEstimates(boolean incremental) throws ClassifierException, TrainingSetException {
checkTrainingSet();
if (evaluationRunning) {
throw new ClassifierException("Another evaluation is already running");
}
int updatedTopics = 0;
// is now created within the #embeddedSolrServerDir
try {
evaluationRunning = true;
// 3-folds CV is hardcoded for now
int cvFoldCount = 3;
// make it possible to limit the number of folds to use
int cvIterationCount = 3;
// We will use the training set quite intensively, ensure that the index is packed and its
// statistics are up to date
getTrainingSet().optimize();
for (int cvFoldIndex = 0; cvFoldIndex < cvIterationCount; cvFoldIndex++) {
updatedTopics = performCVFold(cvFoldIndex, cvFoldCount, cvIterationCount, incremental);
}
SolrServer solrServer = getActiveSolrServer();
solrServer.optimize();
} catch (ConfigurationException e) {
throw new ClassifierException(e);
} catch (IOException e) {
throw new ClassifierException(e);
} catch (SolrServerException e) {
throw new ClassifierException(e);
} finally {
FileUtils.deleteQuietly(__evaluationServerDir);
evaluationRunning = false;
}
return updatedTopics;
}
use of org.apache.stanbol.enhancer.topic.api.ClassifierException in project stanbol by apache.
the class TopicClassificationEngine method computeEnhancements.
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES);
if (contentPart == null) {
throw new IllegalStateException("No ContentPart with a supported Mime Type" + "found for ContentItem " + ci.getUri() + "(supported: '" + SUPPORTED_MIMETYPES + "') -> this indicates that canEnhance was" + "NOT called and indicates a bug in the used EnhancementJobManager!");
}
String language = EnhancementEngineHelper.getLanguage(ci);
if (!(acceptedLanguageSet.isEmpty() || acceptedLanguageSet.contains(language) || acceptedLanguageSet.contains(""))) {
throw new IllegalStateException("The language '" + language + "' of the ContentItem is not configured as " + " active for this Engine (active: " + acceptedLanguageSet + ").");
}
String text;
try {
text = ContentItemHelper.getText(contentPart.getValue());
} catch (IOException e) {
throw new InvalidContentException(String.format("Unable to extract " + " textual content from ContentPart %s of ContentItem %s!", contentPart.getKey(), ci.getUri()), e);
}
if (text.trim().isEmpty()) {
log.warn("ContentPart {} of ContentItem {} does not contain any " + "text to extract topics from", contentPart.getKey(), ci.getUri());
return;
}
Graph metadata = ci.getMetadata();
List<TopicSuggestion> topics;
try {
topics = suggestTopics(text);
if (topics.isEmpty()) {
return;
}
} catch (ClassifierException e) {
throw new EngineException(e);
}
IRI precision = new IRI(NamespaceEnum.fise + "classifier/precision");
IRI recall = new IRI(NamespaceEnum.fise + "classifier/recall");
IRI f1 = new IRI(NamespaceEnum.fise + "classifier/f1");
LiteralFactory lf = LiteralFactory.getInstance();
ci.getLock().writeLock().lock();
try {
// Global text annotation to attach all the topic annotation to it.
IRI textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
metadata.add(new TripleImpl(textAnnotation, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_TYPE, OntologicalClasses.SKOS_CONCEPT));
for (TopicSuggestion topic : topics) {
IRI enhancement = EnhancementEngineHelper.createEntityEnhancement(ci, this);
metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.RDF_TYPE, TechnicalClasses.ENHANCER_TOPICANNOTATION));
metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_RELATION, textAnnotation));
// add link to entity
metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_REFERENCE, new IRI(topic.conceptUri)));
metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_TYPE, OntologicalClasses.SKOS_CONCEPT));
// add confidence information
metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_CONFIDENCE, lf.createTypedLiteral(Double.valueOf(topic.score))));
// add performance estimates of the classifier if available
ClassificationReport perf = getPerformanceEstimates(topic.conceptUri);
if (perf.uptodate) {
metadata.add(new TripleImpl(enhancement, precision, lf.createTypedLiteral(Double.valueOf(perf.precision))));
metadata.add(new TripleImpl(enhancement, recall, lf.createTypedLiteral(Double.valueOf(perf.recall))));
metadata.add(new TripleImpl(enhancement, f1, lf.createTypedLiteral(Double.valueOf(perf.f1))));
}
// fetch concept label from the entityhub or a referenced site if available
Entity entity = entityhub.getEntity(topic.conceptUri);
if (entity == null) {
entity = referencedSiteManager.getEntity(topic.conceptUri);
}
if (entity != null) {
Representation representation = entity.getRepresentation();
// TODO: extract all languages based on some configuration instead of hardcoding English
Text label = representation.getFirst(NamespaceEnum.skos + "prefLabel", "en", "en-US", "en-GB");
if (label == null) {
label = representation.getFirst(NamespaceEnum.rdfs + "label", "en", "en-US", "en-GB");
}
if (label != null) {
metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_LABEL, new PlainLiteralImpl(label.getText())));
}
}
}
} catch (ClassifierException e) {
throw new EngineException(e);
} catch (IllegalArgumentException e) {
throw new EngineException(e);
} catch (EntityhubException e) {
throw new EngineException(e);
} finally {
ci.getLock().writeLock().unlock();
}
}
use of org.apache.stanbol.enhancer.topic.api.ClassifierException in project stanbol by apache.
the class TopicEngineTest method testUpdatePerformanceEstimates.
@Test
public void testUpdatePerformanceEstimates() throws Exception {
log.info(" --- testUpdatePerformanceEstimates --- ");
ClassificationReport performanceEstimates;
// no registered topic
try {
classifier.getPerformanceEstimates("urn:t/001");
fail("Should have raised ClassifierException");
} catch (ClassifierException e) {
// expected
}
// register some topics
classifier.addConcept("urn:t/001", null);
classifier.addConcept("urn:t/002", Arrays.asList("urn:t/001"));
performanceEstimates = classifier.getPerformanceEstimates("urn:t/002");
assertFalse(performanceEstimates.uptodate);
// update the performance metadata manually
classifier.updatePerformanceMetadata("urn:t/002", 0.76f, 0.60f, 34, 32, Arrays.asList("ex14", "ex78"), Arrays.asList("ex34", "ex23", "ex89"));
classifier.getActiveSolrServer().commit();
performanceEstimates = classifier.getPerformanceEstimates("urn:t/002");
assertTrue(performanceEstimates.uptodate);
assertEquals(0.76f, performanceEstimates.precision, 0.01);
assertEquals(0.60f, performanceEstimates.recall, 0.01);
assertEquals(0.67f, performanceEstimates.f1, 0.01);
assertEquals(34, performanceEstimates.positiveSupport);
assertEquals(32, performanceEstimates.negativeSupport);
assertTrue(classifier.getBroaderConcepts("urn:t/002").contains("urn:t/001"));
// accumulate other folds statistics and compute means of statistics
classifier.updatePerformanceMetadata("urn:t/002", 0.79f, 0.63f, 10, 10, Arrays.asList("ex1", "ex5"), Arrays.asList("ex3", "ex10", "ex11"));
classifier.getActiveSolrServer().commit();
performanceEstimates = classifier.getPerformanceEstimates("urn:t/002");
assertTrue(performanceEstimates.uptodate);
assertEquals(0.775f, performanceEstimates.precision, 0.01);
assertEquals(0.615f, performanceEstimates.recall, 0.01);
assertEquals(0.695f, performanceEstimates.f1, 0.01);
assertEquals(44, performanceEstimates.positiveSupport);
assertEquals(42, performanceEstimates.negativeSupport);
}
Aggregations