use of org.apache.stanbol.enhancer.topic.api.ClassificationReport in project stanbol by apache.
the class TopicClassificationEngine method computeEnhancements.
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES);
if (contentPart == null) {
throw new IllegalStateException("No ContentPart with a supported Mime Type" + "found for ContentItem " + ci.getUri() + "(supported: '" + SUPPORTED_MIMETYPES + "') -> this indicates that canEnhance was" + "NOT called and indicates a bug in the used EnhancementJobManager!");
}
String language = EnhancementEngineHelper.getLanguage(ci);
if (!(acceptedLanguageSet.isEmpty() || acceptedLanguageSet.contains(language) || acceptedLanguageSet.contains(""))) {
throw new IllegalStateException("The language '" + language + "' of the ContentItem is not configured as " + " active for this Engine (active: " + acceptedLanguageSet + ").");
}
String text;
try {
text = ContentItemHelper.getText(contentPart.getValue());
} catch (IOException e) {
throw new InvalidContentException(String.format("Unable to extract " + " textual content from ContentPart %s of ContentItem %s!", contentPart.getKey(), ci.getUri()), e);
}
if (text.trim().isEmpty()) {
log.warn("ContentPart {} of ContentItem {} does not contain any " + "text to extract topics from", contentPart.getKey(), ci.getUri());
return;
}
Graph metadata = ci.getMetadata();
List<TopicSuggestion> topics;
try {
topics = suggestTopics(text);
if (topics.isEmpty()) {
return;
}
} catch (ClassifierException e) {
throw new EngineException(e);
}
IRI precision = new IRI(NamespaceEnum.fise + "classifier/precision");
IRI recall = new IRI(NamespaceEnum.fise + "classifier/recall");
IRI f1 = new IRI(NamespaceEnum.fise + "classifier/f1");
LiteralFactory lf = LiteralFactory.getInstance();
ci.getLock().writeLock().lock();
try {
// Global text annotation to attach all the topic annotation to it.
IRI textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
metadata.add(new TripleImpl(textAnnotation, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_TYPE, OntologicalClasses.SKOS_CONCEPT));
for (TopicSuggestion topic : topics) {
IRI enhancement = EnhancementEngineHelper.createEntityEnhancement(ci, this);
metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.RDF_TYPE, TechnicalClasses.ENHANCER_TOPICANNOTATION));
metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_RELATION, textAnnotation));
// add link to entity
metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_REFERENCE, new IRI(topic.conceptUri)));
metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_TYPE, OntologicalClasses.SKOS_CONCEPT));
// add confidence information
metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_CONFIDENCE, lf.createTypedLiteral(Double.valueOf(topic.score))));
// add performance estimates of the classifier if available
ClassificationReport perf = getPerformanceEstimates(topic.conceptUri);
if (perf.uptodate) {
metadata.add(new TripleImpl(enhancement, precision, lf.createTypedLiteral(Double.valueOf(perf.precision))));
metadata.add(new TripleImpl(enhancement, recall, lf.createTypedLiteral(Double.valueOf(perf.recall))));
metadata.add(new TripleImpl(enhancement, f1, lf.createTypedLiteral(Double.valueOf(perf.f1))));
}
// fetch concept label from the entityhub or a referenced site if available
Entity entity = entityhub.getEntity(topic.conceptUri);
if (entity == null) {
entity = referencedSiteManager.getEntity(topic.conceptUri);
}
if (entity != null) {
Representation representation = entity.getRepresentation();
// TODO: extract all languages based on some configuration instead of hardcoding English
Text label = representation.getFirst(NamespaceEnum.skos + "prefLabel", "en", "en-US", "en-GB");
if (label == null) {
label = representation.getFirst(NamespaceEnum.rdfs + "label", "en", "en-US", "en-GB");
}
if (label != null) {
metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_LABEL, new PlainLiteralImpl(label.getText())));
}
}
}
} catch (ClassifierException e) {
throw new EngineException(e);
} catch (IllegalArgumentException e) {
throw new EngineException(e);
} catch (EntityhubException e) {
throw new EngineException(e);
} finally {
ci.getLock().writeLock().unlock();
}
}
use of org.apache.stanbol.enhancer.topic.api.ClassificationReport in project stanbol by apache.
the class TopicEngineTest method testUpdatePerformanceEstimates.
@Test
public void testUpdatePerformanceEstimates() throws Exception {
log.info(" --- testUpdatePerformanceEstimates --- ");
ClassificationReport performanceEstimates;
// no registered topic
try {
classifier.getPerformanceEstimates("urn:t/001");
fail("Should have raised ClassifierException");
} catch (ClassifierException e) {
// expected
}
// register some topics
classifier.addConcept("urn:t/001", null);
classifier.addConcept("urn:t/002", Arrays.asList("urn:t/001"));
performanceEstimates = classifier.getPerformanceEstimates("urn:t/002");
assertFalse(performanceEstimates.uptodate);
// update the performance metadata manually
classifier.updatePerformanceMetadata("urn:t/002", 0.76f, 0.60f, 34, 32, Arrays.asList("ex14", "ex78"), Arrays.asList("ex34", "ex23", "ex89"));
classifier.getActiveSolrServer().commit();
performanceEstimates = classifier.getPerformanceEstimates("urn:t/002");
assertTrue(performanceEstimates.uptodate);
assertEquals(0.76f, performanceEstimates.precision, 0.01);
assertEquals(0.60f, performanceEstimates.recall, 0.01);
assertEquals(0.67f, performanceEstimates.f1, 0.01);
assertEquals(34, performanceEstimates.positiveSupport);
assertEquals(32, performanceEstimates.negativeSupport);
assertTrue(classifier.getBroaderConcepts("urn:t/002").contains("urn:t/001"));
// accumulate other folds statistics and compute means of statistics
classifier.updatePerformanceMetadata("urn:t/002", 0.79f, 0.63f, 10, 10, Arrays.asList("ex1", "ex5"), Arrays.asList("ex3", "ex10", "ex11"));
classifier.getActiveSolrServer().commit();
performanceEstimates = classifier.getPerformanceEstimates("urn:t/002");
assertTrue(performanceEstimates.uptodate);
assertEquals(0.775f, performanceEstimates.precision, 0.01);
assertEquals(0.615f, performanceEstimates.recall, 0.01);
assertEquals(0.695f, performanceEstimates.f1, 0.01);
assertEquals(44, performanceEstimates.positiveSupport);
assertEquals(42, performanceEstimates.negativeSupport);
}
use of org.apache.stanbol.enhancer.topic.api.ClassificationReport in project stanbol by apache.
the class TopicEngineTest method testCrossValidation.
@Test
public void testCrossValidation() throws Exception {
log.info(" --- testCrossValidation --- ");
// seed a pseudo random number generator for reproducible tests
Random rng = new Random(0);
ClassificationReport performanceEstimates;
// build an artificial data set used for training models and evaluation
int numberOfTopics = 10;
int numberOfDocuments = 100;
int vocabSizeMin = 20;
int vocabSizeMax = 30;
initArtificialTrainingSet(numberOfTopics, numberOfDocuments, vocabSizeMin, vocabSizeMax, rng);
// by default the reports are not computed
performanceEstimates = classifier.getPerformanceEstimates("urn:t/001");
assertFalse(performanceEstimates.uptodate);
performanceEstimates = classifier.getPerformanceEstimates("urn:t/002");
assertFalse(performanceEstimates.uptodate);
performanceEstimates = classifier.getPerformanceEstimates("urn:t/003");
assertFalse(performanceEstimates.uptodate);
try {
classifier.getPerformanceEstimates("urn:doesnotexist");
fail("Should have raised a ClassifierException");
} catch (ClassifierException e) {
// expected
}
// launch an evaluation of the classifier according to the current state of the training set
assertEquals(numberOfTopics, classifier.updatePerformanceEstimates(true));
for (int i = 1; i <= numberOfTopics; i++) {
String topic = String.format("urn:t/%03d", i);
performanceEstimates = classifier.getPerformanceEstimates(topic);
assertTrue(performanceEstimates.uptodate);
assertGreater(performanceEstimates.precision, 0.45f);
assertNotNull(performanceEstimates.falsePositiveExampleIds);
assertNotNull(performanceEstimates.falseNegativeExampleIds);
if (performanceEstimates.precision < 1) {
assertFalse(performanceEstimates.falsePositiveExampleIds.isEmpty());
}
if (performanceEstimates.recall < 1) {
assertFalse(performanceEstimates.falseNegativeExampleIds.isEmpty());
}
assertGreater(performanceEstimates.recall, 0.45f);
assertGreater(performanceEstimates.f1, 0.55f);
// very small support, hence the estimates are unstable, hence we set low min expectations, but we
// need this test to run reasonably fast...
assertGreater(performanceEstimates.positiveSupport, 4);
assertGreater(performanceEstimates.negativeSupport, 4);
assertNotNull(performanceEstimates.evaluationDate);
}
// TODO: test model invalidation by registering a sub topic manually
}
use of org.apache.stanbol.enhancer.topic.api.ClassificationReport in project stanbol by apache.
the class TopicClassificationEngine method getPerformanceEstimates.
@Override
public ClassificationReport getPerformanceEstimates(String conceptId) throws ClassifierException {
SolrServer solrServer = getActiveSolrServer();
SolrQuery query = new SolrQuery("*:*");
query.addFilterQuery(entryTypeField + ":" + METADATA_ENTRY);
query.addFilterQuery(conceptUriField + ":" + ClientUtils.escapeQueryChars(conceptId));
try {
SolrDocumentList results = solrServer.query(query).getResults();
if (results.isEmpty()) {
throw new ClassifierException(String.format("'%s' is not a registered topic", conceptId));
}
SolrDocument metadata = results.get(0);
Float precision = computeMeanValue(metadata, precisionField);
Float recall = computeMeanValue(metadata, recallField);
int positiveSupport = computeSumValue(metadata, positiveSupportField);
int negativeSupport = computeSumValue(metadata, negativeSupportField);
Date evaluationDate = (Date) metadata.getFirstValue(modelEvaluationDateField);
boolean uptodate = evaluationDate != null;
ClassificationReport report = new ClassificationReport(precision, recall, positiveSupport, negativeSupport, uptodate, evaluationDate);
if (metadata.getFieldValues(falsePositivesField) == null) {
metadata.setField(falsePositivesField, new ArrayList<Object>());
}
for (Object falsePositiveId : metadata.getFieldValues(falsePositivesField)) {
report.falsePositiveExampleIds.add(falsePositiveId.toString());
}
if (metadata.getFieldValues(falseNegativesField) == null) {
metadata.setField(falseNegativesField, new ArrayList<Object>());
}
for (Object falseNegativeId : metadata.getFieldValues(falseNegativesField)) {
report.falseNegativeExampleIds.add(falseNegativeId.toString());
}
return report;
} catch (SolrServerException e) {
throw new ClassifierException(String.format("Error fetching the performance report for topic " + conceptId));
}
}
Aggregations