Search in sources :

Example 1 with ClassificationReport

use of org.apache.stanbol.enhancer.topic.api.ClassificationReport in project stanbol by apache.

the class TopicClassificationEngine method computeEnhancements.

@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES);
    if (contentPart == null) {
        throw new IllegalStateException("No ContentPart with a supported Mime Type" + "found for ContentItem " + ci.getUri() + "(supported: '" + SUPPORTED_MIMETYPES + "') -> this indicates that canEnhance was" + "NOT called and indicates a bug in the used EnhancementJobManager!");
    }
    String language = EnhancementEngineHelper.getLanguage(ci);
    if (!(acceptedLanguageSet.isEmpty() || acceptedLanguageSet.contains(language) || acceptedLanguageSet.contains(""))) {
        throw new IllegalStateException("The language '" + language + "' of the ContentItem is not configured as " + " active for this Engine (active: " + acceptedLanguageSet + ").");
    }
    String text;
    try {
        text = ContentItemHelper.getText(contentPart.getValue());
    } catch (IOException e) {
        throw new InvalidContentException(String.format("Unable to extract " + " textual content from ContentPart %s of ContentItem %s!", contentPart.getKey(), ci.getUri()), e);
    }
    if (text.trim().isEmpty()) {
        log.warn("ContentPart {} of ContentItem {} does not contain any " + "text to extract topics from", contentPart.getKey(), ci.getUri());
        return;
    }
    Graph metadata = ci.getMetadata();
    List<TopicSuggestion> topics;
    try {
        topics = suggestTopics(text);
        if (topics.isEmpty()) {
            return;
        }
    } catch (ClassifierException e) {
        throw new EngineException(e);
    }
    IRI precision = new IRI(NamespaceEnum.fise + "classifier/precision");
    IRI recall = new IRI(NamespaceEnum.fise + "classifier/recall");
    IRI f1 = new IRI(NamespaceEnum.fise + "classifier/f1");
    LiteralFactory lf = LiteralFactory.getInstance();
    ci.getLock().writeLock().lock();
    try {
        // Global text annotation to attach all the topic annotation to it.
        IRI textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
        metadata.add(new TripleImpl(textAnnotation, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_TYPE, OntologicalClasses.SKOS_CONCEPT));
        for (TopicSuggestion topic : topics) {
            IRI enhancement = EnhancementEngineHelper.createEntityEnhancement(ci, this);
            metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.RDF_TYPE, TechnicalClasses.ENHANCER_TOPICANNOTATION));
            metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_RELATION, textAnnotation));
            // add link to entity
            metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_REFERENCE, new IRI(topic.conceptUri)));
            metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_TYPE, OntologicalClasses.SKOS_CONCEPT));
            // add confidence information
            metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_CONFIDENCE, lf.createTypedLiteral(Double.valueOf(topic.score))));
            // add performance estimates of the classifier if available
            ClassificationReport perf = getPerformanceEstimates(topic.conceptUri);
            if (perf.uptodate) {
                metadata.add(new TripleImpl(enhancement, precision, lf.createTypedLiteral(Double.valueOf(perf.precision))));
                metadata.add(new TripleImpl(enhancement, recall, lf.createTypedLiteral(Double.valueOf(perf.recall))));
                metadata.add(new TripleImpl(enhancement, f1, lf.createTypedLiteral(Double.valueOf(perf.f1))));
            }
            // fetch concept label from the entityhub or a referenced site if available
            Entity entity = entityhub.getEntity(topic.conceptUri);
            if (entity == null) {
                entity = referencedSiteManager.getEntity(topic.conceptUri);
            }
            if (entity != null) {
                Representation representation = entity.getRepresentation();
                // TODO: extract all languages based on some configuration instead of hardcoding English
                Text label = representation.getFirst(NamespaceEnum.skos + "prefLabel", "en", "en-US", "en-GB");
                if (label == null) {
                    label = representation.getFirst(NamespaceEnum.rdfs + "label", "en", "en-US", "en-GB");
                }
                if (label != null) {
                    metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_LABEL, new PlainLiteralImpl(label.getText())));
                }
            }
        }
    } catch (ClassifierException e) {
        throw new EngineException(e);
    } catch (IllegalArgumentException e) {
        throw new EngineException(e);
    } catch (EntityhubException e) {
        throw new EngineException(e);
    } finally {
        ci.getLock().writeLock().unlock();
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) Entity(org.apache.stanbol.entityhub.servicesapi.model.Entity) Blob(org.apache.stanbol.enhancer.servicesapi.Blob) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) Representation(org.apache.stanbol.entityhub.servicesapi.model.Representation) Text(org.apache.stanbol.entityhub.servicesapi.model.Text) IOException(java.io.IOException) TopicSuggestion(org.apache.stanbol.enhancer.topic.api.TopicSuggestion) LiteralFactory(org.apache.clerezza.rdf.core.LiteralFactory) InvalidContentException(org.apache.stanbol.enhancer.servicesapi.InvalidContentException) ImmutableGraph(org.apache.clerezza.commons.rdf.ImmutableGraph) Graph(org.apache.clerezza.commons.rdf.Graph) EntityhubException(org.apache.stanbol.entityhub.servicesapi.EntityhubException) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) ClassificationReport(org.apache.stanbol.enhancer.topic.api.ClassificationReport) ClassifierException(org.apache.stanbol.enhancer.topic.api.ClassifierException)

Example 2 with ClassificationReport

use of org.apache.stanbol.enhancer.topic.api.ClassificationReport in project stanbol by apache.

the class TopicEngineTest method testUpdatePerformanceEstimates.

@Test
public void testUpdatePerformanceEstimates() throws Exception {
    log.info(" --- testUpdatePerformanceEstimates --- ");
    ClassificationReport performanceEstimates;
    // no registered topic
    try {
        classifier.getPerformanceEstimates("urn:t/001");
        fail("Should have raised ClassifierException");
    } catch (ClassifierException e) {
    // expected
    }
    // register some topics
    classifier.addConcept("urn:t/001", null);
    classifier.addConcept("urn:t/002", Arrays.asList("urn:t/001"));
    performanceEstimates = classifier.getPerformanceEstimates("urn:t/002");
    assertFalse(performanceEstimates.uptodate);
    // update the performance metadata manually
    classifier.updatePerformanceMetadata("urn:t/002", 0.76f, 0.60f, 34, 32, Arrays.asList("ex14", "ex78"), Arrays.asList("ex34", "ex23", "ex89"));
    classifier.getActiveSolrServer().commit();
    performanceEstimates = classifier.getPerformanceEstimates("urn:t/002");
    assertTrue(performanceEstimates.uptodate);
    assertEquals(0.76f, performanceEstimates.precision, 0.01);
    assertEquals(0.60f, performanceEstimates.recall, 0.01);
    assertEquals(0.67f, performanceEstimates.f1, 0.01);
    assertEquals(34, performanceEstimates.positiveSupport);
    assertEquals(32, performanceEstimates.negativeSupport);
    assertTrue(classifier.getBroaderConcepts("urn:t/002").contains("urn:t/001"));
    // accumulate other folds statistics and compute means of statistics
    classifier.updatePerformanceMetadata("urn:t/002", 0.79f, 0.63f, 10, 10, Arrays.asList("ex1", "ex5"), Arrays.asList("ex3", "ex10", "ex11"));
    classifier.getActiveSolrServer().commit();
    performanceEstimates = classifier.getPerformanceEstimates("urn:t/002");
    assertTrue(performanceEstimates.uptodate);
    assertEquals(0.775f, performanceEstimates.precision, 0.01);
    assertEquals(0.615f, performanceEstimates.recall, 0.01);
    assertEquals(0.695f, performanceEstimates.f1, 0.01);
    assertEquals(44, performanceEstimates.positiveSupport);
    assertEquals(42, performanceEstimates.negativeSupport);
}
Also used : ClassificationReport(org.apache.stanbol.enhancer.topic.api.ClassificationReport) ClassifierException(org.apache.stanbol.enhancer.topic.api.ClassifierException) Test(org.junit.Test)

Example 3 with ClassificationReport

use of org.apache.stanbol.enhancer.topic.api.ClassificationReport in project stanbol by apache.

the class TopicEngineTest method testCrossValidation.

@Test
public void testCrossValidation() throws Exception {
    log.info(" --- testCrossValidation --- ");
    // seed a pseudo random number generator for reproducible tests
    Random rng = new Random(0);
    ClassificationReport performanceEstimates;
    // build an artificial data set used for training models and evaluation
    int numberOfTopics = 10;
    int numberOfDocuments = 100;
    int vocabSizeMin = 20;
    int vocabSizeMax = 30;
    initArtificialTrainingSet(numberOfTopics, numberOfDocuments, vocabSizeMin, vocabSizeMax, rng);
    // by default the reports are not computed
    performanceEstimates = classifier.getPerformanceEstimates("urn:t/001");
    assertFalse(performanceEstimates.uptodate);
    performanceEstimates = classifier.getPerformanceEstimates("urn:t/002");
    assertFalse(performanceEstimates.uptodate);
    performanceEstimates = classifier.getPerformanceEstimates("urn:t/003");
    assertFalse(performanceEstimates.uptodate);
    try {
        classifier.getPerformanceEstimates("urn:doesnotexist");
        fail("Should have raised a ClassifierException");
    } catch (ClassifierException e) {
    // expected
    }
    // launch an evaluation of the classifier according to the current state of the training set
    assertEquals(numberOfTopics, classifier.updatePerformanceEstimates(true));
    for (int i = 1; i <= numberOfTopics; i++) {
        String topic = String.format("urn:t/%03d", i);
        performanceEstimates = classifier.getPerformanceEstimates(topic);
        assertTrue(performanceEstimates.uptodate);
        assertGreater(performanceEstimates.precision, 0.45f);
        assertNotNull(performanceEstimates.falsePositiveExampleIds);
        assertNotNull(performanceEstimates.falseNegativeExampleIds);
        if (performanceEstimates.precision < 1) {
            assertFalse(performanceEstimates.falsePositiveExampleIds.isEmpty());
        }
        if (performanceEstimates.recall < 1) {
            assertFalse(performanceEstimates.falseNegativeExampleIds.isEmpty());
        }
        assertGreater(performanceEstimates.recall, 0.45f);
        assertGreater(performanceEstimates.f1, 0.55f);
        // very small support, hence the estimates are unstable, hence we set low min expectations, but we
        // need this test to run reasonably fast...
        assertGreater(performanceEstimates.positiveSupport, 4);
        assertGreater(performanceEstimates.negativeSupport, 4);
        assertNotNull(performanceEstimates.evaluationDate);
    }
// TODO: test model invalidation by registering a sub topic manually
}
Also used : Random(java.util.Random) ClassificationReport(org.apache.stanbol.enhancer.topic.api.ClassificationReport) ClassifierException(org.apache.stanbol.enhancer.topic.api.ClassifierException) Test(org.junit.Test)

Example 4 with ClassificationReport

use of org.apache.stanbol.enhancer.topic.api.ClassificationReport in project stanbol by apache.

the class TopicClassificationEngine method getPerformanceEstimates.

@Override
public ClassificationReport getPerformanceEstimates(String conceptId) throws ClassifierException {
    SolrServer solrServer = getActiveSolrServer();
    SolrQuery query = new SolrQuery("*:*");
    query.addFilterQuery(entryTypeField + ":" + METADATA_ENTRY);
    query.addFilterQuery(conceptUriField + ":" + ClientUtils.escapeQueryChars(conceptId));
    try {
        SolrDocumentList results = solrServer.query(query).getResults();
        if (results.isEmpty()) {
            throw new ClassifierException(String.format("'%s' is not a registered topic", conceptId));
        }
        SolrDocument metadata = results.get(0);
        Float precision = computeMeanValue(metadata, precisionField);
        Float recall = computeMeanValue(metadata, recallField);
        int positiveSupport = computeSumValue(metadata, positiveSupportField);
        int negativeSupport = computeSumValue(metadata, negativeSupportField);
        Date evaluationDate = (Date) metadata.getFirstValue(modelEvaluationDateField);
        boolean uptodate = evaluationDate != null;
        ClassificationReport report = new ClassificationReport(precision, recall, positiveSupport, negativeSupport, uptodate, evaluationDate);
        if (metadata.getFieldValues(falsePositivesField) == null) {
            metadata.setField(falsePositivesField, new ArrayList<Object>());
        }
        for (Object falsePositiveId : metadata.getFieldValues(falsePositivesField)) {
            report.falsePositiveExampleIds.add(falsePositiveId.toString());
        }
        if (metadata.getFieldValues(falseNegativesField) == null) {
            metadata.setField(falseNegativesField, new ArrayList<Object>());
        }
        for (Object falseNegativeId : metadata.getFieldValues(falseNegativesField)) {
            report.falseNegativeExampleIds.add(falseNegativeId.toString());
        }
        return report;
    } catch (SolrServerException e) {
        throw new ClassifierException(String.format("Error fetching the performance report for topic " + conceptId));
    }
}
Also used : SolrServerException(org.apache.solr.client.solrj.SolrServerException) SolrDocumentList(org.apache.solr.common.SolrDocumentList) EmbeddedSolrServer(org.apache.solr.client.solrj.embedded.EmbeddedSolrServer) SolrServer(org.apache.solr.client.solrj.SolrServer) ManagedSolrServer(org.apache.stanbol.commons.solr.managed.ManagedSolrServer) SolrQuery(org.apache.solr.client.solrj.SolrQuery) Date(java.util.Date) SolrDocument(org.apache.solr.common.SolrDocument) ClassificationReport(org.apache.stanbol.enhancer.topic.api.ClassificationReport) ClassifierException(org.apache.stanbol.enhancer.topic.api.ClassifierException)

Aggregations

ClassificationReport (org.apache.stanbol.enhancer.topic.api.ClassificationReport)4 ClassifierException (org.apache.stanbol.enhancer.topic.api.ClassifierException)4 Test (org.junit.Test)2 IOException (java.io.IOException)1 Date (java.util.Date)1 Random (java.util.Random)1 Graph (org.apache.clerezza.commons.rdf.Graph)1 IRI (org.apache.clerezza.commons.rdf.IRI)1 ImmutableGraph (org.apache.clerezza.commons.rdf.ImmutableGraph)1 PlainLiteralImpl (org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl)1 TripleImpl (org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)1 LiteralFactory (org.apache.clerezza.rdf.core.LiteralFactory)1 SolrQuery (org.apache.solr.client.solrj.SolrQuery)1 SolrServer (org.apache.solr.client.solrj.SolrServer)1 SolrServerException (org.apache.solr.client.solrj.SolrServerException)1 EmbeddedSolrServer (org.apache.solr.client.solrj.embedded.EmbeddedSolrServer)1 SolrDocument (org.apache.solr.common.SolrDocument)1 SolrDocumentList (org.apache.solr.common.SolrDocumentList)1 ManagedSolrServer (org.apache.stanbol.commons.solr.managed.ManagedSolrServer)1 Blob (org.apache.stanbol.enhancer.servicesapi.Blob)1