Search in sources :

Example 1 with ClassifierException

use of org.apache.stanbol.enhancer.topic.api.ClassifierException in project stanbol by apache.

the class TopicClassificationEngine method addConcept.

@Override
public void addConcept(String conceptUri, String primaryTopicUri, Collection<String> broaderConcepts) throws ClassifierException {
    // ensure that there is no previous topic registered with the same id
    removeConcept(conceptUri);
    SolrInputDocument metadataEntry = new SolrInputDocument();
    String metadataEntryId = UUID.randomUUID().toString();
    String modelEntryId = UUID.randomUUID().toString();
    metadataEntry.addField(conceptUriField, conceptUri);
    metadataEntry.addField(entryIdField, metadataEntryId);
    metadataEntry.addField(modelEntryIdField, modelEntryId);
    metadataEntry.addField(entryTypeField, METADATA_ENTRY);
    if (broaderConcepts != null && broaderField != null) {
        metadataEntry.addField(broaderField, broaderConcepts);
    }
    if (primaryTopicUri != null && primaryTopicUriField != null) {
        metadataEntry.addField(primaryTopicUriField, primaryTopicUri);
    }
    SolrInputDocument modelEntry = new SolrInputDocument();
    modelEntry.addField(entryIdField, modelEntryId);
    modelEntry.addField(conceptUriField, conceptUri);
    modelEntry.addField(entryTypeField, MODEL_ENTRY);
    if (broaderConcepts != null) {
        invalidateModelFields(broaderConcepts, modelUpdateDateField, modelEvaluationDateField);
    }
    SolrServer solrServer = getActiveSolrServer();
    try {
        UpdateRequest request = new UpdateRequest();
        request.add(metadataEntry);
        request.add(modelEntry);
        solrServer.request(request);
        solrServer.commit();
    } catch (Exception e) {
        String msg = String.format("Error adding topic with id '%s' on Solr Core '%s'", conceptUri, solrCoreId);
        throw new ClassifierException(msg, e);
    }
}
Also used : SolrInputDocument(org.apache.solr.common.SolrInputDocument) UpdateRequest(org.apache.solr.client.solrj.request.UpdateRequest) EmbeddedSolrServer(org.apache.solr.client.solrj.embedded.EmbeddedSolrServer) SolrServer(org.apache.solr.client.solrj.SolrServer) ManagedSolrServer(org.apache.stanbol.commons.solr.managed.ManagedSolrServer) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) SolrServerException(org.apache.solr.client.solrj.SolrServerException) ConfigurationException(org.osgi.service.cm.ConfigurationException) InvalidSyntaxException(org.osgi.framework.InvalidSyntaxException) TrainingSetException(org.apache.stanbol.enhancer.topic.api.training.TrainingSetException) ClassifierException(org.apache.stanbol.enhancer.topic.api.ClassifierException) InvalidContentException(org.apache.stanbol.enhancer.servicesapi.InvalidContentException) EntityhubException(org.apache.stanbol.entityhub.servicesapi.EntityhubException) ChainException(org.apache.stanbol.enhancer.servicesapi.ChainException) IOException(java.io.IOException) ClassifierException(org.apache.stanbol.enhancer.topic.api.ClassifierException)

Example 2 with ClassifierException

use of org.apache.stanbol.enhancer.topic.api.ClassifierException in project stanbol by apache.

the class TopicClassificationEngine method updatePerformanceMetadata.

/**
     * Update the performance statistics in a metadata entry of a topic. It is the responsibility of the
     * caller to commit.
     */
protected void updatePerformanceMetadata(String conceptId, float precision, float recall, int positiveSupport, int negativeSupport, List<String> falsePositiveExamples, List<String> falseNegativeExamples) throws ClassifierException {
    SolrServer solrServer = getActiveSolrServer();
    try {
        SolrQuery query = new SolrQuery("*:*");
        query.addFilterQuery(entryTypeField + ":" + METADATA_ENTRY);
        query.addFilterQuery(conceptUriField + ":" + ClientUtils.escapeQueryChars(conceptId));
        for (SolrDocument result : solrServer.query(query).getResults()) {
            // there should be only one (or none: tolerated)
            // fetch any old values to update (all metadata fields are assumed to be stored)s
            Map<String, Collection<Object>> fieldValues = new HashMap<String, Collection<Object>>();
            for (String fieldName : result.getFieldNames()) {
                fieldValues.put(fieldName, result.getFieldValues(fieldName));
            }
            addToList(fieldValues, precisionField, precision);
            addToList(fieldValues, recallField, recall);
            increment(fieldValues, positiveSupportField, positiveSupport);
            increment(fieldValues, negativeSupportField, negativeSupport);
            addToList(fieldValues, falsePositivesField, falsePositiveExamples);
            addToList(fieldValues, falseNegativesField, falseNegativeExamples);
            SolrInputDocument newEntry = new SolrInputDocument();
            for (Map.Entry<String, Collection<Object>> entry : fieldValues.entrySet()) {
                newEntry.addField(entry.getKey(), entry.getValue());
            }
            newEntry.setField(modelEvaluationDateField, UTCTimeStamper.nowUtcDate());
            solrServer.add(newEntry);
        }
        log.info(String.format("Performance for concept '%s': precision=%f, recall=%f," + " positiveSupport=%d, negativeSupport=%d", conceptId, precision, recall, positiveSupport, negativeSupport));
    } catch (Exception e) {
        String msg = String.format("Error updating performance metadata for topic '%s' on Solr Core '%s'", conceptId, solrCoreId);
        throw new ClassifierException(msg, e);
    }
}
Also used : HashMap(java.util.HashMap) EmbeddedSolrServer(org.apache.solr.client.solrj.embedded.EmbeddedSolrServer) SolrServer(org.apache.solr.client.solrj.SolrServer) ManagedSolrServer(org.apache.stanbol.commons.solr.managed.ManagedSolrServer) SolrQuery(org.apache.solr.client.solrj.SolrQuery) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) SolrServerException(org.apache.solr.client.solrj.SolrServerException) ConfigurationException(org.osgi.service.cm.ConfigurationException) InvalidSyntaxException(org.osgi.framework.InvalidSyntaxException) TrainingSetException(org.apache.stanbol.enhancer.topic.api.training.TrainingSetException) ClassifierException(org.apache.stanbol.enhancer.topic.api.ClassifierException) InvalidContentException(org.apache.stanbol.enhancer.servicesapi.InvalidContentException) EntityhubException(org.apache.stanbol.entityhub.servicesapi.EntityhubException) ChainException(org.apache.stanbol.enhancer.servicesapi.ChainException) IOException(java.io.IOException) SolrInputDocument(org.apache.solr.common.SolrInputDocument) SolrDocument(org.apache.solr.common.SolrDocument) Collection(java.util.Collection) Map(java.util.Map) HashMap(java.util.HashMap) ClassifierException(org.apache.stanbol.enhancer.topic.api.ClassifierException)

Example 3 with ClassifierException

use of org.apache.stanbol.enhancer.topic.api.ClassifierException in project stanbol by apache.

the class TopicClassificationEngine method updatePerformanceEstimates.

public synchronized int updatePerformanceEstimates(boolean incremental) throws ClassifierException, TrainingSetException {
    checkTrainingSet();
    if (evaluationRunning) {
        throw new ClassifierException("Another evaluation is already running");
    }
    int updatedTopics = 0;
    //       is now created within the #embeddedSolrServerDir
    try {
        evaluationRunning = true;
        // 3-folds CV is hardcoded for now
        int cvFoldCount = 3;
        // make it possible to limit the number of folds to use
        int cvIterationCount = 3;
        // We will use the training set quite intensively, ensure that the index is packed and its
        // statistics are up to date
        getTrainingSet().optimize();
        for (int cvFoldIndex = 0; cvFoldIndex < cvIterationCount; cvFoldIndex++) {
            updatedTopics = performCVFold(cvFoldIndex, cvFoldCount, cvIterationCount, incremental);
        }
        SolrServer solrServer = getActiveSolrServer();
        solrServer.optimize();
    } catch (ConfigurationException e) {
        throw new ClassifierException(e);
    } catch (IOException e) {
        throw new ClassifierException(e);
    } catch (SolrServerException e) {
        throw new ClassifierException(e);
    } finally {
        FileUtils.deleteQuietly(__evaluationServerDir);
        evaluationRunning = false;
    }
    return updatedTopics;
}
Also used : ConfigurationException(org.osgi.service.cm.ConfigurationException) SolrServerException(org.apache.solr.client.solrj.SolrServerException) IOException(java.io.IOException) EmbeddedSolrServer(org.apache.solr.client.solrj.embedded.EmbeddedSolrServer) SolrServer(org.apache.solr.client.solrj.SolrServer) ManagedSolrServer(org.apache.stanbol.commons.solr.managed.ManagedSolrServer) ClassifierException(org.apache.stanbol.enhancer.topic.api.ClassifierException)

Example 4 with ClassifierException

use of org.apache.stanbol.enhancer.topic.api.ClassifierException in project stanbol by apache.

the class TopicClassificationEngine method computeEnhancements.

@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES);
    if (contentPart == null) {
        throw new IllegalStateException("No ContentPart with a supported Mime Type" + "found for ContentItem " + ci.getUri() + "(supported: '" + SUPPORTED_MIMETYPES + "') -> this indicates that canEnhance was" + "NOT called and indicates a bug in the used EnhancementJobManager!");
    }
    String language = EnhancementEngineHelper.getLanguage(ci);
    if (!(acceptedLanguageSet.isEmpty() || acceptedLanguageSet.contains(language) || acceptedLanguageSet.contains(""))) {
        throw new IllegalStateException("The language '" + language + "' of the ContentItem is not configured as " + " active for this Engine (active: " + acceptedLanguageSet + ").");
    }
    String text;
    try {
        text = ContentItemHelper.getText(contentPart.getValue());
    } catch (IOException e) {
        throw new InvalidContentException(String.format("Unable to extract " + " textual content from ContentPart %s of ContentItem %s!", contentPart.getKey(), ci.getUri()), e);
    }
    if (text.trim().isEmpty()) {
        log.warn("ContentPart {} of ContentItem {} does not contain any " + "text to extract topics from", contentPart.getKey(), ci.getUri());
        return;
    }
    Graph metadata = ci.getMetadata();
    List<TopicSuggestion> topics;
    try {
        topics = suggestTopics(text);
        if (topics.isEmpty()) {
            return;
        }
    } catch (ClassifierException e) {
        throw new EngineException(e);
    }
    IRI precision = new IRI(NamespaceEnum.fise + "classifier/precision");
    IRI recall = new IRI(NamespaceEnum.fise + "classifier/recall");
    IRI f1 = new IRI(NamespaceEnum.fise + "classifier/f1");
    LiteralFactory lf = LiteralFactory.getInstance();
    ci.getLock().writeLock().lock();
    try {
        // Global text annotation to attach all the topic annotation to it.
        IRI textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
        metadata.add(new TripleImpl(textAnnotation, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_TYPE, OntologicalClasses.SKOS_CONCEPT));
        for (TopicSuggestion topic : topics) {
            IRI enhancement = EnhancementEngineHelper.createEntityEnhancement(ci, this);
            metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.RDF_TYPE, TechnicalClasses.ENHANCER_TOPICANNOTATION));
            metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_RELATION, textAnnotation));
            // add link to entity
            metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_REFERENCE, new IRI(topic.conceptUri)));
            metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_TYPE, OntologicalClasses.SKOS_CONCEPT));
            // add confidence information
            metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_CONFIDENCE, lf.createTypedLiteral(Double.valueOf(topic.score))));
            // add performance estimates of the classifier if available
            ClassificationReport perf = getPerformanceEstimates(topic.conceptUri);
            if (perf.uptodate) {
                metadata.add(new TripleImpl(enhancement, precision, lf.createTypedLiteral(Double.valueOf(perf.precision))));
                metadata.add(new TripleImpl(enhancement, recall, lf.createTypedLiteral(Double.valueOf(perf.recall))));
                metadata.add(new TripleImpl(enhancement, f1, lf.createTypedLiteral(Double.valueOf(perf.f1))));
            }
            // fetch concept label from the entityhub or a referenced site if available
            Entity entity = entityhub.getEntity(topic.conceptUri);
            if (entity == null) {
                entity = referencedSiteManager.getEntity(topic.conceptUri);
            }
            if (entity != null) {
                Representation representation = entity.getRepresentation();
                // TODO: extract all languages based on some configuration instead of hardcoding English
                Text label = representation.getFirst(NamespaceEnum.skos + "prefLabel", "en", "en-US", "en-GB");
                if (label == null) {
                    label = representation.getFirst(NamespaceEnum.rdfs + "label", "en", "en-US", "en-GB");
                }
                if (label != null) {
                    metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_LABEL, new PlainLiteralImpl(label.getText())));
                }
            }
        }
    } catch (ClassifierException e) {
        throw new EngineException(e);
    } catch (IllegalArgumentException e) {
        throw new EngineException(e);
    } catch (EntityhubException e) {
        throw new EngineException(e);
    } finally {
        ci.getLock().writeLock().unlock();
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) Entity(org.apache.stanbol.entityhub.servicesapi.model.Entity) Blob(org.apache.stanbol.enhancer.servicesapi.Blob) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) Representation(org.apache.stanbol.entityhub.servicesapi.model.Representation) Text(org.apache.stanbol.entityhub.servicesapi.model.Text) IOException(java.io.IOException) TopicSuggestion(org.apache.stanbol.enhancer.topic.api.TopicSuggestion) LiteralFactory(org.apache.clerezza.rdf.core.LiteralFactory) InvalidContentException(org.apache.stanbol.enhancer.servicesapi.InvalidContentException) ImmutableGraph(org.apache.clerezza.commons.rdf.ImmutableGraph) Graph(org.apache.clerezza.commons.rdf.Graph) EntityhubException(org.apache.stanbol.entityhub.servicesapi.EntityhubException) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) ClassificationReport(org.apache.stanbol.enhancer.topic.api.ClassificationReport) ClassifierException(org.apache.stanbol.enhancer.topic.api.ClassifierException)

Example 5 with ClassifierException

use of org.apache.stanbol.enhancer.topic.api.ClassifierException in project stanbol by apache.

the class TopicEngineTest method testUpdatePerformanceEstimates.

@Test
public void testUpdatePerformanceEstimates() throws Exception {
    log.info(" --- testUpdatePerformanceEstimates --- ");
    ClassificationReport performanceEstimates;
    // no registered topic
    try {
        classifier.getPerformanceEstimates("urn:t/001");
        fail("Should have raised ClassifierException");
    } catch (ClassifierException e) {
    // expected
    }
    // register some topics
    classifier.addConcept("urn:t/001", null);
    classifier.addConcept("urn:t/002", Arrays.asList("urn:t/001"));
    performanceEstimates = classifier.getPerformanceEstimates("urn:t/002");
    assertFalse(performanceEstimates.uptodate);
    // update the performance metadata manually
    classifier.updatePerformanceMetadata("urn:t/002", 0.76f, 0.60f, 34, 32, Arrays.asList("ex14", "ex78"), Arrays.asList("ex34", "ex23", "ex89"));
    classifier.getActiveSolrServer().commit();
    performanceEstimates = classifier.getPerformanceEstimates("urn:t/002");
    assertTrue(performanceEstimates.uptodate);
    assertEquals(0.76f, performanceEstimates.precision, 0.01);
    assertEquals(0.60f, performanceEstimates.recall, 0.01);
    assertEquals(0.67f, performanceEstimates.f1, 0.01);
    assertEquals(34, performanceEstimates.positiveSupport);
    assertEquals(32, performanceEstimates.negativeSupport);
    assertTrue(classifier.getBroaderConcepts("urn:t/002").contains("urn:t/001"));
    // accumulate other folds statistics and compute means of statistics
    classifier.updatePerformanceMetadata("urn:t/002", 0.79f, 0.63f, 10, 10, Arrays.asList("ex1", "ex5"), Arrays.asList("ex3", "ex10", "ex11"));
    classifier.getActiveSolrServer().commit();
    performanceEstimates = classifier.getPerformanceEstimates("urn:t/002");
    assertTrue(performanceEstimates.uptodate);
    assertEquals(0.775f, performanceEstimates.precision, 0.01);
    assertEquals(0.615f, performanceEstimates.recall, 0.01);
    assertEquals(0.695f, performanceEstimates.f1, 0.01);
    assertEquals(44, performanceEstimates.positiveSupport);
    assertEquals(42, performanceEstimates.negativeSupport);
}
Also used : ClassificationReport(org.apache.stanbol.enhancer.topic.api.ClassificationReport) ClassifierException(org.apache.stanbol.enhancer.topic.api.ClassifierException) Test(org.junit.Test)

Aggregations

ClassifierException (org.apache.stanbol.enhancer.topic.api.ClassifierException)16 SolrServerException (org.apache.solr.client.solrj.SolrServerException)13 SolrServer (org.apache.solr.client.solrj.SolrServer)12 EmbeddedSolrServer (org.apache.solr.client.solrj.embedded.EmbeddedSolrServer)12 ManagedSolrServer (org.apache.stanbol.commons.solr.managed.ManagedSolrServer)12 IOException (java.io.IOException)9 SolrDocument (org.apache.solr.common.SolrDocument)8 EngineException (org.apache.stanbol.enhancer.servicesapi.EngineException)8 InvalidContentException (org.apache.stanbol.enhancer.servicesapi.InvalidContentException)8 EntityhubException (org.apache.stanbol.entityhub.servicesapi.EntityhubException)8 ConfigurationException (org.osgi.service.cm.ConfigurationException)8 SolrQuery (org.apache.solr.client.solrj.SolrQuery)7 ChainException (org.apache.stanbol.enhancer.servicesapi.ChainException)7 TrainingSetException (org.apache.stanbol.enhancer.topic.api.training.TrainingSetException)7 InvalidSyntaxException (org.osgi.framework.InvalidSyntaxException)7 SolrInputDocument (org.apache.solr.common.SolrInputDocument)4 ClassificationReport (org.apache.stanbol.enhancer.topic.api.ClassificationReport)4 LinkedHashSet (java.util.LinkedHashSet)3 UpdateRequest (org.apache.solr.client.solrj.request.UpdateRequest)3 SolrDocumentList (org.apache.solr.common.SolrDocumentList)3