Search in sources :

Example 1 with TrainingSetException

use of org.apache.stanbol.enhancer.topic.api.training.TrainingSetException in project stanbol by apache.

the class TopicClassificationEngine method performCVFold.

protected int performCVFold(int cvFoldIndex, int cvFoldCount, int cvIterations, boolean incremental) throws ConfigurationException, TrainingSetException, ClassifierException {
    cvIterations = cvIterations <= 0 ? cvFoldCount : cvFoldCount;
    log.info(String.format("Performing evaluation %d-fold CV iteration %d/%d on classifier %s", cvFoldCount, cvFoldIndex + 1, cvIterations, engineName));
    long start = System.currentTimeMillis();
    final TopicClassificationEngine classifier = new TopicClassificationEngine();
    try {
        if (managedSolrServer != null) {
            // OSGi setup: the evaluation server will be generated automatically using the
            // managedSolrServer
            classifier.bindManagedSolrServer(managedSolrServer);
            classifier.activate(context, getCanonicalConfiguration(// TODO: maybe we should use the SolrCoreName instead
            engineName + "-evaluation", solrCoreConfig));
        } else {
            if (__evaluationServer == null) {
                __evaluationServerDir = new File(embeddedSolrServerDir, engineName + "-evaluation");
                if (!__evaluationServerDir.exists()) {
                    FileUtils.forceMkdir(__evaluationServerDir);
                }
                __evaluationServer = EmbeddedSolrHelper.makeEmbeddedSolrServer(__evaluationServerDir, "evaluationclassifierserver", "default-topic-model", "default-topic-model");
            }
            classifier.configure(getCanonicalConfiguration(__evaluationServer, solrCoreConfig));
        }
    } catch (Exception e) {
        throw new ClassifierException(e);
    }
    // clean all previous concepts from the evaluation classifier in case we are reusing an existing solr
    // index from OSGi.
    classifier.removeAllConcepts();
    // iterate over all the topics to register them in the evaluation classifier
    batchOverTopics(new BatchProcessor<SolrDocument>() {

        @Override
        public int process(List<SolrDocument> batch) throws ClassifierException {
            for (SolrDocument topicEntry : batch) {
                String conceptId = topicEntry.getFirstValue(conceptUriField).toString();
                Collection<Object> broader = topicEntry.getFieldValues(broaderField);
                if (broader == null) {
                    classifier.addConcept(conceptId, null, null);
                } else {
                    List<String> broaderConcepts = new ArrayList<String>();
                    for (Object broaderConcept : broader) {
                        broaderConcepts.add(broaderConcept.toString());
                    }
                    classifier.addConcept(conceptId, null, broaderConcepts);
                }
            }
            return batch.size();
        }
    });
    // build the model on the for the current train CV folds
    classifier.setCrossValidationInfo(cvFoldIndex, cvFoldCount);
    // bind our new classifier to the same training set at the parent
    classifier.setTrainingSet(getTrainingSet());
    classifier.updateModel(false);
    final int foldCount = cvFoldCount;
    final int foldIndex = cvFoldIndex;
    // iterate over the topics again to compute scores on the test fold
    int updatedTopics = batchOverTopics(new BatchProcessor<SolrDocument>() {

        @Override
        public int process(List<SolrDocument> batch) throws TrainingSetException, ClassifierException {
            int offset;
            int updated = 0;
            for (SolrDocument topicMetadata : batch) {
                String topic = topicMetadata.getFirstValue(conceptUriField).toString();
                List<String> topics = Arrays.asList(topic);
                List<String> falseNegativeExamples = new ArrayList<String>();
                int truePositives = 0;
                int falseNegatives = 0;
                int positiveSupport = 0;
                offset = 0;
                Batch<Example> examples = Batch.emtpyBatch(Example.class);
                boolean skipTopic = false;
                do {
                    examples = getTrainingSet().getPositiveExamples(topics, examples.nextOffset);
                    if (offset == 0 && examples.items.size() < MIN_EVALUATION_SAMPLES) {
                        // we need a minimum about of examples otherwise it's really not
                        // worth computing statistics
                        skipTopic = true;
                        break;
                    }
                    for (Example example : examples.items) {
                        if (!(offset % foldCount == foldIndex)) {
                            // this example is not part of the test fold, skip it
                            offset++;
                            continue;
                        }
                        positiveSupport++;
                        offset++;
                        List<TopicSuggestion> suggestedTopics = classifier.suggestTopics(example.contents);
                        boolean match = false;
                        for (TopicSuggestion suggestedTopic : suggestedTopics) {
                            if (topic.equals(suggestedTopic.conceptUri)) {
                                match = true;
                                truePositives++;
                                break;
                            }
                        }
                        if (!match) {
                            falseNegatives++;
                            if (falseNegativeExamples.size() < MAX_COLLECTED_EXAMPLES / foldCount) {
                                falseNegativeExamples.add(example.id);
                            }
                        }
                    }
                } while (!skipTopic && examples.hasMore && offset < MAX_EVALUATION_SAMPLES);
                List<String> falsePositiveExamples = new ArrayList<String>();
                int falsePositives = 0;
                int negativeSupport = 0;
                offset = 0;
                examples = Batch.emtpyBatch(Example.class);
                do {
                    if (skipTopic) {
                        break;
                    }
                    examples = getTrainingSet().getNegativeExamples(topics, examples.nextOffset);
                    for (Example example : examples.items) {
                        if (!(offset % foldCount == foldIndex)) {
                            // this example is not part of the test fold, skip it
                            offset++;
                            continue;
                        }
                        negativeSupport++;
                        offset++;
                        List<TopicSuggestion> suggestedTopics = classifier.suggestTopics(example.contents);
                        for (TopicSuggestion suggestedTopic : suggestedTopics) {
                            if (topic.equals(suggestedTopic.conceptUri)) {
                                falsePositives++;
                                if (falsePositiveExamples.size() < MAX_COLLECTED_EXAMPLES / foldCount) {
                                    falsePositiveExamples.add(example.id);
                                }
                                break;
                            }
                        }
                    // we don't need to collect true negatives
                    }
                } while (examples.hasMore && offset < MAX_EVALUATION_SAMPLES);
                if (skipTopic) {
                    log.debug("Skipping evaluation of {} because too few positive examples.", topic);
                } else {
                    // compute precision, recall and f1 score for the current test fold and topic
                    float precision = 0;
                    if (truePositives != 0 || falsePositives != 0) {
                        precision = truePositives / (float) (truePositives + falsePositives);
                    }
                    float recall = 0;
                    if (truePositives != 0 || falseNegatives != 0) {
                        recall = truePositives / (float) (truePositives + falseNegatives);
                    }
                    updatePerformanceMetadata(topic, precision, recall, positiveSupport, negativeSupport, falsePositiveExamples, falseNegativeExamples);
                    updated += 1;
                }
            }
            try {
                getActiveSolrServer().commit();
            } catch (Exception e) {
                throw new ClassifierException(e);
            }
            return updated;
        }
    });
    long stop = System.currentTimeMillis();
    log.info(String.format("Finished CV iteration %d/%d on classifier %s in %fs.", cvFoldIndex + 1, cvFoldCount, engineName, (stop - start) / 1000.0));
    if (context != null) {
        // close open trackers
        classifier.deactivate(context);
    }
    return updatedTopics;
}
Also used : TopicSuggestion(org.apache.stanbol.enhancer.topic.api.TopicSuggestion) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) SolrServerException(org.apache.solr.client.solrj.SolrServerException) ConfigurationException(org.osgi.service.cm.ConfigurationException) InvalidSyntaxException(org.osgi.framework.InvalidSyntaxException) TrainingSetException(org.apache.stanbol.enhancer.topic.api.training.TrainingSetException) ClassifierException(org.apache.stanbol.enhancer.topic.api.ClassifierException) InvalidContentException(org.apache.stanbol.enhancer.servicesapi.InvalidContentException) EntityhubException(org.apache.stanbol.entityhub.servicesapi.EntityhubException) ChainException(org.apache.stanbol.enhancer.servicesapi.ChainException) IOException(java.io.IOException) SolrDocument(org.apache.solr.common.SolrDocument) Batch(org.apache.stanbol.enhancer.topic.api.Batch) Example(org.apache.stanbol.enhancer.topic.api.training.Example) Collection(java.util.Collection) SolrDocumentList(org.apache.solr.common.SolrDocumentList) List(java.util.List) ArrayList(java.util.ArrayList) File(java.io.File) TrainingSetException(org.apache.stanbol.enhancer.topic.api.training.TrainingSetException) ClassifierException(org.apache.stanbol.enhancer.topic.api.ClassifierException)

Example 2 with TrainingSetException

use of org.apache.stanbol.enhancer.topic.api.training.TrainingSetException in project stanbol by apache.

the class TopicClassificationEngine method batchOverTopics.

protected int batchOverTopics(BatchProcessor<SolrDocument> processor) throws TrainingSetException {
    // TODO: implement incremental update by using the date informations
    int processedCount = 0;
    SolrServer solrServer = getActiveSolrServer();
    SolrQuery query = new SolrQuery("*:*");
    query.addFilterQuery(entryTypeField + ":" + METADATA_ENTRY);
    String offset = null;
    boolean done = false;
    int batchSize = 1000;
    query.addSortField(conceptUriField, SolrQuery.ORDER.asc);
    query.setRows(batchSize + 1);
    try {
        while (!done) {
            // batch over all the indexed topics
            if (offset != null) {
                query.addFilterQuery(conceptUriField + ":[" + ClientUtils.escapeQueryChars(offset) + " TO *]");
            }
            QueryResponse response = solrServer.query(query);
            int count = 0;
            List<SolrDocument> batchDocuments = new ArrayList<SolrDocument>();
            for (SolrDocument result : response.getResults()) {
                String conceptId = result.getFirstValue(conceptUriField).toString();
                if (count == batchSize) {
                    offset = conceptId;
                } else {
                    count++;
                    batchDocuments.add(result);
                }
            }
            processedCount += processor.process(batchDocuments);
            solrServer.commit();
            if (count < batchSize) {
                done = true;
            }
        }
        solrServer.optimize();
    } catch (Exception e) {
        String msg = String.format("Error while updating topics on Solr Core '%s'.", solrCoreId);
        throw new TrainingSetException(msg, e);
    }
    return processedCount;
}
Also used : SolrDocument(org.apache.solr.common.SolrDocument) QueryResponse(org.apache.solr.client.solrj.response.QueryResponse) ArrayList(java.util.ArrayList) EmbeddedSolrServer(org.apache.solr.client.solrj.embedded.EmbeddedSolrServer) SolrServer(org.apache.solr.client.solrj.SolrServer) ManagedSolrServer(org.apache.stanbol.commons.solr.managed.ManagedSolrServer) SolrQuery(org.apache.solr.client.solrj.SolrQuery) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) SolrServerException(org.apache.solr.client.solrj.SolrServerException) ConfigurationException(org.osgi.service.cm.ConfigurationException) InvalidSyntaxException(org.osgi.framework.InvalidSyntaxException) TrainingSetException(org.apache.stanbol.enhancer.topic.api.training.TrainingSetException) ClassifierException(org.apache.stanbol.enhancer.topic.api.ClassifierException) InvalidContentException(org.apache.stanbol.enhancer.servicesapi.InvalidContentException) EntityhubException(org.apache.stanbol.entityhub.servicesapi.EntityhubException) ChainException(org.apache.stanbol.enhancer.servicesapi.ChainException) IOException(java.io.IOException) TrainingSetException(org.apache.stanbol.enhancer.topic.api.training.TrainingSetException)

Example 3 with TrainingSetException

use of org.apache.stanbol.enhancer.topic.api.training.TrainingSetException in project stanbol by apache.

the class SolrTrainingSet method registerExample.

@Override
public String registerExample(String exampleId, String text, List<String> topics) throws TrainingSetException {
    if (text == null) {
        // special case: example removal
        if (exampleId == null) {
            throw new IllegalArgumentException("exampleId and text should not be null simultaneously");
        }
        SolrServer solrServer = getActiveSolrServer();
        try {
            solrServer.deleteByQuery(exampleIdField + ":" + exampleId);
            solrServer.commit();
            return exampleId;
        } catch (Exception e) {
            String msg = String.format("Error deleting example with id '%s' on Solr Core '%s'", exampleId, solrCoreId);
            throw new TrainingSetException(msg, e);
        }
    }
    if (exampleId == null || exampleId.isEmpty()) {
        exampleId = UUID.randomUUID().toString();
    }
    SolrInputDocument doc = new SolrInputDocument();
    doc.addField(exampleIdField, exampleId);
    doc.addField(exampleTextField, text);
    if (topics != null) {
        doc.addField(topicUrisField, topics);
    }
    doc.addField(modificationDateField, UTCTimeStamper.nowUtcDate());
    SolrServer server = getActiveSolrServer();
    try {
        server.add(doc);
        server.commit();
    } catch (Exception e) {
        String msg = String.format("Could not register example '%s' with topics: ['%s']", exampleId, StringUtils.join(topics, "', '"));
        throw new TrainingSetException(msg, e);
    }
    return exampleId;
}
Also used : SolrInputDocument(org.apache.solr.common.SolrInputDocument) ManagedSolrServer(org.apache.stanbol.commons.solr.managed.ManagedSolrServer) SolrServer(org.apache.solr.client.solrj.SolrServer) SolrServerException(org.apache.solr.client.solrj.SolrServerException) ConfigurationException(org.osgi.service.cm.ConfigurationException) InvalidSyntaxException(org.osgi.framework.InvalidSyntaxException) TrainingSetException(org.apache.stanbol.enhancer.topic.api.training.TrainingSetException) TrainingSetException(org.apache.stanbol.enhancer.topic.api.training.TrainingSetException)

Example 4 with TrainingSetException

use of org.apache.stanbol.enhancer.topic.api.training.TrainingSetException in project stanbol by apache.

the class SolrTrainingSet method hasChangedSince.

@Override
public boolean hasChangedSince(List<String> topics, Date referenceDate) throws TrainingSetException {
    String utcIsoDate = UTCTimeStamper.utcIsoString(referenceDate);
    StringBuffer sb = new StringBuffer();
    sb.append(modificationDateField);
    sb.append(":[");
    sb.append(utcIsoDate);
    sb.append(" TO *]");
    if (topics != null && topics.size() > 0) {
        sb.append(" AND (");
        List<String> parts = new ArrayList<String>();
        for (String topic : topics) {
            // use a nested query to avoid string escaping issues with special solr chars
            parts.add(topicUrisField + ":" + ClientUtils.escapeQueryChars(topic));
        }
        sb.append(StringUtils.join(parts, " OR "));
        sb.append(")");
    }
    SolrQuery query = new SolrQuery(sb.toString());
    query.setRows(1);
    query.setFields(exampleIdField);
    try {
        SolrServer solrServer = getActiveSolrServer();
        return solrServer.query(query).getResults().size() > 0;
    } catch (SolrServerException e) {
        String msg = String.format("Error while fetching topics for examples modified after '%s' on Solr Core '%s'.", utcIsoDate, solrCoreId);
        throw new TrainingSetException(msg, e);
    }
}
Also used : SolrServerException(org.apache.solr.client.solrj.SolrServerException) ArrayList(java.util.ArrayList) ManagedSolrServer(org.apache.stanbol.commons.solr.managed.ManagedSolrServer) SolrServer(org.apache.solr.client.solrj.SolrServer) SolrQuery(org.apache.solr.client.solrj.SolrQuery) TrainingSetException(org.apache.stanbol.enhancer.topic.api.training.TrainingSetException)

Example 5 with TrainingSetException

use of org.apache.stanbol.enhancer.topic.api.training.TrainingSetException in project stanbol by apache.

the class SolrTrainingSet method getExamples.

protected Batch<Example> getExamples(List<String> topics, Object offset, boolean positive) throws TrainingSetException {
    List<Example> items = new ArrayList<Example>();
    SolrServer solrServer = getActiveSolrServer();
    SolrQuery query = new SolrQuery();
    List<String> parts = new ArrayList<String>();
    String q = "";
    if (topics.isEmpty()) {
        q += "*:*";
    } else if (positive) {
        for (String topic : topics) {
            parts.add(topicUrisField + ":" + ClientUtils.escapeQueryChars(topic));
        }
        if (offset != null) {
            q += "(";
        }
        q += StringUtils.join(parts, " OR ");
        if (offset != null) {
            q += ")";
        }
    } else {
        for (String topic : topics) {
            parts.add("-" + topicUrisField + ":" + ClientUtils.escapeQueryChars(topic));
        }
        q += StringUtils.join(parts, " AND ");
    }
    if (offset != null) {
        q += " AND " + exampleIdField + ":[" + offset.toString() + " TO *]";
    }
    query.setQuery(q);
    query.addSortField(exampleIdField, SolrQuery.ORDER.asc);
    query.set("rows", batchSize + 1);
    String nextExampleId = null;
    try {
        int count = 0;
        QueryResponse response = solrServer.query(query);
        for (SolrDocument result : response.getResults()) {
            if (count == batchSize) {
                nextExampleId = result.getFirstValue(exampleIdField).toString();
            } else {
                count++;
                String exampleId = result.getFirstValue(exampleIdField).toString();
                Collection<Object> labelValues = result.getFieldValues(topicUrisField);
                Collection<Object> textValues = result.getFieldValues(exampleTextField);
                if (textValues == null) {
                    continue;
                }
                items.add(new Example(exampleId, labelValues, textValues));
            }
        }
    } catch (SolrServerException e) {
        String msg = String.format("Error while fetching positive examples for topics ['%s'] on Solr Core '%s'.", StringUtils.join(topics, "', '"), solrCoreId);
        throw new TrainingSetException(msg, e);
    }
    return new Batch<Example>(items, nextExampleId != null, nextExampleId);
}
Also used : SolrServerException(org.apache.solr.client.solrj.SolrServerException) ArrayList(java.util.ArrayList) ManagedSolrServer(org.apache.stanbol.commons.solr.managed.ManagedSolrServer) SolrServer(org.apache.solr.client.solrj.SolrServer) SolrQuery(org.apache.solr.client.solrj.SolrQuery) SolrDocument(org.apache.solr.common.SolrDocument) Batch(org.apache.stanbol.enhancer.topic.api.Batch) Example(org.apache.stanbol.enhancer.topic.api.training.Example) QueryResponse(org.apache.solr.client.solrj.response.QueryResponse) TrainingSetException(org.apache.stanbol.enhancer.topic.api.training.TrainingSetException)

Aggregations

SolrServerException (org.apache.solr.client.solrj.SolrServerException)6 TrainingSetException (org.apache.stanbol.enhancer.topic.api.training.TrainingSetException)6 SolrServer (org.apache.solr.client.solrj.SolrServer)5 ManagedSolrServer (org.apache.stanbol.commons.solr.managed.ManagedSolrServer)5 ArrayList (java.util.ArrayList)4 InvalidSyntaxException (org.osgi.framework.InvalidSyntaxException)4 ConfigurationException (org.osgi.service.cm.ConfigurationException)4 IOException (java.io.IOException)3 SolrQuery (org.apache.solr.client.solrj.SolrQuery)3 SolrDocument (org.apache.solr.common.SolrDocument)3 ChainException (org.apache.stanbol.enhancer.servicesapi.ChainException)3 EngineException (org.apache.stanbol.enhancer.servicesapi.EngineException)3 InvalidContentException (org.apache.stanbol.enhancer.servicesapi.InvalidContentException)3 ClassifierException (org.apache.stanbol.enhancer.topic.api.ClassifierException)3 Example (org.apache.stanbol.enhancer.topic.api.training.Example)3 EntityhubException (org.apache.stanbol.entityhub.servicesapi.EntityhubException)3 EmbeddedSolrServer (org.apache.solr.client.solrj.embedded.EmbeddedSolrServer)2 QueryResponse (org.apache.solr.client.solrj.response.QueryResponse)2 SolrInputDocument (org.apache.solr.common.SolrInputDocument)2 Batch (org.apache.stanbol.enhancer.topic.api.Batch)2