Example 1 with TrainingSetException

use of in project stanbol by apache.

the class TopicClassificationEngine method performCVFold.

protected int performCVFold(int cvFoldIndex, int cvFoldCount, int cvIterations, boolean incremental) throws ConfigurationException, TrainingSetException, ClassifierException {
    cvIterations = cvIterations <= 0 ? cvFoldCount : cvFoldCount;"Performing evaluation %d-fold CV iteration %d/%d on classifier %s", cvFoldCount, cvFoldIndex + 1, cvIterations, engineName));
    long start = System.currentTimeMillis();
    final TopicClassificationEngine classifier = new TopicClassificationEngine();
    try {
        if (managedSolrServer != null) {
            // OSGi setup: the evaluation server will be generated automatically using the
            // managedSolrServer
            classifier.activate(context, getCanonicalConfiguration(// TODO: maybe we should use the SolrCoreName instead
            engineName + "-evaluation", solrCoreConfig));
        } else {
            if (__evaluationServer == null) {
                __evaluationServerDir = new File(embeddedSolrServerDir, engineName + "-evaluation");
                if (!__evaluationServerDir.exists()) {
                __evaluationServer = EmbeddedSolrHelper.makeEmbeddedSolrServer(__evaluationServerDir, "evaluationclassifierserver", "default-topic-model", "default-topic-model");
            classifier.configure(getCanonicalConfiguration(__evaluationServer, solrCoreConfig));
    } catch (Exception e) {
        throw new ClassifierException(e);
    // clean all previous concepts from the evaluation classifier in case we are reusing an existing solr
    // index from OSGi.
    // iterate over all the topics to register them in the evaluation classifier
    batchOverTopics(new BatchProcessor<SolrDocument>() {

        public int process(List<SolrDocument> batch) throws ClassifierException {
            for (SolrDocument topicEntry : batch) {
                String conceptId = topicEntry.getFirstValue(conceptUriField).toString();
                Collection<Object> broader = topicEntry.getFieldValues(broaderField);
                if (broader == null) {
                    classifier.addConcept(conceptId, null, null);
                } else {
                    List<String> broaderConcepts = new ArrayList<String>();
                    for (Object broaderConcept : broader) {
                    classifier.addConcept(conceptId, null, broaderConcepts);
            return batch.size();
    // build the model on the for the current train CV folds
    classifier.setCrossValidationInfo(cvFoldIndex, cvFoldCount);
    // bind our new classifier to the same training set at the parent
    final int foldCount = cvFoldCount;
    final int foldIndex = cvFoldIndex;
    // iterate over the topics again to compute scores on the test fold
    int updatedTopics = batchOverTopics(new BatchProcessor<SolrDocument>() {

        public int process(List<SolrDocument> batch) throws TrainingSetException, ClassifierException {
            int offset;
            int updated = 0;
            for (SolrDocument topicMetadata : batch) {
                String topic = topicMetadata.getFirstValue(conceptUriField).toString();
                List<String> topics = Arrays.asList(topic);
                List<String> falseNegativeExamples = new ArrayList<String>();
                int truePositives = 0;
                int falseNegatives = 0;
                int positiveSupport = 0;
                offset = 0;
                Batch<Example> examples = Batch.emtpyBatch(Example.class);
                boolean skipTopic = false;
                do {
                    examples = getTrainingSet().getPositiveExamples(topics, examples.nextOffset);
                    if (offset == 0 && examples.items.size() < MIN_EVALUATION_SAMPLES) {
                        // we need a minimum about of examples otherwise it's really not
                        // worth computing statistics
                        skipTopic = true;
                    for (Example example : examples.items) {
                        if (!(offset % foldCount == foldIndex)) {
                            // this example is not part of the test fold, skip it
                        List<TopicSuggestion> suggestedTopics = classifier.suggestTopics(example.contents);
                        boolean match = false;
                        for (TopicSuggestion suggestedTopic : suggestedTopics) {
                            if (topic.equals(suggestedTopic.conceptUri)) {
                                match = true;
                        if (!match) {
                            if (falseNegativeExamples.size() < MAX_COLLECTED_EXAMPLES / foldCount) {
                } while (!skipTopic && examples.hasMore && offset < MAX_EVALUATION_SAMPLES);
                List<String> falsePositiveExamples = new ArrayList<String>();
                int falsePositives = 0;
                int negativeSupport = 0;
                offset = 0;
                examples = Batch.emtpyBatch(Example.class);
                do {
                    if (skipTopic) {
                    examples = getTrainingSet().getNegativeExamples(topics, examples.nextOffset);
                    for (Example example : examples.items) {
                        if (!(offset % foldCount == foldIndex)) {
                            // this example is not part of the test fold, skip it
                        List<TopicSuggestion> suggestedTopics = classifier.suggestTopics(example.contents);
                        for (TopicSuggestion suggestedTopic : suggestedTopics) {
                            if (topic.equals(suggestedTopic.conceptUri)) {
                                if (falsePositiveExamples.size() < MAX_COLLECTED_EXAMPLES / foldCount) {
                    // we don't need to collect true negatives
                } while (examples.hasMore && offset < MAX_EVALUATION_SAMPLES);
                if (skipTopic) {
                    log.debug("Skipping evaluation of {} because too few positive examples.", topic);
                } else {
                    // compute precision, recall and f1 score for the current test fold and topic
                    float precision = 0;
                    if (truePositives != 0 || falsePositives != 0) {
                        precision = truePositives / (float) (truePositives + falsePositives);
                    float recall = 0;
                    if (truePositives != 0 || falseNegatives != 0) {
                        recall = truePositives / (float) (truePositives + falseNegatives);
                    updatePerformanceMetadata(topic, precision, recall, positiveSupport, negativeSupport, falsePositiveExamples, falseNegativeExamples);
                    updated += 1;
            try {
            } catch (Exception e) {
                throw new ClassifierException(e);
            return updated;
    long stop = System.currentTimeMillis();"Finished CV iteration %d/%d on classifier %s in %fs.", cvFoldIndex + 1, cvFoldCount, engineName, (stop - start) / 1000.0));
    if (context != null) {
        // close open trackers
    return updatedTopics;
Example 2 with TrainingSetException

use of in project stanbol by apache.

the class TopicClassificationEngine method batchOverTopics.

protected int batchOverTopics(BatchProcessor<SolrDocument> processor) throws TrainingSetException {
    // TODO: implement incremental update by using the date informations
    int processedCount = 0;
    SolrServer solrServer = getActiveSolrServer();
    SolrQuery query = new SolrQuery("*:*");
    query.addFilterQuery(entryTypeField + ":" + METADATA_ENTRY);
    String offset = null;
    boolean done = false;
    int batchSize = 1000;
    query.addSortField(conceptUriField, SolrQuery.ORDER.asc);
    query.setRows(batchSize + 1);
    try {
        while (!done) {
            // batch over all the indexed topics
            if (offset != null) {
                query.addFilterQuery(conceptUriField + ":[" + ClientUtils.escapeQueryChars(offset) + " TO *]");
            QueryResponse response = solrServer.query(query);
            int count = 0;
            List<SolrDocument> batchDocuments = new ArrayList<SolrDocument>();
            for (SolrDocument result : response.getResults()) {
                String conceptId = result.getFirstValue(conceptUriField).toString();
                if (count == batchSize) {
                    offset = conceptId;
                } else {
            processedCount += processor.process(batchDocuments);
            if (count < batchSize) {
                done = true;
    } catch (Exception e) {
        String msg = String.format("Error while updating topics on Solr Core '%s'.", solrCoreId);
        throw new TrainingSetException(msg, e);
    return processedCount;
Example 3 with TrainingSetException

use of in project stanbol by apache.

the class SolrTrainingSet method registerExample.

public String registerExample(String exampleId, String text, List<String> topics) throws TrainingSetException {
    if (text == null) {
        // special case: example removal
        if (exampleId == null) {
            throw new IllegalArgumentException("exampleId and text should not be null simultaneously");
        SolrServer solrServer = getActiveSolrServer();
        try {
            solrServer.deleteByQuery(exampleIdField + ":" + exampleId);
            return exampleId;
        } catch (Exception e) {
            String msg = String.format("Error deleting example with id '%s' on Solr Core '%s'", exampleId, solrCoreId);
            throw new TrainingSetException(msg, e);
    if (exampleId == null || exampleId.isEmpty()) {
        exampleId = UUID.randomUUID().toString();
    SolrInputDocument doc = new SolrInputDocument();
    doc.addField(exampleIdField, exampleId);
    doc.addField(exampleTextField, text);
    if (topics != null) {
        doc.addField(topicUrisField, topics);
    doc.addField(modificationDateField, UTCTimeStamper.nowUtcDate());
    SolrServer server = getActiveSolrServer();
    try {
    } catch (Exception e) {
        String msg = String.format("Could not register example '%s' with topics: ['%s']", exampleId, StringUtils.join(topics, "', '"));
        throw new TrainingSetException(msg, e);
    return exampleId;
Example 4 with TrainingSetException

use of in project stanbol by apache.

the class SolrTrainingSet method hasChangedSince.

public boolean hasChangedSince(List<String> topics, Date referenceDate) throws TrainingSetException {
    String utcIsoDate = UTCTimeStamper.utcIsoString(referenceDate);
    StringBuffer sb = new StringBuffer();
    sb.append(" TO *]");
    if (topics != null && topics.size() > 0) {
        sb.append(" AND (");
        List<String> parts = new ArrayList<String>();
        for (String topic : topics) {
            // use a nested query to avoid string escaping issues with special solr chars
            parts.add(topicUrisField + ":" + ClientUtils.escapeQueryChars(topic));
        sb.append(StringUtils.join(parts, " OR "));
    SolrQuery query = new SolrQuery(sb.toString());
    try {
        SolrServer solrServer = getActiveSolrServer();
        return solrServer.query(query).getResults().size() > 0;
    } catch (SolrServerException e) {
        String msg = String.format("Error while fetching topics for examples modified after '%s' on Solr Core '%s'.", utcIsoDate, solrCoreId);
        throw new TrainingSetException(msg, e);
Example 5 with TrainingSetException

use of in project stanbol by apache.

the class SolrTrainingSet method getExamples.

protected Batch<Example> getExamples(List<String> topics, Object offset, boolean positive) throws TrainingSetException {
    List<Example> items = new ArrayList<Example>();
    SolrServer solrServer = getActiveSolrServer();
    SolrQuery query = new SolrQuery();
    List<String> parts = new ArrayList<String>();
    String q = "";
    if (topics.isEmpty()) {
        q += "*:*";
    } else if (positive) {
        for (String topic : topics) {
            parts.add(topicUrisField + ":" + ClientUtils.escapeQueryChars(topic));
        if (offset != null) {
            q += "(";
        q += StringUtils.join(parts, " OR ");
        if (offset != null) {
            q += ")";
    } else {
        for (String topic : topics) {
            parts.add("-" + topicUrisField + ":" + ClientUtils.escapeQueryChars(topic));
        q += StringUtils.join(parts, " AND ");
    if (offset != null) {
        q += " AND " + exampleIdField + ":[" + offset.toString() + " TO *]";
    query.addSortField(exampleIdField, SolrQuery.ORDER.asc);
    query.set("rows", batchSize + 1);
    String nextExampleId = null;
    try {
        int count = 0;
        QueryResponse response = solrServer.query(query);
        for (SolrDocument result : response.getResults()) {
            if (count == batchSize) {
                nextExampleId = result.getFirstValue(exampleIdField).toString();
            } else {
                String exampleId = result.getFirstValue(exampleIdField).toString();
                Collection<Object> labelValues = result.getFieldValues(topicUrisField);
                Collection<Object> textValues = result.getFieldValues(exampleTextField);
                if (textValues == null) {
                items.add(new Example(exampleId, labelValues, textValues));
    } catch (SolrServerException e) {
        String msg = String.format("Error while fetching positive examples for topics ['%s'] on Solr Core '%s'.", StringUtils.join(topics, "', '"), solrCoreId);
        throw new TrainingSetException(msg, e);
    return new Batch<Example>(items, nextExampleId != null, nextExampleId);
