Search in sources :

Example 1 with TopicSuggestion

use of org.apache.stanbol.enhancer.topic.api.TopicSuggestion in project stanbol by apache.

the class TopicClassificationEngine method computeEnhancements.

@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES);
    if (contentPart == null) {
        throw new IllegalStateException("No ContentPart with a supported Mime Type" + "found for ContentItem " + ci.getUri() + "(supported: '" + SUPPORTED_MIMETYPES + "') -> this indicates that canEnhance was" + "NOT called and indicates a bug in the used EnhancementJobManager!");
    }
    String language = EnhancementEngineHelper.getLanguage(ci);
    if (!(acceptedLanguageSet.isEmpty() || acceptedLanguageSet.contains(language) || acceptedLanguageSet.contains(""))) {
        throw new IllegalStateException("The language '" + language + "' of the ContentItem is not configured as " + " active for this Engine (active: " + acceptedLanguageSet + ").");
    }
    String text;
    try {
        text = ContentItemHelper.getText(contentPart.getValue());
    } catch (IOException e) {
        throw new InvalidContentException(String.format("Unable to extract " + " textual content from ContentPart %s of ContentItem %s!", contentPart.getKey(), ci.getUri()), e);
    }
    if (text.trim().isEmpty()) {
        log.warn("ContentPart {} of ContentItem {} does not contain any " + "text to extract topics from", contentPart.getKey(), ci.getUri());
        return;
    }
    Graph metadata = ci.getMetadata();
    List<TopicSuggestion> topics;
    try {
        topics = suggestTopics(text);
        if (topics.isEmpty()) {
            return;
        }
    } catch (ClassifierException e) {
        throw new EngineException(e);
    }
    IRI precision = new IRI(NamespaceEnum.fise + "classifier/precision");
    IRI recall = new IRI(NamespaceEnum.fise + "classifier/recall");
    IRI f1 = new IRI(NamespaceEnum.fise + "classifier/f1");
    LiteralFactory lf = LiteralFactory.getInstance();
    ci.getLock().writeLock().lock();
    try {
        // Global text annotation to attach all the topic annotation to it.
        IRI textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
        metadata.add(new TripleImpl(textAnnotation, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_TYPE, OntologicalClasses.SKOS_CONCEPT));
        for (TopicSuggestion topic : topics) {
            IRI enhancement = EnhancementEngineHelper.createEntityEnhancement(ci, this);
            metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.RDF_TYPE, TechnicalClasses.ENHANCER_TOPICANNOTATION));
            metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_RELATION, textAnnotation));
            // add link to entity
            metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_REFERENCE, new IRI(topic.conceptUri)));
            metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_TYPE, OntologicalClasses.SKOS_CONCEPT));
            // add confidence information
            metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_CONFIDENCE, lf.createTypedLiteral(Double.valueOf(topic.score))));
            // add performance estimates of the classifier if available
            ClassificationReport perf = getPerformanceEstimates(topic.conceptUri);
            if (perf.uptodate) {
                metadata.add(new TripleImpl(enhancement, precision, lf.createTypedLiteral(Double.valueOf(perf.precision))));
                metadata.add(new TripleImpl(enhancement, recall, lf.createTypedLiteral(Double.valueOf(perf.recall))));
                metadata.add(new TripleImpl(enhancement, f1, lf.createTypedLiteral(Double.valueOf(perf.f1))));
            }
            // fetch concept label from the entityhub or a referenced site if available
            Entity entity = entityhub.getEntity(topic.conceptUri);
            if (entity == null) {
                entity = referencedSiteManager.getEntity(topic.conceptUri);
            }
            if (entity != null) {
                Representation representation = entity.getRepresentation();
                // TODO: extract all languages based on some configuration instead of hardcoding English
                Text label = representation.getFirst(NamespaceEnum.skos + "prefLabel", "en", "en-US", "en-GB");
                if (label == null) {
                    label = representation.getFirst(NamespaceEnum.rdfs + "label", "en", "en-US", "en-GB");
                }
                if (label != null) {
                    metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_LABEL, new PlainLiteralImpl(label.getText())));
                }
            }
        }
    } catch (ClassifierException e) {
        throw new EngineException(e);
    } catch (IllegalArgumentException e) {
        throw new EngineException(e);
    } catch (EntityhubException e) {
        throw new EngineException(e);
    } finally {
        ci.getLock().writeLock().unlock();
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) Entity(org.apache.stanbol.entityhub.servicesapi.model.Entity) Blob(org.apache.stanbol.enhancer.servicesapi.Blob) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) Representation(org.apache.stanbol.entityhub.servicesapi.model.Representation) Text(org.apache.stanbol.entityhub.servicesapi.model.Text) IOException(java.io.IOException) TopicSuggestion(org.apache.stanbol.enhancer.topic.api.TopicSuggestion) LiteralFactory(org.apache.clerezza.rdf.core.LiteralFactory) InvalidContentException(org.apache.stanbol.enhancer.servicesapi.InvalidContentException) ImmutableGraph(org.apache.clerezza.commons.rdf.ImmutableGraph) Graph(org.apache.clerezza.commons.rdf.Graph) EntityhubException(org.apache.stanbol.entityhub.servicesapi.EntityhubException) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) ClassificationReport(org.apache.stanbol.enhancer.topic.api.ClassificationReport) ClassifierException(org.apache.stanbol.enhancer.topic.api.ClassifierException)

Example 2 with TopicSuggestion

use of org.apache.stanbol.enhancer.topic.api.TopicSuggestion in project stanbol by apache.

the class TopicClassificationEngine method performCVFold.

protected int performCVFold(int cvFoldIndex, int cvFoldCount, int cvIterations, boolean incremental) throws ConfigurationException, TrainingSetException, ClassifierException {
    cvIterations = cvIterations <= 0 ? cvFoldCount : cvFoldCount;
    log.info(String.format("Performing evaluation %d-fold CV iteration %d/%d on classifier %s", cvFoldCount, cvFoldIndex + 1, cvIterations, engineName));
    long start = System.currentTimeMillis();
    final TopicClassificationEngine classifier = new TopicClassificationEngine();
    try {
        if (managedSolrServer != null) {
            // OSGi setup: the evaluation server will be generated automatically using the
            // managedSolrServer
            classifier.bindManagedSolrServer(managedSolrServer);
            classifier.activate(context, getCanonicalConfiguration(//TODO: maybe we should use the SolrCoreName instead
            engineName + "-evaluation", solrCoreConfig));
        } else {
            if (__evaluationServer == null) {
                __evaluationServerDir = new File(embeddedSolrServerDir, engineName + "-evaluation");
                if (!__evaluationServerDir.exists()) {
                    FileUtils.forceMkdir(__evaluationServerDir);
                }
                __evaluationServer = EmbeddedSolrHelper.makeEmbeddedSolrServer(__evaluationServerDir, "evaluationclassifierserver", "default-topic-model", "default-topic-model");
            }
            classifier.configure(getCanonicalConfiguration(__evaluationServer, solrCoreConfig));
        }
    } catch (Exception e) {
        throw new ClassifierException(e);
    }
    // clean all previous concepts from the evaluation classifier in case we are reusing an existing solr
    // index from OSGi.
    classifier.removeAllConcepts();
    // iterate over all the topics to register them in the evaluation classifier
    batchOverTopics(new BatchProcessor<SolrDocument>() {

        @Override
        public int process(List<SolrDocument> batch) throws ClassifierException {
            for (SolrDocument topicEntry : batch) {
                String conceptId = topicEntry.getFirstValue(conceptUriField).toString();
                Collection<Object> broader = topicEntry.getFieldValues(broaderField);
                if (broader == null) {
                    classifier.addConcept(conceptId, null, null);
                } else {
                    List<String> broaderConcepts = new ArrayList<String>();
                    for (Object broaderConcept : broader) {
                        broaderConcepts.add(broaderConcept.toString());
                    }
                    classifier.addConcept(conceptId, null, broaderConcepts);
                }
            }
            return batch.size();
        }
    });
    // build the model on the for the current train CV folds
    classifier.setCrossValidationInfo(cvFoldIndex, cvFoldCount);
    // bind our new classifier to the same training set at the parent
    classifier.setTrainingSet(getTrainingSet());
    classifier.updateModel(false);
    final int foldCount = cvFoldCount;
    final int foldIndex = cvFoldIndex;
    // iterate over the topics again to compute scores on the test fold
    int updatedTopics = batchOverTopics(new BatchProcessor<SolrDocument>() {

        @Override
        public int process(List<SolrDocument> batch) throws TrainingSetException, ClassifierException {
            int offset;
            int updated = 0;
            for (SolrDocument topicMetadata : batch) {
                String topic = topicMetadata.getFirstValue(conceptUriField).toString();
                List<String> topics = Arrays.asList(topic);
                List<String> falseNegativeExamples = new ArrayList<String>();
                int truePositives = 0;
                int falseNegatives = 0;
                int positiveSupport = 0;
                offset = 0;
                Batch<Example> examples = Batch.emtpyBatch(Example.class);
                boolean skipTopic = false;
                do {
                    examples = getTrainingSet().getPositiveExamples(topics, examples.nextOffset);
                    if (offset == 0 && examples.items.size() < MIN_EVALUATION_SAMPLES) {
                        // we need a minimum about of examples otherwise it's really not
                        // worth computing statistics
                        skipTopic = true;
                        break;
                    }
                    for (Example example : examples.items) {
                        if (!(offset % foldCount == foldIndex)) {
                            // this example is not part of the test fold, skip it
                            offset++;
                            continue;
                        }
                        positiveSupport++;
                        offset++;
                        List<TopicSuggestion> suggestedTopics = classifier.suggestTopics(example.contents);
                        boolean match = false;
                        for (TopicSuggestion suggestedTopic : suggestedTopics) {
                            if (topic.equals(suggestedTopic.conceptUri)) {
                                match = true;
                                truePositives++;
                                break;
                            }
                        }
                        if (!match) {
                            falseNegatives++;
                            if (falseNegativeExamples.size() < MAX_COLLECTED_EXAMPLES / foldCount) {
                                falseNegativeExamples.add(example.id);
                            }
                        }
                    }
                } while (!skipTopic && examples.hasMore && offset < MAX_EVALUATION_SAMPLES);
                List<String> falsePositiveExamples = new ArrayList<String>();
                int falsePositives = 0;
                int negativeSupport = 0;
                offset = 0;
                examples = Batch.emtpyBatch(Example.class);
                do {
                    if (skipTopic) {
                        break;
                    }
                    examples = getTrainingSet().getNegativeExamples(topics, examples.nextOffset);
                    for (Example example : examples.items) {
                        if (!(offset % foldCount == foldIndex)) {
                            // this example is not part of the test fold, skip it
                            offset++;
                            continue;
                        }
                        negativeSupport++;
                        offset++;
                        List<TopicSuggestion> suggestedTopics = classifier.suggestTopics(example.contents);
                        for (TopicSuggestion suggestedTopic : suggestedTopics) {
                            if (topic.equals(suggestedTopic.conceptUri)) {
                                falsePositives++;
                                if (falsePositiveExamples.size() < MAX_COLLECTED_EXAMPLES / foldCount) {
                                    falsePositiveExamples.add(example.id);
                                }
                                break;
                            }
                        }
                    // we don't need to collect true negatives
                    }
                } while (examples.hasMore && offset < MAX_EVALUATION_SAMPLES);
                if (skipTopic) {
                    log.debug("Skipping evaluation of {} because too few positive examples.", topic);
                } else {
                    // compute precision, recall and f1 score for the current test fold and topic
                    float precision = 0;
                    if (truePositives != 0 || falsePositives != 0) {
                        precision = truePositives / (float) (truePositives + falsePositives);
                    }
                    float recall = 0;
                    if (truePositives != 0 || falseNegatives != 0) {
                        recall = truePositives / (float) (truePositives + falseNegatives);
                    }
                    updatePerformanceMetadata(topic, precision, recall, positiveSupport, negativeSupport, falsePositiveExamples, falseNegativeExamples);
                    updated += 1;
                }
            }
            try {
                getActiveSolrServer().commit();
            } catch (Exception e) {
                throw new ClassifierException(e);
            }
            return updated;
        }
    });
    long stop = System.currentTimeMillis();
    log.info(String.format("Finished CV iteration %d/%d on classifier %s in %fs.", cvFoldIndex + 1, cvFoldCount, engineName, (stop - start) / 1000.0));
    if (context != null) {
        // close open trackers
        classifier.deactivate(context);
    }
    return updatedTopics;
}
Also used : TopicSuggestion(org.apache.stanbol.enhancer.topic.api.TopicSuggestion) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) SolrServerException(org.apache.solr.client.solrj.SolrServerException) ConfigurationException(org.osgi.service.cm.ConfigurationException) InvalidSyntaxException(org.osgi.framework.InvalidSyntaxException) TrainingSetException(org.apache.stanbol.enhancer.topic.api.training.TrainingSetException) ClassifierException(org.apache.stanbol.enhancer.topic.api.ClassifierException) InvalidContentException(org.apache.stanbol.enhancer.servicesapi.InvalidContentException) EntityhubException(org.apache.stanbol.entityhub.servicesapi.EntityhubException) ChainException(org.apache.stanbol.enhancer.servicesapi.ChainException) IOException(java.io.IOException) SolrDocument(org.apache.solr.common.SolrDocument) Batch(org.apache.stanbol.enhancer.topic.api.Batch) Example(org.apache.stanbol.enhancer.topic.api.training.Example) Collection(java.util.Collection) SolrDocumentList(org.apache.solr.common.SolrDocumentList) List(java.util.List) ArrayList(java.util.ArrayList) File(java.io.File) TrainingSetException(org.apache.stanbol.enhancer.topic.api.training.TrainingSetException) ClassifierException(org.apache.stanbol.enhancer.topic.api.ClassifierException)

Example 3 with TopicSuggestion

use of org.apache.stanbol.enhancer.topic.api.TopicSuggestion in project stanbol by apache.

the class TopicClassificationEngine method suggestTopics.

public List<TopicSuggestion> suggestTopics(String text) throws ClassifierException {
    List<TopicSuggestion> suggestedTopics = new ArrayList<TopicSuggestion>(MAX_SUGGESTIONS * 3);
    SolrServer solrServer = getActiveSolrServer();
    SolrQuery query = new SolrQuery();
    query.setRequestHandler("/" + MoreLikeThisParams.MLT);
    query.setFilterQueries(entryTypeField + ":" + MODEL_ENTRY);
    query.set(MoreLikeThisParams.MATCH_INCLUDE, false);
    query.set(MoreLikeThisParams.MIN_DOC_FREQ, 1);
    query.set(MoreLikeThisParams.MIN_TERM_FREQ, 1);
    query.set(MoreLikeThisParams.MAX_QUERY_TERMS, 30);
    query.set(MoreLikeThisParams.MAX_NUM_TOKENS_PARSED, 10000);
    // TODO: find a way to parse the interesting terms and report them
    // for debugging / explanation in dedicated RDF data structure.
    // query.set(MoreLikeThisParams.INTERESTING_TERMS, "details");
    query.set(MoreLikeThisParams.SIMILARITY_FIELDS, similarityField);
    query.set(CommonParams.STREAM_BODY, text);
    // over query the number of suggestions to find a statistical cut based on the curve of the scores of
    // the top suggestion
    query.setRows(MAX_SUGGESTIONS * 3);
    query.setFields(conceptUriField);
    query.setIncludeScore(true);
    try {
        StreamQueryRequest request = new StreamQueryRequest(query);
        QueryResponse response = request.process(solrServer);
        SolrDocumentList results = response.getResults();
        for (SolrDocument result : results.toArray(new SolrDocument[0])) {
            String conceptUri = (String) result.getFirstValue(conceptUriField);
            if (conceptUri == null) {
                throw new ClassifierException(String.format("Solr Core '%s' is missing required field '%s'.", solrCoreId, conceptUriField));
            }
            Float score = (Float) result.getFirstValue("score");
            // fetch metadata
            SolrQuery metadataQuery = new SolrQuery("*:*");
            // use filter queries to leverage the Solr cache explicitly
            metadataQuery.addFilterQuery(entryTypeField + ":" + METADATA_ENTRY);
            metadataQuery.addFilterQuery(conceptUriField + ":" + ClientUtils.escapeQueryChars(conceptUri));
            metadataQuery.setFields(conceptUriField, broaderField, primaryTopicUriField);
            SolrDocument metadata = solrServer.query(metadataQuery).getResults().get(0);
            String primaryTopicUri = (String) metadata.getFirstValue(primaryTopicUriField);
            suggestedTopics.add(new TopicSuggestion(conceptUri, primaryTopicUri, metadata.getFieldValues(broaderField), score));
        }
    } catch (SolrServerException e) {
        if ("unknown handler: /mlt".equals(e.getCause().getMessage())) {
            String message = String.format("SolrServer with id '%s' for topic engine '%s' lacks" + " configuration for the MoreLikeThisHandler", solrCoreId, engineName);
            throw new ClassifierException(message, e);
        } else {
            throw new ClassifierException(e);
        }
    }
    if (suggestedTopics.size() <= 1) {
        // no need to apply the cutting heuristic
        return suggestedTopics;
    }
    // filter out suggestions that are less than some threshold based on the mean of the top scores
    float mean = 0.0f;
    for (TopicSuggestion suggestion : suggestedTopics) {
        mean += suggestion.score / suggestedTopics.size();
    }
    float threshold = 0.25f * suggestedTopics.get(0).score + 0.75f * mean;
    List<TopicSuggestion> filteredSuggestions = new ArrayList<TopicSuggestion>();
    for (TopicSuggestion suggestion : suggestedTopics) {
        if (filteredSuggestions.size() >= MAX_SUGGESTIONS) {
            return filteredSuggestions;
        }
        if (filteredSuggestions.isEmpty() || suggestion.score > threshold) {
            filteredSuggestions.add(suggestion);
        } else {
            break;
        }
    }
    return filteredSuggestions;
}
Also used : StreamQueryRequest(org.apache.stanbol.commons.solr.utils.StreamQueryRequest) SolrServerException(org.apache.solr.client.solrj.SolrServerException) ArrayList(java.util.ArrayList) SolrDocumentList(org.apache.solr.common.SolrDocumentList) TopicSuggestion(org.apache.stanbol.enhancer.topic.api.TopicSuggestion) EmbeddedSolrServer(org.apache.solr.client.solrj.embedded.EmbeddedSolrServer) SolrServer(org.apache.solr.client.solrj.SolrServer) ManagedSolrServer(org.apache.stanbol.commons.solr.managed.ManagedSolrServer) SolrQuery(org.apache.solr.client.solrj.SolrQuery) SolrDocument(org.apache.solr.common.SolrDocument) QueryResponse(org.apache.solr.client.solrj.response.QueryResponse) ClassifierException(org.apache.stanbol.enhancer.topic.api.ClassifierException)

Example 4 with TopicSuggestion

use of org.apache.stanbol.enhancer.topic.api.TopicSuggestion in project stanbol by apache.

the class TopicEngineTest method testTopicClassification.

// @Test
// to get updated to work with the new Solr schema + move the CSV import directly to the classifier or
// training set API
public void testTopicClassification() throws Exception {
    log.info(" --- testTopicClassification --- ");
    loadSampleTopicsFromTSV();
    List<TopicSuggestion> suggestedTopics = classifier.suggestTopics("The Man Who Shot Liberty Valance is a 1962" + " American Western film directed by John Ford," + " narrated by Charlton Heston and starring James" + " Stewart, John Wayne and Vivien Leigh.");
    assertNotNull(suggestedTopics);
    assertEquals(suggestedTopics.size(), 10);
    TopicSuggestion bestSuggestion = suggestedTopics.get(0);
    assertEquals(bestSuggestion.conceptUri, "Category:American_films");
}
Also used : TopicSuggestion(org.apache.stanbol.enhancer.topic.api.TopicSuggestion)

Aggregations

TopicSuggestion (org.apache.stanbol.enhancer.topic.api.TopicSuggestion)4 ClassifierException (org.apache.stanbol.enhancer.topic.api.ClassifierException)3 IOException (java.io.IOException)2 ArrayList (java.util.ArrayList)2 SolrServerException (org.apache.solr.client.solrj.SolrServerException)2 SolrDocument (org.apache.solr.common.SolrDocument)2 SolrDocumentList (org.apache.solr.common.SolrDocumentList)2 EngineException (org.apache.stanbol.enhancer.servicesapi.EngineException)2 InvalidContentException (org.apache.stanbol.enhancer.servicesapi.InvalidContentException)2 EntityhubException (org.apache.stanbol.entityhub.servicesapi.EntityhubException)2 File (java.io.File)1 Collection (java.util.Collection)1 List (java.util.List)1 Graph (org.apache.clerezza.commons.rdf.Graph)1 IRI (org.apache.clerezza.commons.rdf.IRI)1 ImmutableGraph (org.apache.clerezza.commons.rdf.ImmutableGraph)1 PlainLiteralImpl (org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl)1 TripleImpl (org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)1 LiteralFactory (org.apache.clerezza.rdf.core.LiteralFactory)1 SolrQuery (org.apache.solr.client.solrj.SolrQuery)1