Search in sources :

Example 16 with Graph

use of org.apache.clerezza.commons.rdf.Graph in project stanbol by apache.

the class TopicClassificationEngine method computeEnhancements.

@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES);
    if (contentPart == null) {
        throw new IllegalStateException("No ContentPart with a supported Mime Type" + "found for ContentItem " + ci.getUri() + "(supported: '" + SUPPORTED_MIMETYPES + "') -> this indicates that canEnhance was" + "NOT called and indicates a bug in the used EnhancementJobManager!");
    }
    String language = EnhancementEngineHelper.getLanguage(ci);
    if (!(acceptedLanguageSet.isEmpty() || acceptedLanguageSet.contains(language) || acceptedLanguageSet.contains(""))) {
        throw new IllegalStateException("The language '" + language + "' of the ContentItem is not configured as " + " active for this Engine (active: " + acceptedLanguageSet + ").");
    }
    String text;
    try {
        text = ContentItemHelper.getText(contentPart.getValue());
    } catch (IOException e) {
        throw new InvalidContentException(String.format("Unable to extract " + " textual content from ContentPart %s of ContentItem %s!", contentPart.getKey(), ci.getUri()), e);
    }
    if (text.trim().isEmpty()) {
        log.warn("ContentPart {} of ContentItem {} does not contain any " + "text to extract topics from", contentPart.getKey(), ci.getUri());
        return;
    }
    Graph metadata = ci.getMetadata();
    List<TopicSuggestion> topics;
    try {
        topics = suggestTopics(text);
        if (topics.isEmpty()) {
            return;
        }
    } catch (ClassifierException e) {
        throw new EngineException(e);
    }
    IRI precision = new IRI(NamespaceEnum.fise + "classifier/precision");
    IRI recall = new IRI(NamespaceEnum.fise + "classifier/recall");
    IRI f1 = new IRI(NamespaceEnum.fise + "classifier/f1");
    LiteralFactory lf = LiteralFactory.getInstance();
    ci.getLock().writeLock().lock();
    try {
        // Global text annotation to attach all the topic annotation to it.
        IRI textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
        metadata.add(new TripleImpl(textAnnotation, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_TYPE, OntologicalClasses.SKOS_CONCEPT));
        for (TopicSuggestion topic : topics) {
            IRI enhancement = EnhancementEngineHelper.createEntityEnhancement(ci, this);
            metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.RDF_TYPE, TechnicalClasses.ENHANCER_TOPICANNOTATION));
            metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_RELATION, textAnnotation));
            // add link to entity
            metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_REFERENCE, new IRI(topic.conceptUri)));
            metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_TYPE, OntologicalClasses.SKOS_CONCEPT));
            // add confidence information
            metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_CONFIDENCE, lf.createTypedLiteral(Double.valueOf(topic.score))));
            // add performance estimates of the classifier if available
            ClassificationReport perf = getPerformanceEstimates(topic.conceptUri);
            if (perf.uptodate) {
                metadata.add(new TripleImpl(enhancement, precision, lf.createTypedLiteral(Double.valueOf(perf.precision))));
                metadata.add(new TripleImpl(enhancement, recall, lf.createTypedLiteral(Double.valueOf(perf.recall))));
                metadata.add(new TripleImpl(enhancement, f1, lf.createTypedLiteral(Double.valueOf(perf.f1))));
            }
            // fetch concept label from the entityhub or a referenced site if available
            Entity entity = entityhub.getEntity(topic.conceptUri);
            if (entity == null) {
                entity = referencedSiteManager.getEntity(topic.conceptUri);
            }
            if (entity != null) {
                Representation representation = entity.getRepresentation();
                // TODO: extract all languages based on some configuration instead of hardcoding English
                Text label = representation.getFirst(NamespaceEnum.skos + "prefLabel", "en", "en-US", "en-GB");
                if (label == null) {
                    label = representation.getFirst(NamespaceEnum.rdfs + "label", "en", "en-US", "en-GB");
                }
                if (label != null) {
                    metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_LABEL, new PlainLiteralImpl(label.getText())));
                }
            }
        }
    } catch (ClassifierException e) {
        throw new EngineException(e);
    } catch (IllegalArgumentException e) {
        throw new EngineException(e);
    } catch (EntityhubException e) {
        throw new EngineException(e);
    } finally {
        ci.getLock().writeLock().unlock();
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) Entity(org.apache.stanbol.entityhub.servicesapi.model.Entity) Blob(org.apache.stanbol.enhancer.servicesapi.Blob) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) Representation(org.apache.stanbol.entityhub.servicesapi.model.Representation) Text(org.apache.stanbol.entityhub.servicesapi.model.Text) IOException(java.io.IOException) TopicSuggestion(org.apache.stanbol.enhancer.topic.api.TopicSuggestion) LiteralFactory(org.apache.clerezza.rdf.core.LiteralFactory) InvalidContentException(org.apache.stanbol.enhancer.servicesapi.InvalidContentException) ImmutableGraph(org.apache.clerezza.commons.rdf.ImmutableGraph) Graph(org.apache.clerezza.commons.rdf.Graph) EntityhubException(org.apache.stanbol.entityhub.servicesapi.EntityhubException) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) ClassificationReport(org.apache.stanbol.enhancer.topic.api.ClassificationReport) ClassifierException(org.apache.stanbol.enhancer.topic.api.ClassifierException)

Example 17 with Graph

use of org.apache.clerezza.commons.rdf.Graph in project stanbol by apache.

the class NEREngineCore method findNamedEntities.

protected void findNamedEntities(final ContentItem ci, final AnalysedText at, final String text, final String lang, final TokenNameFinderModel nameFinderModel) {
    if (ci == null) {
        throw new IllegalArgumentException("Parsed ContentItem MUST NOT be NULL");
    }
    if (at == null && text == null) {
        log.warn("NULL was parsed as AnalysedText AND Text for content item " + ci.getUri() + ". One of the two MUST BE present! -> call ignored");
        return;
    }
    final Language language;
    if (lang != null && !lang.isEmpty()) {
        language = new Language(lang);
    } else {
        language = null;
    }
    if (log.isDebugEnabled()) {
        log.debug("findNamedEntities model={},  language={}, text=", new Object[] { nameFinderModel, language, StringUtils.abbreviate(at != null ? at.getSpan() : text, 100) });
    }
    LiteralFactory literalFactory = LiteralFactory.getInstance();
    Graph g = ci.getMetadata();
    Map<String, List<NameOccurrence>> entityNames;
    if (at != null) {
        entityNames = extractNameOccurrences(nameFinderModel, at, lang);
    } else {
        entityNames = extractNameOccurrences(nameFinderModel, text, lang);
    }
    //lock the ContentItem while writing the RDF data for found Named Entities
    ci.getLock().writeLock().lock();
    try {
        Map<String, IRI> previousAnnotations = new LinkedHashMap<String, IRI>();
        for (Map.Entry<String, List<NameOccurrence>> nameInContext : entityNames.entrySet()) {
            String name = nameInContext.getKey();
            List<NameOccurrence> occurrences = nameInContext.getValue();
            IRI firstOccurrenceAnnotation = null;
            for (NameOccurrence occurrence : occurrences) {
                IRI textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
                g.add(new TripleImpl(textAnnotation, ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(name, language)));
                g.add(new TripleImpl(textAnnotation, ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(occurrence.context, language)));
                if (occurrence.type != null) {
                    g.add(new TripleImpl(textAnnotation, DC_TYPE, occurrence.type));
                }
                if (occurrence.confidence != null) {
                    g.add(new TripleImpl(textAnnotation, ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(occurrence.confidence)));
                }
                if (occurrence.start != null && occurrence.end != null) {
                    g.add(new TripleImpl(textAnnotation, ENHANCER_START, literalFactory.createTypedLiteral(occurrence.start)));
                    g.add(new TripleImpl(textAnnotation, ENHANCER_END, literalFactory.createTypedLiteral(occurrence.end)));
                }
                // name
                if (firstOccurrenceAnnotation == null) {
                    // specific occurrence
                    for (Map.Entry<String, IRI> entry : previousAnnotations.entrySet()) {
                        if (entry.getKey().contains(name)) {
                            // we have found a most specific previous
                            // occurrence, use it as subsumption target
                            firstOccurrenceAnnotation = entry.getValue();
                            g.add(new TripleImpl(textAnnotation, DC_RELATION, firstOccurrenceAnnotation));
                            break;
                        }
                    }
                    if (firstOccurrenceAnnotation == null) {
                        // no most specific previous occurrence, I am the first,
                        // most specific occurrence to be later used as a target
                        firstOccurrenceAnnotation = textAnnotation;
                        previousAnnotations.put(name, textAnnotation);
                    }
                } else {
                    // I am referring to a most specific first occurrence of the
                    // same name
                    g.add(new TripleImpl(textAnnotation, DC_RELATION, firstOccurrenceAnnotation));
                }
            }
        }
    } finally {
        ci.getLock().writeLock().unlock();
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) LiteralFactory(org.apache.clerezza.rdf.core.LiteralFactory) LinkedHashMap(java.util.LinkedHashMap) Graph(org.apache.clerezza.commons.rdf.Graph) Language(org.apache.clerezza.commons.rdf.Language) List(java.util.List) ArrayList(java.util.ArrayList) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) Map(java.util.Map) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap)

Example 18 with Graph

use of org.apache.clerezza.commons.rdf.Graph in project stanbol by apache.

the class TestNamedEntityExtractionEnhancementEngine method testCustomModel.

@Test
public void testCustomModel() throws EngineException, IOException {
    ContentItem ci = wrapAsContentItem("urn:test:content-item:single:sentence", EHEALTH, "en");
    //this test does not use default models
    nerEngine.config.getDefaultModelTypes().clear();
    //but instead a custom model provided by the test data
    nerEngine.config.addCustomNameFinderModel("en", "bionlp2004-DNA-en.bin");
    nerEngine.config.setMappedType("DNA", new IRI("http://www.bootstrep.eu/ontology/GRO#DNA"));
    nerEngine.computeEnhancements(ci);
    Map<IRI, RDFTerm> expectedValues = new HashMap<IRI, RDFTerm>();
    expectedValues.put(Properties.ENHANCER_EXTRACTED_FROM, ci.getUri());
    expectedValues.put(Properties.DC_CREATOR, LiteralFactory.getInstance().createTypedLiteral(nerEngine.getClass().getName()));
    //adding null as expected for confidence makes it a required property
    expectedValues.put(Properties.ENHANCER_CONFIDENCE, null);
    //and dc:type values MUST be the URI set as mapped type
    expectedValues.put(Properties.DC_TYPE, new IRI("http://www.bootstrep.eu/ontology/GRO#DNA"));
    Graph g = ci.getMetadata();
    int textAnnotationCount = validateAllTextAnnotations(g, EHEALTH, expectedValues);
    assertEquals(7, textAnnotationCount);
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) Graph(org.apache.clerezza.commons.rdf.Graph) HashMap(java.util.HashMap) RDFTerm(org.apache.clerezza.commons.rdf.RDFTerm) ContentItem(org.apache.stanbol.enhancer.servicesapi.ContentItem) Test(org.junit.Test)

Example 19 with Graph

use of org.apache.clerezza.commons.rdf.Graph in project stanbol by apache.

the class Nlp2RdfMetadataEngine method computeEnhancements.

@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    AnalysedText at = getAnalysedText(this, ci, true);
    String lang = EnhancementEngineHelper.getLanguage(ci);
    Language language = lang == null ? null : new Language(lang);
    //now iterate over the AnalysedText data and create the RDF representation
    //TODO: make configureable
    boolean sentences = true;
    boolean phrases = true;
    boolean words = true;
    EnumSet<SpanTypeEnum> activeTypes = EnumSet.noneOf(SpanTypeEnum.class);
    if (sentences) {
        activeTypes.add(SpanTypeEnum.Sentence);
    }
    if (phrases) {
        activeTypes.add(SpanTypeEnum.Chunk);
    }
    if (words) {
        activeTypes.add(SpanTypeEnum.Token);
    }
    Graph metadata = ci.getMetadata();
    IRI base = ci.getUri();
    ci.getLock().writeLock().lock();
    try {
        Iterator<Span> spans = at.getEnclosed(activeTypes);
        IRI sentence = null;
        IRI phrase = null;
        IRI word = null;
        boolean firstWordInSentence = true;
        while (spans.hasNext()) {
            Span span = spans.next();
            //TODO: filter Spans based on additional requirements
            //(1) write generic information about the span
            IRI current = writeSpan(metadata, base, at, language, span);
            //(2) add the relations between the different spans
            switch(span.getType()) {
                case Sentence:
                    if (sentence != null) {
                        metadata.add(new TripleImpl(sentence, SsoOntology.nextSentence.getUri(), current));
                    }
                    sentence = current;
                    firstWordInSentence = true;
                    break;
                case Chunk:
                    if (sentence != null) {
                        metadata.add(new TripleImpl(current, StringOntology.superString.getUri(), sentence));
                        if (word != null) {
                            metadata.add(new TripleImpl(word, SsoOntology.lastWord.getUri(), sentence));
                        }
                    }
                    phrase = current;
                    break;
                case Token:
                    if (sentence != null) {
                        metadata.add(new TripleImpl(current, SsoOntology.sentence.getUri(), sentence));
                        if (firstWordInSentence) {
                            metadata.add(new TripleImpl(current, SsoOntology.firstWord.getUri(), sentence));
                            firstWordInSentence = false;
                        }
                    }
                    if (phrase != null) {
                        metadata.add(new TripleImpl(current, SsoOntology.parent.getUri(), phrase));
                    }
                    if (word != null) {
                        metadata.add(new TripleImpl(word, SsoOntology.nextWord.getUri(), current));
                        metadata.add(new TripleImpl(current, SsoOntology.previousWord.getUri(), word));
                    }
                    word = current;
                    break;
                default:
                    break;
            }
            //(3) add specific information such as POS, chunk type ...
            writePos(metadata, span, current);
            writePhrase(metadata, span, current);
            //OlIA does not include Sentiments
            Value<Double> sentiment = span.getAnnotation(NlpAnnotations.SENTIMENT_ANNOTATION);
            if (sentiment != null && sentiment.value() != null) {
                metadata.add(new TripleImpl(current, SENTIMENT_PROPERTY, lf.createTypedLiteral(sentiment.value())));
            }
        }
    } finally {
        ci.getLock().writeLock().unlock();
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) SpanTypeEnum(org.apache.stanbol.enhancer.nlp.model.SpanTypeEnum) NIFHelper.writeSpan(org.apache.stanbol.enhancer.nlp.utils.NIFHelper.writeSpan) Span(org.apache.stanbol.enhancer.nlp.model.Span) NlpEngineHelper.getAnalysedText(org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getAnalysedText) AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) Graph(org.apache.clerezza.commons.rdf.Graph) Language(org.apache.clerezza.commons.rdf.Language) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)

Example 20 with Graph

use of org.apache.clerezza.commons.rdf.Graph in project stanbol by apache.

the class TestOpenCalaisEngine method testEntityExtraction.

@Test
public void testEntityExtraction() throws IOException, EngineException {
    String testFile = "calaisresult.owl";
    String format = "application/rdf+xml";
    InputStream in = this.getClass().getClassLoader().getResourceAsStream(testFile);
    Assert.assertNotNull("failed to load resource " + testFile, in);
    Graph model = calaisExtractor.readModel(in, format);
    Assert.assertNotNull("model reader failed with format: " + format, model);
    Collection<CalaisEntityOccurrence> entities;
    try {
        entities = calaisExtractor.queryModel(model);
    } catch (EngineException e) {
        RemoteServiceHelper.checkServiceUnavailable(e);
        return;
    }
    LOG.info("Found entities: {}", entities.size());
    LOG.debug("Entities:\n{}", entities);
    Assert.assertFalse("No entities found!", entities.isEmpty());
    //test the generation of the Enhancements
    ContentItem ci = wrapAsContentItem(TEST_TEXT);
    calaisExtractor.createEnhancements(entities, ci);
    Map<IRI, RDFTerm> expectedValues = new HashMap<IRI, RDFTerm>();
    expectedValues.put(Properties.ENHANCER_EXTRACTED_FROM, ci.getUri());
    expectedValues.put(Properties.DC_CREATOR, LiteralFactory.getInstance().createTypedLiteral(calaisExtractor.getClass().getName()));
    //adding null as expected for confidence makes it a required property
    expectedValues.put(Properties.ENHANCER_CONFIDENCE, null);
    validateAllTextAnnotations(ci.getMetadata(), TEST_TEXT, expectedValues);
    validateAllEntityAnnotations(ci.getMetadata(), expectedValues);
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) Graph(org.apache.clerezza.commons.rdf.Graph) HashMap(java.util.HashMap) InputStream(java.io.InputStream) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) RDFTerm(org.apache.clerezza.commons.rdf.RDFTerm) ContentItem(org.apache.stanbol.enhancer.servicesapi.ContentItem) Test(org.junit.Test)

Aggregations

Graph (org.apache.clerezza.commons.rdf.Graph)172 IRI (org.apache.clerezza.commons.rdf.IRI)110 TripleImpl (org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)66 SimpleGraph (org.apache.clerezza.commons.rdf.impl.utils.simple.SimpleGraph)57 Triple (org.apache.clerezza.commons.rdf.Triple)45 IndexedGraph (org.apache.stanbol.commons.indexedgraph.IndexedGraph)43 Test (org.junit.Test)38 BlankNodeOrIRI (org.apache.clerezza.commons.rdf.BlankNodeOrIRI)36 PlainLiteralImpl (org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl)34 IOException (java.io.IOException)27 ImmutableGraph (org.apache.clerezza.commons.rdf.ImmutableGraph)26 HashSet (java.util.HashSet)24 RDFTerm (org.apache.clerezza.commons.rdf.RDFTerm)24 EngineException (org.apache.stanbol.enhancer.servicesapi.EngineException)24 InputStream (java.io.InputStream)21 HashMap (java.util.HashMap)20 Language (org.apache.clerezza.commons.rdf.Language)17 Blob (org.apache.stanbol.enhancer.servicesapi.Blob)17 ArrayList (java.util.ArrayList)16 LiteralFactory (org.apache.clerezza.rdf.core.LiteralFactory)15