use of org.apache.clerezza.commons.rdf.Graph in project stanbol by apache.
the class TopicClassificationEngine method computeEnhancements.
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES);
if (contentPart == null) {
throw new IllegalStateException("No ContentPart with a supported Mime Type" + "found for ContentItem " + ci.getUri() + "(supported: '" + SUPPORTED_MIMETYPES + "') -> this indicates that canEnhance was" + "NOT called and indicates a bug in the used EnhancementJobManager!");
}
String language = EnhancementEngineHelper.getLanguage(ci);
if (!(acceptedLanguageSet.isEmpty() || acceptedLanguageSet.contains(language) || acceptedLanguageSet.contains(""))) {
throw new IllegalStateException("The language '" + language + "' of the ContentItem is not configured as " + " active for this Engine (active: " + acceptedLanguageSet + ").");
}
String text;
try {
text = ContentItemHelper.getText(contentPart.getValue());
} catch (IOException e) {
throw new InvalidContentException(String.format("Unable to extract " + " textual content from ContentPart %s of ContentItem %s!", contentPart.getKey(), ci.getUri()), e);
}
if (text.trim().isEmpty()) {
log.warn("ContentPart {} of ContentItem {} does not contain any " + "text to extract topics from", contentPart.getKey(), ci.getUri());
return;
}
Graph metadata = ci.getMetadata();
List<TopicSuggestion> topics;
try {
topics = suggestTopics(text);
if (topics.isEmpty()) {
return;
}
} catch (ClassifierException e) {
throw new EngineException(e);
}
IRI precision = new IRI(NamespaceEnum.fise + "classifier/precision");
IRI recall = new IRI(NamespaceEnum.fise + "classifier/recall");
IRI f1 = new IRI(NamespaceEnum.fise + "classifier/f1");
LiteralFactory lf = LiteralFactory.getInstance();
ci.getLock().writeLock().lock();
try {
// Global text annotation to attach all the topic annotation to it.
IRI textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
metadata.add(new TripleImpl(textAnnotation, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_TYPE, OntologicalClasses.SKOS_CONCEPT));
for (TopicSuggestion topic : topics) {
IRI enhancement = EnhancementEngineHelper.createEntityEnhancement(ci, this);
metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.RDF_TYPE, TechnicalClasses.ENHANCER_TOPICANNOTATION));
metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_RELATION, textAnnotation));
// add link to entity
metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_REFERENCE, new IRI(topic.conceptUri)));
metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_TYPE, OntologicalClasses.SKOS_CONCEPT));
// add confidence information
metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_CONFIDENCE, lf.createTypedLiteral(Double.valueOf(topic.score))));
// add performance estimates of the classifier if available
ClassificationReport perf = getPerformanceEstimates(topic.conceptUri);
if (perf.uptodate) {
metadata.add(new TripleImpl(enhancement, precision, lf.createTypedLiteral(Double.valueOf(perf.precision))));
metadata.add(new TripleImpl(enhancement, recall, lf.createTypedLiteral(Double.valueOf(perf.recall))));
metadata.add(new TripleImpl(enhancement, f1, lf.createTypedLiteral(Double.valueOf(perf.f1))));
}
// fetch concept label from the entityhub or a referenced site if available
Entity entity = entityhub.getEntity(topic.conceptUri);
if (entity == null) {
entity = referencedSiteManager.getEntity(topic.conceptUri);
}
if (entity != null) {
Representation representation = entity.getRepresentation();
// TODO: extract all languages based on some configuration instead of hardcoding English
Text label = representation.getFirst(NamespaceEnum.skos + "prefLabel", "en", "en-US", "en-GB");
if (label == null) {
label = representation.getFirst(NamespaceEnum.rdfs + "label", "en", "en-US", "en-GB");
}
if (label != null) {
metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_LABEL, new PlainLiteralImpl(label.getText())));
}
}
}
} catch (ClassifierException e) {
throw new EngineException(e);
} catch (IllegalArgumentException e) {
throw new EngineException(e);
} catch (EntityhubException e) {
throw new EngineException(e);
} finally {
ci.getLock().writeLock().unlock();
}
}
use of org.apache.clerezza.commons.rdf.Graph in project stanbol by apache.
the class NEREngineCore method findNamedEntities.
protected void findNamedEntities(final ContentItem ci, final AnalysedText at, final String text, final String lang, final TokenNameFinderModel nameFinderModel) {
if (ci == null) {
throw new IllegalArgumentException("Parsed ContentItem MUST NOT be NULL");
}
if (at == null && text == null) {
log.warn("NULL was parsed as AnalysedText AND Text for content item " + ci.getUri() + ". One of the two MUST BE present! -> call ignored");
return;
}
final Language language;
if (lang != null && !lang.isEmpty()) {
language = new Language(lang);
} else {
language = null;
}
if (log.isDebugEnabled()) {
log.debug("findNamedEntities model={}, language={}, text=", new Object[] { nameFinderModel, language, StringUtils.abbreviate(at != null ? at.getSpan() : text, 100) });
}
LiteralFactory literalFactory = LiteralFactory.getInstance();
Graph g = ci.getMetadata();
Map<String, List<NameOccurrence>> entityNames;
if (at != null) {
entityNames = extractNameOccurrences(nameFinderModel, at, lang);
} else {
entityNames = extractNameOccurrences(nameFinderModel, text, lang);
}
//lock the ContentItem while writing the RDF data for found Named Entities
ci.getLock().writeLock().lock();
try {
Map<String, IRI> previousAnnotations = new LinkedHashMap<String, IRI>();
for (Map.Entry<String, List<NameOccurrence>> nameInContext : entityNames.entrySet()) {
String name = nameInContext.getKey();
List<NameOccurrence> occurrences = nameInContext.getValue();
IRI firstOccurrenceAnnotation = null;
for (NameOccurrence occurrence : occurrences) {
IRI textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
g.add(new TripleImpl(textAnnotation, ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(name, language)));
g.add(new TripleImpl(textAnnotation, ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(occurrence.context, language)));
if (occurrence.type != null) {
g.add(new TripleImpl(textAnnotation, DC_TYPE, occurrence.type));
}
if (occurrence.confidence != null) {
g.add(new TripleImpl(textAnnotation, ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(occurrence.confidence)));
}
if (occurrence.start != null && occurrence.end != null) {
g.add(new TripleImpl(textAnnotation, ENHANCER_START, literalFactory.createTypedLiteral(occurrence.start)));
g.add(new TripleImpl(textAnnotation, ENHANCER_END, literalFactory.createTypedLiteral(occurrence.end)));
}
// name
if (firstOccurrenceAnnotation == null) {
// specific occurrence
for (Map.Entry<String, IRI> entry : previousAnnotations.entrySet()) {
if (entry.getKey().contains(name)) {
// we have found a most specific previous
// occurrence, use it as subsumption target
firstOccurrenceAnnotation = entry.getValue();
g.add(new TripleImpl(textAnnotation, DC_RELATION, firstOccurrenceAnnotation));
break;
}
}
if (firstOccurrenceAnnotation == null) {
// no most specific previous occurrence, I am the first,
// most specific occurrence to be later used as a target
firstOccurrenceAnnotation = textAnnotation;
previousAnnotations.put(name, textAnnotation);
}
} else {
// I am referring to a most specific first occurrence of the
// same name
g.add(new TripleImpl(textAnnotation, DC_RELATION, firstOccurrenceAnnotation));
}
}
}
} finally {
ci.getLock().writeLock().unlock();
}
}
use of org.apache.clerezza.commons.rdf.Graph in project stanbol by apache.
the class TestNamedEntityExtractionEnhancementEngine method testCustomModel.
@Test
public void testCustomModel() throws EngineException, IOException {
ContentItem ci = wrapAsContentItem("urn:test:content-item:single:sentence", EHEALTH, "en");
//this test does not use default models
nerEngine.config.getDefaultModelTypes().clear();
//but instead a custom model provided by the test data
nerEngine.config.addCustomNameFinderModel("en", "bionlp2004-DNA-en.bin");
nerEngine.config.setMappedType("DNA", new IRI("http://www.bootstrep.eu/ontology/GRO#DNA"));
nerEngine.computeEnhancements(ci);
Map<IRI, RDFTerm> expectedValues = new HashMap<IRI, RDFTerm>();
expectedValues.put(Properties.ENHANCER_EXTRACTED_FROM, ci.getUri());
expectedValues.put(Properties.DC_CREATOR, LiteralFactory.getInstance().createTypedLiteral(nerEngine.getClass().getName()));
//adding null as expected for confidence makes it a required property
expectedValues.put(Properties.ENHANCER_CONFIDENCE, null);
//and dc:type values MUST be the URI set as mapped type
expectedValues.put(Properties.DC_TYPE, new IRI("http://www.bootstrep.eu/ontology/GRO#DNA"));
Graph g = ci.getMetadata();
int textAnnotationCount = validateAllTextAnnotations(g, EHEALTH, expectedValues);
assertEquals(7, textAnnotationCount);
}
use of org.apache.clerezza.commons.rdf.Graph in project stanbol by apache.
the class Nlp2RdfMetadataEngine method computeEnhancements.
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
AnalysedText at = getAnalysedText(this, ci, true);
String lang = EnhancementEngineHelper.getLanguage(ci);
Language language = lang == null ? null : new Language(lang);
//now iterate over the AnalysedText data and create the RDF representation
//TODO: make configureable
boolean sentences = true;
boolean phrases = true;
boolean words = true;
EnumSet<SpanTypeEnum> activeTypes = EnumSet.noneOf(SpanTypeEnum.class);
if (sentences) {
activeTypes.add(SpanTypeEnum.Sentence);
}
if (phrases) {
activeTypes.add(SpanTypeEnum.Chunk);
}
if (words) {
activeTypes.add(SpanTypeEnum.Token);
}
Graph metadata = ci.getMetadata();
IRI base = ci.getUri();
ci.getLock().writeLock().lock();
try {
Iterator<Span> spans = at.getEnclosed(activeTypes);
IRI sentence = null;
IRI phrase = null;
IRI word = null;
boolean firstWordInSentence = true;
while (spans.hasNext()) {
Span span = spans.next();
//TODO: filter Spans based on additional requirements
//(1) write generic information about the span
IRI current = writeSpan(metadata, base, at, language, span);
//(2) add the relations between the different spans
switch(span.getType()) {
case Sentence:
if (sentence != null) {
metadata.add(new TripleImpl(sentence, SsoOntology.nextSentence.getUri(), current));
}
sentence = current;
firstWordInSentence = true;
break;
case Chunk:
if (sentence != null) {
metadata.add(new TripleImpl(current, StringOntology.superString.getUri(), sentence));
if (word != null) {
metadata.add(new TripleImpl(word, SsoOntology.lastWord.getUri(), sentence));
}
}
phrase = current;
break;
case Token:
if (sentence != null) {
metadata.add(new TripleImpl(current, SsoOntology.sentence.getUri(), sentence));
if (firstWordInSentence) {
metadata.add(new TripleImpl(current, SsoOntology.firstWord.getUri(), sentence));
firstWordInSentence = false;
}
}
if (phrase != null) {
metadata.add(new TripleImpl(current, SsoOntology.parent.getUri(), phrase));
}
if (word != null) {
metadata.add(new TripleImpl(word, SsoOntology.nextWord.getUri(), current));
metadata.add(new TripleImpl(current, SsoOntology.previousWord.getUri(), word));
}
word = current;
break;
default:
break;
}
//(3) add specific information such as POS, chunk type ...
writePos(metadata, span, current);
writePhrase(metadata, span, current);
//OlIA does not include Sentiments
Value<Double> sentiment = span.getAnnotation(NlpAnnotations.SENTIMENT_ANNOTATION);
if (sentiment != null && sentiment.value() != null) {
metadata.add(new TripleImpl(current, SENTIMENT_PROPERTY, lf.createTypedLiteral(sentiment.value())));
}
}
} finally {
ci.getLock().writeLock().unlock();
}
}
use of org.apache.clerezza.commons.rdf.Graph in project stanbol by apache.
the class TestOpenCalaisEngine method testEntityExtraction.
@Test
public void testEntityExtraction() throws IOException, EngineException {
String testFile = "calaisresult.owl";
String format = "application/rdf+xml";
InputStream in = this.getClass().getClassLoader().getResourceAsStream(testFile);
Assert.assertNotNull("failed to load resource " + testFile, in);
Graph model = calaisExtractor.readModel(in, format);
Assert.assertNotNull("model reader failed with format: " + format, model);
Collection<CalaisEntityOccurrence> entities;
try {
entities = calaisExtractor.queryModel(model);
} catch (EngineException e) {
RemoteServiceHelper.checkServiceUnavailable(e);
return;
}
LOG.info("Found entities: {}", entities.size());
LOG.debug("Entities:\n{}", entities);
Assert.assertFalse("No entities found!", entities.isEmpty());
//test the generation of the Enhancements
ContentItem ci = wrapAsContentItem(TEST_TEXT);
calaisExtractor.createEnhancements(entities, ci);
Map<IRI, RDFTerm> expectedValues = new HashMap<IRI, RDFTerm>();
expectedValues.put(Properties.ENHANCER_EXTRACTED_FROM, ci.getUri());
expectedValues.put(Properties.DC_CREATOR, LiteralFactory.getInstance().createTypedLiteral(calaisExtractor.getClass().getName()));
//adding null as expected for confidence makes it a required property
expectedValues.put(Properties.ENHANCER_CONFIDENCE, null);
validateAllTextAnnotations(ci.getMetadata(), TEST_TEXT, expectedValues);
validateAllEntityAnnotations(ci.getMetadata(), expectedValues);
}
Aggregations