use of org.apache.stanbol.enhancer.servicesapi.Blob in project stanbol by apache.
the class TikaEngineTest method testRtf.
@Test
public void testRtf() throws EngineException, IOException {
log.info(">>> testRtf <<<");
ContentItem ci = createContentItem("test.rtf", "application/rtf");
assertFalse(engine.canEnhance(ci) == CANNOT_ENHANCE);
engine.computeEnhancements(ci);
Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, singleton("text/plain"));
assertNotNull(contentPart);
Blob plainTextBlob = contentPart.getValue();
assertNotNull(plainTextBlob);
assertContentRegexp(plainTextBlob, "The Apache Stanbol Enhancer", "The Stanbol enhancer can detect famous cities such as Paris and people such as Bob Marley.");
//validate XHTML results
contentPart = ContentItemHelper.getBlob(ci, singleton("application/xhtml+xml"));
assertNotNull(contentPart);
Blob xhtmlBlob = contentPart.getValue();
assertNotNull(xhtmlBlob);
assertContentRegexp(xhtmlBlob, "<html xmlns=\"http://www.w3.org/1999/xhtml\">", "<head>", "<meta name=", "<title>", "The Apache Stanbol Enhancer", "The Stanbol enhancer can detect famous cities", "</body></html>");
}
use of org.apache.stanbol.enhancer.servicesapi.Blob in project stanbol by apache.
the class TikaEngineTest method testContentTypeDetection.
@Test
public void testContentTypeDetection() throws EngineException, IOException {
log.info(">>> testContentTypeDetection <<<");
ContentItem ci = createContentItem("test.pdf", OCTET_STREAM.toString());
assertFalse(engine.canEnhance(ci) == CANNOT_ENHANCE);
engine.computeEnhancements(ci);
Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, singleton("text/plain"));
assertNotNull(contentPart);
Blob plainTextBlob = contentPart.getValue();
assertNotNull(plainTextBlob);
assertContentRegexp(plainTextBlob, "The Apache Stanbol Enhancer", "The Stanbol enhancer can detect famous cities");
//validate XHTML results
contentPart = ContentItemHelper.getBlob(ci, singleton("application/xhtml+xml"));
assertNotNull(contentPart);
Blob xhtmlBlob = contentPart.getValue();
assertNotNull(xhtmlBlob);
assertContentRegexp(xhtmlBlob, "<html xmlns=\"http://www.w3.org/1999/xhtml\">", "<head>", "<meta name=", "<div class=\"page\">", "The Apache Stanbol Enhancer", "The Stanbol enhancer can detect famous cities", "</body></html>");
}
use of org.apache.stanbol.enhancer.servicesapi.Blob in project stanbol by apache.
the class TikaEngineTest method testUnsupported.
@Test
public void testUnsupported() throws EngineException, IOException {
log.info(">>> testUnsupported <<<");
ContentItem ci = createContentItem("test.pages", "application/x-iwork-pages-sffpages");
assertFalse(engine.canEnhance(ci) == CANNOT_ENHANCE);
engine.computeEnhancements(ci);
Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, singleton("text/plain"));
//it MUST NOT give an error but also not add a content part
assertNull(contentPart);
//only the original content
assertEquals(1, ContentItemHelper.getContentParts(ci, Blob.class).size());
}
use of org.apache.stanbol.enhancer.servicesapi.Blob in project stanbol by apache.
the class TopicClassificationEngine method computeEnhancements.
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES);
if (contentPart == null) {
throw new IllegalStateException("No ContentPart with a supported Mime Type" + "found for ContentItem " + ci.getUri() + "(supported: '" + SUPPORTED_MIMETYPES + "') -> this indicates that canEnhance was" + "NOT called and indicates a bug in the used EnhancementJobManager!");
}
String language = EnhancementEngineHelper.getLanguage(ci);
if (!(acceptedLanguageSet.isEmpty() || acceptedLanguageSet.contains(language) || acceptedLanguageSet.contains(""))) {
throw new IllegalStateException("The language '" + language + "' of the ContentItem is not configured as " + " active for this Engine (active: " + acceptedLanguageSet + ").");
}
String text;
try {
text = ContentItemHelper.getText(contentPart.getValue());
} catch (IOException e) {
throw new InvalidContentException(String.format("Unable to extract " + " textual content from ContentPart %s of ContentItem %s!", contentPart.getKey(), ci.getUri()), e);
}
if (text.trim().isEmpty()) {
log.warn("ContentPart {} of ContentItem {} does not contain any " + "text to extract topics from", contentPart.getKey(), ci.getUri());
return;
}
Graph metadata = ci.getMetadata();
List<TopicSuggestion> topics;
try {
topics = suggestTopics(text);
if (topics.isEmpty()) {
return;
}
} catch (ClassifierException e) {
throw new EngineException(e);
}
IRI precision = new IRI(NamespaceEnum.fise + "classifier/precision");
IRI recall = new IRI(NamespaceEnum.fise + "classifier/recall");
IRI f1 = new IRI(NamespaceEnum.fise + "classifier/f1");
LiteralFactory lf = LiteralFactory.getInstance();
ci.getLock().writeLock().lock();
try {
// Global text annotation to attach all the topic annotation to it.
IRI textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
metadata.add(new TripleImpl(textAnnotation, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_TYPE, OntologicalClasses.SKOS_CONCEPT));
for (TopicSuggestion topic : topics) {
IRI enhancement = EnhancementEngineHelper.createEntityEnhancement(ci, this);
metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.RDF_TYPE, TechnicalClasses.ENHANCER_TOPICANNOTATION));
metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_RELATION, textAnnotation));
// add link to entity
metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_REFERENCE, new IRI(topic.conceptUri)));
metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_TYPE, OntologicalClasses.SKOS_CONCEPT));
// add confidence information
metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_CONFIDENCE, lf.createTypedLiteral(Double.valueOf(topic.score))));
// add performance estimates of the classifier if available
ClassificationReport perf = getPerformanceEstimates(topic.conceptUri);
if (perf.uptodate) {
metadata.add(new TripleImpl(enhancement, precision, lf.createTypedLiteral(Double.valueOf(perf.precision))));
metadata.add(new TripleImpl(enhancement, recall, lf.createTypedLiteral(Double.valueOf(perf.recall))));
metadata.add(new TripleImpl(enhancement, f1, lf.createTypedLiteral(Double.valueOf(perf.f1))));
}
// fetch concept label from the entityhub or a referenced site if available
Entity entity = entityhub.getEntity(topic.conceptUri);
if (entity == null) {
entity = referencedSiteManager.getEntity(topic.conceptUri);
}
if (entity != null) {
Representation representation = entity.getRepresentation();
// TODO: extract all languages based on some configuration instead of hardcoding English
Text label = representation.getFirst(NamespaceEnum.skos + "prefLabel", "en", "en-US", "en-GB");
if (label == null) {
label = representation.getFirst(NamespaceEnum.rdfs + "label", "en", "en-US", "en-GB");
}
if (label != null) {
metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_LABEL, new PlainLiteralImpl(label.getText())));
}
}
}
} catch (ClassifierException e) {
throw new EngineException(e);
} catch (IllegalArgumentException e) {
throw new EngineException(e);
} catch (EntityhubException e) {
throw new EngineException(e);
} finally {
ci.getLock().writeLock().unlock();
}
}
use of org.apache.stanbol.enhancer.servicesapi.Blob in project stanbol by apache.
the class RestfulLangidentEngine method computeEnhancements.
/**
* Compute enhancements for supplied ContentItem. The results of the process
* are expected to be stored in the metadata of the content item.
* <p/>
* The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
* persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
* <p/>
* This method creates a new POSContentPart using {@link org.apache.stanbol.enhancer.engines.pos.api.POSTaggerHelper#createContentPart} from a text/plain part and
* stores it as a new part in the content item. The metadata is not changed.
*
* @throws org.apache.stanbol.enhancer.servicesapi.EngineException
* if the underlying process failed to work as
* expected
*/
@Override
public void computeEnhancements(final ContentItem ci) throws EngineException {
//get the plain text Blob
Map.Entry<IRI, Blob> textBlob = getPlainText(this, ci, false);
Blob blob = textBlob.getValue();
//send the text to the server
final HttpPost request = new HttpPost(serviceUrl);
request.setEntity(new InputStreamEntity(blob.getStream(), blob.getContentLength(), ContentType.create(blob.getMimeType(), blob.getParameter().get("charset"))));
//execute the request
List<LangSuggestion> detected;
try {
detected = AccessController.doPrivileged(new PrivilegedExceptionAction<List<LangSuggestion>>() {
public List<LangSuggestion> run() throws ClientProtocolException, IOException {
return httpClient.execute(request, new LangIdentResponseHandler(ci, objectMapper));
}
});
} catch (PrivilegedActionException pae) {
Exception e = pae.getException();
if (e instanceof ClientProtocolException) {
throw new EngineException(this, ci, "Exception while executing Request " + "on RESTful Language Identification Service at " + serviceUrl, e);
} else if (e instanceof IOException) {
throw new EngineException(this, ci, "Exception while executing Request " + "on RESTful Language Identification Service at " + serviceUrl, e);
} else {
throw RuntimeException.class.cast(e);
}
}
Graph metadata = ci.getMetadata();
log.debug("Detected Languages for ContentItem {} and Blob {}");
ci.getLock().writeLock().lock();
try {
//write TextAnnotations for the detected languages
for (LangSuggestion suggestion : detected) {
// add a hypothesis
log.debug(" > {}@{}", suggestion.getLanguage(), suggestion.hasProbability() ? suggestion.getProbability() : "-,--");
IRI textEnhancement = EnhancementEngineHelper.createTextEnhancement(ci, this);
metadata.add(new TripleImpl(textEnhancement, DC_LANGUAGE, new PlainLiteralImpl(suggestion.getLanguage())));
metadata.add(new TripleImpl(textEnhancement, DC_TYPE, DCTERMS_LINGUISTIC_SYSTEM));
if (suggestion.hasProbability()) {
metadata.add(new TripleImpl(textEnhancement, ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(suggestion.getProbability())));
}
}
} finally {
ci.getLock().writeLock().unlock();
}
}
Aggregations