use of org.apache.stanbol.enhancer.servicesapi.ContentItem in project stanbol by apache.
the class TikaEngineTest method testPdf.
@Test
public void testPdf() throws EngineException, IOException {
log.info(">>> testPdf <<<");
//PDF created by Apple Pages
ContentItem ci = createContentItem("test.pdf", "application/pdf");
assertFalse(engine.canEnhance(ci) == CANNOT_ENHANCE);
engine.computeEnhancements(ci);
Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, singleton("text/plain"));
assertNotNull(contentPart);
Blob plainTextBlob = contentPart.getValue();
assertNotNull(plainTextBlob);
assertContentRegexp(plainTextBlob, "The Apache Stanbol Enhancer", "The Stanbol enhancer can detect famous cities ");
//validate XHTML results
contentPart = ContentItemHelper.getBlob(ci, singleton("application/xhtml+xml"));
assertNotNull(contentPart);
Blob xhtmlBlob = contentPart.getValue();
assertNotNull(xhtmlBlob);
assertContentRegexp(xhtmlBlob, "<html xmlns=\"http://www.w3.org/1999/xhtml\">", "<head>", "<meta name=", "<div class=\"page\">", "The Apache Stanbol Enhancer", "The Stanbol enhancer can detect famous cities", "</body></html>");
//PDF created by OpenOffice
ci = createContentItem("test2.pdf", "application/pdf");
assertFalse(engine.canEnhance(ci) == CANNOT_ENHANCE);
engine.computeEnhancements(ci);
//validate plain text results
contentPart = ContentItemHelper.getBlob(ci, singleton("text/plain"));
assertNotNull(contentPart);
plainTextBlob = contentPart.getValue();
assertNotNull(plainTextBlob);
assertContentRegexp(plainTextBlob, "The Apache Stanbol Enhancer", "The Stanbol enhancer can detect famous cities");
//validate XHTML results
contentPart = ContentItemHelper.getBlob(ci, singleton("application/xhtml+xml"));
assertNotNull(contentPart);
xhtmlBlob = contentPart.getValue();
assertNotNull(xhtmlBlob);
assertContentRegexp(xhtmlBlob, "<html xmlns=\"http://www.w3.org/1999/xhtml\">", "<head>", "<meta name=", "<div class=\"page\">", "The Apache Stanbol Enhancer", "The Stanbol enhancer can detect famous cities", "</body></html>");
}
use of org.apache.stanbol.enhancer.servicesapi.ContentItem in project stanbol by apache.
the class TikaEngineTest method testRtf.
@Test
public void testRtf() throws EngineException, IOException {
log.info(">>> testRtf <<<");
ContentItem ci = createContentItem("test.rtf", "application/rtf");
assertFalse(engine.canEnhance(ci) == CANNOT_ENHANCE);
engine.computeEnhancements(ci);
Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, singleton("text/plain"));
assertNotNull(contentPart);
Blob plainTextBlob = contentPart.getValue();
assertNotNull(plainTextBlob);
assertContentRegexp(plainTextBlob, "The Apache Stanbol Enhancer", "The Stanbol enhancer can detect famous cities such as Paris and people such as Bob Marley.");
//validate XHTML results
contentPart = ContentItemHelper.getBlob(ci, singleton("application/xhtml+xml"));
assertNotNull(contentPart);
Blob xhtmlBlob = contentPart.getValue();
assertNotNull(xhtmlBlob);
assertContentRegexp(xhtmlBlob, "<html xmlns=\"http://www.w3.org/1999/xhtml\">", "<head>", "<meta name=", "<title>", "The Apache Stanbol Enhancer", "The Stanbol enhancer can detect famous cities", "</body></html>");
}
use of org.apache.stanbol.enhancer.servicesapi.ContentItem in project stanbol by apache.
the class TikaEngineTest method testContentTypeDetection.
@Test
public void testContentTypeDetection() throws EngineException, IOException {
log.info(">>> testContentTypeDetection <<<");
ContentItem ci = createContentItem("test.pdf", OCTET_STREAM.toString());
assertFalse(engine.canEnhance(ci) == CANNOT_ENHANCE);
engine.computeEnhancements(ci);
Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, singleton("text/plain"));
assertNotNull(contentPart);
Blob plainTextBlob = contentPart.getValue();
assertNotNull(plainTextBlob);
assertContentRegexp(plainTextBlob, "The Apache Stanbol Enhancer", "The Stanbol enhancer can detect famous cities");
//validate XHTML results
contentPart = ContentItemHelper.getBlob(ci, singleton("application/xhtml+xml"));
assertNotNull(contentPart);
Blob xhtmlBlob = contentPart.getValue();
assertNotNull(xhtmlBlob);
assertContentRegexp(xhtmlBlob, "<html xmlns=\"http://www.w3.org/1999/xhtml\">", "<head>", "<meta name=", "<div class=\"page\">", "The Apache Stanbol Enhancer", "The Stanbol enhancer can detect famous cities", "</body></html>");
}
use of org.apache.stanbol.enhancer.servicesapi.ContentItem in project stanbol by apache.
the class TikaEngineTest method testUnsupported.
@Test
public void testUnsupported() throws EngineException, IOException {
log.info(">>> testUnsupported <<<");
ContentItem ci = createContentItem("test.pages", "application/x-iwork-pages-sffpages");
assertFalse(engine.canEnhance(ci) == CANNOT_ENHANCE);
engine.computeEnhancements(ci);
Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, singleton("text/plain"));
//it MUST NOT give an error but also not add a content part
assertNull(contentPart);
//only the original content
assertEquals(1, ContentItemHelper.getContentParts(ci, Blob.class).size());
}
use of org.apache.stanbol.enhancer.servicesapi.ContentItem in project stanbol by apache.
the class TestNamedEntityExtractionEnhancementEngine method testCustomModel.
@Test
public void testCustomModel() throws EngineException, IOException {
ContentItem ci = wrapAsContentItem("urn:test:content-item:single:sentence", EHEALTH, "en");
//this test does not use default models
nerEngine.config.getDefaultModelTypes().clear();
//but instead a custom model provided by the test data
nerEngine.config.addCustomNameFinderModel("en", "bionlp2004-DNA-en.bin");
nerEngine.config.setMappedType("DNA", new IRI("http://www.bootstrep.eu/ontology/GRO#DNA"));
nerEngine.computeEnhancements(ci);
Map<IRI, RDFTerm> expectedValues = new HashMap<IRI, RDFTerm>();
expectedValues.put(Properties.ENHANCER_EXTRACTED_FROM, ci.getUri());
expectedValues.put(Properties.DC_CREATOR, LiteralFactory.getInstance().createTypedLiteral(nerEngine.getClass().getName()));
//adding null as expected for confidence makes it a required property
expectedValues.put(Properties.ENHANCER_CONFIDENCE, null);
//and dc:type values MUST be the URI set as mapped type
expectedValues.put(Properties.DC_TYPE, new IRI("http://www.bootstrep.eu/ontology/GRO#DNA"));
Graph g = ci.getMetadata();
int textAnnotationCount = validateAllTextAnnotations(g, EHEALTH, expectedValues);
assertEquals(7, textAnnotationCount);
}
Aggregations