use of org.apache.stanbol.enhancer.servicesapi.ContentItem in project stanbol by apache.
the class ContentItemFactoryTest method testDefaultId.
/**
* Test the generation of valid IDs if no or <code>null</code> is parsed
* as id
*/
@Test
public void testDefaultId() throws IOException {
ContentItem ci = contentItemFactory.createContentItem(TEST_CS);
assertNotNull(ci);
assertNotNull(ci.getUri());
ci = contentItemFactory.createContentItem((IRI) null, TEST_CS);
assertNotNull(ci);
assertNotNull(ci.getUri());
ci = contentItemFactory.createContentItem((IRI) null, TEST_CS, new SimpleGraph());
assertNotNull(ci);
assertNotNull(ci.getUri());
}
use of org.apache.stanbol.enhancer.servicesapi.ContentItem in project stanbol by apache.
the class DereferenceEngineTest method testAsyncOtherEntityReferenceDereferencing.
/**
* Test for <a href="https://issues.apache.org/jira/browse/STANBOL-1334">STANBOL-1334</a>
* @throws Exception
*/
@Test
public void testAsyncOtherEntityReferenceDereferencing() throws Exception {
ContentItem ci = getContentItem("urn:test:testSyncDereferencing");
Dictionary<String, Object> dict = new Hashtable<String, Object>();
dict.put(EnhancementEngine.PROPERTY_NAME, "async");
dict.put(FILTER_CONTENT_LANGUAGES, false);
dict.put(FILTER_ACCEPT_LANGUAGES, false);
dict.put(ENTITY_REFERENCES, OTHER_ENTITY_REFERENCE.getUnicodeString());
DereferenceEngineConfig config = new DereferenceEngineConfig(dict, null);
EntityDereferenceEngine engine = new EntityDereferenceEngine(asyncDereferencer, config);
Assert.assertNotEquals(engine.canEnhance(ci), EnhancementEngine.CANNOT_ENHANCE);
engine.computeEnhancements(ci);
validateDereferencedEntities(ci.getMetadata(), OTHER_ENTITY_REFERENCE);
}
use of org.apache.stanbol.enhancer.servicesapi.ContentItem in project stanbol by apache.
the class DereferenceEngineTest method getContentItem.
public static ContentItem getContentItem(final String id) throws IOException {
ContentItem ci = ciFactory.createContentItem(new IRI(id), new StringSource("Not used"));
ci.getMetadata().addAll(testMetadata);
return ci;
}
use of org.apache.stanbol.enhancer.servicesapi.ContentItem in project stanbol by apache.
the class TikaEngineTest method testEMail.
@Test
public void testEMail() throws EngineException, IOException, ParseException {
log.info(">>> testEMail <<<");
ContentItem ci = createContentItem("test.email.txt", "message/rfc822");
assertFalse(engine.canEnhance(ci) == CANNOT_ENHANCE);
engine.computeEnhancements(ci);
Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, singleton("text/plain"));
assertNotNull(contentPart);
Blob plainTextBlob = contentPart.getValue();
assertNotNull(plainTextBlob);
assertContentRegexp(plainTextBlob, "Julien Nioche commented on TIKA-461:", "I'll have a look at mime4j and try to use it in Tika", "> RFC822 messages not parsed", "Key: TIKA-461", "URL: https://issues.apache.org/jira/browse/TIKA-461");
//validate XHTML results
contentPart = ContentItemHelper.getBlob(ci, singleton("application/xhtml+xml"));
assertNotNull(contentPart);
Blob xhtmlBlob = contentPart.getValue();
assertNotNull(xhtmlBlob);
assertContentRegexp(xhtmlBlob, "<html xmlns=\"http://www.w3.org/1999/xhtml\">", "<title>\\[jira\\] Commented: \\(TIKA-461\\) RFC822 messages not parsed</title>", "<body><p>", "Julien Nioche commented on TIKA-461:", "I'll have a look at mime4j and try to use it in Tika", "> RFC822 messages not parsed", "Key: TIKA-461", "URL: https://issues.apache.org/jira/browse/TIKA-461");
//no check the extracted metadata!
//DC
//STANBOL-757: dc:date no longer added by Tika 1.2 (dc:created is still present)
//verifyValue(ci, new IRI(NamespaceEnum.dc+"date"), XSD.dateTime,"2010-09-06T09:25:34Z");
verifyValue(ci, new IRI(NamespaceEnum.dc + "format"), null, "message/rfc822");
//STANBOL-757: dc:subject no longer added by Tika1.2 (dc:title is used instead)
//verifyValue(ci, new IRI(NamespaceEnum.dc+"subject"), null,"[jira] Commented: (TIKA-461) RFC822 messages not parsed");
verifyValue(ci, new IRI(NamespaceEnum.dc + "title"), null, "[jira] Commented: (TIKA-461) RFC822 messages not parsed");
verifyValue(ci, new IRI(NamespaceEnum.dc + "creator"), null, "Julien Nioche (JIRA) <jira@apache.org>");
verifyValue(ci, new IRI(NamespaceEnum.dc + "created"), XSD.dateTime, "2010-09-06T09:25:34Z");
//Media Ontology
verifyValue(ci, new IRI(NamespaceEnum.media + "creationDate"), XSD.dateTime, "2010-09-06T09:25:34Z");
verifyValue(ci, new IRI(NamespaceEnum.media + "hasFormat"), null, "message/rfc822");
verifyValue(ci, new IRI(NamespaceEnum.media + "hasCreator"), null, "Julien Nioche (JIRA) <jira@apache.org>");
verifyValue(ci, new IRI(NamespaceEnum.media + "hasContributor"), null, "Julien Nioche (JIRA) <jira@apache.org>");
//STANBOL-757: This was present with Tika 1.1 because its mapping from dc:subject
// verifyValue(ci, new IRI(NamespaceEnum.media+"hasKeyword"),null,"[jira] Commented: (TIKA-461) RFC822 messages not parsed");
//Nepomuk Message
String message = "http://www.semanticdesktop.org/ontologies/2007/03/22/nmo#";
verifyValue(ci, new IRI(message + "from"), null, "Julien Nioche (JIRA) <jira@apache.org>");
verifyValue(ci, new IRI(message + "to"), null, "dev@tika.apache.org");
}
use of org.apache.stanbol.enhancer.servicesapi.ContentItem in project stanbol by apache.
the class TikaEngineTest method testXhtml.
@Test
public void testXhtml() throws EngineException, IOException {
log.info(">>> testXhtml <<<");
ContentItem ci = createContentItem("test.xhtml", XHTML.toString() + "; charset=UTF-8");
assertFalse(engine.canEnhance(ci) == CANNOT_ENHANCE);
engine.computeEnhancements(ci);
Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, singleton("text/plain"));
assertNotNull(contentPart);
Blob plainTextBlob = contentPart.getValue();
assertNotNull(plainTextBlob);
assertContentRegexp(plainTextBlob, "The Apache Stanbol Enhancer", "The Stanbol enhancer can detect famous cities");
//only the original and the plain text
// this asserts that no xhtml is parsed from the parsed xhtml content
assertEquals(2, ContentItemHelper.getContentParts(ci, Blob.class).size());
}
Aggregations