Search in sources :

Example 61 with ContentItem

use of org.apache.stanbol.enhancer.servicesapi.ContentItem in project stanbol by apache.

the class ContentItemFactoryTest method testDefaultId.

/**
     * Test the generation of valid IDs if no or <code>null</code> is parsed
     * as id
     */
@Test
public void testDefaultId() throws IOException {
    ContentItem ci = contentItemFactory.createContentItem(TEST_CS);
    assertNotNull(ci);
    assertNotNull(ci.getUri());
    ci = contentItemFactory.createContentItem((IRI) null, TEST_CS);
    assertNotNull(ci);
    assertNotNull(ci.getUri());
    ci = contentItemFactory.createContentItem((IRI) null, TEST_CS, new SimpleGraph());
    assertNotNull(ci);
    assertNotNull(ci.getUri());
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) SimpleGraph(org.apache.clerezza.commons.rdf.impl.utils.simple.SimpleGraph) ContentItem(org.apache.stanbol.enhancer.servicesapi.ContentItem) Test(org.junit.Test)

Example 62 with ContentItem

use of org.apache.stanbol.enhancer.servicesapi.ContentItem in project stanbol by apache.

the class DereferenceEngineTest method testAsyncOtherEntityReferenceDereferencing.

/**
     * Test for <a href="https://issues.apache.org/jira/browse/STANBOL-1334">STANBOL-1334</a>
     * @throws Exception
     */
@Test
public void testAsyncOtherEntityReferenceDereferencing() throws Exception {
    ContentItem ci = getContentItem("urn:test:testSyncDereferencing");
    Dictionary<String, Object> dict = new Hashtable<String, Object>();
    dict.put(EnhancementEngine.PROPERTY_NAME, "async");
    dict.put(FILTER_CONTENT_LANGUAGES, false);
    dict.put(FILTER_ACCEPT_LANGUAGES, false);
    dict.put(ENTITY_REFERENCES, OTHER_ENTITY_REFERENCE.getUnicodeString());
    DereferenceEngineConfig config = new DereferenceEngineConfig(dict, null);
    EntityDereferenceEngine engine = new EntityDereferenceEngine(asyncDereferencer, config);
    Assert.assertNotEquals(engine.canEnhance(ci), EnhancementEngine.CANNOT_ENHANCE);
    engine.computeEnhancements(ci);
    validateDereferencedEntities(ci.getMetadata(), OTHER_ENTITY_REFERENCE);
}
Also used : Hashtable(java.util.Hashtable) ContentItem(org.apache.stanbol.enhancer.servicesapi.ContentItem) Test(org.junit.Test)

Example 63 with ContentItem

use of org.apache.stanbol.enhancer.servicesapi.ContentItem in project stanbol by apache.

the class DereferenceEngineTest method getContentItem.

public static ContentItem getContentItem(final String id) throws IOException {
    ContentItem ci = ciFactory.createContentItem(new IRI(id), new StringSource("Not used"));
    ci.getMetadata().addAll(testMetadata);
    return ci;
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) StringSource(org.apache.stanbol.enhancer.servicesapi.impl.StringSource) ContentItem(org.apache.stanbol.enhancer.servicesapi.ContentItem)

Example 64 with ContentItem

use of org.apache.stanbol.enhancer.servicesapi.ContentItem in project stanbol by apache.

the class TikaEngineTest method testEMail.

@Test
public void testEMail() throws EngineException, IOException, ParseException {
    log.info(">>> testEMail <<<");
    ContentItem ci = createContentItem("test.email.txt", "message/rfc822");
    assertFalse(engine.canEnhance(ci) == CANNOT_ENHANCE);
    engine.computeEnhancements(ci);
    Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, singleton("text/plain"));
    assertNotNull(contentPart);
    Blob plainTextBlob = contentPart.getValue();
    assertNotNull(plainTextBlob);
    assertContentRegexp(plainTextBlob, "Julien Nioche commented on TIKA-461:", "I'll have a look at mime4j and try to use it in Tika", "> RFC822 messages not parsed", "Key: TIKA-461", "URL: https://issues.apache.org/jira/browse/TIKA-461");
    //validate XHTML results
    contentPart = ContentItemHelper.getBlob(ci, singleton("application/xhtml+xml"));
    assertNotNull(contentPart);
    Blob xhtmlBlob = contentPart.getValue();
    assertNotNull(xhtmlBlob);
    assertContentRegexp(xhtmlBlob, "<html xmlns=\"http://www.w3.org/1999/xhtml\">", "<title>\\[jira\\] Commented: \\(TIKA-461\\) RFC822 messages not parsed</title>", "<body><p>", "Julien Nioche commented on TIKA-461:", "I'll have a look at mime4j and try to use it in Tika", "&gt; RFC822 messages not parsed", "Key: TIKA-461", "URL: https://issues.apache.org/jira/browse/TIKA-461");
    //no check the extracted metadata!
    //DC
    //STANBOL-757: dc:date no longer added by Tika 1.2 (dc:created is still present)
    //verifyValue(ci, new IRI(NamespaceEnum.dc+"date"), XSD.dateTime,"2010-09-06T09:25:34Z");
    verifyValue(ci, new IRI(NamespaceEnum.dc + "format"), null, "message/rfc822");
    //STANBOL-757: dc:subject no longer added by Tika1.2 (dc:title is used instead)
    //verifyValue(ci, new IRI(NamespaceEnum.dc+"subject"), null,"[jira] Commented: (TIKA-461) RFC822 messages not parsed");
    verifyValue(ci, new IRI(NamespaceEnum.dc + "title"), null, "[jira] Commented: (TIKA-461) RFC822 messages not parsed");
    verifyValue(ci, new IRI(NamespaceEnum.dc + "creator"), null, "Julien Nioche (JIRA) <jira@apache.org>");
    verifyValue(ci, new IRI(NamespaceEnum.dc + "created"), XSD.dateTime, "2010-09-06T09:25:34Z");
    //Media Ontology
    verifyValue(ci, new IRI(NamespaceEnum.media + "creationDate"), XSD.dateTime, "2010-09-06T09:25:34Z");
    verifyValue(ci, new IRI(NamespaceEnum.media + "hasFormat"), null, "message/rfc822");
    verifyValue(ci, new IRI(NamespaceEnum.media + "hasCreator"), null, "Julien Nioche (JIRA) <jira@apache.org>");
    verifyValue(ci, new IRI(NamespaceEnum.media + "hasContributor"), null, "Julien Nioche (JIRA) <jira@apache.org>");
    //STANBOL-757: This was present with Tika 1.1 because its mapping from dc:subject 
    //        verifyValue(ci, new IRI(NamespaceEnum.media+"hasKeyword"),null,"[jira] Commented: (TIKA-461) RFC822 messages not parsed");
    //Nepomuk Message
    String message = "http://www.semanticdesktop.org/ontologies/2007/03/22/nmo#";
    verifyValue(ci, new IRI(message + "from"), null, "Julien Nioche (JIRA) <jira@apache.org>");
    verifyValue(ci, new IRI(message + "to"), null, "dev@tika.apache.org");
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) Blob(org.apache.stanbol.enhancer.servicesapi.Blob) ContentItem(org.apache.stanbol.enhancer.servicesapi.ContentItem) Test(org.junit.Test)

Example 65 with ContentItem

use of org.apache.stanbol.enhancer.servicesapi.ContentItem in project stanbol by apache.

the class TikaEngineTest method testXhtml.

@Test
public void testXhtml() throws EngineException, IOException {
    log.info(">>> testXhtml <<<");
    ContentItem ci = createContentItem("test.xhtml", XHTML.toString() + "; charset=UTF-8");
    assertFalse(engine.canEnhance(ci) == CANNOT_ENHANCE);
    engine.computeEnhancements(ci);
    Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, singleton("text/plain"));
    assertNotNull(contentPart);
    Blob plainTextBlob = contentPart.getValue();
    assertNotNull(plainTextBlob);
    assertContentRegexp(plainTextBlob, "The Apache Stanbol Enhancer", "The Stanbol enhancer can detect famous cities");
    //only the original and the plain text
    // this asserts that no xhtml is parsed from the parsed xhtml content
    assertEquals(2, ContentItemHelper.getContentParts(ci, Blob.class).size());
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) Blob(org.apache.stanbol.enhancer.servicesapi.Blob) ContentItem(org.apache.stanbol.enhancer.servicesapi.ContentItem) Test(org.junit.Test)

Aggregations

ContentItem (org.apache.stanbol.enhancer.servicesapi.ContentItem)73 Test (org.junit.Test)62 IRI (org.apache.clerezza.commons.rdf.IRI)46 BlankNodeOrIRI (org.apache.clerezza.commons.rdf.BlankNodeOrIRI)18 RDFTerm (org.apache.clerezza.commons.rdf.RDFTerm)18 HashMap (java.util.HashMap)15 TripleImpl (org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)15 Blob (org.apache.stanbol.enhancer.servicesapi.Blob)15 StringSource (org.apache.stanbol.enhancer.servicesapi.impl.StringSource)13 EngineException (org.apache.stanbol.enhancer.servicesapi.EngineException)12 PlainLiteralImpl (org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl)11 Graph (org.apache.clerezza.commons.rdf.Graph)8 Date (java.util.Date)6 SimpleGraph (org.apache.clerezza.commons.rdf.impl.utils.simple.SimpleGraph)6 Hashtable (java.util.Hashtable)5 AnalysedText (org.apache.stanbol.enhancer.nlp.model.AnalysedText)4 IOException (java.io.IOException)3 InputStream (java.io.InputStream)3 MediaType (javax.ws.rs.core.MediaType)3 Triple (org.apache.clerezza.commons.rdf.Triple)3