Search in sources :

Example 66 with ContentItem

use of org.apache.stanbol.enhancer.servicesapi.ContentItem in project stanbol by apache.

the class TikaEngineTest method testOdt.

@Test
public void testOdt() throws EngineException, IOException {
    log.info(">>> testOdt <<<");
    ContentItem ci = createContentItem("test.odt", "application/vnd.oasis.opendocument.text");
    assertFalse(engine.canEnhance(ci) == CANNOT_ENHANCE);
    engine.computeEnhancements(ci);
    Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, singleton("text/plain"));
    assertNotNull(contentPart);
    Blob plainTextBlob = contentPart.getValue();
    assertNotNull(plainTextBlob);
    assertContentRegexp(plainTextBlob, "The Apache Stanbol Enhancer", "The Stanbol enhancer can detect famous cities such as Paris and people such as Bob Marley.");
    //validate XHTML results
    contentPart = ContentItemHelper.getBlob(ci, singleton("application/xhtml+xml"));
    assertNotNull(contentPart);
    Blob xhtmlBlob = contentPart.getValue();
    assertNotNull(xhtmlBlob);
    assertContentRegexp(xhtmlBlob, "<html xmlns=\"http://www.w3.org/1999/xhtml\">", "<head>", "<meta name=", "<title>", "The Apache Stanbol Enhancer", "The Stanbol enhancer can detect famous cities", "</body></html>");
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) Blob(org.apache.stanbol.enhancer.servicesapi.Blob) ContentItem(org.apache.stanbol.enhancer.servicesapi.ContentItem) Test(org.junit.Test)

Example 67 with ContentItem

use of org.apache.stanbol.enhancer.servicesapi.ContentItem in project stanbol by apache.

the class TikaEngineTest method testMp3.

@Test
public void testMp3() throws EngineException, IOException, ParseException {
    log.info(">>> testMp3 <<<");
    ContentItem ci = createContentItem("testMP3id3v24.mp3", "audio/mpeg");
    assertFalse(engine.canEnhance(ci) == CANNOT_ENHANCE);
    engine.computeEnhancements(ci);
    Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, singleton("text/plain"));
    assertNotNull(contentPart);
    Blob plainTextBlob = contentPart.getValue();
    assertNotNull(plainTextBlob);
    assertContentRegexp(plainTextBlob, "Test Title", "Test Artist", "Test Album");
    //validate XHTML results
    contentPart = ContentItemHelper.getBlob(ci, singleton("application/xhtml+xml"));
    assertNotNull(contentPart);
    Blob xhtmlBlob = contentPart.getValue();
    assertNotNull(xhtmlBlob);
    //Test AudioTrack metadata
    BlankNodeOrIRI audioTrack = verifyBlankNodeOrIRI(ci, new IRI(NamespaceEnum.media + "hasTrack"));
    //types
    verifyValues(ci, audioTrack, RDF.type, new IRI(NamespaceEnum.media + "MediaFragment"), new IRI(NamespaceEnum.media + "Track"), new IRI(NamespaceEnum.media + "AudioTrack"));
    //properties
    verifyValue(ci, audioTrack, new IRI(NamespaceEnum.media + "hasFormat"), XSD.string, "Mono");
    verifyValue(ci, audioTrack, new IRI(NamespaceEnum.media + "samplingRate"), XSD.int_, "44100");
    verifyValue(ci, audioTrack, new IRI(NamespaceEnum.media + "hasCompression"), XSD.string, "MP3");
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) Blob(org.apache.stanbol.enhancer.servicesapi.Blob) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) ContentItem(org.apache.stanbol.enhancer.servicesapi.ContentItem) Test(org.junit.Test)

Example 68 with ContentItem

use of org.apache.stanbol.enhancer.servicesapi.ContentItem in project stanbol by apache.

the class TikaEngineTest method testMsWord.

@Test
public void testMsWord() throws EngineException, IOException {
    log.info(">>> testMsWord <<<");
    ContentItem ci = createContentItem("test.doc", "application/msword");
    assertFalse(engine.canEnhance(ci) == CANNOT_ENHANCE);
    engine.computeEnhancements(ci);
    Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, singleton("text/plain"));
    assertNotNull(contentPart);
    Blob plainTextBlob = contentPart.getValue();
    assertNotNull(plainTextBlob);
    assertContentRegexp(plainTextBlob, "The Apache Stanbol Enhancer", "The Stanbol enhancer can detect famous cities such as Paris and people such as Bob Marley.");
    //validate XHTML results
    contentPart = ContentItemHelper.getBlob(ci, singleton("application/xhtml+xml"));
    assertNotNull(contentPart);
    Blob xhtmlBlob = contentPart.getValue();
    assertNotNull(xhtmlBlob);
    assertContentRegexp(xhtmlBlob, "<html xmlns=\"http://www.w3.org/1999/xhtml\">", "<head>", "<meta name=", "<title>", "The Apache Stanbol Enhancer", "The Stanbol enhancer can detect famous cities", "</body></html>");
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) Blob(org.apache.stanbol.enhancer.servicesapi.Blob) ContentItem(org.apache.stanbol.enhancer.servicesapi.ContentItem) Test(org.junit.Test)

Example 69 with ContentItem

use of org.apache.stanbol.enhancer.servicesapi.ContentItem in project stanbol by apache.

the class TikaEngineTest method testHtml.

@Test
public void testHtml() throws EngineException, IOException {
    log.info(">>> testHtml <<<");
    ContentItem ci = createContentItem("test.html", "text/html; charset=UTF-8");
    assertFalse(engine.canEnhance(ci) == CANNOT_ENHANCE);
    engine.computeEnhancements(ci);
    Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, singleton("text/plain"));
    assertNotNull(contentPart);
    Blob plainTextBlob = contentPart.getValue();
    assertNotNull(plainTextBlob);
    assertContentRegexp(plainTextBlob, "The Apache Stanbol Enhancer", "The Stanbol enhancer can detect famous cities such as Paris and people such as Bob Marley.");
    //validate XHTML results
    contentPart = ContentItemHelper.getBlob(ci, singleton("application/xhtml+xml"));
    assertNotNull(contentPart);
    Blob xhtmlBlob = contentPart.getValue();
    assertNotNull(xhtmlBlob);
    assertContentRegexp(xhtmlBlob, "<html xmlns=\"http://www.w3.org/1999/xhtml\">", "<head>", "<meta name=", "<title>The Apache Stanbol Enhancer</title>", "The Apache Stanbol Enhancer", "The Stanbol enhancer can detect famous cities", "</body></html>");
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) Blob(org.apache.stanbol.enhancer.servicesapi.Blob) ContentItem(org.apache.stanbol.enhancer.servicesapi.ContentItem) Test(org.junit.Test)

Example 70 with ContentItem

use of org.apache.stanbol.enhancer.servicesapi.ContentItem in project stanbol by apache.

the class TestOpenCalaisEngine method testCalaisConnection.

@Test
public void testCalaisConnection() throws IOException, EngineException {
    Assume.assumeNotNull(calaisExtractor.getLicenseKey());
    ContentItem ci = wrapAsContentItem(TEST_TEXT);
    ci.getMetadata().add(new TripleImpl(ci.getUri(), Properties.DC_LANGUAGE, LiteralFactory.getInstance().createTypedLiteral("en")));
    Graph model;
    try {
        model = calaisExtractor.getCalaisAnalysis(TEST_TEXT, "text/plain");
    } catch (EngineException e) {
        RemoteServiceHelper.checkServiceUnavailable(e);
        return;
    }
    Assert.assertNotNull("No model", model);
    Collection<CalaisEntityOccurrence> entities;
    try {
        entities = calaisExtractor.queryModel(model);
    } catch (EngineException e) {
        RemoteServiceHelper.checkServiceUnavailable(e);
        return;
    }
    LOG.info("Found entities: {}", entities.size());
    LOG.debug("Entities:\n{}", entities);
    Assert.assertFalse("No entities found!", entities.isEmpty());
}
Also used : Graph(org.apache.clerezza.commons.rdf.Graph) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) ContentItem(org.apache.stanbol.enhancer.servicesapi.ContentItem) Test(org.junit.Test)

Aggregations

ContentItem (org.apache.stanbol.enhancer.servicesapi.ContentItem)73 Test (org.junit.Test)62 IRI (org.apache.clerezza.commons.rdf.IRI)46 BlankNodeOrIRI (org.apache.clerezza.commons.rdf.BlankNodeOrIRI)18 RDFTerm (org.apache.clerezza.commons.rdf.RDFTerm)18 HashMap (java.util.HashMap)15 TripleImpl (org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)15 Blob (org.apache.stanbol.enhancer.servicesapi.Blob)15 StringSource (org.apache.stanbol.enhancer.servicesapi.impl.StringSource)13 EngineException (org.apache.stanbol.enhancer.servicesapi.EngineException)12 PlainLiteralImpl (org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl)11 Graph (org.apache.clerezza.commons.rdf.Graph)8 Date (java.util.Date)6 SimpleGraph (org.apache.clerezza.commons.rdf.impl.utils.simple.SimpleGraph)6 Hashtable (java.util.Hashtable)5 AnalysedText (org.apache.stanbol.enhancer.nlp.model.AnalysedText)4 IOException (java.io.IOException)3 InputStream (java.io.InputStream)3 MediaType (javax.ws.rs.core.MediaType)3 Triple (org.apache.clerezza.commons.rdf.Triple)3