Search in sources :

Example 36 with Blob

use of org.apache.stanbol.enhancer.servicesapi.Blob in project stanbol by apache.

the class TikaEngineTest method testOdt.

@Test
public void testOdt() throws EngineException, IOException {
    log.info(">>> testOdt <<<");
    ContentItem ci = createContentItem("test.odt", "application/vnd.oasis.opendocument.text");
    assertFalse(engine.canEnhance(ci) == CANNOT_ENHANCE);
    engine.computeEnhancements(ci);
    Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, singleton("text/plain"));
    assertNotNull(contentPart);
    Blob plainTextBlob = contentPart.getValue();
    assertNotNull(plainTextBlob);
    assertContentRegexp(plainTextBlob, "The Apache Stanbol Enhancer", "The Stanbol enhancer can detect famous cities such as Paris and people such as Bob Marley.");
    //validate XHTML results
    contentPart = ContentItemHelper.getBlob(ci, singleton("application/xhtml+xml"));
    assertNotNull(contentPart);
    Blob xhtmlBlob = contentPart.getValue();
    assertNotNull(xhtmlBlob);
    assertContentRegexp(xhtmlBlob, "<html xmlns=\"http://www.w3.org/1999/xhtml\">", "<head>", "<meta name=", "<title>", "The Apache Stanbol Enhancer", "The Stanbol enhancer can detect famous cities", "</body></html>");
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) Blob(org.apache.stanbol.enhancer.servicesapi.Blob) ContentItem(org.apache.stanbol.enhancer.servicesapi.ContentItem) Test(org.junit.Test)

Example 37 with Blob

use of org.apache.stanbol.enhancer.servicesapi.Blob in project stanbol by apache.

the class TikaEngineTest method testMp3.

@Test
public void testMp3() throws EngineException, IOException, ParseException {
    log.info(">>> testMp3 <<<");
    ContentItem ci = createContentItem("testMP3id3v24.mp3", "audio/mpeg");
    assertFalse(engine.canEnhance(ci) == CANNOT_ENHANCE);
    engine.computeEnhancements(ci);
    Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, singleton("text/plain"));
    assertNotNull(contentPart);
    Blob plainTextBlob = contentPart.getValue();
    assertNotNull(plainTextBlob);
    assertContentRegexp(plainTextBlob, "Test Title", "Test Artist", "Test Album");
    //validate XHTML results
    contentPart = ContentItemHelper.getBlob(ci, singleton("application/xhtml+xml"));
    assertNotNull(contentPart);
    Blob xhtmlBlob = contentPart.getValue();
    assertNotNull(xhtmlBlob);
    //Test AudioTrack metadata
    BlankNodeOrIRI audioTrack = verifyBlankNodeOrIRI(ci, new IRI(NamespaceEnum.media + "hasTrack"));
    //types
    verifyValues(ci, audioTrack, RDF.type, new IRI(NamespaceEnum.media + "MediaFragment"), new IRI(NamespaceEnum.media + "Track"), new IRI(NamespaceEnum.media + "AudioTrack"));
    //properties
    verifyValue(ci, audioTrack, new IRI(NamespaceEnum.media + "hasFormat"), XSD.string, "Mono");
    verifyValue(ci, audioTrack, new IRI(NamespaceEnum.media + "samplingRate"), XSD.int_, "44100");
    verifyValue(ci, audioTrack, new IRI(NamespaceEnum.media + "hasCompression"), XSD.string, "MP3");
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) Blob(org.apache.stanbol.enhancer.servicesapi.Blob) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) ContentItem(org.apache.stanbol.enhancer.servicesapi.ContentItem) Test(org.junit.Test)

Example 38 with Blob

use of org.apache.stanbol.enhancer.servicesapi.Blob in project stanbol by apache.

the class TikaEngineTest method testMsWord.

@Test
public void testMsWord() throws EngineException, IOException {
    log.info(">>> testMsWord <<<");
    ContentItem ci = createContentItem("test.doc", "application/msword");
    assertFalse(engine.canEnhance(ci) == CANNOT_ENHANCE);
    engine.computeEnhancements(ci);
    Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, singleton("text/plain"));
    assertNotNull(contentPart);
    Blob plainTextBlob = contentPart.getValue();
    assertNotNull(plainTextBlob);
    assertContentRegexp(plainTextBlob, "The Apache Stanbol Enhancer", "The Stanbol enhancer can detect famous cities such as Paris and people such as Bob Marley.");
    //validate XHTML results
    contentPart = ContentItemHelper.getBlob(ci, singleton("application/xhtml+xml"));
    assertNotNull(contentPart);
    Blob xhtmlBlob = contentPart.getValue();
    assertNotNull(xhtmlBlob);
    assertContentRegexp(xhtmlBlob, "<html xmlns=\"http://www.w3.org/1999/xhtml\">", "<head>", "<meta name=", "<title>", "The Apache Stanbol Enhancer", "The Stanbol enhancer can detect famous cities", "</body></html>");
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) Blob(org.apache.stanbol.enhancer.servicesapi.Blob) ContentItem(org.apache.stanbol.enhancer.servicesapi.ContentItem) Test(org.junit.Test)

Example 39 with Blob

use of org.apache.stanbol.enhancer.servicesapi.Blob in project stanbol by apache.

the class TikaEngineTest method testHtml.

@Test
public void testHtml() throws EngineException, IOException {
    log.info(">>> testHtml <<<");
    ContentItem ci = createContentItem("test.html", "text/html; charset=UTF-8");
    assertFalse(engine.canEnhance(ci) == CANNOT_ENHANCE);
    engine.computeEnhancements(ci);
    Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, singleton("text/plain"));
    assertNotNull(contentPart);
    Blob plainTextBlob = contentPart.getValue();
    assertNotNull(plainTextBlob);
    assertContentRegexp(plainTextBlob, "The Apache Stanbol Enhancer", "The Stanbol enhancer can detect famous cities such as Paris and people such as Bob Marley.");
    //validate XHTML results
    contentPart = ContentItemHelper.getBlob(ci, singleton("application/xhtml+xml"));
    assertNotNull(contentPart);
    Blob xhtmlBlob = contentPart.getValue();
    assertNotNull(xhtmlBlob);
    assertContentRegexp(xhtmlBlob, "<html xmlns=\"http://www.w3.org/1999/xhtml\">", "<head>", "<meta name=", "<title>The Apache Stanbol Enhancer</title>", "The Apache Stanbol Enhancer", "The Stanbol enhancer can detect famous cities", "</body></html>");
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) Blob(org.apache.stanbol.enhancer.servicesapi.Blob) ContentItem(org.apache.stanbol.enhancer.servicesapi.ContentItem) Test(org.junit.Test)

Example 40 with Blob

use of org.apache.stanbol.enhancer.servicesapi.Blob in project stanbol by apache.

the class OpenCalaisEngine method computeEnhancements.

public void computeEnhancements(ContentItem ci) throws EngineException {
    Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES);
    if (contentPart == null) {
        throw new IllegalStateException("No ContentPart with an supported Mimetype '" + SUPPORTED_MIMETYPES + "' found for ContentItem " + ci.getUri() + ": This is also checked in the canEnhance method! -> This " + "indicated an Bug in the implementation of the " + "EnhancementJobManager!");
    }
    String text;
    try {
        text = ContentItemHelper.getText(contentPart.getValue());
    } catch (IOException e) {
        throw new InvalidContentException(this, ci, e);
    }
    Graph calaisModel = getCalaisAnalysis(text, contentPart.getValue().getMimeType());
    if (calaisModel != null) {
        //Acquire a write lock on the ContentItem when adding the enhancements
        ci.getLock().writeLock().lock();
        try {
            createEnhancements(queryModel(calaisModel), ci);
            if (log.isDebugEnabled()) {
                Serializer serializer = Serializer.getInstance();
                ByteArrayOutputStream debugStream = new ByteArrayOutputStream();
                serializer.serialize(debugStream, ci.getMetadata(), "application/rdf+xml");
                try {
                    log.debug("Calais Enhancements:\n{}", debugStream.toString("UTF-8"));
                } catch (UnsupportedEncodingException e) {
                    e.printStackTrace();
                }
            }
        } finally {
            ci.getLock().writeLock().unlock();
        }
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) Blob(org.apache.stanbol.enhancer.servicesapi.Blob) InvalidContentException(org.apache.stanbol.enhancer.servicesapi.InvalidContentException) ImmutableGraph(org.apache.clerezza.commons.rdf.ImmutableGraph) SimpleGraph(org.apache.clerezza.commons.rdf.impl.utils.simple.SimpleGraph) Graph(org.apache.clerezza.commons.rdf.Graph) UnsupportedEncodingException(java.io.UnsupportedEncodingException) IOException(java.io.IOException) ByteArrayOutputStream(java.io.ByteArrayOutputStream) Serializer(org.apache.clerezza.rdf.core.serializedform.Serializer)

Aggregations

Blob (org.apache.stanbol.enhancer.servicesapi.Blob)44 IRI (org.apache.clerezza.commons.rdf.IRI)36 Test (org.junit.Test)21 IOException (java.io.IOException)20 Graph (org.apache.clerezza.commons.rdf.Graph)17 BlankNodeOrIRI (org.apache.clerezza.commons.rdf.BlankNodeOrIRI)15 ContentItem (org.apache.stanbol.enhancer.servicesapi.ContentItem)15 EngineException (org.apache.stanbol.enhancer.servicesapi.EngineException)15 InvalidContentException (org.apache.stanbol.enhancer.servicesapi.InvalidContentException)14 PlainLiteralImpl (org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl)11 TripleImpl (org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)11 SOAPException (javax.xml.soap.SOAPException)4 Language (org.apache.clerezza.commons.rdf.Language)4 AnalysedText (org.apache.stanbol.enhancer.nlp.model.AnalysedText)4 ByteArrayInputStream (java.io.ByteArrayInputStream)3 MediaType (javax.ws.rs.core.MediaType)3 SimpleGraph (org.apache.clerezza.commons.rdf.impl.utils.simple.SimpleGraph)3 LiteralFactory (org.apache.clerezza.rdf.core.LiteralFactory)3 NoSuchPartException (org.apache.stanbol.enhancer.servicesapi.NoSuchPartException)3 StreamSource (org.apache.stanbol.enhancer.servicesapi.impl.StreamSource)3