Search in sources :

Example 1 with ContentSink

use of org.apache.stanbol.enhancer.servicesapi.ContentSink in project stanbol by apache.

the class ContentItemFactoryTest method testContentSink.

@Test
public void testContentSink() throws IOException {
    String mt = "text/plain";
    Charset ISO8859_4 = Charset.forName("ISO-8859-4");
    ContentSink cs = contentItemFactory.createContentSink(mt + "; charset=" + ISO8859_4.name());
    assertNotNull(cs);
    assertNotNull(cs.getBlob());
    OutputStream out = cs.getOutputStream();
    assertNotNull(cs);
    // multiple calls MUST return the same OutputStream!
    assertSame(out, cs.getOutputStream());
    //test mime type
    assertNotNull(cs.getBlob().getMimeType());
    //get MimeType MUST return the simple mime type
    assertEquals(mt, cs.getBlob().getMimeType());
    String charsetParam = cs.getBlob().getParameter().get("charset");
    assertNotNull("expected charset parameter is missing!", charsetParam);
    assertEquals(ISO8859_4.name(), charsetParam);
    //now write some data to the sink
    String TEST_CONTENT = "Thîs tésts wrîtîng to â ConténtSînk!";
    //note this uses the same charset as parsed as charset in the
    //constructor!
    IOUtils.write(TEST_CONTENT, cs.getOutputStream(), ISO8859_4.name());
    IOUtils.closeQuietly(cs.getOutputStream());
    //now read the data from the blob
    String content = IOUtils.toString(cs.getBlob().getStream(), charsetParam);
    assertEquals(TEST_CONTENT, content);
}
Also used : OutputStream(java.io.OutputStream) Charset(java.nio.charset.Charset) ContentSink(org.apache.stanbol.enhancer.servicesapi.ContentSink) Test(org.junit.Test)

Example 2 with ContentSink

use of org.apache.stanbol.enhancer.servicesapi.ContentSink in project stanbol by apache.

the class ContentItemFactoryTest method testContentSinkDefaultMimeType.

@Test
public void testContentSinkDefaultMimeType() throws IOException {
    String DEFAULT = "application/octet-stream";
    ContentSink cs = contentItemFactory.createContentSink(null);
    assertNotNull(cs);
    assertNotNull(cs.getBlob());
    assertNotNull(cs.getBlob().getMimeType());
    //get MimeType MUST return the simple mime type
    assertEquals(DEFAULT, cs.getBlob().getMimeType());
    assertNull(cs.getBlob().getParameter().get("charset"));
}
Also used : ContentSink(org.apache.stanbol.enhancer.servicesapi.ContentSink) Test(org.junit.Test)

Example 3 with ContentSink

use of org.apache.stanbol.enhancer.servicesapi.ContentSink in project stanbol by apache.

the class MetaxaEngine method computeEnhancements.

public void computeEnhancements(ContentItem ci) throws EngineException {
    // get model from the extraction
    URIImpl docId;
    Model m = null;
    ci.getLock().readLock().lock();
    try {
        docId = new URIImpl(ci.getUri().getUnicodeString());
        m = this.extractor.extract(ci.getStream(), docId, ci.getMimeType());
    } catch (ExtractorException e) {
        throw new EngineException("Error while processing ContentItem " + ci.getUri() + " with Metaxa", e);
    } catch (IOException e) {
        throw new EngineException("Error while processing ContentItem " + ci.getUri() + " with Metaxa", e);
    } finally {
        ci.getLock().readLock().unlock();
    }
    // the extracted plain text from the model
    if (null == m) {
        log.debug("Unable to preocess ContentItem {} (mime type {}) with Metaxa", ci.getUri(), ci.getMimeType());
        return;
    }
    ContentSink plainTextSink;
    try {
        plainTextSink = ciFactory.createContentSink("text/plain");
    } catch (IOException e) {
        m.close();
        throw new EngineException("Unable to initialise Blob for storing" + "the plain text content", e);
    }
    HashMap<BlankNode, BlankNode> blankNodeMap = new HashMap<BlankNode, BlankNode>();
    RDF2GoUtils.urifyBlankNodes(m);
    ClosableIterator<Statement> it = m.iterator();
    BufferedWriter out = new BufferedWriter(new OutputStreamWriter(plainTextSink.getOutputStream(), UTF8));
    //used to detect if some text was extracted
    boolean textExtracted = false;
    try {
        //first add to a temporary graph
        Graph g = new SimpleGraph();
        while (it.hasNext()) {
            Statement oneStmt = it.next();
            //the plain text Blob!
            if (oneStmt.getSubject().equals(docId) && oneStmt.getPredicate().equals(NIE_PLAINTEXT_PROPERTY)) {
                String text = oneStmt.getObject().toString();
                if (text != null && !text.isEmpty()) {
                    try {
                        out.write(oneStmt.getObject().toString());
                    } catch (IOException e) {
                        throw new EngineException("Unable to write extracted" + "plain text to Blob (blob impl: " + plainTextSink.getBlob().getClass() + ")", e);
                    }
                    textExtracted = true;
                    if (includeText) {
                        BlankNodeOrIRI subject = (BlankNodeOrIRI) asClerezzaResource(oneStmt.getSubject(), blankNodeMap);
                        IRI predicate = (IRI) asClerezzaResource(oneStmt.getPredicate(), blankNodeMap);
                        RDFTerm object = asClerezzaResource(oneStmt.getObject(), blankNodeMap);
                        g.add(new TripleImpl(subject, predicate, object));
                    }
                }
            } else {
                //add metadata to the metadata of the contentItem
                BlankNodeOrIRI subject = (BlankNodeOrIRI) asClerezzaResource(oneStmt.getSubject(), blankNodeMap);
                IRI predicate = (IRI) asClerezzaResource(oneStmt.getPredicate(), blankNodeMap);
                RDFTerm object = asClerezzaResource(oneStmt.getObject(), blankNodeMap);
                if (null != subject && null != predicate && null != object) {
                    Triple t = new TripleImpl(subject, predicate, object);
                    g.add(t);
                    log.debug("added " + t.toString());
                }
            }
        }
        //add the extracted triples to the metadata of the ContentItem
        ci.getLock().writeLock().lock();
        try {
            ci.getMetadata().addAll(g);
            g = null;
        } finally {
            ci.getLock().writeLock().unlock();
        }
    } finally {
        it.close();
        m.close();
        IOUtils.closeQuietly(out);
    }
    if (textExtracted) {
        //add plain text to the content item
        IRI blobUri = new IRI("urn:metaxa:plain-text:" + randomUUID());
        ci.addPart(blobUri, plainTextSink.getBlob());
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) HashMap(java.util.HashMap) Statement(org.ontoware.rdf2go.model.Statement) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) BlankNode(org.apache.clerezza.commons.rdf.BlankNode) BlankNode(org.ontoware.rdf2go.model.node.BlankNode) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) URIImpl(org.ontoware.rdf2go.model.node.impl.URIImpl) RDFTerm(org.apache.clerezza.commons.rdf.RDFTerm) IOException(java.io.IOException) BufferedWriter(java.io.BufferedWriter) Triple(org.apache.clerezza.commons.rdf.Triple) SimpleGraph(org.apache.clerezza.commons.rdf.impl.utils.simple.SimpleGraph) Graph(org.apache.clerezza.commons.rdf.Graph) Model(org.ontoware.rdf2go.model.Model) ExtractorException(org.semanticdesktop.aperture.extractor.ExtractorException) SimpleGraph(org.apache.clerezza.commons.rdf.impl.utils.simple.SimpleGraph) OutputStreamWriter(java.io.OutputStreamWriter) ContentSink(org.apache.stanbol.enhancer.servicesapi.ContentSink) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)

Example 4 with ContentSink

use of org.apache.stanbol.enhancer.servicesapi.ContentSink in project stanbol by apache.

the class TikaEngine method computeEnhancements.

@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    MediaTypeAndStream mtas = extractMediaType(ci);
    if (mtas.mediaType == null) {
        //unable to parse and detect content type
        return;
    }
    MediaType plainMediaType = mtas.mediaType.getBaseType();
    if (plainMediaType.equals(MediaType.TEXT_PLAIN)) {
        //we need not to process plain text!
        return;
    }
    final ParseContext context = new ParseContext();
    context.set(Parser.class, parser);
    Set<MediaType> supproted = parser.getSupportedTypes(context);
    if (supproted.contains(plainMediaType)) {
        final InputStream in;
        if (mtas.in == null) {
            in = ci.getStream();
        } else {
            in = mtas.in;
        }
        final Metadata metadata = new Metadata();
        //set the already parsed contentType
        metadata.set(Metadata.CONTENT_TYPE, mtas.mediaType.toString());
        //also explicitly set the charset as contentEncoding
        String charset = mtas.mediaType.getParameters().get("charset");
        if (charset != null) {
            metadata.set(Metadata.CONTENT_ENCODING, charset);
        }
        ContentSink plainTextSink;
        try {
            plainTextSink = ciFactory.createContentSink(TEXT_PLAIN + "; charset=" + UTF8.name());
        } catch (IOException e) {
            //close the input stream
            IOUtils.closeQuietly(in);
            throw new EngineException("Error while initialising Blob for" + "writing the text/plain version of the parsed content", e);
        }
        final Writer plainTextWriter = new OutputStreamWriter(plainTextSink.getOutputStream(), UTF8);
        final ContentHandler textHandler = new //only the Body
        BodyContentHandler(//skip ignoreable
        new PlainTextHandler(plainTextWriter, false, skipLinebreaks));
        final ToXMLContentHandler xhtmlHandler;
        final ContentHandler mainHandler;
        ContentSink xhtmlSink = null;
        try {
            if (!plainMediaType.equals(XHTML)) {
                //do not parse XHTML from XHTML
                try {
                    xhtmlSink = ciFactory.createContentSink(XHTML + "; charset=" + UTF8.name());
                } catch (IOException e) {
                    throw new EngineException("Error while initialising Blob for" + "writing the application/xhtml+xml version of the parsed content", e);
                }
                try {
                    xhtmlHandler = new ToXMLContentHandler(xhtmlSink.getOutputStream(), UTF8.name());
                } catch (UnsupportedEncodingException e) {
                    throw new EngineException("This system does not support the encoding " + UTF8, e);
                }
                mainHandler = new MultiHandler(textHandler, xhtmlHandler);
            } else {
                mainHandler = textHandler;
                xhtmlHandler = null;
                xhtmlSink = null;
            }
            try {
                AccessController.doPrivileged(new PrivilegedExceptionAction<Object>() {

                    public Object run() throws IOException, SAXException, TikaException {
                        /* 
                             * We need to replace the context Classloader with the Bundle ClassLoader
                             * to ensure that Singleton instances of XML frameworks (such as node4j) 
                             * do not leak into the OSGI environment.
                             * 
                             * Most Java XML libs prefer to load implementations by using the 
                             * {@link Thread#getContextClassLoader()}. However OSGI has no control over
                             * this {@link ClassLoader}. Because of that there can be situations where
                             * Interfaces are loaded via the Bundle Classloader and the implementations
                             * are taken from the context Classloader. What can cause 
                             * {@link ClassCastException}, {@link ExceptionInInitializerError}s, ...
                             * 
                             * Setting the context Classloader to the Bundle classloader helps to avoid
                             * those situations.
                             */
                        ClassLoader contextClassLoader = updateContextClassLoader();
                        try {
                            parser.parse(in, mainHandler, metadata, context);
                        } finally {
                            //reset the previous context ClassLoader
                            Thread.currentThread().setContextClassLoader(contextClassLoader);
                        }
                        return null;
                    }
                });
            } catch (PrivilegedActionException pae) {
                Exception e = pae.getException();
                if (e instanceof IOException || e instanceof SAXException || e instanceof TikaException) {
                    throw new EngineException("Unable to convert ContentItem " + ci.getUri() + " with mimeType '" + ci.getMimeType() + "' to " + "plain text!", e);
                } else {
                    //runtime exception
                    throw RuntimeException.class.cast(e);
                }
            }
        } finally {
            //ensure that the writers are closed correctly
            IOUtils.closeQuietly(in);
            IOUtils.closeQuietly(plainTextWriter);
            if (xhtmlSink != null) {
                IOUtils.closeQuietly(xhtmlSink.getOutputStream());
            }
        }
        String random = randomUUID().toString();
        IRI textBlobUri = new IRI("urn:tika:text:" + random);
        ci.addPart(textBlobUri, plainTextSink.getBlob());
        if (xhtmlHandler != null) {
            IRI xhtmlBlobUri = new IRI("urn:tika:xhtml:" + random);
            ci.addPart(xhtmlBlobUri, xhtmlSink.getBlob());
        }
        //add the extracted metadata
        if (log.isInfoEnabled()) {
            for (String name : metadata.names()) {
                log.info("{}: {}", name, Arrays.toString(metadata.getValues(name)));
            }
        }
        ci.getLock().writeLock().lock();
        try {
            Graph graph = ci.getMetadata();
            IRI id = ci.getUri();
            Set<String> mapped = ontologyMappings.apply(graph, id, metadata);
            if (includeUnmappedProperties) {
                Set<String> unmapped = new HashSet<String>(Arrays.asList(metadata.names()));
                unmapped.removeAll(mapped);
                for (String name : unmapped) {
                    if (name.indexOf(':') >= 0 || includeAllUnmappedProperties) {
                        //only mapped
                        IRI prop = new IRI(new StringBuilder(TIKA_URN_PREFIX).append(name).toString());
                        for (String value : metadata.getValues(name)) {
                            //TODO: without the Property for the name we have no datatype
                            //      information ... so we add PlainLiterals for now
                            graph.add(new TripleImpl(id, prop, new PlainLiteralImpl(value)));
                        }
                    }
                }
            }
        } finally {
            ci.getLock().writeLock().unlock();
        }
    }
//else not supported format
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) IRI(org.apache.clerezza.commons.rdf.IRI) Metadata(org.apache.tika.metadata.Metadata) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) ToXMLContentHandler(org.apache.tika.sax.ToXMLContentHandler) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) SAXException(org.xml.sax.SAXException) MediaType(org.apache.tika.mime.MediaType) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) HashSet(java.util.HashSet) MultiHandler(org.apache.stanbol.enhancer.engines.tika.handler.MultiHandler) ToXMLContentHandler(org.apache.tika.sax.ToXMLContentHandler) TikaException(org.apache.tika.exception.TikaException) PrivilegedActionException(java.security.PrivilegedActionException) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) BufferedInputStream(java.io.BufferedInputStream) InputStream(java.io.InputStream) UnsupportedEncodingException(java.io.UnsupportedEncodingException) IOException(java.io.IOException) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) ConfigurationException(org.osgi.service.cm.ConfigurationException) SAXException(org.xml.sax.SAXException) UnsupportedEncodingException(java.io.UnsupportedEncodingException) TikaException(org.apache.tika.exception.TikaException) PrivilegedActionException(java.security.PrivilegedActionException) IOException(java.io.IOException) Graph(org.apache.clerezza.commons.rdf.Graph) PlainTextHandler(org.apache.stanbol.enhancer.engines.tika.handler.PlainTextHandler) ParseContext(org.apache.tika.parser.ParseContext) OutputStreamWriter(java.io.OutputStreamWriter) ContentSink(org.apache.stanbol.enhancer.servicesapi.ContentSink) Writer(java.io.Writer) OutputStreamWriter(java.io.OutputStreamWriter)

Aggregations

ContentSink (org.apache.stanbol.enhancer.servicesapi.ContentSink)4 IOException (java.io.IOException)2 OutputStreamWriter (java.io.OutputStreamWriter)2 Graph (org.apache.clerezza.commons.rdf.Graph)2 IRI (org.apache.clerezza.commons.rdf.IRI)2 TripleImpl (org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)2 EngineException (org.apache.stanbol.enhancer.servicesapi.EngineException)2 Test (org.junit.Test)2 BufferedInputStream (java.io.BufferedInputStream)1 BufferedWriter (java.io.BufferedWriter)1 InputStream (java.io.InputStream)1 OutputStream (java.io.OutputStream)1 UnsupportedEncodingException (java.io.UnsupportedEncodingException)1 Writer (java.io.Writer)1 Charset (java.nio.charset.Charset)1 PrivilegedActionException (java.security.PrivilegedActionException)1 HashMap (java.util.HashMap)1 HashSet (java.util.HashSet)1 BlankNode (org.apache.clerezza.commons.rdf.BlankNode)1 BlankNodeOrIRI (org.apache.clerezza.commons.rdf.BlankNodeOrIRI)1