Search in sources :

Example 21 with XHTMLContentHandler

use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.

the class WebPParser method parse.

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    TemporaryResources tmp = new TemporaryResources();
    try {
        TikaInputStream tis = TikaInputStream.get(stream, tmp);
        new ImageMetadataExtractor(metadata).parseWebP(tis.getFile());
    } finally {
        tmp.dispose();
    }
    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();
    xhtml.endDocument();
}
Also used : TemporaryResources(org.apache.tika.io.TemporaryResources) TikaInputStream(org.apache.tika.io.TikaInputStream) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler)

Example 22 with XHTMLContentHandler

use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.

the class RFC822Parser method parse.

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    // Get the mime4j configuration, or use a default one
    MimeConfig config = new MimeConfig();
    config.setMaxLineLen(100000);
    // max length of any individual header
    config.setMaxHeaderLen(100000);
    config = context.get(MimeConfig.class, config);
    MimeStreamParser parser = new MimeStreamParser(config);
    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    MailContentHandler mch = new MailContentHandler(xhtml, metadata, context, config.isStrictParsing());
    parser.setContentHandler(mch);
    parser.setContentDecoding(true);
    TikaInputStream tstream = TikaInputStream.get(stream);
    try {
        parser.parse(tstream);
    } catch (IOException e) {
        tstream.throwIfCauseOf(e);
        throw new TikaException("Failed to parse an email message", e);
    } catch (MimeException e) {
        // Unwrap the exception in case it was not thrown by mime4j
        Throwable cause = e.getCause();
        if (cause instanceof TikaException) {
            throw (TikaException) cause;
        } else if (cause instanceof SAXException) {
            throw (SAXException) cause;
        } else {
            throw new TikaException("Failed to parse an email message", e);
        }
    }
}
Also used : TikaException(org.apache.tika.exception.TikaException) MimeConfig(org.apache.james.mime4j.stream.MimeConfig) MimeStreamParser(org.apache.james.mime4j.parser.MimeStreamParser) TikaInputStream(org.apache.tika.io.TikaInputStream) MimeException(org.apache.james.mime4j.MimeException) IOException(java.io.IOException) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) SAXException(org.xml.sax.SAXException)

Example 23 with XHTMLContentHandler

use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.

the class MatParser method parse.

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    //Set MIME type as Matlab
    metadata.set(Metadata.CONTENT_TYPE, MATLAB_MIME_TYPE);
    TemporaryResources tmp = TikaInputStream.isTikaInputStream(stream) ? null : new TemporaryResources();
    try {
        // Use TIS so we can spool a temp file for parsing.
        TikaInputStream tis = TikaInputStream.get(stream, tmp);
        //Extract information from header file
        //input .mat file
        MatFileReader mfr = new MatFileReader(tis.getFile());
        //.mat header information
        MatFileHeader hdr = mfr.getMatFileHeader();
        // Example header: "MATLAB 5.0 MAT-file, Platform: MACI64, Created on: Sun Mar  2 23:41:57 2014"
        // Break header information into its parts
        String[] parts = hdr.getDescription().split(",");
        if (parts[2].contains("Created")) {
            int lastIndex1 = parts[2].lastIndexOf("Created on:");
            String dateCreated = parts[2].substring(lastIndex1 + "Created on:".length()).trim();
            metadata.set("createdOn", dateCreated);
        }
        if (parts[1].contains("Platform")) {
            int lastIndex2 = parts[1].lastIndexOf("Platform:");
            String platform = parts[1].substring(lastIndex2 + "Platform:".length()).trim();
            metadata.set("platform", platform);
        }
        if (parts[0].contains("MATLAB")) {
            metadata.set("fileType", parts[0]);
        }
        // Get endian indicator from header file
        // Retrieve endian bytes and convert to string
        String endianBytes = new String(hdr.getEndianIndicator(), UTF_8);
        // Convert bytes to characters to string
        String endianCode = String.valueOf(endianBytes.toCharArray());
        metadata.set("endian", endianCode);
        //Text output	
        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();
        xhtml.newline();
        //Loop through each variable
        for (Map.Entry<String, MLArray> entry : mfr.getContent().entrySet()) {
            String varName = entry.getKey();
            MLArray varData = entry.getValue();
            xhtml.element("p", varName + ":" + String.valueOf(varData));
            // If the variable is a structure, extract variable info from structure
            if (varData.isStruct()) {
                MLStructure mlStructure = (MLStructure) mfr.getMLArray(varName);
                xhtml.startElement("ul");
                xhtml.newline();
                for (MLArray element : mlStructure.getAllFields()) {
                    xhtml.startElement("li");
                    xhtml.characters(String.valueOf(element));
                    // If there is an embedded structure, extract variable info.
                    if (element.isStruct()) {
                        xhtml.startElement("ul");
                        // Should this actually be a recursive call?
                        xhtml.element("li", element.contentToString());
                        xhtml.endElement("ul");
                    }
                    xhtml.endElement("li");
                }
                xhtml.endElement("ul");
            }
        }
        xhtml.endDocument();
    } catch (IOException e) {
        throw new TikaException("Error parsing Matlab file with MatParser", e);
    } finally {
        if (tmp != null) {
            tmp.dispose();
        }
    }
}
Also used : MatFileReader(com.jmatio.io.MatFileReader) MLArray(com.jmatio.types.MLArray) TikaException(org.apache.tika.exception.TikaException) MatFileHeader(com.jmatio.io.MatFileHeader) TemporaryResources(org.apache.tika.io.TemporaryResources) TikaInputStream(org.apache.tika.io.TikaInputStream) IOException(java.io.IOException) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) MLStructure(com.jmatio.types.MLStructure) Map(java.util.Map)

Example 24 with XHTMLContentHandler

use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.

the class JackcessParser method parse.

@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    TikaInputStream tis = TikaInputStream.get(stream);
    Database db = null;
    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();
    String password = null;
    PasswordProvider passwordProvider = context.get(PasswordProvider.class);
    if (passwordProvider != null) {
        password = passwordProvider.getPassword(metadata);
    }
    try {
        if (password == null) {
            //do this to ensure encryption/wrong password exception vs. more generic
            //"need right codec" error message.
            db = new DatabaseBuilder(tis.getFile()).setCodecProvider(new CryptCodecProvider()).setReadOnly(true).open();
        } else {
            db = new DatabaseBuilder(tis.getFile()).setCodecProvider(new CryptCodecProvider(password)).setReadOnly(true).open();
        }
        //just in case
        db.setLinkResolver(IGNORE_LINK_RESOLVER);
        JackcessExtractor ex = new JackcessExtractor(metadata, context, locale);
        ex.parse(db, xhtml);
    } catch (IllegalStateException e) {
        if (e.getMessage() != null && e.getMessage().contains("Incorrect password")) {
            throw new EncryptedDocumentException(e);
        }
        throw e;
    } finally {
        if (db != null) {
            try {
                db.close();
            } catch (IOException e) {
            //swallow = silent close
            }
        }
    }
    xhtml.endDocument();
}
Also used : DatabaseBuilder(com.healthmarketscience.jackcess.DatabaseBuilder) CryptCodecProvider(com.healthmarketscience.jackcess.CryptCodecProvider) EncryptedDocumentException(org.apache.tika.exception.EncryptedDocumentException) Database(com.healthmarketscience.jackcess.Database) TikaInputStream(org.apache.tika.io.TikaInputStream) IOException(java.io.IOException) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) PasswordProvider(org.apache.tika.parser.PasswordProvider)

Example 25 with XHTMLContentHandler

use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.

the class OldExcelParser method parse.

/**
     * Extracts properties and text from an MS Document input stream
     */
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    // Open the POI provided extractor
    OldExcelExtractor extractor = new OldExcelExtractor(stream);
    // We can't do anything about metadata, as these old formats
    //  didn't have any stored with them
    // Set the content type
    // TODO Get the version and type, to set as the Content Type
    // Have the text extracted and given to our Content Handler
    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    parse(extractor, xhtml);
}
Also used : OldExcelExtractor(org.apache.poi.hssf.extractor.OldExcelExtractor) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler)

Aggregations

XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)72 TikaException (org.apache.tika.exception.TikaException)26 TikaInputStream (org.apache.tika.io.TikaInputStream)22 TemporaryResources (org.apache.tika.io.TemporaryResources)14 CloseShieldInputStream (org.apache.commons.io.input.CloseShieldInputStream)13 IOException (java.io.IOException)12 SAXException (org.xml.sax.SAXException)9 File (java.io.File)6 EmbeddedDocumentExtractor (org.apache.tika.extractor.EmbeddedDocumentExtractor)6 Metadata (org.apache.tika.metadata.Metadata)6 BufferedInputStream (java.io.BufferedInputStream)5 InputStream (java.io.InputStream)5 EmbeddedContentHandler (org.apache.tika.sax.EmbeddedContentHandler)5 ByteArrayInputStream (java.io.ByteArrayInputStream)4 Charset (java.nio.charset.Charset)4 ArrayList (java.util.ArrayList)4 Map (java.util.Map)4 MediaType (org.apache.tika.mime.MediaType)4 OfflineContentHandler (org.apache.tika.sax.OfflineContentHandler)4 InputStreamReader (java.io.InputStreamReader)3