Search in sources :

Example 16 with XHTMLContentHandler

use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.

the class EpubParser method parse.

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    // Because an EPub file is often made up of multiple XHTML files,
    //  we need explicit control over the start and end of the document
    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();
    ContentHandler childHandler = new EmbeddedContentHandler(new BodyContentHandler(xhtml));
    ZipInputStream zip = new ZipInputStream(stream);
    ZipEntry entry = zip.getNextEntry();
    while (entry != null) {
        if (entry.getName().equals("mimetype")) {
            String type = IOUtils.toString(zip, UTF_8);
            //often has trailing new lines
            if (type != null) {
                type = type.trim();
            }
            metadata.set(Metadata.CONTENT_TYPE, type);
        } else if (entry.getName().equals("metadata.xml")) {
            meta.parse(zip, new DefaultHandler(), metadata, context);
        } else if (entry.getName().endsWith(".opf")) {
            meta.parse(zip, new DefaultHandler(), metadata, context);
        } else if (entry.getName().endsWith(".html") || entry.getName().endsWith(".xhtml")) {
            content.parse(zip, childHandler, metadata, context);
        }
        entry = zip.getNextEntry();
    }
    // Finish everything
    xhtml.endDocument();
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ZipInputStream(java.util.zip.ZipInputStream) ZipEntry(java.util.zip.ZipEntry) EmbeddedContentHandler(org.apache.tika.sax.EmbeddedContentHandler) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) EmbeddedContentHandler(org.apache.tika.sax.EmbeddedContentHandler) ContentHandler(org.xml.sax.ContentHandler) DefaultHandler(org.xml.sax.helpers.DefaultHandler)

Example 17 with XHTMLContentHandler

use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.

the class ExecutableParser method parse.

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    // We only do metadata, for now
    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();
    // What kind is it?
    byte[] first4 = new byte[4];
    IOUtils.readFully(stream, first4);
    if (first4[0] == (byte) 'M' && first4[1] == (byte) 'Z') {
        parsePE(xhtml, metadata, stream, first4);
    } else if (first4[0] == (byte) 0x7f && first4[1] == (byte) 'E' && first4[2] == (byte) 'L' && first4[3] == (byte) 'F') {
        parseELF(xhtml, metadata, stream, first4);
    }
    // Finish everything
    xhtml.endDocument();
}
Also used : XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler)

Example 18 with XHTMLContentHandler

use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.

the class TrueTypeParser method parse.

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    TikaInputStream tis = TikaInputStream.cast(stream);
    // Ask FontBox to parse the file for us
    TrueTypeFont font = null;
    try {
        TTFParser parser = new TTFParser();
        if (tis != null && tis.hasFile()) {
            font = parser.parse(tis.getFile());
        } else {
            font = parser.parse(stream);
        }
        // Report the details of the font
        metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
        metadata.set(TikaCoreProperties.CREATED, font.getHeader().getCreated());
        metadata.set(TikaCoreProperties.MODIFIED, font.getHeader().getModified());
        metadata.set(AdobeFontMetricParser.MET_DOC_VERSION, Float.toString(font.getHeader().getVersion()));
        // Pull out the naming info
        NamingTable fontNaming = font.getNaming();
        for (NameRecord nr : fontNaming.getNameRecords()) {
            if (nr.getNameId() == NameRecord.NAME_FONT_FAMILY_NAME) {
                metadata.set(AdobeFontMetricParser.MET_FONT_FAMILY_NAME, nr.getString());
            }
            if (nr.getNameId() == NameRecord.NAME_FONT_SUB_FAMILY_NAME) {
                metadata.set(AdobeFontMetricParser.MET_FONT_SUB_FAMILY_NAME, nr.getString());
            }
            if (nr.getNameId() == NameRecord.NAME_FULL_FONT_NAME) {
                metadata.set(AdobeFontMetricParser.MET_FONT_NAME, nr.getString());
                metadata.set(TikaCoreProperties.TITLE, nr.getString());
            }
            if (nr.getNameId() == NameRecord.NAME_POSTSCRIPT_NAME) {
                metadata.set(AdobeFontMetricParser.MET_PS_NAME, nr.getString());
            }
            if (nr.getNameId() == NameRecord.NAME_COPYRIGHT) {
                metadata.set("Copyright", nr.getString());
            }
            if (nr.getNameId() == NameRecord.NAME_TRADEMARK) {
                metadata.set("Trademark", nr.getString());
            }
        }
    } finally {
        if (font != null) {
            font.close();
        }
    }
    // For now, we only output metadata, no textual contents
    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();
    xhtml.endDocument();
}
Also used : TrueTypeFont(org.apache.fontbox.ttf.TrueTypeFont) NameRecord(org.apache.fontbox.ttf.NameRecord) NamingTable(org.apache.fontbox.ttf.NamingTable) TikaInputStream(org.apache.tika.io.TikaInputStream) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) TTFParser(org.apache.fontbox.ttf.TTFParser)

Example 19 with XHTMLContentHandler

use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.

the class DBFParser method parse.

@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    DBFReader reader = DBFReader.open(stream);
    DBFFileHeader header = reader.getHeader();
    metadata.set(Metadata.CONTENT_TYPE, header.getVersion().getFullMimeString());
    //insert metadata here
    Calendar lastModified = header.getLastModified();
    if (lastModified != null) {
        metadata.set(TikaCoreProperties.MODIFIED, lastModified);
    }
    //buffer first X rows for charset detection
    List<DBFRow> firstRows = new LinkedList<>();
    DBFRow row = reader.next();
    int i = 0;
    while (row != null && i++ < ROWS_TO_BUFFER_FOR_CHARSET_DETECTION) {
        firstRows.add(row.deepCopy());
        row = reader.next();
    }
    Charset charset = getCharset(firstRows, header);
    metadata.set(Metadata.CONTENT_ENCODING, charset.toString());
    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();
    xhtml.startElement("table");
    xhtml.startElement("thead");
    for (DBFColumnHeader col : header.getCols()) {
        xhtml.startElement("th");
        xhtml.characters(col.getName(charset));
        xhtml.endElement("th");
    }
    xhtml.endElement("thead");
    xhtml.startElement("tbody");
    //now write cached rows
    while (firstRows.size() > 0) {
        DBFRow cachedRow = firstRows.remove(0);
        writeRow(cachedRow, charset, xhtml);
    }
    //now continue with rest
    while (row != null) {
        writeRow(row, charset, xhtml);
        row = reader.next();
    }
    xhtml.endElement("tbody");
    xhtml.endElement("table");
    xhtml.endDocument();
}
Also used : Calendar(java.util.Calendar) Charset(java.nio.charset.Charset) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) LinkedList(java.util.LinkedList)

Example 20 with XHTMLContentHandler

use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.

the class DWGParser method parse.

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, TikaException, SAXException {
    // First up, which version of the format are we handling?
    byte[] header = new byte[128];
    IOUtils.readFully(stream, header);
    String version = new String(header, 0, 6, "US-ASCII");
    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();
    if (version.equals("AC1015")) {
        metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
        if (skipTo2000PropertyInfoSection(stream, header)) {
            get2000Props(stream, metadata, xhtml);
        }
    } else if (version.equals("AC1018")) {
        metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
        if (skipToPropertyInfoSection(stream, header)) {
            get2004Props(stream, metadata, xhtml);
        }
    } else if (version.equals("AC1021") || version.equals("AC1024")) {
        metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
        if (skipToPropertyInfoSection(stream, header)) {
            get2007and2010Props(stream, metadata, xhtml);
        }
    } else {
        throw new TikaException("Unsupported AutoCAD drawing version: " + version);
    }
    xhtml.endDocument();
}
Also used : TikaException(org.apache.tika.exception.TikaException) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler)

Aggregations

XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)72 TikaException (org.apache.tika.exception.TikaException)26 TikaInputStream (org.apache.tika.io.TikaInputStream)22 TemporaryResources (org.apache.tika.io.TemporaryResources)14 CloseShieldInputStream (org.apache.commons.io.input.CloseShieldInputStream)13 IOException (java.io.IOException)12 SAXException (org.xml.sax.SAXException)9 File (java.io.File)6 EmbeddedDocumentExtractor (org.apache.tika.extractor.EmbeddedDocumentExtractor)6 Metadata (org.apache.tika.metadata.Metadata)6 BufferedInputStream (java.io.BufferedInputStream)5 InputStream (java.io.InputStream)5 EmbeddedContentHandler (org.apache.tika.sax.EmbeddedContentHandler)5 ByteArrayInputStream (java.io.ByteArrayInputStream)4 Charset (java.nio.charset.Charset)4 ArrayList (java.util.ArrayList)4 Map (java.util.Map)4 MediaType (org.apache.tika.mime.MediaType)4 OfflineContentHandler (org.apache.tika.sax.OfflineContentHandler)4 InputStreamReader (java.io.InputStreamReader)3