Search in sources :

Example 36 with XHTMLContentHandler

use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.

the class PSDParser method parse.

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    // Check for the magic header signature
    byte[] signature = new byte[4];
    IOUtils.readFully(stream, signature);
    if (signature[0] == (byte) '8' && signature[1] == (byte) 'B' && signature[2] == (byte) 'P' && signature[3] == (byte) 'S') {
    // Good, signature found
    } else {
        throw new TikaException("PSD/PSB magic signature invalid");
    }
    // Check the version
    int version = EndianUtils.readUShortBE(stream);
    if (version == 1 || version == 2) {
    // Good, we support these two
    } else {
        throw new TikaException("Invalid PSD/PSB version " + version);
    }
    // Skip the reserved block
    IOUtils.readFully(stream, new byte[6]);
    // Number of channels in the image
    int numChannels = EndianUtils.readUShortBE(stream);
    // TODO Identify a suitable metadata key for this
    // Width and Height
    int height = EndianUtils.readIntBE(stream);
    int width = EndianUtils.readIntBE(stream);
    metadata.set(TIFF.IMAGE_LENGTH, height);
    metadata.set(TIFF.IMAGE_WIDTH, width);
    // Depth (bits per channel)
    int depth = EndianUtils.readUShortBE(stream);
    metadata.set(TIFF.BITS_PER_SAMPLE, Integer.toString(depth));
    // Colour mode, eg Bitmap or RGB
    int colorMode = EndianUtils.readUShortBE(stream);
    metadata.set(Photoshop.COLOR_MODE, Photoshop._COLOR_MODE_CHOICES_INDEXED[colorMode]);
    // Next is the Color Mode section
    // We don't care about this bit
    long colorModeSectionSize = EndianUtils.readIntBE(stream);
    stream.skip(colorModeSectionSize);
    // Next is the Image Resources section
    // Check for certain interesting keys here
    long imageResourcesSectionSize = EndianUtils.readIntBE(stream);
    long read = 0;
    while (read < imageResourcesSectionSize) {
        ResourceBlock rb = new ResourceBlock(stream);
        read += rb.totalLength;
        // Is it one we can do something useful with?
        if (rb.id == ResourceBlock.ID_CAPTION) {
            metadata.add(TikaCoreProperties.DESCRIPTION, rb.getDataAsString());
        } else if (rb.id == ResourceBlock.ID_EXIF_1) {
        // TODO Parse the EXIF info via ImageMetadataExtractor
        } else if (rb.id == ResourceBlock.ID_EXIF_3) {
        // TODO Parse the EXIF info via ImageMetadataExtractor
        } else if (rb.id == ResourceBlock.ID_XMP) {
        // TODO Parse the XMP info via ImageMetadataExtractor
        }
    }
    // Next is the Layer and Mask Info
    // Finally we have Image Data
    // We can't do anything with these parts
    // We don't have any helpful text, sorry...
    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();
    xhtml.endDocument();
}
Also used : TikaException(org.apache.tika.exception.TikaException) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler)

Example 37 with XHTMLContentHandler

use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.

the class IptcAnpaParser method parse.

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    HashMap<String, String> properties = this.loadProperties(stream);
    this.setMetadata(metadata, properties);
    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();
    // TODO: put body content here
    xhtml.startElement("p");
    String body = clean(properties.get("body"));
    if (body != null)
        xhtml.characters(body);
    xhtml.endElement("p");
    xhtml.endDocument();
}
Also used : XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler)

Example 38 with XHTMLContentHandler

use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.

the class ISArchiveParser method parse.

@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    TemporaryResources tmp = TikaInputStream.isTikaInputStream(stream) ? null : new TemporaryResources();
    TikaInputStream tis = TikaInputStream.get(stream, tmp);
    try {
        if (this.location == null) {
            this.location = tis.getFile().getParent() + File.separator;
        }
        this.studyFileName = tis.getFile().getName();
        File locationFile = new File(location);
        String[] investigationList = locationFile.list(new FilenameFilter() {

            @Override
            public boolean accept(File dir, String name) {
                return name.matches("i_.+\\.txt");
            }
        });
        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();
        parseInvestigation(investigationList, xhtml, metadata, context);
        parseStudy(stream, xhtml, metadata, context);
        parseAssay(xhtml, metadata, context);
        xhtml.endDocument();
    } finally {
        if (tmp != null) {
            tmp.dispose();
        }
    }
}
Also used : FilenameFilter(java.io.FilenameFilter) TemporaryResources(org.apache.tika.io.TemporaryResources) TikaInputStream(org.apache.tika.io.TikaInputStream) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) File(java.io.File)

Example 39 with XHTMLContentHandler

use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.

the class IWorkPackageParser method parse.

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    ZipArchiveInputStream zip = new ZipArchiveInputStream(stream);
    ZipArchiveEntry entry = zip.getNextZipEntry();
    while (entry != null) {
        if (!IWORK_CONTENT_ENTRIES.contains(entry.getName())) {
            entry = zip.getNextZipEntry();
            continue;
        }
        InputStream entryStream = new BufferedInputStream(zip, 4096);
        entryStream.mark(4096);
        IWORKDocumentType type = IWORKDocumentType.detectType(entryStream);
        entryStream.reset();
        if (type != null) {
            XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
            ContentHandler contentHandler;
            switch(type) {
                case KEYNOTE:
                    contentHandler = new KeynoteContentHandler(xhtml, metadata);
                    break;
                case NUMBERS:
                    contentHandler = new NumbersContentHandler(xhtml, metadata);
                    break;
                case PAGES:
                    contentHandler = new PagesContentHandler(xhtml, metadata);
                    break;
                case ENCRYPTED:
                    // We can't do anything for the file right now
                    contentHandler = null;
                    break;
                default:
                    throw new TikaException("Unhandled iWorks file " + type);
            }
            metadata.add(Metadata.CONTENT_TYPE, type.getType().toString());
            xhtml.startDocument();
            if (contentHandler != null) {
                context.getSAXParser().parse(new CloseShieldInputStream(entryStream), new OfflineContentHandler(contentHandler));
            }
            xhtml.endDocument();
        }
        entry = zip.getNextZipEntry();
    }
// Don't close the zip InputStream (TIKA-1117).
}
Also used : TikaException(org.apache.tika.exception.TikaException) ZipArchiveInputStream(org.apache.commons.compress.archivers.zip.ZipArchiveInputStream) BufferedInputStream(java.io.BufferedInputStream) ZipArchiveInputStream(org.apache.commons.compress.archivers.zip.ZipArchiveInputStream) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream) InputStream(java.io.InputStream) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) OfflineContentHandler(org.apache.tika.sax.OfflineContentHandler) ContentHandler(org.xml.sax.ContentHandler) OfflineContentHandler(org.apache.tika.sax.OfflineContentHandler) BufferedInputStream(java.io.BufferedInputStream) ZipArchiveEntry(org.apache.commons.compress.archivers.zip.ZipArchiveEntry) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream)

Example 40 with XHTMLContentHandler

use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.

the class AbstractDBParser method parse.

@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    connection = getConnection(stream, metadata, context);
    XHTMLContentHandler xHandler = null;
    List<String> tableNames = null;
    try {
        tableNames = getTableNames(connection, metadata, context);
    } catch (SQLException e) {
        try {
            close();
        } catch (SQLException sqlE) {
        //swallow
        }
        throw new IOExceptionWithCause(e);
    }
    for (String tableName : tableNames) {
        //add table names to parent metadata
        metadata.add(Database.TABLE_NAME, tableName);
    }
    xHandler = new XHTMLContentHandler(handler, metadata);
    xHandler.startDocument();
    try {
        for (String tableName : tableNames) {
            JDBCTableReader tableReader = getTableReader(connection, tableName, context);
            xHandler.startElement("table", "name", tableReader.getTableName());
            xHandler.startElement("thead");
            xHandler.startElement("tr");
            for (String header : tableReader.getHeaders()) {
                xHandler.startElement("th");
                xHandler.characters(header);
                xHandler.endElement("th");
            }
            xHandler.endElement("tr");
            xHandler.endElement("thead");
            xHandler.startElement("tbody");
            while (tableReader.nextRow(xHandler, context)) {
            //no-op
            }
            xHandler.endElement("tbody");
            xHandler.endElement("table");
        }
    } finally {
        try {
            close();
        } catch (IOException | SQLException e) {
        //swallow
        }
        if (xHandler != null) {
            xHandler.endDocument();
        }
    }
}
Also used : IOExceptionWithCause(org.apache.commons.io.IOExceptionWithCause) SQLException(java.sql.SQLException) IOException(java.io.IOException) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler)

Aggregations

XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)72 TikaException (org.apache.tika.exception.TikaException)26 TikaInputStream (org.apache.tika.io.TikaInputStream)22 TemporaryResources (org.apache.tika.io.TemporaryResources)14 CloseShieldInputStream (org.apache.commons.io.input.CloseShieldInputStream)13 IOException (java.io.IOException)12 SAXException (org.xml.sax.SAXException)9 File (java.io.File)6 EmbeddedDocumentExtractor (org.apache.tika.extractor.EmbeddedDocumentExtractor)6 Metadata (org.apache.tika.metadata.Metadata)6 BufferedInputStream (java.io.BufferedInputStream)5 InputStream (java.io.InputStream)5 EmbeddedContentHandler (org.apache.tika.sax.EmbeddedContentHandler)5 ByteArrayInputStream (java.io.ByteArrayInputStream)4 Charset (java.nio.charset.Charset)4 ArrayList (java.util.ArrayList)4 Map (java.util.Map)4 MediaType (org.apache.tika.mime.MediaType)4 OfflineContentHandler (org.apache.tika.sax.OfflineContentHandler)4 InputStreamReader (java.io.InputStreamReader)3