Search in sources :

Example 1 with ByteChunk

use of org.apache.poi.hsmf.datatypes.ByteChunk in project tika by apache.

the class OutlookExtractor method parse.

public void parse(XHTMLContentHandler xhtml, Metadata metadata) throws TikaException, SAXException, IOException {
    try {
        msg.setReturnNullOnMissingChunk(true);
        try {
            metadata.set(Office.MAPI_MESSAGE_CLASS, getMessageClass(msg.getMessageClass()));
        } catch (ChunkNotFoundException e) {
        }
        //  as Unicode, try to sort out an encoding for them
        if (msg.has7BitEncodingStrings()) {
            guess7BitEncoding(msg);
        }
        // Start with the metadata
        String subject = msg.getSubject();
        Map<String, String[]> headers = normalizeHeaders(msg.getHeaders());
        String from = msg.getDisplayFrom();
        handleFromTo(headers, metadata);
        metadata.set(TikaCoreProperties.TITLE, subject);
        // TODO: Move to description in Tika 2.0
        metadata.set(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_DESCRIPTION, msg.getConversationTopic());
        try {
            for (String recipientAddress : msg.getRecipientEmailAddressList()) {
                if (recipientAddress != null)
                    metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, recipientAddress);
            }
        } catch (ChunkNotFoundException he) {
        }
        for (Map.Entry<String, String[]> e : headers.entrySet()) {
            String headerKey = e.getKey();
            for (String headerValue : e.getValue()) {
                metadata.add(Metadata.MESSAGE_RAW_HEADER_PREFIX + headerKey, headerValue);
            }
        }
        // First try via the proper chunk
        if (msg.getMessageDate() != null) {
            metadata.set(TikaCoreProperties.CREATED, msg.getMessageDate().getTime());
            metadata.set(TikaCoreProperties.MODIFIED, msg.getMessageDate().getTime());
        } else {
            if (headers != null && headers.size() > 0) {
                for (Map.Entry<String, String[]> header : headers.entrySet()) {
                    String headerKey = header.getKey();
                    if (headerKey.toLowerCase(Locale.ROOT).startsWith("date:")) {
                        String date = headerKey.substring(headerKey.indexOf(':') + 1).trim();
                        // See if we can parse it as a normal mail date
                        try {
                            Date d = MboxParser.parseDate(date);
                            metadata.set(TikaCoreProperties.CREATED, d);
                            metadata.set(TikaCoreProperties.MODIFIED, d);
                        } catch (ParseException e) {
                            // Store it as-is, and hope for the best...
                            metadata.set(TikaCoreProperties.CREATED, date);
                            metadata.set(TikaCoreProperties.MODIFIED, date);
                        }
                        break;
                    }
                }
            }
        }
        xhtml.element("h1", subject);
        // Output the from and to details in text, as you
        //  often want them in text form for searching
        xhtml.startElement("dl");
        if (from != null) {
            header(xhtml, "From", from);
        }
        header(xhtml, "To", msg.getDisplayTo());
        header(xhtml, "Cc", msg.getDisplayCC());
        header(xhtml, "Bcc", msg.getDisplayBCC());
        try {
            header(xhtml, "Recipients", msg.getRecipientEmailAddress());
        } catch (ChunkNotFoundException e) {
        }
        xhtml.endElement("dl");
        // Get the message body. Preference order is: html, rtf, text
        Chunk htmlChunk = null;
        Chunk rtfChunk = null;
        Chunk textChunk = null;
        for (Chunk chunk : msg.getMainChunks().getChunks()) {
            if (chunk.getChunkId() == MAPIProperty.BODY_HTML.id) {
                htmlChunk = chunk;
            }
            if (chunk.getChunkId() == MAPIProperty.RTF_COMPRESSED.id) {
                rtfChunk = chunk;
            }
            if (chunk.getChunkId() == MAPIProperty.BODY.id) {
                textChunk = chunk;
            }
        }
        boolean doneBody = false;
        xhtml.startElement("div", "class", "message-body");
        if (htmlChunk != null) {
            byte[] data = null;
            if (htmlChunk instanceof ByteChunk) {
                data = ((ByteChunk) htmlChunk).getValue();
            } else if (htmlChunk instanceof StringChunk) {
                data = ((StringChunk) htmlChunk).getRawValue();
            }
            if (data != null) {
                Parser htmlParser = EmbeddedDocumentUtil.tryToFindExistingLeafParser(HtmlParser.class, parseContext);
                if (htmlParser == null) {
                    htmlParser = new HtmlParser();
                }
                htmlParser.parse(new ByteArrayInputStream(data), new EmbeddedContentHandler(new BodyContentHandler(xhtml)), new Metadata(), parseContext);
                doneBody = true;
            }
        }
        if (rtfChunk != null && !doneBody) {
            ByteChunk chunk = (ByteChunk) rtfChunk;
            MAPIRtfAttribute rtf = new MAPIRtfAttribute(MAPIProperty.RTF_COMPRESSED, Types.BINARY.getId(), chunk.getValue());
            Parser rtfParser = EmbeddedDocumentUtil.tryToFindExistingLeafParser(RTFParser.class, parseContext);
            if (rtfParser == null) {
                rtfParser = new RTFParser();
            }
            rtfParser.parse(new ByteArrayInputStream(rtf.getData()), new EmbeddedContentHandler(new BodyContentHandler(xhtml)), new Metadata(), parseContext);
            doneBody = true;
        }
        if (textChunk != null && !doneBody) {
            xhtml.element("p", ((StringChunk) textChunk).getValue());
        }
        xhtml.endElement("div");
        // Process the attachments
        for (AttachmentChunks attachment : msg.getAttachmentFiles()) {
            xhtml.startElement("div", "class", "attachment-entry");
            String filename = null;
            if (attachment.getAttachLongFileName() != null) {
                filename = attachment.getAttachLongFileName().getValue();
            } else if (attachment.getAttachFileName() != null) {
                filename = attachment.getAttachFileName().getValue();
            }
            if (filename != null && filename.length() > 0) {
                xhtml.element("h1", filename);
            }
            if (attachment.getAttachData() != null) {
                handleEmbeddedResource(TikaInputStream.get(attachment.getAttachData().getValue()), filename, null, null, xhtml, true);
            }
            if (attachment.getAttachmentDirectory() != null) {
                handleEmbeddedOfficeDoc(attachment.getAttachmentDirectory().getDirectory(), xhtml);
            }
            xhtml.endElement("div");
        }
    } catch (ChunkNotFoundException e) {
        throw new TikaException("POI MAPIMessage broken - didn't return null on missing chunk", e);
    } finally {
    //You'd think you'd want to call msg.close().
    //Don't do that.  That closes down the file system.
    //If an msg has multiple msg attachments, some of them
    //can reside in the same file system.  After the first
    //child is read, the fs is closed, and the other children
    //get a java.nio.channels.ClosedChannelException
    }
}
Also used : ChunkNotFoundException(org.apache.poi.hsmf.exceptions.ChunkNotFoundException) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) MAPIRtfAttribute(org.apache.poi.hmef.attribute.MAPIRtfAttribute) TikaException(org.apache.tika.exception.TikaException) ByteChunk(org.apache.poi.hsmf.datatypes.ByteChunk) Metadata(org.apache.tika.metadata.Metadata) EmbeddedContentHandler(org.apache.tika.sax.EmbeddedContentHandler) ByteChunk(org.apache.poi.hsmf.datatypes.ByteChunk) StringChunk(org.apache.poi.hsmf.datatypes.StringChunk) Chunk(org.apache.poi.hsmf.datatypes.Chunk) Date(java.util.Date) StringChunk(org.apache.poi.hsmf.datatypes.StringChunk) Parser(org.apache.tika.parser.Parser) HtmlParser(org.apache.tika.parser.html.HtmlParser) RTFParser(org.apache.tika.parser.rtf.RTFParser) MboxParser(org.apache.tika.parser.mbox.MboxParser) HtmlParser(org.apache.tika.parser.html.HtmlParser) RTFParser(org.apache.tika.parser.rtf.RTFParser) ByteArrayInputStream(java.io.ByteArrayInputStream) ParseException(java.text.ParseException) Map(java.util.Map) LinkedHashMap(java.util.LinkedHashMap) AttachmentChunks(org.apache.poi.hsmf.datatypes.AttachmentChunks)

Example 2 with ByteChunk

use of org.apache.poi.hsmf.datatypes.ByteChunk in project poi by apache.

the class POIFSChunkParser method process.

/**
    * Creates a chunk, and gives it to its parent group 
    */
protected static void process(Entry entry, ChunkGroup grouping) {
    String entryName = entry.getName();
    Chunk chunk = null;
    // Is it a properties chunk? (They have special names)
    if (entryName.equals(PropertiesChunk.NAME)) {
        if (grouping instanceof Chunks) {
            // These should be the properties for the message itself
            chunk = new MessagePropertiesChunk(grouping);
        } else {
            // Will be properties on an attachment or recipient
            chunk = new StoragePropertiesChunk(grouping);
        }
    } else {
        // Check it's a regular chunk
        if (entryName.length() < 9) {
            // Name in the wrong format
            return;
        }
        if (!entryName.contains("_")) {
            // Name in the wrong format
            return;
        }
        // Split it into its parts
        int splitAt = entryName.lastIndexOf('_');
        String namePrefix = entryName.substring(0, splitAt + 1);
        String ids = entryName.substring(splitAt + 1);
        //  the form __<name>_<id><type>
        if (namePrefix.equals("Olk10SideProps") || namePrefix.equals("Olk10SideProps_")) {
            // This is some odd Outlook 2002 thing, skip
            return;
        } else if (splitAt <= entryName.length() - 8) {
        // In the right form for a normal chunk
        // We'll process this further in a little bit
        } else {
            // Underscores not the right place, something's wrong
            throw new IllegalArgumentException("Invalid chunk name " + entryName);
        }
        // Now try to turn it into id + type
        try {
            int chunkId = Integer.parseInt(ids.substring(0, 4), 16);
            int typeId = Integer.parseInt(ids.substring(4, 8), 16);
            MAPIType type = Types.getById(typeId);
            if (type == null) {
                type = Types.createCustom(typeId);
            }
            // Special cases based on the ID
            if (chunkId == MAPIProperty.MESSAGE_SUBMISSION_ID.id) {
                chunk = new MessageSubmissionChunk(namePrefix, chunkId, type);
            } else {
                // So, do the usual thing which is by type
                if (type == Types.BINARY) {
                    chunk = new ByteChunk(namePrefix, chunkId, type);
                } else if (type == Types.DIRECTORY) {
                    if (entry instanceof DirectoryNode) {
                        chunk = new DirectoryChunk((DirectoryNode) entry, namePrefix, chunkId, type);
                    }
                } else if (type == Types.ASCII_STRING || type == Types.UNICODE_STRING) {
                    chunk = new StringChunk(namePrefix, chunkId, type);
                } else {
                // Type of an unsupported type! Skipping... 
                }
            }
        } catch (NumberFormatException e) {
            // Name in the wrong format
            return;
        }
    }
    if (chunk != null) {
        if (entry instanceof DocumentNode) {
            DocumentInputStream inp = null;
            try {
                inp = new DocumentInputStream((DocumentNode) entry);
                chunk.readValue(inp);
                grouping.record(chunk);
            } catch (IOException e) {
                logger.log(POILogger.ERROR, "Error reading from part " + entry.getName() + " - " + e);
            } finally {
                if (inp != null)
                    inp.close();
            }
        } else {
            grouping.record(chunk);
        }
    }
}
Also used : StoragePropertiesChunk(org.apache.poi.hsmf.datatypes.StoragePropertiesChunk) Chunks(org.apache.poi.hsmf.datatypes.Chunks) AttachmentChunks(org.apache.poi.hsmf.datatypes.AttachmentChunks) RecipientChunks(org.apache.poi.hsmf.datatypes.RecipientChunks) NameIdChunks(org.apache.poi.hsmf.datatypes.NameIdChunks) DocumentNode(org.apache.poi.poifs.filesystem.DocumentNode) ByteChunk(org.apache.poi.hsmf.datatypes.ByteChunk) DirectoryNode(org.apache.poi.poifs.filesystem.DirectoryNode) DirectoryChunk(org.apache.poi.hsmf.datatypes.DirectoryChunk) MessagePropertiesChunk(org.apache.poi.hsmf.datatypes.MessagePropertiesChunk) IOException(java.io.IOException) ByteChunk(org.apache.poi.hsmf.datatypes.ByteChunk) PropertiesChunk(org.apache.poi.hsmf.datatypes.PropertiesChunk) StringChunk(org.apache.poi.hsmf.datatypes.StringChunk) Chunk(org.apache.poi.hsmf.datatypes.Chunk) DirectoryChunk(org.apache.poi.hsmf.datatypes.DirectoryChunk) StoragePropertiesChunk(org.apache.poi.hsmf.datatypes.StoragePropertiesChunk) MessagePropertiesChunk(org.apache.poi.hsmf.datatypes.MessagePropertiesChunk) MessageSubmissionChunk(org.apache.poi.hsmf.datatypes.MessageSubmissionChunk) DocumentInputStream(org.apache.poi.poifs.filesystem.DocumentInputStream) MessageSubmissionChunk(org.apache.poi.hsmf.datatypes.MessageSubmissionChunk) StringChunk(org.apache.poi.hsmf.datatypes.StringChunk) MAPIType(org.apache.poi.hsmf.datatypes.Types.MAPIType)

Aggregations

AttachmentChunks (org.apache.poi.hsmf.datatypes.AttachmentChunks)2 ByteChunk (org.apache.poi.hsmf.datatypes.ByteChunk)2 Chunk (org.apache.poi.hsmf.datatypes.Chunk)2 StringChunk (org.apache.poi.hsmf.datatypes.StringChunk)2 ByteArrayInputStream (java.io.ByteArrayInputStream)1 IOException (java.io.IOException)1 ParseException (java.text.ParseException)1 Date (java.util.Date)1 LinkedHashMap (java.util.LinkedHashMap)1 Map (java.util.Map)1 MAPIRtfAttribute (org.apache.poi.hmef.attribute.MAPIRtfAttribute)1 Chunks (org.apache.poi.hsmf.datatypes.Chunks)1 DirectoryChunk (org.apache.poi.hsmf.datatypes.DirectoryChunk)1 MessagePropertiesChunk (org.apache.poi.hsmf.datatypes.MessagePropertiesChunk)1 MessageSubmissionChunk (org.apache.poi.hsmf.datatypes.MessageSubmissionChunk)1 NameIdChunks (org.apache.poi.hsmf.datatypes.NameIdChunks)1 PropertiesChunk (org.apache.poi.hsmf.datatypes.PropertiesChunk)1 RecipientChunks (org.apache.poi.hsmf.datatypes.RecipientChunks)1 StoragePropertiesChunk (org.apache.poi.hsmf.datatypes.StoragePropertiesChunk)1 MAPIType (org.apache.poi.hsmf.datatypes.Types.MAPIType)1