Search in sources :

Example 1 with Chunk

use of org.apache.poi.hsmf.datatypes.Chunk in project tika by apache.

the class OutlookExtractor method parse.

public void parse(XHTMLContentHandler xhtml, Metadata metadata) throws TikaException, SAXException, IOException {
    try {
        msg.setReturnNullOnMissingChunk(true);
        try {
            metadata.set(Office.MAPI_MESSAGE_CLASS, getMessageClass(msg.getMessageClass()));
        } catch (ChunkNotFoundException e) {
        }
        //  as Unicode, try to sort out an encoding for them
        if (msg.has7BitEncodingStrings()) {
            guess7BitEncoding(msg);
        }
        // Start with the metadata
        String subject = msg.getSubject();
        Map<String, String[]> headers = normalizeHeaders(msg.getHeaders());
        String from = msg.getDisplayFrom();
        handleFromTo(headers, metadata);
        metadata.set(TikaCoreProperties.TITLE, subject);
        // TODO: Move to description in Tika 2.0
        metadata.set(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_DESCRIPTION, msg.getConversationTopic());
        try {
            for (String recipientAddress : msg.getRecipientEmailAddressList()) {
                if (recipientAddress != null)
                    metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, recipientAddress);
            }
        } catch (ChunkNotFoundException he) {
        }
        for (Map.Entry<String, String[]> e : headers.entrySet()) {
            String headerKey = e.getKey();
            for (String headerValue : e.getValue()) {
                metadata.add(Metadata.MESSAGE_RAW_HEADER_PREFIX + headerKey, headerValue);
            }
        }
        // First try via the proper chunk
        if (msg.getMessageDate() != null) {
            metadata.set(TikaCoreProperties.CREATED, msg.getMessageDate().getTime());
            metadata.set(TikaCoreProperties.MODIFIED, msg.getMessageDate().getTime());
        } else {
            if (headers != null && headers.size() > 0) {
                for (Map.Entry<String, String[]> header : headers.entrySet()) {
                    String headerKey = header.getKey();
                    if (headerKey.toLowerCase(Locale.ROOT).startsWith("date:")) {
                        String date = headerKey.substring(headerKey.indexOf(':') + 1).trim();
                        // See if we can parse it as a normal mail date
                        try {
                            Date d = MboxParser.parseDate(date);
                            metadata.set(TikaCoreProperties.CREATED, d);
                            metadata.set(TikaCoreProperties.MODIFIED, d);
                        } catch (ParseException e) {
                            // Store it as-is, and hope for the best...
                            metadata.set(TikaCoreProperties.CREATED, date);
                            metadata.set(TikaCoreProperties.MODIFIED, date);
                        }
                        break;
                    }
                }
            }
        }
        xhtml.element("h1", subject);
        // Output the from and to details in text, as you
        //  often want them in text form for searching
        xhtml.startElement("dl");
        if (from != null) {
            header(xhtml, "From", from);
        }
        header(xhtml, "To", msg.getDisplayTo());
        header(xhtml, "Cc", msg.getDisplayCC());
        header(xhtml, "Bcc", msg.getDisplayBCC());
        try {
            header(xhtml, "Recipients", msg.getRecipientEmailAddress());
        } catch (ChunkNotFoundException e) {
        }
        xhtml.endElement("dl");
        // Get the message body. Preference order is: html, rtf, text
        Chunk htmlChunk = null;
        Chunk rtfChunk = null;
        Chunk textChunk = null;
        for (Chunk chunk : msg.getMainChunks().getChunks()) {
            if (chunk.getChunkId() == MAPIProperty.BODY_HTML.id) {
                htmlChunk = chunk;
            }
            if (chunk.getChunkId() == MAPIProperty.RTF_COMPRESSED.id) {
                rtfChunk = chunk;
            }
            if (chunk.getChunkId() == MAPIProperty.BODY.id) {
                textChunk = chunk;
            }
        }
        boolean doneBody = false;
        xhtml.startElement("div", "class", "message-body");
        if (htmlChunk != null) {
            byte[] data = null;
            if (htmlChunk instanceof ByteChunk) {
                data = ((ByteChunk) htmlChunk).getValue();
            } else if (htmlChunk instanceof StringChunk) {
                data = ((StringChunk) htmlChunk).getRawValue();
            }
            if (data != null) {
                Parser htmlParser = EmbeddedDocumentUtil.tryToFindExistingLeafParser(HtmlParser.class, parseContext);
                if (htmlParser == null) {
                    htmlParser = new HtmlParser();
                }
                htmlParser.parse(new ByteArrayInputStream(data), new EmbeddedContentHandler(new BodyContentHandler(xhtml)), new Metadata(), parseContext);
                doneBody = true;
            }
        }
        if (rtfChunk != null && !doneBody) {
            ByteChunk chunk = (ByteChunk) rtfChunk;
            MAPIRtfAttribute rtf = new MAPIRtfAttribute(MAPIProperty.RTF_COMPRESSED, Types.BINARY.getId(), chunk.getValue());
            Parser rtfParser = EmbeddedDocumentUtil.tryToFindExistingLeafParser(RTFParser.class, parseContext);
            if (rtfParser == null) {
                rtfParser = new RTFParser();
            }
            rtfParser.parse(new ByteArrayInputStream(rtf.getData()), new EmbeddedContentHandler(new BodyContentHandler(xhtml)), new Metadata(), parseContext);
            doneBody = true;
        }
        if (textChunk != null && !doneBody) {
            xhtml.element("p", ((StringChunk) textChunk).getValue());
        }
        xhtml.endElement("div");
        // Process the attachments
        for (AttachmentChunks attachment : msg.getAttachmentFiles()) {
            xhtml.startElement("div", "class", "attachment-entry");
            String filename = null;
            if (attachment.getAttachLongFileName() != null) {
                filename = attachment.getAttachLongFileName().getValue();
            } else if (attachment.getAttachFileName() != null) {
                filename = attachment.getAttachFileName().getValue();
            }
            if (filename != null && filename.length() > 0) {
                xhtml.element("h1", filename);
            }
            if (attachment.getAttachData() != null) {
                handleEmbeddedResource(TikaInputStream.get(attachment.getAttachData().getValue()), filename, null, null, xhtml, true);
            }
            if (attachment.getAttachmentDirectory() != null) {
                handleEmbeddedOfficeDoc(attachment.getAttachmentDirectory().getDirectory(), xhtml);
            }
            xhtml.endElement("div");
        }
    } catch (ChunkNotFoundException e) {
        throw new TikaException("POI MAPIMessage broken - didn't return null on missing chunk", e);
    } finally {
    //You'd think you'd want to call msg.close().
    //Don't do that.  That closes down the file system.
    //If an msg has multiple msg attachments, some of them
    //can reside in the same file system.  After the first
    //child is read, the fs is closed, and the other children
    //get a java.nio.channels.ClosedChannelException
    }
}
Also used : ChunkNotFoundException(org.apache.poi.hsmf.exceptions.ChunkNotFoundException) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) MAPIRtfAttribute(org.apache.poi.hmef.attribute.MAPIRtfAttribute) TikaException(org.apache.tika.exception.TikaException) ByteChunk(org.apache.poi.hsmf.datatypes.ByteChunk) Metadata(org.apache.tika.metadata.Metadata) EmbeddedContentHandler(org.apache.tika.sax.EmbeddedContentHandler) ByteChunk(org.apache.poi.hsmf.datatypes.ByteChunk) StringChunk(org.apache.poi.hsmf.datatypes.StringChunk) Chunk(org.apache.poi.hsmf.datatypes.Chunk) Date(java.util.Date) StringChunk(org.apache.poi.hsmf.datatypes.StringChunk) Parser(org.apache.tika.parser.Parser) HtmlParser(org.apache.tika.parser.html.HtmlParser) RTFParser(org.apache.tika.parser.rtf.RTFParser) MboxParser(org.apache.tika.parser.mbox.MboxParser) HtmlParser(org.apache.tika.parser.html.HtmlParser) RTFParser(org.apache.tika.parser.rtf.RTFParser) ByteArrayInputStream(java.io.ByteArrayInputStream) ParseException(java.text.ParseException) Map(java.util.Map) LinkedHashMap(java.util.LinkedHashMap) AttachmentChunks(org.apache.poi.hsmf.datatypes.AttachmentChunks)

Example 2 with Chunk

use of org.apache.poi.hsmf.datatypes.Chunk in project poi by apache.

the class POIFSChunkParser method process.

/**
    * Creates a chunk, and gives it to its parent group 
    */
protected static void process(Entry entry, ChunkGroup grouping) {
    String entryName = entry.getName();
    Chunk chunk = null;
    // Is it a properties chunk? (They have special names)
    if (entryName.equals(PropertiesChunk.NAME)) {
        if (grouping instanceof Chunks) {
            // These should be the properties for the message itself
            chunk = new MessagePropertiesChunk(grouping);
        } else {
            // Will be properties on an attachment or recipient
            chunk = new StoragePropertiesChunk(grouping);
        }
    } else {
        // Check it's a regular chunk
        if (entryName.length() < 9) {
            // Name in the wrong format
            return;
        }
        if (!entryName.contains("_")) {
            // Name in the wrong format
            return;
        }
        // Split it into its parts
        int splitAt = entryName.lastIndexOf('_');
        String namePrefix = entryName.substring(0, splitAt + 1);
        String ids = entryName.substring(splitAt + 1);
        //  the form __<name>_<id><type>
        if (namePrefix.equals("Olk10SideProps") || namePrefix.equals("Olk10SideProps_")) {
            // This is some odd Outlook 2002 thing, skip
            return;
        } else if (splitAt <= entryName.length() - 8) {
        // In the right form for a normal chunk
        // We'll process this further in a little bit
        } else {
            // Underscores not the right place, something's wrong
            throw new IllegalArgumentException("Invalid chunk name " + entryName);
        }
        // Now try to turn it into id + type
        try {
            int chunkId = Integer.parseInt(ids.substring(0, 4), 16);
            int typeId = Integer.parseInt(ids.substring(4, 8), 16);
            MAPIType type = Types.getById(typeId);
            if (type == null) {
                type = Types.createCustom(typeId);
            }
            // Special cases based on the ID
            if (chunkId == MAPIProperty.MESSAGE_SUBMISSION_ID.id) {
                chunk = new MessageSubmissionChunk(namePrefix, chunkId, type);
            } else {
                // So, do the usual thing which is by type
                if (type == Types.BINARY) {
                    chunk = new ByteChunk(namePrefix, chunkId, type);
                } else if (type == Types.DIRECTORY) {
                    if (entry instanceof DirectoryNode) {
                        chunk = new DirectoryChunk((DirectoryNode) entry, namePrefix, chunkId, type);
                    }
                } else if (type == Types.ASCII_STRING || type == Types.UNICODE_STRING) {
                    chunk = new StringChunk(namePrefix, chunkId, type);
                } else {
                // Type of an unsupported type! Skipping... 
                }
            }
        } catch (NumberFormatException e) {
            // Name in the wrong format
            return;
        }
    }
    if (chunk != null) {
        if (entry instanceof DocumentNode) {
            DocumentInputStream inp = null;
            try {
                inp = new DocumentInputStream((DocumentNode) entry);
                chunk.readValue(inp);
                grouping.record(chunk);
            } catch (IOException e) {
                logger.log(POILogger.ERROR, "Error reading from part " + entry.getName() + " - " + e);
            } finally {
                if (inp != null)
                    inp.close();
            }
        } else {
            grouping.record(chunk);
        }
    }
}
Also used : StoragePropertiesChunk(org.apache.poi.hsmf.datatypes.StoragePropertiesChunk) Chunks(org.apache.poi.hsmf.datatypes.Chunks) AttachmentChunks(org.apache.poi.hsmf.datatypes.AttachmentChunks) RecipientChunks(org.apache.poi.hsmf.datatypes.RecipientChunks) NameIdChunks(org.apache.poi.hsmf.datatypes.NameIdChunks) DocumentNode(org.apache.poi.poifs.filesystem.DocumentNode) ByteChunk(org.apache.poi.hsmf.datatypes.ByteChunk) DirectoryNode(org.apache.poi.poifs.filesystem.DirectoryNode) DirectoryChunk(org.apache.poi.hsmf.datatypes.DirectoryChunk) MessagePropertiesChunk(org.apache.poi.hsmf.datatypes.MessagePropertiesChunk) IOException(java.io.IOException) ByteChunk(org.apache.poi.hsmf.datatypes.ByteChunk) PropertiesChunk(org.apache.poi.hsmf.datatypes.PropertiesChunk) StringChunk(org.apache.poi.hsmf.datatypes.StringChunk) Chunk(org.apache.poi.hsmf.datatypes.Chunk) DirectoryChunk(org.apache.poi.hsmf.datatypes.DirectoryChunk) StoragePropertiesChunk(org.apache.poi.hsmf.datatypes.StoragePropertiesChunk) MessagePropertiesChunk(org.apache.poi.hsmf.datatypes.MessagePropertiesChunk) MessageSubmissionChunk(org.apache.poi.hsmf.datatypes.MessageSubmissionChunk) DocumentInputStream(org.apache.poi.poifs.filesystem.DocumentInputStream) MessageSubmissionChunk(org.apache.poi.hsmf.datatypes.MessageSubmissionChunk) StringChunk(org.apache.poi.hsmf.datatypes.StringChunk) MAPIType(org.apache.poi.hsmf.datatypes.Types.MAPIType)

Example 3 with Chunk

use of org.apache.poi.hsmf.datatypes.Chunk in project poi by apache.

the class HSMFDump method dump.

public void dump(PrintStream out) throws IOException {
    ChunkGroup[] chunkGroups = POIFSChunkParser.parse(fs);
    for (ChunkGroup chunks : chunkGroups) {
        out.println(chunks.getClass().getSimpleName());
        for (Chunk chunk : chunks.getChunks()) {
            MAPIProperty attr = MAPIProperty.get(chunk.getChunkId());
            if (chunk instanceof PropertiesChunk) {
                PropertiesChunk props = (PropertiesChunk) chunk;
                out.println("   Properties - " + props.getProperties().size() + ":");
                for (MAPIProperty prop : props.getProperties().keySet()) {
                    out.println("       * " + prop);
                    for (PropertyValue v : props.getValues(prop)) {
                        out.println("        = " + v);
                    }
                }
            } else {
                String idName = attr.id + " - " + attr.name;
                if (attr == MAPIProperty.UNKNOWN) {
                    idName = chunk.getChunkId() + " - (unknown)";
                }
                out.println("   " + idName + " - " + chunk.getType().getName());
                out.println("       " + chunk);
            }
        }
        out.println();
    }
}
Also used : ChunkGroup(org.apache.poi.hsmf.datatypes.ChunkGroup) PropertiesChunk(org.apache.poi.hsmf.datatypes.PropertiesChunk) PropertyValue(org.apache.poi.hsmf.datatypes.PropertyValue) PropertiesChunk(org.apache.poi.hsmf.datatypes.PropertiesChunk) Chunk(org.apache.poi.hsmf.datatypes.Chunk) MAPIProperty(org.apache.poi.hsmf.datatypes.MAPIProperty)

Example 4 with Chunk

use of org.apache.poi.hsmf.datatypes.Chunk in project tika by apache.

the class OutlookExtractor method handleFromTo.

private void handleFromTo(Map<String, String[]> headers, Metadata metadata) throws ChunkNotFoundException {
    String from = msg.getDisplayFrom();
    metadata.set(TikaCoreProperties.CREATOR, from);
    metadata.set(Metadata.MESSAGE_FROM, from);
    metadata.set(Metadata.MESSAGE_TO, msg.getDisplayTo());
    metadata.set(Metadata.MESSAGE_CC, msg.getDisplayCC());
    metadata.set(Metadata.MESSAGE_BCC, msg.getDisplayBCC());
    Chunks chunks = msg.getMainChunks();
    StringChunk sentByServerType = chunks.getSentByServerType();
    if (sentByServerType != null) {
        metadata.set(Office.MAPI_SENT_BY_SERVER_TYPE, sentByServerType.getValue());
    }
    Map<MAPIProperty, List<Chunk>> mainChunks = msg.getMainChunks().getAll();
    List<Chunk> senderAddresType = mainChunks.get(MAPIProperty.SENDER_ADDRTYPE);
    String senderAddressTypeString = "";
    if (senderAddresType != null && senderAddresType.size() > 0) {
        senderAddressTypeString = senderAddresType.get(0).toString();
    }
    //sometimes in SMTP .msg files there is an email in the sender name field.
    setFirstChunk(mainChunks.get(MAPIProperty.SENDER_NAME), Message.MESSAGE_FROM_NAME, metadata);
    setFirstChunk(mainChunks.get(MAPIProperty.SENT_REPRESENTING_NAME), Office.MAPI_FROM_REPRESENTING_NAME, metadata);
    setFirstChunk(mainChunks.get(MAPIProperty.SENDER_EMAIL_ADDRESS), Message.MESSAGE_FROM_EMAIL, metadata);
    setFirstChunk(mainChunks.get(MAPIProperty.SENT_REPRESENTING_EMAIL_ADDRESS), Office.MAPI_FROM_REPRESENTING_EMAIL, metadata);
    for (Recipient recipient : buildRecipients()) {
        switch(recipient.recipientType) {
            case TO:
                addEvenIfNull(Message.MESSAGE_TO_NAME, recipient.name, metadata);
                addEvenIfNull(Message.MESSAGE_TO_DISPLAY_NAME, recipient.displayName, metadata);
                addEvenIfNull(Message.MESSAGE_TO_EMAIL, recipient.emailAddress, metadata);
                break;
            case CC:
                addEvenIfNull(Message.MESSAGE_CC_NAME, recipient.name, metadata);
                addEvenIfNull(Message.MESSAGE_CC_DISPLAY_NAME, recipient.displayName, metadata);
                addEvenIfNull(Message.MESSAGE_CC_EMAIL, recipient.emailAddress, metadata);
                break;
            case BCC:
                addEvenIfNull(Message.MESSAGE_BCC_NAME, recipient.name, metadata);
                addEvenIfNull(Message.MESSAGE_BCC_DISPLAY_NAME, recipient.displayName, metadata);
                addEvenIfNull(Message.MESSAGE_BCC_EMAIL, recipient.emailAddress, metadata);
                break;
            default:
                //log unknown or undefined?
                break;
        }
    }
}
Also used : Chunks(org.apache.poi.hsmf.datatypes.Chunks) RecipientChunks(org.apache.poi.hsmf.datatypes.RecipientChunks) AttachmentChunks(org.apache.poi.hsmf.datatypes.AttachmentChunks) List(java.util.List) ArrayList(java.util.ArrayList) LinkedList(java.util.LinkedList) ByteChunk(org.apache.poi.hsmf.datatypes.ByteChunk) StringChunk(org.apache.poi.hsmf.datatypes.StringChunk) Chunk(org.apache.poi.hsmf.datatypes.Chunk) MAPIProperty(org.apache.poi.hsmf.datatypes.MAPIProperty) StringChunk(org.apache.poi.hsmf.datatypes.StringChunk)

Aggregations

Chunk (org.apache.poi.hsmf.datatypes.Chunk)4 AttachmentChunks (org.apache.poi.hsmf.datatypes.AttachmentChunks)3 ByteChunk (org.apache.poi.hsmf.datatypes.ByteChunk)3 StringChunk (org.apache.poi.hsmf.datatypes.StringChunk)3 Chunks (org.apache.poi.hsmf.datatypes.Chunks)2 MAPIProperty (org.apache.poi.hsmf.datatypes.MAPIProperty)2 PropertiesChunk (org.apache.poi.hsmf.datatypes.PropertiesChunk)2 RecipientChunks (org.apache.poi.hsmf.datatypes.RecipientChunks)2 ByteArrayInputStream (java.io.ByteArrayInputStream)1 IOException (java.io.IOException)1 ParseException (java.text.ParseException)1 ArrayList (java.util.ArrayList)1 Date (java.util.Date)1 LinkedHashMap (java.util.LinkedHashMap)1 LinkedList (java.util.LinkedList)1 List (java.util.List)1 Map (java.util.Map)1 MAPIRtfAttribute (org.apache.poi.hmef.attribute.MAPIRtfAttribute)1 ChunkGroup (org.apache.poi.hsmf.datatypes.ChunkGroup)1 DirectoryChunk (org.apache.poi.hsmf.datatypes.DirectoryChunk)1