Search in sources :

Example 6 with AttachmentChunks

use of org.apache.poi.hsmf.datatypes.AttachmentChunks in project poi by apache.

the class TestFileWithAttachmentsRead method testReadContentIDField.

/**
     * Bug 60550: Test to see if we get the correct Content-IDs of inline images`.
     */
@Test
public void testReadContentIDField() throws IOException {
    AttachmentChunks[] attachments = inlineImgMsgAttachments.getAttachmentFiles();
    AttachmentChunks attachment;
    // Check in Content-ID field
    attachment = inlineImgMsgAttachments.getAttachmentFiles()[0];
    assertEquals("image001.png", attachment.getAttachFileName().getValue());
    assertEquals(".png", attachment.getAttachExtension().getValue());
    assertEquals("image001.png@01D0A524.96D40F30", attachment.getAttachContentId().getValue());
    attachment = inlineImgMsgAttachments.getAttachmentFiles()[1];
    assertEquals("image002.png", attachment.getAttachFileName().getValue());
    assertEquals(".png", attachment.getAttachExtension().getValue());
    assertEquals("image002.png@01D0A524.96D40F30", attachment.getAttachContentId().getValue());
    attachment = inlineImgMsgAttachments.getAttachmentFiles()[2];
    assertEquals("image003.png", attachment.getAttachFileName().getValue());
    assertEquals(".png", attachment.getAttachExtension().getValue());
    assertEquals("image003.png@01D0A526.B4C739C0", attachment.getAttachContentId().getValue());
    attachment = inlineImgMsgAttachments.getAttachmentFiles()[3];
    assertEquals("image006.jpg", attachment.getAttachFileName().getValue());
    assertEquals(".jpg", attachment.getAttachExtension().getValue());
    assertEquals("image006.jpg@01D0A526.B649E220", attachment.getAttachContentId().getValue());
}
Also used : AttachmentChunks(org.apache.poi.hsmf.datatypes.AttachmentChunks) Test(org.junit.Test)

Example 7 with AttachmentChunks

use of org.apache.poi.hsmf.datatypes.AttachmentChunks in project poi by apache.

the class TestFileWithAttachmentsRead method testReadAttachments.

/**
     * Test to see if attachments are not empty.
     */
@Test
public void testReadAttachments() throws IOException {
    AttachmentChunks[] attachments = twoSimpleAttachments.getAttachmentFiles();
    // Basic checks
    for (AttachmentChunks attachment : attachments) {
        assertTrue(attachment.getAttachFileName().getValue().length() > 0);
        assertTrue(attachment.getAttachLongFileName().getValue().length() > 0);
        assertTrue(attachment.getAttachExtension().getValue().length() > 0);
        if (attachment.getAttachMimeTag() != null) {
            assertTrue(attachment.getAttachMimeTag().getValue().length() > 0);
        }
    }
    AttachmentChunks attachment;
    // Now check in detail
    attachment = twoSimpleAttachments.getAttachmentFiles()[0];
    assertEquals("TEST-U~1.DOC", attachment.getAttachFileName().getValue());
    assertEquals("test-unicode.doc", attachment.getAttachLongFileName().getValue());
    assertEquals(".doc", attachment.getAttachExtension().getValue());
    assertNull(attachment.getAttachMimeTag());
    // or compare the hashes of the attachment data
    assertEquals(24064, attachment.getAttachData().getValue().length);
    attachment = twoSimpleAttachments.getAttachmentFiles()[1];
    assertEquals("pj1.txt", attachment.getAttachFileName().getValue());
    assertEquals("pj1.txt", attachment.getAttachLongFileName().getValue());
    assertEquals(".txt", attachment.getAttachExtension().getValue());
    assertNull(attachment.getAttachMimeTag());
    // or compare the hashes of the attachment data
    assertEquals(89, attachment.getAttachData().getValue().length);
}
Also used : AttachmentChunks(org.apache.poi.hsmf.datatypes.AttachmentChunks) Test(org.junit.Test)

Example 8 with AttachmentChunks

use of org.apache.poi.hsmf.datatypes.AttachmentChunks in project poi by apache.

the class OLE2ScratchpadExtractorFactory method identifyEmbeddedResources.

/**
     * Returns an array of text extractors, one for each of
     *  the embedded documents in the file (if there are any).
     * If there are no embedded documents, you'll get back an
     *  empty array. Otherwise, you'll get one open
     *  {@link POITextExtractor} for each embedded file.
     */
public static void identifyEmbeddedResources(POIOLE2TextExtractor ext, List<Entry> dirs, List<InputStream> nonPOIFS) throws IOException {
    // Find all the embedded directories
    DirectoryEntry root = ext.getRoot();
    if (root == null) {
        throw new IllegalStateException("The extractor didn't know which POIFS it came from!");
    }
    if (ext instanceof WordExtractor) {
        // These are in ObjectPool -> _... under the root
        try {
            DirectoryEntry op = (DirectoryEntry) root.getEntry("ObjectPool");
            Iterator<Entry> it = op.getEntries();
            while (it.hasNext()) {
                Entry entry = it.next();
                if (entry.getName().startsWith("_")) {
                    dirs.add(entry);
                }
            }
        } catch (FileNotFoundException e) {
        // ignored here
        }
    //} else if(ext instanceof PowerPointExtractor) {
    // Tricky, not stored directly in poifs
    // TODO
    } else if (ext instanceof OutlookTextExtactor) {
        // Stored in the Attachment blocks
        MAPIMessage msg = ((OutlookTextExtactor) ext).getMAPIMessage();
        for (AttachmentChunks attachment : msg.getAttachmentFiles()) {
            if (attachment.getAttachData() != null) {
                byte[] data = attachment.getAttachData().getValue();
                nonPOIFS.add(new ByteArrayInputStream(data));
            } else if (attachment.getAttachmentDirectory() != null) {
                dirs.add(attachment.getAttachmentDirectory().getDirectory());
            }
        }
    }
}
Also used : MAPIMessage(org.apache.poi.hsmf.MAPIMessage) Entry(org.apache.poi.poifs.filesystem.Entry) DirectoryEntry(org.apache.poi.poifs.filesystem.DirectoryEntry) OutlookTextExtactor(org.apache.poi.hsmf.extractor.OutlookTextExtactor) ByteArrayInputStream(java.io.ByteArrayInputStream) FileNotFoundException(java.io.FileNotFoundException) DirectoryEntry(org.apache.poi.poifs.filesystem.DirectoryEntry) AttachmentChunks(org.apache.poi.hsmf.datatypes.AttachmentChunks) WordExtractor(org.apache.poi.hwpf.extractor.WordExtractor)

Example 9 with AttachmentChunks

use of org.apache.poi.hsmf.datatypes.AttachmentChunks in project tika by apache.

the class OutlookExtractor method parse.

public void parse(XHTMLContentHandler xhtml, Metadata metadata) throws TikaException, SAXException, IOException {
    try {
        msg.setReturnNullOnMissingChunk(true);
        try {
            metadata.set(Office.MAPI_MESSAGE_CLASS, getMessageClass(msg.getMessageClass()));
        } catch (ChunkNotFoundException e) {
        }
        //  as Unicode, try to sort out an encoding for them
        if (msg.has7BitEncodingStrings()) {
            guess7BitEncoding(msg);
        }
        // Start with the metadata
        String subject = msg.getSubject();
        Map<String, String[]> headers = normalizeHeaders(msg.getHeaders());
        String from = msg.getDisplayFrom();
        handleFromTo(headers, metadata);
        metadata.set(TikaCoreProperties.TITLE, subject);
        // TODO: Move to description in Tika 2.0
        metadata.set(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_DESCRIPTION, msg.getConversationTopic());
        try {
            for (String recipientAddress : msg.getRecipientEmailAddressList()) {
                if (recipientAddress != null)
                    metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, recipientAddress);
            }
        } catch (ChunkNotFoundException he) {
        }
        for (Map.Entry<String, String[]> e : headers.entrySet()) {
            String headerKey = e.getKey();
            for (String headerValue : e.getValue()) {
                metadata.add(Metadata.MESSAGE_RAW_HEADER_PREFIX + headerKey, headerValue);
            }
        }
        // First try via the proper chunk
        if (msg.getMessageDate() != null) {
            metadata.set(TikaCoreProperties.CREATED, msg.getMessageDate().getTime());
            metadata.set(TikaCoreProperties.MODIFIED, msg.getMessageDate().getTime());
        } else {
            if (headers != null && headers.size() > 0) {
                for (Map.Entry<String, String[]> header : headers.entrySet()) {
                    String headerKey = header.getKey();
                    if (headerKey.toLowerCase(Locale.ROOT).startsWith("date:")) {
                        String date = headerKey.substring(headerKey.indexOf(':') + 1).trim();
                        // See if we can parse it as a normal mail date
                        try {
                            Date d = MboxParser.parseDate(date);
                            metadata.set(TikaCoreProperties.CREATED, d);
                            metadata.set(TikaCoreProperties.MODIFIED, d);
                        } catch (ParseException e) {
                            // Store it as-is, and hope for the best...
                            metadata.set(TikaCoreProperties.CREATED, date);
                            metadata.set(TikaCoreProperties.MODIFIED, date);
                        }
                        break;
                    }
                }
            }
        }
        xhtml.element("h1", subject);
        // Output the from and to details in text, as you
        //  often want them in text form for searching
        xhtml.startElement("dl");
        if (from != null) {
            header(xhtml, "From", from);
        }
        header(xhtml, "To", msg.getDisplayTo());
        header(xhtml, "Cc", msg.getDisplayCC());
        header(xhtml, "Bcc", msg.getDisplayBCC());
        try {
            header(xhtml, "Recipients", msg.getRecipientEmailAddress());
        } catch (ChunkNotFoundException e) {
        }
        xhtml.endElement("dl");
        // Get the message body. Preference order is: html, rtf, text
        Chunk htmlChunk = null;
        Chunk rtfChunk = null;
        Chunk textChunk = null;
        for (Chunk chunk : msg.getMainChunks().getChunks()) {
            if (chunk.getChunkId() == MAPIProperty.BODY_HTML.id) {
                htmlChunk = chunk;
            }
            if (chunk.getChunkId() == MAPIProperty.RTF_COMPRESSED.id) {
                rtfChunk = chunk;
            }
            if (chunk.getChunkId() == MAPIProperty.BODY.id) {
                textChunk = chunk;
            }
        }
        boolean doneBody = false;
        xhtml.startElement("div", "class", "message-body");
        if (htmlChunk != null) {
            byte[] data = null;
            if (htmlChunk instanceof ByteChunk) {
                data = ((ByteChunk) htmlChunk).getValue();
            } else if (htmlChunk instanceof StringChunk) {
                data = ((StringChunk) htmlChunk).getRawValue();
            }
            if (data != null) {
                Parser htmlParser = EmbeddedDocumentUtil.tryToFindExistingLeafParser(HtmlParser.class, parseContext);
                if (htmlParser == null) {
                    htmlParser = new HtmlParser();
                }
                htmlParser.parse(new ByteArrayInputStream(data), new EmbeddedContentHandler(new BodyContentHandler(xhtml)), new Metadata(), parseContext);
                doneBody = true;
            }
        }
        if (rtfChunk != null && !doneBody) {
            ByteChunk chunk = (ByteChunk) rtfChunk;
            MAPIRtfAttribute rtf = new MAPIRtfAttribute(MAPIProperty.RTF_COMPRESSED, Types.BINARY.getId(), chunk.getValue());
            Parser rtfParser = EmbeddedDocumentUtil.tryToFindExistingLeafParser(RTFParser.class, parseContext);
            if (rtfParser == null) {
                rtfParser = new RTFParser();
            }
            rtfParser.parse(new ByteArrayInputStream(rtf.getData()), new EmbeddedContentHandler(new BodyContentHandler(xhtml)), new Metadata(), parseContext);
            doneBody = true;
        }
        if (textChunk != null && !doneBody) {
            xhtml.element("p", ((StringChunk) textChunk).getValue());
        }
        xhtml.endElement("div");
        // Process the attachments
        for (AttachmentChunks attachment : msg.getAttachmentFiles()) {
            xhtml.startElement("div", "class", "attachment-entry");
            String filename = null;
            if (attachment.getAttachLongFileName() != null) {
                filename = attachment.getAttachLongFileName().getValue();
            } else if (attachment.getAttachFileName() != null) {
                filename = attachment.getAttachFileName().getValue();
            }
            if (filename != null && filename.length() > 0) {
                xhtml.element("h1", filename);
            }
            if (attachment.getAttachData() != null) {
                handleEmbeddedResource(TikaInputStream.get(attachment.getAttachData().getValue()), filename, null, null, xhtml, true);
            }
            if (attachment.getAttachmentDirectory() != null) {
                handleEmbeddedOfficeDoc(attachment.getAttachmentDirectory().getDirectory(), xhtml);
            }
            xhtml.endElement("div");
        }
    } catch (ChunkNotFoundException e) {
        throw new TikaException("POI MAPIMessage broken - didn't return null on missing chunk", e);
    } finally {
    //You'd think you'd want to call msg.close().
    //Don't do that.  That closes down the file system.
    //If an msg has multiple msg attachments, some of them
    //can reside in the same file system.  After the first
    //child is read, the fs is closed, and the other children
    //get a java.nio.channels.ClosedChannelException
    }
}
Also used : ChunkNotFoundException(org.apache.poi.hsmf.exceptions.ChunkNotFoundException) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) MAPIRtfAttribute(org.apache.poi.hmef.attribute.MAPIRtfAttribute) TikaException(org.apache.tika.exception.TikaException) ByteChunk(org.apache.poi.hsmf.datatypes.ByteChunk) Metadata(org.apache.tika.metadata.Metadata) EmbeddedContentHandler(org.apache.tika.sax.EmbeddedContentHandler) ByteChunk(org.apache.poi.hsmf.datatypes.ByteChunk) StringChunk(org.apache.poi.hsmf.datatypes.StringChunk) Chunk(org.apache.poi.hsmf.datatypes.Chunk) Date(java.util.Date) StringChunk(org.apache.poi.hsmf.datatypes.StringChunk) Parser(org.apache.tika.parser.Parser) HtmlParser(org.apache.tika.parser.html.HtmlParser) RTFParser(org.apache.tika.parser.rtf.RTFParser) MboxParser(org.apache.tika.parser.mbox.MboxParser) HtmlParser(org.apache.tika.parser.html.HtmlParser) RTFParser(org.apache.tika.parser.rtf.RTFParser) ByteArrayInputStream(java.io.ByteArrayInputStream) ParseException(java.text.ParseException) Map(java.util.Map) LinkedHashMap(java.util.LinkedHashMap) AttachmentChunks(org.apache.poi.hsmf.datatypes.AttachmentChunks)

Example 10 with AttachmentChunks

use of org.apache.poi.hsmf.datatypes.AttachmentChunks in project Xponents by OpenSextant.

the class OLEMessageConverter method conversionImplementation.

@Override
protected ConvertedDocument conversionImplementation(InputStream in, File doc) throws IOException {
    ConvertedDocument msgDoc = new ConvertedDocument(doc);
    try {
        MAPIMessage msg = new MAPIMessage(in);
        // If your message is Latin-1 text... there is no real easy way to get bytes of raw message text
        // to ensure it is UTF-8
        // TextTranscodingConverter.setTextAndEncoding(doc, msg.getM);
        // By default this may be UTF-8 text.
        msgDoc.setText(msg.getTextBody());
        /* Would prefer not to set encoding here without knowing  or attempting to derive it properly */
        msgDoc.setEncoding(ConvertedDocument.OUTPUT_ENCODING);
        AttachmentChunks[] chunks = msg.getAttachmentFiles();
        for (AttachmentChunks c : chunks) {
            Content child = new Content();
            child.id = getAttachmentName(c.attachLongFileName, c.attachFileName);
            child.content = c.attachData.getValue();
            msgDoc.addRawChild(child);
        }
        // Get a subject line.
        try {
            msgDoc.addTitle(msg.getSubject());
        } catch (ChunkNotFoundException err) {
            msgDoc.addTitle("(MIME error: unable to get subject)");
        }
        // Get a date line.
        try {
            msgDoc.addCreateDate(msg.getMessageDate());
        } catch (ChunkNotFoundException err) {
        // 
        }
        // Get author.
        try {
            msgDoc.addAuthor(msg.getDisplayFrom());
        } catch (ChunkNotFoundException err) {
            msgDoc.addAuthor("(MIME error: unable to get sender)");
        }
        return msgDoc;
    } catch (Exception xerr) {
        throw new IOException("Unable to parse content", xerr);
    } finally {
        in.close();
    }
}
Also used : ChunkNotFoundException(org.apache.poi.hsmf.exceptions.ChunkNotFoundException) MAPIMessage(org.apache.poi.hsmf.MAPIMessage) Content(org.opensextant.xtext.Content) IOException(java.io.IOException) ConvertedDocument(org.opensextant.xtext.ConvertedDocument) AttachmentChunks(org.apache.poi.hsmf.datatypes.AttachmentChunks) IOException(java.io.IOException) ChunkNotFoundException(org.apache.poi.hsmf.exceptions.ChunkNotFoundException)

Aggregations

AttachmentChunks (org.apache.poi.hsmf.datatypes.AttachmentChunks)14 MAPIMessage (org.apache.poi.hsmf.MAPIMessage)7 Test (org.junit.Test)5 ChunkNotFoundException (org.apache.poi.hsmf.exceptions.ChunkNotFoundException)4 ByteArrayInputStream (java.io.ByteArrayInputStream)3 ChunkGroup (org.apache.poi.hsmf.datatypes.ChunkGroup)3 Chunks (org.apache.poi.hsmf.datatypes.Chunks)3 NameIdChunks (org.apache.poi.hsmf.datatypes.NameIdChunks)3 RecipientChunks (org.apache.poi.hsmf.datatypes.RecipientChunks)3 Entry (org.apache.poi.poifs.filesystem.Entry)3 File (java.io.File)2 FileNotFoundException (java.io.FileNotFoundException)2 IOException (java.io.IOException)2 ArrayList (java.util.ArrayList)2 StringChunk (org.apache.poi.hsmf.datatypes.StringChunk)2 OutlookTextExtactor (org.apache.poi.hsmf.extractor.OutlookTextExtactor)2 WordExtractor (org.apache.poi.hwpf.extractor.WordExtractor)2 DirectoryEntry (org.apache.poi.poifs.filesystem.DirectoryEntry)2 DirectoryNode (org.apache.poi.poifs.filesystem.DirectoryNode)2 NPOIFSFileSystem (org.apache.poi.poifs.filesystem.NPOIFSFileSystem)2