use of org.apache.poi.hsmf.datatypes.AttachmentChunks in project poi by apache.
the class TestFileWithAttachmentsRead method testReadContentIDField.
/**
* Bug 60550: Test to see if we get the correct Content-IDs of inline images`.
*/
@Test
public void testReadContentIDField() throws IOException {
AttachmentChunks[] attachments = inlineImgMsgAttachments.getAttachmentFiles();
AttachmentChunks attachment;
// Check in Content-ID field
attachment = inlineImgMsgAttachments.getAttachmentFiles()[0];
assertEquals("image001.png", attachment.getAttachFileName().getValue());
assertEquals(".png", attachment.getAttachExtension().getValue());
assertEquals("image001.png@01D0A524.96D40F30", attachment.getAttachContentId().getValue());
attachment = inlineImgMsgAttachments.getAttachmentFiles()[1];
assertEquals("image002.png", attachment.getAttachFileName().getValue());
assertEquals(".png", attachment.getAttachExtension().getValue());
assertEquals("image002.png@01D0A524.96D40F30", attachment.getAttachContentId().getValue());
attachment = inlineImgMsgAttachments.getAttachmentFiles()[2];
assertEquals("image003.png", attachment.getAttachFileName().getValue());
assertEquals(".png", attachment.getAttachExtension().getValue());
assertEquals("image003.png@01D0A526.B4C739C0", attachment.getAttachContentId().getValue());
attachment = inlineImgMsgAttachments.getAttachmentFiles()[3];
assertEquals("image006.jpg", attachment.getAttachFileName().getValue());
assertEquals(".jpg", attachment.getAttachExtension().getValue());
assertEquals("image006.jpg@01D0A526.B649E220", attachment.getAttachContentId().getValue());
}
use of org.apache.poi.hsmf.datatypes.AttachmentChunks in project poi by apache.
the class TestFileWithAttachmentsRead method testReadAttachments.
/**
* Test to see if attachments are not empty.
*/
@Test
public void testReadAttachments() throws IOException {
AttachmentChunks[] attachments = twoSimpleAttachments.getAttachmentFiles();
// Basic checks
for (AttachmentChunks attachment : attachments) {
assertTrue(attachment.getAttachFileName().getValue().length() > 0);
assertTrue(attachment.getAttachLongFileName().getValue().length() > 0);
assertTrue(attachment.getAttachExtension().getValue().length() > 0);
if (attachment.getAttachMimeTag() != null) {
assertTrue(attachment.getAttachMimeTag().getValue().length() > 0);
}
}
AttachmentChunks attachment;
// Now check in detail
attachment = twoSimpleAttachments.getAttachmentFiles()[0];
assertEquals("TEST-U~1.DOC", attachment.getAttachFileName().getValue());
assertEquals("test-unicode.doc", attachment.getAttachLongFileName().getValue());
assertEquals(".doc", attachment.getAttachExtension().getValue());
assertNull(attachment.getAttachMimeTag());
// or compare the hashes of the attachment data
assertEquals(24064, attachment.getAttachData().getValue().length);
attachment = twoSimpleAttachments.getAttachmentFiles()[1];
assertEquals("pj1.txt", attachment.getAttachFileName().getValue());
assertEquals("pj1.txt", attachment.getAttachLongFileName().getValue());
assertEquals(".txt", attachment.getAttachExtension().getValue());
assertNull(attachment.getAttachMimeTag());
// or compare the hashes of the attachment data
assertEquals(89, attachment.getAttachData().getValue().length);
}
use of org.apache.poi.hsmf.datatypes.AttachmentChunks in project poi by apache.
the class OLE2ScratchpadExtractorFactory method identifyEmbeddedResources.
/**
* Returns an array of text extractors, one for each of
* the embedded documents in the file (if there are any).
* If there are no embedded documents, you'll get back an
* empty array. Otherwise, you'll get one open
* {@link POITextExtractor} for each embedded file.
*/
public static void identifyEmbeddedResources(POIOLE2TextExtractor ext, List<Entry> dirs, List<InputStream> nonPOIFS) throws IOException {
// Find all the embedded directories
DirectoryEntry root = ext.getRoot();
if (root == null) {
throw new IllegalStateException("The extractor didn't know which POIFS it came from!");
}
if (ext instanceof WordExtractor) {
// These are in ObjectPool -> _... under the root
try {
DirectoryEntry op = (DirectoryEntry) root.getEntry("ObjectPool");
Iterator<Entry> it = op.getEntries();
while (it.hasNext()) {
Entry entry = it.next();
if (entry.getName().startsWith("_")) {
dirs.add(entry);
}
}
} catch (FileNotFoundException e) {
// ignored here
}
//} else if(ext instanceof PowerPointExtractor) {
// Tricky, not stored directly in poifs
// TODO
} else if (ext instanceof OutlookTextExtactor) {
// Stored in the Attachment blocks
MAPIMessage msg = ((OutlookTextExtactor) ext).getMAPIMessage();
for (AttachmentChunks attachment : msg.getAttachmentFiles()) {
if (attachment.getAttachData() != null) {
byte[] data = attachment.getAttachData().getValue();
nonPOIFS.add(new ByteArrayInputStream(data));
} else if (attachment.getAttachmentDirectory() != null) {
dirs.add(attachment.getAttachmentDirectory().getDirectory());
}
}
}
}
use of org.apache.poi.hsmf.datatypes.AttachmentChunks in project tika by apache.
the class OutlookExtractor method parse.
public void parse(XHTMLContentHandler xhtml, Metadata metadata) throws TikaException, SAXException, IOException {
try {
msg.setReturnNullOnMissingChunk(true);
try {
metadata.set(Office.MAPI_MESSAGE_CLASS, getMessageClass(msg.getMessageClass()));
} catch (ChunkNotFoundException e) {
}
// as Unicode, try to sort out an encoding for them
if (msg.has7BitEncodingStrings()) {
guess7BitEncoding(msg);
}
// Start with the metadata
String subject = msg.getSubject();
Map<String, String[]> headers = normalizeHeaders(msg.getHeaders());
String from = msg.getDisplayFrom();
handleFromTo(headers, metadata);
metadata.set(TikaCoreProperties.TITLE, subject);
// TODO: Move to description in Tika 2.0
metadata.set(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_DESCRIPTION, msg.getConversationTopic());
try {
for (String recipientAddress : msg.getRecipientEmailAddressList()) {
if (recipientAddress != null)
metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, recipientAddress);
}
} catch (ChunkNotFoundException he) {
}
for (Map.Entry<String, String[]> e : headers.entrySet()) {
String headerKey = e.getKey();
for (String headerValue : e.getValue()) {
metadata.add(Metadata.MESSAGE_RAW_HEADER_PREFIX + headerKey, headerValue);
}
}
// First try via the proper chunk
if (msg.getMessageDate() != null) {
metadata.set(TikaCoreProperties.CREATED, msg.getMessageDate().getTime());
metadata.set(TikaCoreProperties.MODIFIED, msg.getMessageDate().getTime());
} else {
if (headers != null && headers.size() > 0) {
for (Map.Entry<String, String[]> header : headers.entrySet()) {
String headerKey = header.getKey();
if (headerKey.toLowerCase(Locale.ROOT).startsWith("date:")) {
String date = headerKey.substring(headerKey.indexOf(':') + 1).trim();
// See if we can parse it as a normal mail date
try {
Date d = MboxParser.parseDate(date);
metadata.set(TikaCoreProperties.CREATED, d);
metadata.set(TikaCoreProperties.MODIFIED, d);
} catch (ParseException e) {
// Store it as-is, and hope for the best...
metadata.set(TikaCoreProperties.CREATED, date);
metadata.set(TikaCoreProperties.MODIFIED, date);
}
break;
}
}
}
}
xhtml.element("h1", subject);
// Output the from and to details in text, as you
// often want them in text form for searching
xhtml.startElement("dl");
if (from != null) {
header(xhtml, "From", from);
}
header(xhtml, "To", msg.getDisplayTo());
header(xhtml, "Cc", msg.getDisplayCC());
header(xhtml, "Bcc", msg.getDisplayBCC());
try {
header(xhtml, "Recipients", msg.getRecipientEmailAddress());
} catch (ChunkNotFoundException e) {
}
xhtml.endElement("dl");
// Get the message body. Preference order is: html, rtf, text
Chunk htmlChunk = null;
Chunk rtfChunk = null;
Chunk textChunk = null;
for (Chunk chunk : msg.getMainChunks().getChunks()) {
if (chunk.getChunkId() == MAPIProperty.BODY_HTML.id) {
htmlChunk = chunk;
}
if (chunk.getChunkId() == MAPIProperty.RTF_COMPRESSED.id) {
rtfChunk = chunk;
}
if (chunk.getChunkId() == MAPIProperty.BODY.id) {
textChunk = chunk;
}
}
boolean doneBody = false;
xhtml.startElement("div", "class", "message-body");
if (htmlChunk != null) {
byte[] data = null;
if (htmlChunk instanceof ByteChunk) {
data = ((ByteChunk) htmlChunk).getValue();
} else if (htmlChunk instanceof StringChunk) {
data = ((StringChunk) htmlChunk).getRawValue();
}
if (data != null) {
Parser htmlParser = EmbeddedDocumentUtil.tryToFindExistingLeafParser(HtmlParser.class, parseContext);
if (htmlParser == null) {
htmlParser = new HtmlParser();
}
htmlParser.parse(new ByteArrayInputStream(data), new EmbeddedContentHandler(new BodyContentHandler(xhtml)), new Metadata(), parseContext);
doneBody = true;
}
}
if (rtfChunk != null && !doneBody) {
ByteChunk chunk = (ByteChunk) rtfChunk;
MAPIRtfAttribute rtf = new MAPIRtfAttribute(MAPIProperty.RTF_COMPRESSED, Types.BINARY.getId(), chunk.getValue());
Parser rtfParser = EmbeddedDocumentUtil.tryToFindExistingLeafParser(RTFParser.class, parseContext);
if (rtfParser == null) {
rtfParser = new RTFParser();
}
rtfParser.parse(new ByteArrayInputStream(rtf.getData()), new EmbeddedContentHandler(new BodyContentHandler(xhtml)), new Metadata(), parseContext);
doneBody = true;
}
if (textChunk != null && !doneBody) {
xhtml.element("p", ((StringChunk) textChunk).getValue());
}
xhtml.endElement("div");
// Process the attachments
for (AttachmentChunks attachment : msg.getAttachmentFiles()) {
xhtml.startElement("div", "class", "attachment-entry");
String filename = null;
if (attachment.getAttachLongFileName() != null) {
filename = attachment.getAttachLongFileName().getValue();
} else if (attachment.getAttachFileName() != null) {
filename = attachment.getAttachFileName().getValue();
}
if (filename != null && filename.length() > 0) {
xhtml.element("h1", filename);
}
if (attachment.getAttachData() != null) {
handleEmbeddedResource(TikaInputStream.get(attachment.getAttachData().getValue()), filename, null, null, xhtml, true);
}
if (attachment.getAttachmentDirectory() != null) {
handleEmbeddedOfficeDoc(attachment.getAttachmentDirectory().getDirectory(), xhtml);
}
xhtml.endElement("div");
}
} catch (ChunkNotFoundException e) {
throw new TikaException("POI MAPIMessage broken - didn't return null on missing chunk", e);
} finally {
//You'd think you'd want to call msg.close().
//Don't do that. That closes down the file system.
//If an msg has multiple msg attachments, some of them
//can reside in the same file system. After the first
//child is read, the fs is closed, and the other children
//get a java.nio.channels.ClosedChannelException
}
}
use of org.apache.poi.hsmf.datatypes.AttachmentChunks in project Xponents by OpenSextant.
the class OLEMessageConverter method conversionImplementation.
@Override
protected ConvertedDocument conversionImplementation(InputStream in, File doc) throws IOException {
ConvertedDocument msgDoc = new ConvertedDocument(doc);
try {
MAPIMessage msg = new MAPIMessage(in);
// If your message is Latin-1 text... there is no real easy way to get bytes of raw message text
// to ensure it is UTF-8
// TextTranscodingConverter.setTextAndEncoding(doc, msg.getM);
// By default this may be UTF-8 text.
msgDoc.setText(msg.getTextBody());
/* Would prefer not to set encoding here without knowing or attempting to derive it properly */
msgDoc.setEncoding(ConvertedDocument.OUTPUT_ENCODING);
AttachmentChunks[] chunks = msg.getAttachmentFiles();
for (AttachmentChunks c : chunks) {
Content child = new Content();
child.id = getAttachmentName(c.attachLongFileName, c.attachFileName);
child.content = c.attachData.getValue();
msgDoc.addRawChild(child);
}
// Get a subject line.
try {
msgDoc.addTitle(msg.getSubject());
} catch (ChunkNotFoundException err) {
msgDoc.addTitle("(MIME error: unable to get subject)");
}
// Get a date line.
try {
msgDoc.addCreateDate(msg.getMessageDate());
} catch (ChunkNotFoundException err) {
//
}
// Get author.
try {
msgDoc.addAuthor(msg.getDisplayFrom());
} catch (ChunkNotFoundException err) {
msgDoc.addAuthor("(MIME error: unable to get sender)");
}
return msgDoc;
} catch (Exception xerr) {
throw new IOException("Unable to parse content", xerr);
} finally {
in.close();
}
}
Aggregations