use of org.apache.poi.hmef.attribute.MAPIRtfAttribute in project tika by apache.
the class OutlookExtractor method parse.
public void parse(XHTMLContentHandler xhtml, Metadata metadata) throws TikaException, SAXException, IOException {
try {
msg.setReturnNullOnMissingChunk(true);
try {
metadata.set(Office.MAPI_MESSAGE_CLASS, getMessageClass(msg.getMessageClass()));
} catch (ChunkNotFoundException e) {
}
// as Unicode, try to sort out an encoding for them
if (msg.has7BitEncodingStrings()) {
guess7BitEncoding(msg);
}
// Start with the metadata
String subject = msg.getSubject();
Map<String, String[]> headers = normalizeHeaders(msg.getHeaders());
String from = msg.getDisplayFrom();
handleFromTo(headers, metadata);
metadata.set(TikaCoreProperties.TITLE, subject);
// TODO: Move to description in Tika 2.0
metadata.set(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_DESCRIPTION, msg.getConversationTopic());
try {
for (String recipientAddress : msg.getRecipientEmailAddressList()) {
if (recipientAddress != null)
metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, recipientAddress);
}
} catch (ChunkNotFoundException he) {
}
for (Map.Entry<String, String[]> e : headers.entrySet()) {
String headerKey = e.getKey();
for (String headerValue : e.getValue()) {
metadata.add(Metadata.MESSAGE_RAW_HEADER_PREFIX + headerKey, headerValue);
}
}
// First try via the proper chunk
if (msg.getMessageDate() != null) {
metadata.set(TikaCoreProperties.CREATED, msg.getMessageDate().getTime());
metadata.set(TikaCoreProperties.MODIFIED, msg.getMessageDate().getTime());
} else {
if (headers != null && headers.size() > 0) {
for (Map.Entry<String, String[]> header : headers.entrySet()) {
String headerKey = header.getKey();
if (headerKey.toLowerCase(Locale.ROOT).startsWith("date:")) {
String date = headerKey.substring(headerKey.indexOf(':') + 1).trim();
// See if we can parse it as a normal mail date
try {
Date d = MboxParser.parseDate(date);
metadata.set(TikaCoreProperties.CREATED, d);
metadata.set(TikaCoreProperties.MODIFIED, d);
} catch (ParseException e) {
// Store it as-is, and hope for the best...
metadata.set(TikaCoreProperties.CREATED, date);
metadata.set(TikaCoreProperties.MODIFIED, date);
}
break;
}
}
}
}
xhtml.element("h1", subject);
// Output the from and to details in text, as you
// often want them in text form for searching
xhtml.startElement("dl");
if (from != null) {
header(xhtml, "From", from);
}
header(xhtml, "To", msg.getDisplayTo());
header(xhtml, "Cc", msg.getDisplayCC());
header(xhtml, "Bcc", msg.getDisplayBCC());
try {
header(xhtml, "Recipients", msg.getRecipientEmailAddress());
} catch (ChunkNotFoundException e) {
}
xhtml.endElement("dl");
// Get the message body. Preference order is: html, rtf, text
Chunk htmlChunk = null;
Chunk rtfChunk = null;
Chunk textChunk = null;
for (Chunk chunk : msg.getMainChunks().getChunks()) {
if (chunk.getChunkId() == MAPIProperty.BODY_HTML.id) {
htmlChunk = chunk;
}
if (chunk.getChunkId() == MAPIProperty.RTF_COMPRESSED.id) {
rtfChunk = chunk;
}
if (chunk.getChunkId() == MAPIProperty.BODY.id) {
textChunk = chunk;
}
}
boolean doneBody = false;
xhtml.startElement("div", "class", "message-body");
if (htmlChunk != null) {
byte[] data = null;
if (htmlChunk instanceof ByteChunk) {
data = ((ByteChunk) htmlChunk).getValue();
} else if (htmlChunk instanceof StringChunk) {
data = ((StringChunk) htmlChunk).getRawValue();
}
if (data != null) {
Parser htmlParser = EmbeddedDocumentUtil.tryToFindExistingLeafParser(HtmlParser.class, parseContext);
if (htmlParser == null) {
htmlParser = new HtmlParser();
}
htmlParser.parse(new ByteArrayInputStream(data), new EmbeddedContentHandler(new BodyContentHandler(xhtml)), new Metadata(), parseContext);
doneBody = true;
}
}
if (rtfChunk != null && !doneBody) {
ByteChunk chunk = (ByteChunk) rtfChunk;
MAPIRtfAttribute rtf = new MAPIRtfAttribute(MAPIProperty.RTF_COMPRESSED, Types.BINARY.getId(), chunk.getValue());
Parser rtfParser = EmbeddedDocumentUtil.tryToFindExistingLeafParser(RTFParser.class, parseContext);
if (rtfParser == null) {
rtfParser = new RTFParser();
}
rtfParser.parse(new ByteArrayInputStream(rtf.getData()), new EmbeddedContentHandler(new BodyContentHandler(xhtml)), new Metadata(), parseContext);
doneBody = true;
}
if (textChunk != null && !doneBody) {
xhtml.element("p", ((StringChunk) textChunk).getValue());
}
xhtml.endElement("div");
// Process the attachments
for (AttachmentChunks attachment : msg.getAttachmentFiles()) {
xhtml.startElement("div", "class", "attachment-entry");
String filename = null;
if (attachment.getAttachLongFileName() != null) {
filename = attachment.getAttachLongFileName().getValue();
} else if (attachment.getAttachFileName() != null) {
filename = attachment.getAttachFileName().getValue();
}
if (filename != null && filename.length() > 0) {
xhtml.element("h1", filename);
}
if (attachment.getAttachData() != null) {
handleEmbeddedResource(TikaInputStream.get(attachment.getAttachData().getValue()), filename, null, null, xhtml, true);
}
if (attachment.getAttachmentDirectory() != null) {
handleEmbeddedOfficeDoc(attachment.getAttachmentDirectory().getDirectory(), xhtml);
}
xhtml.endElement("div");
}
} catch (ChunkNotFoundException e) {
throw new TikaException("POI MAPIMessage broken - didn't return null on missing chunk", e);
} finally {
//You'd think you'd want to call msg.close().
//Don't do that. That closes down the file system.
//If an msg has multiple msg attachments, some of them
//can reside in the same file system. After the first
//child is read, the fs is closed, and the other children
//get a java.nio.channels.ClosedChannelException
}
}
use of org.apache.poi.hmef.attribute.MAPIRtfAttribute in project poi by apache.
the class TestCompressedRTF method testQuickBasics.
/**
* Check that things are as we expected. If this fails,
* then decoding has no hope...
*/
public void testQuickBasics() throws Exception {
HMEFMessage msg = new HMEFMessage(_samples.openResourceAsStream("quick-winmail.dat"));
MAPIAttribute rtfAttr = msg.getMessageMAPIAttribute(MAPIProperty.RTF_COMPRESSED);
assertNotNull(rtfAttr);
assertTrue(rtfAttr instanceof MAPIRtfAttribute);
// Check the start of the compressed version
byte[] data = ((MAPIRtfAttribute) rtfAttr).getRawData();
assertEquals(5907, data.length);
// First 16 bytes is header stuff
// Check it has the length + compressed marker
assertEquals(5907 - 4, LittleEndian.getShort(data));
assertEquals("LZFu", StringUtil.getFromCompressedUnicode(data, 8, 4));
// Now Look at the code
// Flag: cccUUUUU
assertEquals((byte) 0x07, data[16 + 0]);
// c1a: offset 0 / 0x000
assertEquals((byte) 0x00, data[16 + 1]);
// c1b: length 6+2 -> {\rtf1\a
assertEquals((byte) 0x06, data[16 + 2]);
// c2a: offset 16 / 0x010
assertEquals((byte) 0x01, data[16 + 3]);
// c2b: length 1+2 -> def
assertEquals((byte) 0x01, data[16 + 4]);
// c3a: offset 182 / 0xb6
assertEquals((byte) 0x0b, data[16 + 5]);
// c3b: length 0+2 -> la
assertEquals((byte) 0x60, data[16 + 6]);
// n
assertEquals((byte) 0x6e, data[16 + 7]);
// g
assertEquals((byte) 0x67, data[16 + 8]);
// 1
assertEquals((byte) 0x31, data[16 + 9]);
// 0
assertEquals((byte) 0x30, data[16 + 10]);
// 2
assertEquals((byte) 0x32, data[16 + 11]);
// Flag: UccUUccU
assertEquals((byte) 0x66, data[16 + 12]);
// 5
assertEquals((byte) 0x35, data[16 + 13]);
// c2a: offset 6 / 0x006
assertEquals((byte) 0x00, data[16 + 14]);
// c2b: length 4+2 -> \ansi\a
assertEquals((byte) 0x64, data[16 + 15]);
// c3a: offset 7 / 0x007
assertEquals((byte) 0x00, data[16 + 16]);
// c3b: length 2+2 -> nsi
assertEquals((byte) 0x72, data[16 + 17]);
// c
assertEquals((byte) 0x63, data[16 + 18]);
// p
assertEquals((byte) 0x70, data[16 + 19]);
// c6a: offset 221 / 0x0dd
assertEquals((byte) 0x0d, data[16 + 20]);
// c6b: length 0+2 -> g1
assertEquals((byte) 0xd0, data[16 + 21]);
// c7a: offset 224 / 0x0e0
assertEquals((byte) 0x0e, data[16 + 22]);
// c7b: length 0+2 -> 25
assertEquals((byte) 0x00, data[16 + 23]);
// 2
assertEquals((byte) 0x32, data[16 + 24]);
}
use of org.apache.poi.hmef.attribute.MAPIRtfAttribute in project tika by apache.
the class TNEFParser method parse.
/**
* Extracts properties and text from an MS Document input stream
*/
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
// We work by recursing, so get the appropriate bits
EmbeddedDocumentExtractor embeddedExtractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
// Ask POI to process the file for us
HMEFMessage msg = new HMEFMessage(stream);
// Set the message subject if known
String subject = msg.getSubject();
if (subject != null && subject.length() > 0) {
// TODO: Move to title in Tika 2.0
metadata.set(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_TITLE, subject);
}
// Recurse into the message body RTF
MAPIAttribute attr = msg.getMessageMAPIAttribute(MAPIProperty.RTF_COMPRESSED);
if (attr != null && attr instanceof MAPIRtfAttribute) {
MAPIRtfAttribute rtf = (MAPIRtfAttribute) attr;
handleEmbedded("message.rtf", "application/rtf", rtf.getData(), embeddedExtractor, handler);
}
// Recurse into each attachment in turn
for (Attachment attachment : msg.getAttachments()) {
String name = attachment.getLongFilename();
if (name == null || name.length() == 0) {
name = attachment.getFilename();
}
if (name == null || name.length() == 0) {
String ext = attachment.getExtension();
if (ext != null) {
name = "unknown" + ext;
}
}
handleEmbedded(name, null, attachment.getContents(), embeddedExtractor, handler);
}
}
Aggregations