use of org.apache.poi.hsmf.datatypes.StringChunk in project tika by apache.
the class OutlookExtractor method parse.
public void parse(XHTMLContentHandler xhtml, Metadata metadata) throws TikaException, SAXException, IOException {
try {
msg.setReturnNullOnMissingChunk(true);
try {
metadata.set(Office.MAPI_MESSAGE_CLASS, getMessageClass(msg.getMessageClass()));
} catch (ChunkNotFoundException e) {
}
// as Unicode, try to sort out an encoding for them
if (msg.has7BitEncodingStrings()) {
guess7BitEncoding(msg);
}
// Start with the metadata
String subject = msg.getSubject();
Map<String, String[]> headers = normalizeHeaders(msg.getHeaders());
String from = msg.getDisplayFrom();
handleFromTo(headers, metadata);
metadata.set(TikaCoreProperties.TITLE, subject);
// TODO: Move to description in Tika 2.0
metadata.set(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_DESCRIPTION, msg.getConversationTopic());
try {
for (String recipientAddress : msg.getRecipientEmailAddressList()) {
if (recipientAddress != null)
metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, recipientAddress);
}
} catch (ChunkNotFoundException he) {
}
for (Map.Entry<String, String[]> e : headers.entrySet()) {
String headerKey = e.getKey();
for (String headerValue : e.getValue()) {
metadata.add(Metadata.MESSAGE_RAW_HEADER_PREFIX + headerKey, headerValue);
}
}
// First try via the proper chunk
if (msg.getMessageDate() != null) {
metadata.set(TikaCoreProperties.CREATED, msg.getMessageDate().getTime());
metadata.set(TikaCoreProperties.MODIFIED, msg.getMessageDate().getTime());
} else {
if (headers != null && headers.size() > 0) {
for (Map.Entry<String, String[]> header : headers.entrySet()) {
String headerKey = header.getKey();
if (headerKey.toLowerCase(Locale.ROOT).startsWith("date:")) {
String date = headerKey.substring(headerKey.indexOf(':') + 1).trim();
// See if we can parse it as a normal mail date
try {
Date d = MboxParser.parseDate(date);
metadata.set(TikaCoreProperties.CREATED, d);
metadata.set(TikaCoreProperties.MODIFIED, d);
} catch (ParseException e) {
// Store it as-is, and hope for the best...
metadata.set(TikaCoreProperties.CREATED, date);
metadata.set(TikaCoreProperties.MODIFIED, date);
}
break;
}
}
}
}
xhtml.element("h1", subject);
// Output the from and to details in text, as you
// often want them in text form for searching
xhtml.startElement("dl");
if (from != null) {
header(xhtml, "From", from);
}
header(xhtml, "To", msg.getDisplayTo());
header(xhtml, "Cc", msg.getDisplayCC());
header(xhtml, "Bcc", msg.getDisplayBCC());
try {
header(xhtml, "Recipients", msg.getRecipientEmailAddress());
} catch (ChunkNotFoundException e) {
}
xhtml.endElement("dl");
// Get the message body. Preference order is: html, rtf, text
Chunk htmlChunk = null;
Chunk rtfChunk = null;
Chunk textChunk = null;
for (Chunk chunk : msg.getMainChunks().getChunks()) {
if (chunk.getChunkId() == MAPIProperty.BODY_HTML.id) {
htmlChunk = chunk;
}
if (chunk.getChunkId() == MAPIProperty.RTF_COMPRESSED.id) {
rtfChunk = chunk;
}
if (chunk.getChunkId() == MAPIProperty.BODY.id) {
textChunk = chunk;
}
}
boolean doneBody = false;
xhtml.startElement("div", "class", "message-body");
if (htmlChunk != null) {
byte[] data = null;
if (htmlChunk instanceof ByteChunk) {
data = ((ByteChunk) htmlChunk).getValue();
} else if (htmlChunk instanceof StringChunk) {
data = ((StringChunk) htmlChunk).getRawValue();
}
if (data != null) {
Parser htmlParser = EmbeddedDocumentUtil.tryToFindExistingLeafParser(HtmlParser.class, parseContext);
if (htmlParser == null) {
htmlParser = new HtmlParser();
}
htmlParser.parse(new ByteArrayInputStream(data), new EmbeddedContentHandler(new BodyContentHandler(xhtml)), new Metadata(), parseContext);
doneBody = true;
}
}
if (rtfChunk != null && !doneBody) {
ByteChunk chunk = (ByteChunk) rtfChunk;
MAPIRtfAttribute rtf = new MAPIRtfAttribute(MAPIProperty.RTF_COMPRESSED, Types.BINARY.getId(), chunk.getValue());
Parser rtfParser = EmbeddedDocumentUtil.tryToFindExistingLeafParser(RTFParser.class, parseContext);
if (rtfParser == null) {
rtfParser = new RTFParser();
}
rtfParser.parse(new ByteArrayInputStream(rtf.getData()), new EmbeddedContentHandler(new BodyContentHandler(xhtml)), new Metadata(), parseContext);
doneBody = true;
}
if (textChunk != null && !doneBody) {
xhtml.element("p", ((StringChunk) textChunk).getValue());
}
xhtml.endElement("div");
// Process the attachments
for (AttachmentChunks attachment : msg.getAttachmentFiles()) {
xhtml.startElement("div", "class", "attachment-entry");
String filename = null;
if (attachment.getAttachLongFileName() != null) {
filename = attachment.getAttachLongFileName().getValue();
} else if (attachment.getAttachFileName() != null) {
filename = attachment.getAttachFileName().getValue();
}
if (filename != null && filename.length() > 0) {
xhtml.element("h1", filename);
}
if (attachment.getAttachData() != null) {
handleEmbeddedResource(TikaInputStream.get(attachment.getAttachData().getValue()), filename, null, null, xhtml, true);
}
if (attachment.getAttachmentDirectory() != null) {
handleEmbeddedOfficeDoc(attachment.getAttachmentDirectory().getDirectory(), xhtml);
}
xhtml.endElement("div");
}
} catch (ChunkNotFoundException e) {
throw new TikaException("POI MAPIMessage broken - didn't return null on missing chunk", e);
} finally {
//You'd think you'd want to call msg.close().
//Don't do that. That closes down the file system.
//If an msg has multiple msg attachments, some of them
//can reside in the same file system. After the first
//child is read, the fs is closed, and the other children
//get a java.nio.channels.ClosedChannelException
}
}
use of org.apache.poi.hsmf.datatypes.StringChunk in project poi by apache.
the class OutlookTextExtactor method getText.
/**
* Outputs something a little like a RFC822 email
*/
public String getText() {
MAPIMessage msg = (MAPIMessage) document;
StringBuffer s = new StringBuffer();
// See if we can get a suitable encoding for any
// non unicode text in the file
msg.guess7BitEncoding();
// Off we go
StringsIterator emails;
try {
emails = new StringsIterator(msg.getRecipientEmailAddressList());
} catch (ChunkNotFoundException e) {
emails = new StringsIterator(new String[0]);
}
try {
s.append("From: " + msg.getDisplayFrom() + "\n");
} catch (ChunkNotFoundException e) {
}
// people in To + CC + BCC.
try {
handleEmails(s, "To", msg.getDisplayTo(), emails);
} catch (ChunkNotFoundException e) {
}
try {
handleEmails(s, "CC", msg.getDisplayCC(), emails);
} catch (ChunkNotFoundException e) {
}
try {
handleEmails(s, "BCC", msg.getDisplayBCC(), emails);
} catch (ChunkNotFoundException e) {
}
// Date - try two ways to find it
try {
// First try via the proper chunk
SimpleDateFormat f = new SimpleDateFormat("E, d MMM yyyy HH:mm:ss Z", Locale.ROOT);
f.setTimeZone(LocaleUtil.getUserTimeZone());
s.append("Date: " + f.format(msg.getMessageDate().getTime()) + "\n");
} catch (ChunkNotFoundException e) {
try {
// Failing that try via the raw headers
String[] headers = msg.getHeaders();
for (String header : headers) {
if (startsWithIgnoreCase(header, "date:")) {
s.append("Date:" + header.substring(header.indexOf(':') + 1) + "\n");
break;
}
}
} catch (ChunkNotFoundException he) {
// We can't find the date, sorry...
}
}
try {
s.append("Subject: " + msg.getSubject() + "\n");
} catch (ChunkNotFoundException e) {
}
// To get the attachments, use ExtractorFactory
for (AttachmentChunks att : msg.getAttachmentFiles()) {
StringChunk name = att.getAttachLongFileName();
if (name == null)
name = att.getAttachFileName();
String attName = name == null ? null : name.getValue();
if (att.getAttachMimeTag() != null && att.getAttachMimeTag().getValue() != null) {
attName = att.getAttachMimeTag().getValue() + " = " + attName;
}
s.append("Attachment: " + attName + "\n");
}
try {
s.append("\n" + msg.getTextBody() + "\n");
} catch (ChunkNotFoundException e) {
}
return s.toString();
}
use of org.apache.poi.hsmf.datatypes.StringChunk in project poi by apache.
the class POIFSChunkParser method process.
/**
* Creates a chunk, and gives it to its parent group
*/
protected static void process(Entry entry, ChunkGroup grouping) {
String entryName = entry.getName();
Chunk chunk = null;
// Is it a properties chunk? (They have special names)
if (entryName.equals(PropertiesChunk.NAME)) {
if (grouping instanceof Chunks) {
// These should be the properties for the message itself
chunk = new MessagePropertiesChunk(grouping);
} else {
// Will be properties on an attachment or recipient
chunk = new StoragePropertiesChunk(grouping);
}
} else {
// Check it's a regular chunk
if (entryName.length() < 9) {
// Name in the wrong format
return;
}
if (!entryName.contains("_")) {
// Name in the wrong format
return;
}
// Split it into its parts
int splitAt = entryName.lastIndexOf('_');
String namePrefix = entryName.substring(0, splitAt + 1);
String ids = entryName.substring(splitAt + 1);
// the form __<name>_<id><type>
if (namePrefix.equals("Olk10SideProps") || namePrefix.equals("Olk10SideProps_")) {
// This is some odd Outlook 2002 thing, skip
return;
} else if (splitAt <= entryName.length() - 8) {
// In the right form for a normal chunk
// We'll process this further in a little bit
} else {
// Underscores not the right place, something's wrong
throw new IllegalArgumentException("Invalid chunk name " + entryName);
}
// Now try to turn it into id + type
try {
int chunkId = Integer.parseInt(ids.substring(0, 4), 16);
int typeId = Integer.parseInt(ids.substring(4, 8), 16);
MAPIType type = Types.getById(typeId);
if (type == null) {
type = Types.createCustom(typeId);
}
// Special cases based on the ID
if (chunkId == MAPIProperty.MESSAGE_SUBMISSION_ID.id) {
chunk = new MessageSubmissionChunk(namePrefix, chunkId, type);
} else {
// So, do the usual thing which is by type
if (type == Types.BINARY) {
chunk = new ByteChunk(namePrefix, chunkId, type);
} else if (type == Types.DIRECTORY) {
if (entry instanceof DirectoryNode) {
chunk = new DirectoryChunk((DirectoryNode) entry, namePrefix, chunkId, type);
}
} else if (type == Types.ASCII_STRING || type == Types.UNICODE_STRING) {
chunk = new StringChunk(namePrefix, chunkId, type);
} else {
// Type of an unsupported type! Skipping...
}
}
} catch (NumberFormatException e) {
// Name in the wrong format
return;
}
}
if (chunk != null) {
if (entry instanceof DocumentNode) {
DocumentInputStream inp = null;
try {
inp = new DocumentInputStream((DocumentNode) entry);
chunk.readValue(inp);
grouping.record(chunk);
} catch (IOException e) {
logger.log(POILogger.ERROR, "Error reading from part " + entry.getName() + " - " + e);
} finally {
if (inp != null)
inp.close();
}
} else {
grouping.record(chunk);
}
}
}
use of org.apache.poi.hsmf.datatypes.StringChunk in project poi by apache.
the class TestPOIFSChunkParser method testFindsCore.
@Test
public void testFindsCore() throws IOException, ChunkNotFoundException {
NPOIFSFileSystem simple = new NPOIFSFileSystem(samples.getFile("quick.msg"), true);
// Check a few core things are present
simple.getRoot().getEntry((new StringChunk(MAPIProperty.SUBJECT.id, Types.ASCII_STRING)).getEntryName());
simple.getRoot().getEntry((new StringChunk(MAPIProperty.SENDER_NAME.id, Types.ASCII_STRING)).getEntryName());
// Now load the file
MAPIMessage msg = new MAPIMessage(simple);
assertEquals("Kevin Roast", msg.getDisplayTo());
assertEquals("Kevin Roast", msg.getDisplayFrom());
assertEquals("Test the content transformer", msg.getSubject());
// Check date too
Calendar calExp = LocaleUtil.getLocaleCalendar(2007, 5, 14, 9, 42, 55);
Calendar calAct = msg.getMessageDate();
assertEquals(calExp, calAct);
msg.close();
simple.close();
}
use of org.apache.poi.hsmf.datatypes.StringChunk in project tika by apache.
the class OutlookExtractor method handleFromTo.
private void handleFromTo(Map<String, String[]> headers, Metadata metadata) throws ChunkNotFoundException {
String from = msg.getDisplayFrom();
metadata.set(TikaCoreProperties.CREATOR, from);
metadata.set(Metadata.MESSAGE_FROM, from);
metadata.set(Metadata.MESSAGE_TO, msg.getDisplayTo());
metadata.set(Metadata.MESSAGE_CC, msg.getDisplayCC());
metadata.set(Metadata.MESSAGE_BCC, msg.getDisplayBCC());
Chunks chunks = msg.getMainChunks();
StringChunk sentByServerType = chunks.getSentByServerType();
if (sentByServerType != null) {
metadata.set(Office.MAPI_SENT_BY_SERVER_TYPE, sentByServerType.getValue());
}
Map<MAPIProperty, List<Chunk>> mainChunks = msg.getMainChunks().getAll();
List<Chunk> senderAddresType = mainChunks.get(MAPIProperty.SENDER_ADDRTYPE);
String senderAddressTypeString = "";
if (senderAddresType != null && senderAddresType.size() > 0) {
senderAddressTypeString = senderAddresType.get(0).toString();
}
//sometimes in SMTP .msg files there is an email in the sender name field.
setFirstChunk(mainChunks.get(MAPIProperty.SENDER_NAME), Message.MESSAGE_FROM_NAME, metadata);
setFirstChunk(mainChunks.get(MAPIProperty.SENT_REPRESENTING_NAME), Office.MAPI_FROM_REPRESENTING_NAME, metadata);
setFirstChunk(mainChunks.get(MAPIProperty.SENDER_EMAIL_ADDRESS), Message.MESSAGE_FROM_EMAIL, metadata);
setFirstChunk(mainChunks.get(MAPIProperty.SENT_REPRESENTING_EMAIL_ADDRESS), Office.MAPI_FROM_REPRESENTING_EMAIL, metadata);
for (Recipient recipient : buildRecipients()) {
switch(recipient.recipientType) {
case TO:
addEvenIfNull(Message.MESSAGE_TO_NAME, recipient.name, metadata);
addEvenIfNull(Message.MESSAGE_TO_DISPLAY_NAME, recipient.displayName, metadata);
addEvenIfNull(Message.MESSAGE_TO_EMAIL, recipient.emailAddress, metadata);
break;
case CC:
addEvenIfNull(Message.MESSAGE_CC_NAME, recipient.name, metadata);
addEvenIfNull(Message.MESSAGE_CC_DISPLAY_NAME, recipient.displayName, metadata);
addEvenIfNull(Message.MESSAGE_CC_EMAIL, recipient.emailAddress, metadata);
break;
case BCC:
addEvenIfNull(Message.MESSAGE_BCC_NAME, recipient.name, metadata);
addEvenIfNull(Message.MESSAGE_BCC_DISPLAY_NAME, recipient.displayName, metadata);
addEvenIfNull(Message.MESSAGE_BCC_EMAIL, recipient.emailAddress, metadata);
break;
default:
//log unknown or undefined?
break;
}
}
}
Aggregations