use of org.apache.poi.hsmf.datatypes.ByteChunk in project tika by apache.
the class OutlookExtractor method parse.
public void parse(XHTMLContentHandler xhtml, Metadata metadata) throws TikaException, SAXException, IOException {
try {
msg.setReturnNullOnMissingChunk(true);
try {
metadata.set(Office.MAPI_MESSAGE_CLASS, getMessageClass(msg.getMessageClass()));
} catch (ChunkNotFoundException e) {
}
// as Unicode, try to sort out an encoding for them
if (msg.has7BitEncodingStrings()) {
guess7BitEncoding(msg);
}
// Start with the metadata
String subject = msg.getSubject();
Map<String, String[]> headers = normalizeHeaders(msg.getHeaders());
String from = msg.getDisplayFrom();
handleFromTo(headers, metadata);
metadata.set(TikaCoreProperties.TITLE, subject);
// TODO: Move to description in Tika 2.0
metadata.set(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_DESCRIPTION, msg.getConversationTopic());
try {
for (String recipientAddress : msg.getRecipientEmailAddressList()) {
if (recipientAddress != null)
metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, recipientAddress);
}
} catch (ChunkNotFoundException he) {
}
for (Map.Entry<String, String[]> e : headers.entrySet()) {
String headerKey = e.getKey();
for (String headerValue : e.getValue()) {
metadata.add(Metadata.MESSAGE_RAW_HEADER_PREFIX + headerKey, headerValue);
}
}
// First try via the proper chunk
if (msg.getMessageDate() != null) {
metadata.set(TikaCoreProperties.CREATED, msg.getMessageDate().getTime());
metadata.set(TikaCoreProperties.MODIFIED, msg.getMessageDate().getTime());
} else {
if (headers != null && headers.size() > 0) {
for (Map.Entry<String, String[]> header : headers.entrySet()) {
String headerKey = header.getKey();
if (headerKey.toLowerCase(Locale.ROOT).startsWith("date:")) {
String date = headerKey.substring(headerKey.indexOf(':') + 1).trim();
// See if we can parse it as a normal mail date
try {
Date d = MboxParser.parseDate(date);
metadata.set(TikaCoreProperties.CREATED, d);
metadata.set(TikaCoreProperties.MODIFIED, d);
} catch (ParseException e) {
// Store it as-is, and hope for the best...
metadata.set(TikaCoreProperties.CREATED, date);
metadata.set(TikaCoreProperties.MODIFIED, date);
}
break;
}
}
}
}
xhtml.element("h1", subject);
// Output the from and to details in text, as you
// often want them in text form for searching
xhtml.startElement("dl");
if (from != null) {
header(xhtml, "From", from);
}
header(xhtml, "To", msg.getDisplayTo());
header(xhtml, "Cc", msg.getDisplayCC());
header(xhtml, "Bcc", msg.getDisplayBCC());
try {
header(xhtml, "Recipients", msg.getRecipientEmailAddress());
} catch (ChunkNotFoundException e) {
}
xhtml.endElement("dl");
// Get the message body. Preference order is: html, rtf, text
Chunk htmlChunk = null;
Chunk rtfChunk = null;
Chunk textChunk = null;
for (Chunk chunk : msg.getMainChunks().getChunks()) {
if (chunk.getChunkId() == MAPIProperty.BODY_HTML.id) {
htmlChunk = chunk;
}
if (chunk.getChunkId() == MAPIProperty.RTF_COMPRESSED.id) {
rtfChunk = chunk;
}
if (chunk.getChunkId() == MAPIProperty.BODY.id) {
textChunk = chunk;
}
}
boolean doneBody = false;
xhtml.startElement("div", "class", "message-body");
if (htmlChunk != null) {
byte[] data = null;
if (htmlChunk instanceof ByteChunk) {
data = ((ByteChunk) htmlChunk).getValue();
} else if (htmlChunk instanceof StringChunk) {
data = ((StringChunk) htmlChunk).getRawValue();
}
if (data != null) {
Parser htmlParser = EmbeddedDocumentUtil.tryToFindExistingLeafParser(HtmlParser.class, parseContext);
if (htmlParser == null) {
htmlParser = new HtmlParser();
}
htmlParser.parse(new ByteArrayInputStream(data), new EmbeddedContentHandler(new BodyContentHandler(xhtml)), new Metadata(), parseContext);
doneBody = true;
}
}
if (rtfChunk != null && !doneBody) {
ByteChunk chunk = (ByteChunk) rtfChunk;
MAPIRtfAttribute rtf = new MAPIRtfAttribute(MAPIProperty.RTF_COMPRESSED, Types.BINARY.getId(), chunk.getValue());
Parser rtfParser = EmbeddedDocumentUtil.tryToFindExistingLeafParser(RTFParser.class, parseContext);
if (rtfParser == null) {
rtfParser = new RTFParser();
}
rtfParser.parse(new ByteArrayInputStream(rtf.getData()), new EmbeddedContentHandler(new BodyContentHandler(xhtml)), new Metadata(), parseContext);
doneBody = true;
}
if (textChunk != null && !doneBody) {
xhtml.element("p", ((StringChunk) textChunk).getValue());
}
xhtml.endElement("div");
// Process the attachments
for (AttachmentChunks attachment : msg.getAttachmentFiles()) {
xhtml.startElement("div", "class", "attachment-entry");
String filename = null;
if (attachment.getAttachLongFileName() != null) {
filename = attachment.getAttachLongFileName().getValue();
} else if (attachment.getAttachFileName() != null) {
filename = attachment.getAttachFileName().getValue();
}
if (filename != null && filename.length() > 0) {
xhtml.element("h1", filename);
}
if (attachment.getAttachData() != null) {
handleEmbeddedResource(TikaInputStream.get(attachment.getAttachData().getValue()), filename, null, null, xhtml, true);
}
if (attachment.getAttachmentDirectory() != null) {
handleEmbeddedOfficeDoc(attachment.getAttachmentDirectory().getDirectory(), xhtml);
}
xhtml.endElement("div");
}
} catch (ChunkNotFoundException e) {
throw new TikaException("POI MAPIMessage broken - didn't return null on missing chunk", e);
} finally {
//You'd think you'd want to call msg.close().
//Don't do that. That closes down the file system.
//If an msg has multiple msg attachments, some of them
//can reside in the same file system. After the first
//child is read, the fs is closed, and the other children
//get a java.nio.channels.ClosedChannelException
}
}
use of org.apache.poi.hsmf.datatypes.ByteChunk in project poi by apache.
the class POIFSChunkParser method process.
/**
* Creates a chunk, and gives it to its parent group
*/
protected static void process(Entry entry, ChunkGroup grouping) {
String entryName = entry.getName();
Chunk chunk = null;
// Is it a properties chunk? (They have special names)
if (entryName.equals(PropertiesChunk.NAME)) {
if (grouping instanceof Chunks) {
// These should be the properties for the message itself
chunk = new MessagePropertiesChunk(grouping);
} else {
// Will be properties on an attachment or recipient
chunk = new StoragePropertiesChunk(grouping);
}
} else {
// Check it's a regular chunk
if (entryName.length() < 9) {
// Name in the wrong format
return;
}
if (!entryName.contains("_")) {
// Name in the wrong format
return;
}
// Split it into its parts
int splitAt = entryName.lastIndexOf('_');
String namePrefix = entryName.substring(0, splitAt + 1);
String ids = entryName.substring(splitAt + 1);
// the form __<name>_<id><type>
if (namePrefix.equals("Olk10SideProps") || namePrefix.equals("Olk10SideProps_")) {
// This is some odd Outlook 2002 thing, skip
return;
} else if (splitAt <= entryName.length() - 8) {
// In the right form for a normal chunk
// We'll process this further in a little bit
} else {
// Underscores not the right place, something's wrong
throw new IllegalArgumentException("Invalid chunk name " + entryName);
}
// Now try to turn it into id + type
try {
int chunkId = Integer.parseInt(ids.substring(0, 4), 16);
int typeId = Integer.parseInt(ids.substring(4, 8), 16);
MAPIType type = Types.getById(typeId);
if (type == null) {
type = Types.createCustom(typeId);
}
// Special cases based on the ID
if (chunkId == MAPIProperty.MESSAGE_SUBMISSION_ID.id) {
chunk = new MessageSubmissionChunk(namePrefix, chunkId, type);
} else {
// So, do the usual thing which is by type
if (type == Types.BINARY) {
chunk = new ByteChunk(namePrefix, chunkId, type);
} else if (type == Types.DIRECTORY) {
if (entry instanceof DirectoryNode) {
chunk = new DirectoryChunk((DirectoryNode) entry, namePrefix, chunkId, type);
}
} else if (type == Types.ASCII_STRING || type == Types.UNICODE_STRING) {
chunk = new StringChunk(namePrefix, chunkId, type);
} else {
// Type of an unsupported type! Skipping...
}
}
} catch (NumberFormatException e) {
// Name in the wrong format
return;
}
}
if (chunk != null) {
if (entry instanceof DocumentNode) {
DocumentInputStream inp = null;
try {
inp = new DocumentInputStream((DocumentNode) entry);
chunk.readValue(inp);
grouping.record(chunk);
} catch (IOException e) {
logger.log(POILogger.ERROR, "Error reading from part " + entry.getName() + " - " + e);
} finally {
if (inp != null)
inp.close();
}
} else {
grouping.record(chunk);
}
}
}
Aggregations