use of org.apache.poi.hsmf.exceptions.ChunkNotFoundException in project tika by apache.
the class OutlookExtractor method parse.
public void parse(XHTMLContentHandler xhtml, Metadata metadata) throws TikaException, SAXException, IOException {
try {
msg.setReturnNullOnMissingChunk(true);
try {
metadata.set(Office.MAPI_MESSAGE_CLASS, getMessageClass(msg.getMessageClass()));
} catch (ChunkNotFoundException e) {
}
// as Unicode, try to sort out an encoding for them
if (msg.has7BitEncodingStrings()) {
guess7BitEncoding(msg);
}
// Start with the metadata
String subject = msg.getSubject();
Map<String, String[]> headers = normalizeHeaders(msg.getHeaders());
String from = msg.getDisplayFrom();
handleFromTo(headers, metadata);
metadata.set(TikaCoreProperties.TITLE, subject);
// TODO: Move to description in Tika 2.0
metadata.set(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_DESCRIPTION, msg.getConversationTopic());
try {
for (String recipientAddress : msg.getRecipientEmailAddressList()) {
if (recipientAddress != null)
metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, recipientAddress);
}
} catch (ChunkNotFoundException he) {
}
for (Map.Entry<String, String[]> e : headers.entrySet()) {
String headerKey = e.getKey();
for (String headerValue : e.getValue()) {
metadata.add(Metadata.MESSAGE_RAW_HEADER_PREFIX + headerKey, headerValue);
}
}
// First try via the proper chunk
if (msg.getMessageDate() != null) {
metadata.set(TikaCoreProperties.CREATED, msg.getMessageDate().getTime());
metadata.set(TikaCoreProperties.MODIFIED, msg.getMessageDate().getTime());
} else {
if (headers != null && headers.size() > 0) {
for (Map.Entry<String, String[]> header : headers.entrySet()) {
String headerKey = header.getKey();
if (headerKey.toLowerCase(Locale.ROOT).startsWith("date:")) {
String date = headerKey.substring(headerKey.indexOf(':') + 1).trim();
// See if we can parse it as a normal mail date
try {
Date d = MboxParser.parseDate(date);
metadata.set(TikaCoreProperties.CREATED, d);
metadata.set(TikaCoreProperties.MODIFIED, d);
} catch (ParseException e) {
// Store it as-is, and hope for the best...
metadata.set(TikaCoreProperties.CREATED, date);
metadata.set(TikaCoreProperties.MODIFIED, date);
}
break;
}
}
}
}
xhtml.element("h1", subject);
// Output the from and to details in text, as you
// often want them in text form for searching
xhtml.startElement("dl");
if (from != null) {
header(xhtml, "From", from);
}
header(xhtml, "To", msg.getDisplayTo());
header(xhtml, "Cc", msg.getDisplayCC());
header(xhtml, "Bcc", msg.getDisplayBCC());
try {
header(xhtml, "Recipients", msg.getRecipientEmailAddress());
} catch (ChunkNotFoundException e) {
}
xhtml.endElement("dl");
// Get the message body. Preference order is: html, rtf, text
Chunk htmlChunk = null;
Chunk rtfChunk = null;
Chunk textChunk = null;
for (Chunk chunk : msg.getMainChunks().getChunks()) {
if (chunk.getChunkId() == MAPIProperty.BODY_HTML.id) {
htmlChunk = chunk;
}
if (chunk.getChunkId() == MAPIProperty.RTF_COMPRESSED.id) {
rtfChunk = chunk;
}
if (chunk.getChunkId() == MAPIProperty.BODY.id) {
textChunk = chunk;
}
}
boolean doneBody = false;
xhtml.startElement("div", "class", "message-body");
if (htmlChunk != null) {
byte[] data = null;
if (htmlChunk instanceof ByteChunk) {
data = ((ByteChunk) htmlChunk).getValue();
} else if (htmlChunk instanceof StringChunk) {
data = ((StringChunk) htmlChunk).getRawValue();
}
if (data != null) {
Parser htmlParser = EmbeddedDocumentUtil.tryToFindExistingLeafParser(HtmlParser.class, parseContext);
if (htmlParser == null) {
htmlParser = new HtmlParser();
}
htmlParser.parse(new ByteArrayInputStream(data), new EmbeddedContentHandler(new BodyContentHandler(xhtml)), new Metadata(), parseContext);
doneBody = true;
}
}
if (rtfChunk != null && !doneBody) {
ByteChunk chunk = (ByteChunk) rtfChunk;
MAPIRtfAttribute rtf = new MAPIRtfAttribute(MAPIProperty.RTF_COMPRESSED, Types.BINARY.getId(), chunk.getValue());
Parser rtfParser = EmbeddedDocumentUtil.tryToFindExistingLeafParser(RTFParser.class, parseContext);
if (rtfParser == null) {
rtfParser = new RTFParser();
}
rtfParser.parse(new ByteArrayInputStream(rtf.getData()), new EmbeddedContentHandler(new BodyContentHandler(xhtml)), new Metadata(), parseContext);
doneBody = true;
}
if (textChunk != null && !doneBody) {
xhtml.element("p", ((StringChunk) textChunk).getValue());
}
xhtml.endElement("div");
// Process the attachments
for (AttachmentChunks attachment : msg.getAttachmentFiles()) {
xhtml.startElement("div", "class", "attachment-entry");
String filename = null;
if (attachment.getAttachLongFileName() != null) {
filename = attachment.getAttachLongFileName().getValue();
} else if (attachment.getAttachFileName() != null) {
filename = attachment.getAttachFileName().getValue();
}
if (filename != null && filename.length() > 0) {
xhtml.element("h1", filename);
}
if (attachment.getAttachData() != null) {
handleEmbeddedResource(TikaInputStream.get(attachment.getAttachData().getValue()), filename, null, null, xhtml, true);
}
if (attachment.getAttachmentDirectory() != null) {
handleEmbeddedOfficeDoc(attachment.getAttachmentDirectory().getDirectory(), xhtml);
}
xhtml.endElement("div");
}
} catch (ChunkNotFoundException e) {
throw new TikaException("POI MAPIMessage broken - didn't return null on missing chunk", e);
} finally {
//You'd think you'd want to call msg.close().
//Don't do that. That closes down the file system.
//If an msg has multiple msg attachments, some of them
//can reside in the same file system. After the first
//child is read, the fs is closed, and the other children
//get a java.nio.channels.ClosedChannelException
}
}
use of org.apache.poi.hsmf.exceptions.ChunkNotFoundException in project Xponents by OpenSextant.
the class OLEMessageConverter method conversionImplementation.
@Override
protected ConvertedDocument conversionImplementation(InputStream in, File doc) throws IOException {
ConvertedDocument msgDoc = new ConvertedDocument(doc);
try {
MAPIMessage msg = new MAPIMessage(in);
// If your message is Latin-1 text... there is no real easy way to get bytes of raw message text
// to ensure it is UTF-8
// TextTranscodingConverter.setTextAndEncoding(doc, msg.getM);
// By default this may be UTF-8 text.
msgDoc.setText(msg.getTextBody());
/* Would prefer not to set encoding here without knowing or attempting to derive it properly */
msgDoc.setEncoding(ConvertedDocument.OUTPUT_ENCODING);
AttachmentChunks[] chunks = msg.getAttachmentFiles();
for (AttachmentChunks c : chunks) {
Content child = new Content();
child.id = getAttachmentName(c.attachLongFileName, c.attachFileName);
child.content = c.attachData.getValue();
msgDoc.addRawChild(child);
}
// Get a subject line.
try {
msgDoc.addTitle(msg.getSubject());
} catch (ChunkNotFoundException err) {
msgDoc.addTitle("(MIME error: unable to get subject)");
}
// Get a date line.
try {
msgDoc.addCreateDate(msg.getMessageDate());
} catch (ChunkNotFoundException err) {
//
}
// Get author.
try {
msgDoc.addAuthor(msg.getDisplayFrom());
} catch (ChunkNotFoundException err) {
msgDoc.addAuthor("(MIME error: unable to get sender)");
}
return msgDoc;
} catch (Exception xerr) {
throw new IOException("Unable to parse content", xerr);
} finally {
in.close();
}
}
use of org.apache.poi.hsmf.exceptions.ChunkNotFoundException in project poi by apache.
the class OutlookTextExtactor method getText.
/**
* Outputs something a little like a RFC822 email
*/
public String getText() {
MAPIMessage msg = (MAPIMessage) document;
StringBuffer s = new StringBuffer();
// See if we can get a suitable encoding for any
// non unicode text in the file
msg.guess7BitEncoding();
// Off we go
StringsIterator emails;
try {
emails = new StringsIterator(msg.getRecipientEmailAddressList());
} catch (ChunkNotFoundException e) {
emails = new StringsIterator(new String[0]);
}
try {
s.append("From: " + msg.getDisplayFrom() + "\n");
} catch (ChunkNotFoundException e) {
}
// people in To + CC + BCC.
try {
handleEmails(s, "To", msg.getDisplayTo(), emails);
} catch (ChunkNotFoundException e) {
}
try {
handleEmails(s, "CC", msg.getDisplayCC(), emails);
} catch (ChunkNotFoundException e) {
}
try {
handleEmails(s, "BCC", msg.getDisplayBCC(), emails);
} catch (ChunkNotFoundException e) {
}
// Date - try two ways to find it
try {
// First try via the proper chunk
SimpleDateFormat f = new SimpleDateFormat("E, d MMM yyyy HH:mm:ss Z", Locale.ROOT);
f.setTimeZone(LocaleUtil.getUserTimeZone());
s.append("Date: " + f.format(msg.getMessageDate().getTime()) + "\n");
} catch (ChunkNotFoundException e) {
try {
// Failing that try via the raw headers
String[] headers = msg.getHeaders();
for (String header : headers) {
if (startsWithIgnoreCase(header, "date:")) {
s.append("Date:" + header.substring(header.indexOf(':') + 1) + "\n");
break;
}
}
} catch (ChunkNotFoundException he) {
// We can't find the date, sorry...
}
}
try {
s.append("Subject: " + msg.getSubject() + "\n");
} catch (ChunkNotFoundException e) {
}
// To get the attachments, use ExtractorFactory
for (AttachmentChunks att : msg.getAttachmentFiles()) {
StringChunk name = att.getAttachLongFileName();
if (name == null)
name = att.getAttachFileName();
String attName = name == null ? null : name.getValue();
if (att.getAttachMimeTag() != null && att.getAttachMimeTag().getValue() != null) {
attName = att.getAttachMimeTag().getValue() + " = " + attName;
}
s.append("Attachment: " + attName + "\n");
}
try {
s.append("\n" + msg.getTextBody() + "\n");
} catch (ChunkNotFoundException e) {
}
return s.toString();
}
use of org.apache.poi.hsmf.exceptions.ChunkNotFoundException in project poi by apache.
the class MAPIMessage method guess7BitEncoding.
/**
* Tries to identify the correct encoding for 7-bit (non-unicode)
* strings in the file.
* <p>Many messages store their strings as unicode, which is
* nice and easy. Some use one-byte encodings for their
* strings, but don't always store the encoding anywhere
* helpful in the file.</p>
* <p>This method checks for codepage properties, and failing that
* looks at the headers for the message, and uses these to
* guess the correct encoding for your file.</p>
* <p>Bug #49441 has more on why this is needed</p>
*/
public void guess7BitEncoding() {
// First choice is a codepage property
for (MAPIProperty prop : new MAPIProperty[] { MAPIProperty.MESSAGE_CODEPAGE, MAPIProperty.INTERNET_CPID }) {
List<PropertyValue> val = mainChunks.getProperties().get(prop);
if (val != null && val.size() > 0) {
int codepage = ((LongPropertyValue) val.get(0)).getValue();
try {
String encoding = CodePageUtil.codepageToEncoding(codepage, true);
set7BitEncoding(encoding);
return;
} catch (UnsupportedEncodingException e) {
logger.log(POILogger.WARN, "Invalid codepage ID ", codepage, " set for the message via ", prop, ", ignoring");
}
}
}
// Second choice is a charset on a content type header
try {
String[] headers = getHeaders();
if (headers != null && headers.length > 0) {
// Look for a content type with a charset
Pattern p = Pattern.compile("Content-Type:.*?charset=[\"']?([^;'\"]+)[\"']?", Pattern.CASE_INSENSITIVE);
for (String header : headers) {
if (header.startsWith("Content-Type")) {
Matcher m = p.matcher(header);
if (m.matches()) {
// Found it! Tell all the string chunks
String charset = m.group(1);
if (!charset.equalsIgnoreCase("utf-8")) {
set7BitEncoding(charset);
}
return;
}
}
}
}
} catch (ChunkNotFoundException e) {
}
// Nothing suitable in the headers, try HTML
try {
String html = getHtmlBody();
if (html != null && html.length() > 0) {
// Look for a content type in the meta headers
Pattern p = Pattern.compile("<META\\s+HTTP-EQUIV=\"Content-Type\"\\s+CONTENT=\"text/html;\\s+charset=(.*?)\"");
Matcher m = p.matcher(html);
if (m.find()) {
// Found it! Tell all the string chunks
String charset = m.group(1);
set7BitEncoding(charset);
return;
}
}
} catch (ChunkNotFoundException e) {
}
}
use of org.apache.poi.hsmf.exceptions.ChunkNotFoundException in project poi by apache.
the class MAPIMessage method getRecipientNamesList.
/**
* Returns an array of all the recipient's names, normally
* in TO then CC then BCC order.
* Checks all the likely chunks in search of the names.
* See also {@link #getDisplayTo()}, {@link #getDisplayCC()}
* and {@link #getDisplayBCC()}.
*/
public String[] getRecipientNamesList() throws ChunkNotFoundException {
if (recipientChunks == null || recipientChunks.length == 0) {
throw new ChunkNotFoundException("No recipients section present");
}
String[] names = new String[recipientChunks.length];
for (int i = 0; i < names.length; i++) {
RecipientChunks rc = recipientChunks[i];
String name = rc.getRecipientName();
if (name != null) {
names[i] = name;
} else {
throw new ChunkNotFoundException("No display name holding chunks found for the " + (i + 1) + "th recipient");
}
}
return names;
}
Aggregations