use of org.opensextant.xtext.Content in project Xponents by OpenSextant.
the class WebArchiveConverter method conversionImplementation.
/**
* Convert MHT or .webarchive file to pure text.
* Alternatively, export "archive" exploded on disk and then convert all children items.
* See MessageConverter base and ArchiveNavigator solutions for that.
*
* @param in stream
* @param doc original file
*/
@Override
protected ConvertedDocument conversionImplementation(InputStream in, File doc) throws IOException {
TikaHTMLConverter htmlParser = new TikaHTMLConverter(false);
DefaultConverter objectParser = new DefaultConverter();
ConvertedDocument d = super.conversionImplementation(in, doc);
d.is_webArchive = true;
if (!d.hasRawChildren()) {
return d;
}
StringBuilder buf = new StringBuilder();
for (Content binary : d.getRawChildren()) {
logger.info("{} {} {}", d.id, binary.id, binary.mimeType);
if (binary.mimeType == null) {
continue;
}
if ("application/octet-stream".equalsIgnoreCase(binary.mimeType)) {
ConvertedDocument obj = objectParser.convert(TikaInputStream.get(binary.content));
if (obj != null && obj.hasText() && !isWebScript(obj.getText())) {
buf.append(obj.getText());
buf.append("\n==================\n");
}
} else if (binary.mimeType.startsWith("text/html")) {
ConvertedDocument htmlDoc = htmlParser.convert(TikaInputStream.get(binary.content));
if (htmlDoc != null && htmlDoc.hasText() && !isWebScript(htmlDoc.getText())) {
// Filter out HTML crap -- comments, javascript, etc. that comes through as octet-stream in these archives.
buf.append(htmlDoc.getText());
buf.append("\n==================\n");
}
} else if (binary.mimeType.startsWith("image")) {
buf.append(String.format("\n[Image: %s type='%s'] ", binary.id, binary.mimeType));
}
}
if (d.hasText()) {
d.setText(d.getText() + "\n\n==================\n\n" + buf.toString());
} else {
d.setText(buf.toString());
}
return d;
}
use of org.opensextant.xtext.Content in project Xponents by OpenSextant.
the class MessageConverter method parseMessage.
/**
* This is a recursive parser that pulls off attachments into Child content or saves plain text as main message text.
* Calendar invites are ignored.
*
* @param bodyPart individual sub-part to append to buffer
* @param parent parent doc
* @param buf text to append
* @param msgPrefixId msgId prefix
* @throws IOException on error
*/
public void parseMessage(Part bodyPart, ConvertedDocument parent, StringBuilder buf, String msgPrefixId) throws IOException {
InputStream partIO = null;
++attachmentNumber;
try {
PartMetadata meta = new PartMetadata(bodyPart);
//String charset = (meta.charset == null ? "UTF-8" : meta.charset);
textEncodings.add(meta.charset);
String filename = bodyPart.getFileName();
String fileext = meta.getPossibleFileExtension();
if (filename != null) {
fileext = FilenameUtils.getExtension(filename);
logger.debug("original filename: " + filename);
}
boolean hasExtension = StringUtils.isNotBlank(fileext);
if (!hasExtension) {
logger.debug("Unknown message part");
fileext = "dat";
}
if (filename == null && attachmentNumber > 1) {
filename = String.format("%s-Att%d.%s", msgPrefixId, attachmentNumber, fileext);
}
logger.debug("Charset for part is {}", meta.charset);
// IGNORE types: calendar.
if (meta.isCalendar()) {
logger.debug("{}# Ignore item", msgPrefixId);
return;
}
if (meta.isHTML()) {
//
logger.debug("{}# Save HTML part as its own file", msgPrefixId);
} else if (bodyPart.isMimeType("multipart/*")) {
Multipart mp = (Multipart) bodyPart.getContent();
int count = mp.getCount();
for (int i = 0; i < count; i++) {
// This step does not actually save any content, it calls
// itself to continue to break down the parts into the
// finest grained elements, at which point
parseMessage(mp.getBodyPart(i), parent, buf, msgPrefixId);
}
// Exit point
return;
} else if (bodyPart.isMimeType("message/rfc822")) {
/* normal mail message body */
parseMessage((Part) bodyPart.getContent(), parent, buf, msgPrefixId);
// Exit point
return;
} else {
Object part = bodyPart.getContent();
boolean isTextPlain = bodyPart.isMimeType("text/plain");
if (part instanceof String) {
/* We will take the first charset encoding found for the body text of hte message.
* If there are HTML views of the data, those individual documents will be child documents with their own encodings.
*/
if (meta.charset != null && parent.getEncoding() == null) {
parent.setEncoding(meta.charset);
}
String text = (String) part;
if (!isTextPlain) {
// Decode TEXT from MIME base64 or QP encoded data.
// TODO: Is this necessary? The mime libraries seem to handle base64 unencoding automatically
// (at least for text/plain attachments). -jgibson
logger.debug("{}# Save String MIME part", msgPrefixId);
if (meta.isQP() || meta.isBase64()) {
try {
partIO = IOUtils.toInputStream(text);
byte[] textBytes = decodeMIMEText(partIO, meta.transferEncoding);
if (meta.charset != null) {
text = new String(textBytes, meta.charset);
} else {
text = new String(textBytes);
}
} catch (Exception decodeErr) {
logger.error("Decoding error with bare text in body of message");
}
} else {
logger.debug("Other encoding is unaccounted: {}", meta.transferEncoding);
}
}
if (meta.isAttachment()) {
Content child = createBaseChildContent(filename, meta);
if (child.encoding == null) {
child.encoding = "UTF-8";
}
child.content = text.getBytes(child.encoding);
copyMailAttrs(parent, child);
parent.addRawChild(child);
} else {
// Note, before trying any of these decoding trick
buf.append(TextUtils.delete_controls(text));
buf.append("\n*******************\n");
// Note, the "=XX" sequence is reserved for RFC822 encoding of special chars and non-ASCII.
// So I avoid using "=====".... as a separator.
}
// Exit point
return;
} else if (part instanceof InputStream) {
// Retrieve byte stream.
partIO = (InputStream) part;
Content child = createChildContent(filename, partIO, meta);
copyMailAttrs(parent, child);
parent.addRawChild(child);
// Exit point.
return;
} else {
/* MCU: identify unknown MIME parts */
logger.debug("Skipping this an unknown bodyPart type: " + part.getClass().getName());
//return;
}
}
if (bodyPart instanceof MimeBodyPart && !bodyPart.isMimeType("multipart/*")) {
logger.debug("{}# Saving {} ", msgPrefixId, filename);
if (meta.disposition == null || meta.isAttachment) {
partIO = ((MimeBodyPart) bodyPart).getRawInputStream();
Content child = createChildContent(filename, partIO, meta);
copyMailAttrs(parent, child);
if (meta.isHTML() && (meta.isInline() || (!meta.isAttachment()))) {
child.meta.setProperty(MAIL_KEY_PREFIX + "html-body", "true");
}
parent.addRawChild(child);
return;
}
}
} catch (MessagingException e2) {
logger.error("Extraction Failed on Messaging Exception", e2);
} finally {
if (partIO != null) {
partIO.close();
}
}
}
use of org.opensextant.xtext.Content in project Xponents by OpenSextant.
the class OLEMessageConverter method conversionImplementation.
@Override
protected ConvertedDocument conversionImplementation(InputStream in, File doc) throws IOException {
ConvertedDocument msgDoc = new ConvertedDocument(doc);
try {
MAPIMessage msg = new MAPIMessage(in);
// If your message is Latin-1 text... there is no real easy way to get bytes of raw message text
// to ensure it is UTF-8
// TextTranscodingConverter.setTextAndEncoding(doc, msg.getM);
// By default this may be UTF-8 text.
msgDoc.setText(msg.getTextBody());
/* Would prefer not to set encoding here without knowing or attempting to derive it properly */
msgDoc.setEncoding(ConvertedDocument.OUTPUT_ENCODING);
AttachmentChunks[] chunks = msg.getAttachmentFiles();
for (AttachmentChunks c : chunks) {
Content child = new Content();
child.id = getAttachmentName(c.attachLongFileName, c.attachFileName);
child.content = c.attachData.getValue();
msgDoc.addRawChild(child);
}
// Get a subject line.
try {
msgDoc.addTitle(msg.getSubject());
} catch (ChunkNotFoundException err) {
msgDoc.addTitle("(MIME error: unable to get subject)");
}
// Get a date line.
try {
msgDoc.addCreateDate(msg.getMessageDate());
} catch (ChunkNotFoundException err) {
//
}
// Get author.
try {
msgDoc.addAuthor(msg.getDisplayFrom());
} catch (ChunkNotFoundException err) {
msgDoc.addAuthor("(MIME error: unable to get sender)");
}
return msgDoc;
} catch (Exception xerr) {
throw new IOException("Unable to parse content", xerr);
} finally {
in.close();
}
}
use of org.opensextant.xtext.Content in project Xponents by OpenSextant.
the class EmbeddedContentConverter method renderText.
/**
*
* @param childObjects children
* @return text assembled from children
*/
private String renderText(List<Content> childObjects) {
StringBuilder buf = new StringBuilder();
for (Content c : childObjects) {
buf.append(String.format("\n[Embedded: %s; %s]\n", c.id, c.tikaMediatype.toString()));
try {
// NOTE: To do this well, you may have to write bytes to disk as a valid file name
// And let Tika convert in full.
ConvertedDocument text = conv.conversionImplementation(TikaInputStream.get(c.content, c.tikaMetadata), null);
buf.append(text.getText());
} catch (IOException ioe) {
buf.append("Unconvertable content");
}
buf.append("\n");
}
return buf.toString();
}
use of org.opensextant.xtext.Content in project Xponents by OpenSextant.
the class MessageConverter method createBaseChildContent.
/**
* Create a Child item with all of the metadata populated correctly.
*
* @param file_id file ID, if Tika found one, or a custom one.
* @param meta metadata pulled from the MIME part
* @return content abstraction for the child
*/
private Content createBaseChildContent(String file_id, PartMetadata meta) {
Content child = new Content();
child.id = file_id;
child.encoding = meta.charset;
child.meta.setProperty(ConvertedDocument.CHILD_ENTRY_KEY, file_id);
child.meta.setProperty(MAIL_KEY_PREFIX + "disposition", (meta.disposition == null ? "none" : meta.disposition));
if (meta.contentId != null) {
child.meta.setProperty(MAIL_KEY_PREFIX + "content-id", meta.contentId);
}
child.mimeType = meta.mimeType;
return child;
}
Aggregations