Search in sources :

Example 1 with Content

use of org.opensextant.xtext.Content in project Xponents by OpenSextant.

the class WebArchiveConverter method conversionImplementation.

/**
     * Convert MHT or .webarchive file to pure text.
     * Alternatively, export "archive" exploded on disk and then convert all children items.
     * See MessageConverter base and ArchiveNavigator solutions for that.
     *
     * @param in stream
     * @param doc original file
     */
@Override
protected ConvertedDocument conversionImplementation(InputStream in, File doc) throws IOException {
    TikaHTMLConverter htmlParser = new TikaHTMLConverter(false);
    DefaultConverter objectParser = new DefaultConverter();
    ConvertedDocument d = super.conversionImplementation(in, doc);
    d.is_webArchive = true;
    if (!d.hasRawChildren()) {
        return d;
    }
    StringBuilder buf = new StringBuilder();
    for (Content binary : d.getRawChildren()) {
        logger.info("{} {} {}", d.id, binary.id, binary.mimeType);
        if (binary.mimeType == null) {
            continue;
        }
        if ("application/octet-stream".equalsIgnoreCase(binary.mimeType)) {
            ConvertedDocument obj = objectParser.convert(TikaInputStream.get(binary.content));
            if (obj != null && obj.hasText() && !isWebScript(obj.getText())) {
                buf.append(obj.getText());
                buf.append("\n==================\n");
            }
        } else if (binary.mimeType.startsWith("text/html")) {
            ConvertedDocument htmlDoc = htmlParser.convert(TikaInputStream.get(binary.content));
            if (htmlDoc != null && htmlDoc.hasText() && !isWebScript(htmlDoc.getText())) {
                // Filter out HTML crap -- comments, javascript, etc. that comes through as octet-stream in these archives.
                buf.append(htmlDoc.getText());
                buf.append("\n==================\n");
            }
        } else if (binary.mimeType.startsWith("image")) {
            buf.append(String.format("\n[Image: %s type='%s']  ", binary.id, binary.mimeType));
        }
    }
    if (d.hasText()) {
        d.setText(d.getText() + "\n\n==================\n\n" + buf.toString());
    } else {
        d.setText(buf.toString());
    }
    return d;
}
Also used : Content(org.opensextant.xtext.Content) ConvertedDocument(org.opensextant.xtext.ConvertedDocument)

Example 2 with Content

use of org.opensextant.xtext.Content in project Xponents by OpenSextant.

the class MessageConverter method parseMessage.

/**
     * This is a recursive parser that pulls off attachments into Child content or saves plain text as main message text.
     * Calendar invites are ignored.
     *
     * @param bodyPart  individual sub-part to append to buffer
     * @param parent parent doc
     * @param buf text to append
     * @param msgPrefixId msgId prefix
     * @throws IOException on error
     */
public void parseMessage(Part bodyPart, ConvertedDocument parent, StringBuilder buf, String msgPrefixId) throws IOException {
    InputStream partIO = null;
    ++attachmentNumber;
    try {
        PartMetadata meta = new PartMetadata(bodyPart);
        //String charset = (meta.charset == null ? "UTF-8" : meta.charset);
        textEncodings.add(meta.charset);
        String filename = bodyPart.getFileName();
        String fileext = meta.getPossibleFileExtension();
        if (filename != null) {
            fileext = FilenameUtils.getExtension(filename);
            logger.debug("original filename: " + filename);
        }
        boolean hasExtension = StringUtils.isNotBlank(fileext);
        if (!hasExtension) {
            logger.debug("Unknown message part");
            fileext = "dat";
        }
        if (filename == null && attachmentNumber > 1) {
            filename = String.format("%s-Att%d.%s", msgPrefixId, attachmentNumber, fileext);
        }
        logger.debug("Charset for part is {}", meta.charset);
        // IGNORE types: calendar.
        if (meta.isCalendar()) {
            logger.debug("{}# Ignore item", msgPrefixId);
            return;
        }
        if (meta.isHTML()) {
            //
            logger.debug("{}# Save HTML part as its own file", msgPrefixId);
        } else if (bodyPart.isMimeType("multipart/*")) {
            Multipart mp = (Multipart) bodyPart.getContent();
            int count = mp.getCount();
            for (int i = 0; i < count; i++) {
                // This step does not actually save any content, it calls
                // itself to continue to break down the parts into the
                // finest grained elements, at which point
                parseMessage(mp.getBodyPart(i), parent, buf, msgPrefixId);
            }
            // Exit point
            return;
        } else if (bodyPart.isMimeType("message/rfc822")) {
            /* normal mail message body */
            parseMessage((Part) bodyPart.getContent(), parent, buf, msgPrefixId);
            // Exit point
            return;
        } else {
            Object part = bodyPart.getContent();
            boolean isTextPlain = bodyPart.isMimeType("text/plain");
            if (part instanceof String) {
                /* We will take the first charset encoding found for the body text of hte message.
                     *  If there are HTML views of the data, those individual documents will be child documents with their own encodings.
                     */
                if (meta.charset != null && parent.getEncoding() == null) {
                    parent.setEncoding(meta.charset);
                }
                String text = (String) part;
                if (!isTextPlain) {
                    // Decode TEXT from MIME base64 or QP encoded data.
                    // TODO: Is this necessary? The mime libraries seem to handle base64 unencoding automatically
                    // (at least for text/plain attachments). -jgibson
                    logger.debug("{}# Save String MIME part", msgPrefixId);
                    if (meta.isQP() || meta.isBase64()) {
                        try {
                            partIO = IOUtils.toInputStream(text);
                            byte[] textBytes = decodeMIMEText(partIO, meta.transferEncoding);
                            if (meta.charset != null) {
                                text = new String(textBytes, meta.charset);
                            } else {
                                text = new String(textBytes);
                            }
                        } catch (Exception decodeErr) {
                            logger.error("Decoding error with bare text in body of message");
                        }
                    } else {
                        logger.debug("Other encoding is unaccounted: {}", meta.transferEncoding);
                    }
                }
                if (meta.isAttachment()) {
                    Content child = createBaseChildContent(filename, meta);
                    if (child.encoding == null) {
                        child.encoding = "UTF-8";
                    }
                    child.content = text.getBytes(child.encoding);
                    copyMailAttrs(parent, child);
                    parent.addRawChild(child);
                } else {
                    // Note, before trying any of these decoding trick
                    buf.append(TextUtils.delete_controls(text));
                    buf.append("\n*******************\n");
                // Note, the "=XX" sequence is reserved for RFC822 encoding of special chars and non-ASCII.
                // So I avoid using "=====".... as a separator.
                }
                // Exit point
                return;
            } else if (part instanceof InputStream) {
                // Retrieve byte stream.
                partIO = (InputStream) part;
                Content child = createChildContent(filename, partIO, meta);
                copyMailAttrs(parent, child);
                parent.addRawChild(child);
                // Exit point.
                return;
            } else {
                /* MCU: identify unknown MIME parts */
                logger.debug("Skipping this an unknown bodyPart type: " + part.getClass().getName());
            //return;
            }
        }
        if (bodyPart instanceof MimeBodyPart && !bodyPart.isMimeType("multipart/*")) {
            logger.debug("{}# Saving {} ", msgPrefixId, filename);
            if (meta.disposition == null || meta.isAttachment) {
                partIO = ((MimeBodyPart) bodyPart).getRawInputStream();
                Content child = createChildContent(filename, partIO, meta);
                copyMailAttrs(parent, child);
                if (meta.isHTML() && (meta.isInline() || (!meta.isAttachment()))) {
                    child.meta.setProperty(MAIL_KEY_PREFIX + "html-body", "true");
                }
                parent.addRawChild(child);
                return;
            }
        }
    } catch (MessagingException e2) {
        logger.error("Extraction Failed on Messaging Exception", e2);
    } finally {
        if (partIO != null) {
            partIO.close();
        }
    }
}
Also used : Multipart(javax.mail.Multipart) MessagingException(javax.mail.MessagingException) InputStream(java.io.InputStream) Content(org.opensextant.xtext.Content) MimeBodyPart(javax.mail.internet.MimeBodyPart) MessagingException(javax.mail.MessagingException) IOException(java.io.IOException)

Example 3 with Content

use of org.opensextant.xtext.Content in project Xponents by OpenSextant.

the class OLEMessageConverter method conversionImplementation.

@Override
protected ConvertedDocument conversionImplementation(InputStream in, File doc) throws IOException {
    ConvertedDocument msgDoc = new ConvertedDocument(doc);
    try {
        MAPIMessage msg = new MAPIMessage(in);
        // If your message is Latin-1 text... there is no real easy way to get bytes of raw message text
        // to ensure it is UTF-8
        // TextTranscodingConverter.setTextAndEncoding(doc, msg.getM);
        // By default this may be UTF-8 text.
        msgDoc.setText(msg.getTextBody());
        /* Would prefer not to set encoding here without knowing  or attempting to derive it properly */
        msgDoc.setEncoding(ConvertedDocument.OUTPUT_ENCODING);
        AttachmentChunks[] chunks = msg.getAttachmentFiles();
        for (AttachmentChunks c : chunks) {
            Content child = new Content();
            child.id = getAttachmentName(c.attachLongFileName, c.attachFileName);
            child.content = c.attachData.getValue();
            msgDoc.addRawChild(child);
        }
        // Get a subject line.
        try {
            msgDoc.addTitle(msg.getSubject());
        } catch (ChunkNotFoundException err) {
            msgDoc.addTitle("(MIME error: unable to get subject)");
        }
        // Get a date line.
        try {
            msgDoc.addCreateDate(msg.getMessageDate());
        } catch (ChunkNotFoundException err) {
        // 
        }
        // Get author.
        try {
            msgDoc.addAuthor(msg.getDisplayFrom());
        } catch (ChunkNotFoundException err) {
            msgDoc.addAuthor("(MIME error: unable to get sender)");
        }
        return msgDoc;
    } catch (Exception xerr) {
        throw new IOException("Unable to parse content", xerr);
    } finally {
        in.close();
    }
}
Also used : ChunkNotFoundException(org.apache.poi.hsmf.exceptions.ChunkNotFoundException) MAPIMessage(org.apache.poi.hsmf.MAPIMessage) Content(org.opensextant.xtext.Content) IOException(java.io.IOException) ConvertedDocument(org.opensextant.xtext.ConvertedDocument) AttachmentChunks(org.apache.poi.hsmf.datatypes.AttachmentChunks) IOException(java.io.IOException) ChunkNotFoundException(org.apache.poi.hsmf.exceptions.ChunkNotFoundException)

Example 4 with Content

use of org.opensextant.xtext.Content in project Xponents by OpenSextant.

the class EmbeddedContentConverter method renderText.

/**
     *
     * @param childObjects children
     * @return text assembled from children
     */
private String renderText(List<Content> childObjects) {
    StringBuilder buf = new StringBuilder();
    for (Content c : childObjects) {
        buf.append(String.format("\n[Embedded: %s; %s]\n", c.id, c.tikaMediatype.toString()));
        try {
            // NOTE: To do this well, you may have to write bytes to disk as a valid file name
            //  And let Tika convert in full.
            ConvertedDocument text = conv.conversionImplementation(TikaInputStream.get(c.content, c.tikaMetadata), null);
            buf.append(text.getText());
        } catch (IOException ioe) {
            buf.append("Unconvertable content");
        }
        buf.append("\n");
    }
    return buf.toString();
}
Also used : Content(org.opensextant.xtext.Content) IOException(java.io.IOException) ConvertedDocument(org.opensextant.xtext.ConvertedDocument)

Example 5 with Content

use of org.opensextant.xtext.Content in project Xponents by OpenSextant.

the class MessageConverter method createBaseChildContent.

/**
     * Create a Child item with all of the metadata populated correctly.
     *
     * @param file_id file ID, if Tika found one, or a custom one.
     * @param meta metadata pulled from the MIME part
     * @return content abstraction for the child
     */
private Content createBaseChildContent(String file_id, PartMetadata meta) {
    Content child = new Content();
    child.id = file_id;
    child.encoding = meta.charset;
    child.meta.setProperty(ConvertedDocument.CHILD_ENTRY_KEY, file_id);
    child.meta.setProperty(MAIL_KEY_PREFIX + "disposition", (meta.disposition == null ? "none" : meta.disposition));
    if (meta.contentId != null) {
        child.meta.setProperty(MAIL_KEY_PREFIX + "content-id", meta.contentId);
    }
    child.mimeType = meta.mimeType;
    return child;
}
Also used : Content(org.opensextant.xtext.Content)

Aggregations

Content (org.opensextant.xtext.Content)6 ConvertedDocument (org.opensextant.xtext.ConvertedDocument)4 IOException (java.io.IOException)3 InputStream (java.io.InputStream)1 HashMap (java.util.HashMap)1 MimeType (javax.activation.MimeType)1 MessagingException (javax.mail.MessagingException)1 Multipart (javax.mail.Multipart)1 MimeBodyPart (javax.mail.internet.MimeBodyPart)1 MAPIMessage (org.apache.poi.hsmf.MAPIMessage)1 AttachmentChunks (org.apache.poi.hsmf.datatypes.AttachmentChunks)1 ChunkNotFoundException (org.apache.poi.hsmf.exceptions.ChunkNotFoundException)1 Test (org.junit.Test)1 MessageConverter (org.opensextant.xtext.converters.MessageConverter)1