Search in sources :

Example 1 with ChunkNotFoundException

use of org.apache.poi.hsmf.exceptions.ChunkNotFoundException in project tika by apache.

the class OutlookExtractor method parse.

public void parse(XHTMLContentHandler xhtml, Metadata metadata) throws TikaException, SAXException, IOException {
    try {
        msg.setReturnNullOnMissingChunk(true);
        try {
            metadata.set(Office.MAPI_MESSAGE_CLASS, getMessageClass(msg.getMessageClass()));
        } catch (ChunkNotFoundException e) {
        }
        //  as Unicode, try to sort out an encoding for them
        if (msg.has7BitEncodingStrings()) {
            guess7BitEncoding(msg);
        }
        // Start with the metadata
        String subject = msg.getSubject();
        Map<String, String[]> headers = normalizeHeaders(msg.getHeaders());
        String from = msg.getDisplayFrom();
        handleFromTo(headers, metadata);
        metadata.set(TikaCoreProperties.TITLE, subject);
        // TODO: Move to description in Tika 2.0
        metadata.set(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_DESCRIPTION, msg.getConversationTopic());
        try {
            for (String recipientAddress : msg.getRecipientEmailAddressList()) {
                if (recipientAddress != null)
                    metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, recipientAddress);
            }
        } catch (ChunkNotFoundException he) {
        }
        for (Map.Entry<String, String[]> e : headers.entrySet()) {
            String headerKey = e.getKey();
            for (String headerValue : e.getValue()) {
                metadata.add(Metadata.MESSAGE_RAW_HEADER_PREFIX + headerKey, headerValue);
            }
        }
        // First try via the proper chunk
        if (msg.getMessageDate() != null) {
            metadata.set(TikaCoreProperties.CREATED, msg.getMessageDate().getTime());
            metadata.set(TikaCoreProperties.MODIFIED, msg.getMessageDate().getTime());
        } else {
            if (headers != null && headers.size() > 0) {
                for (Map.Entry<String, String[]> header : headers.entrySet()) {
                    String headerKey = header.getKey();
                    if (headerKey.toLowerCase(Locale.ROOT).startsWith("date:")) {
                        String date = headerKey.substring(headerKey.indexOf(':') + 1).trim();
                        // See if we can parse it as a normal mail date
                        try {
                            Date d = MboxParser.parseDate(date);
                            metadata.set(TikaCoreProperties.CREATED, d);
                            metadata.set(TikaCoreProperties.MODIFIED, d);
                        } catch (ParseException e) {
                            // Store it as-is, and hope for the best...
                            metadata.set(TikaCoreProperties.CREATED, date);
                            metadata.set(TikaCoreProperties.MODIFIED, date);
                        }
                        break;
                    }
                }
            }
        }
        xhtml.element("h1", subject);
        // Output the from and to details in text, as you
        //  often want them in text form for searching
        xhtml.startElement("dl");
        if (from != null) {
            header(xhtml, "From", from);
        }
        header(xhtml, "To", msg.getDisplayTo());
        header(xhtml, "Cc", msg.getDisplayCC());
        header(xhtml, "Bcc", msg.getDisplayBCC());
        try {
            header(xhtml, "Recipients", msg.getRecipientEmailAddress());
        } catch (ChunkNotFoundException e) {
        }
        xhtml.endElement("dl");
        // Get the message body. Preference order is: html, rtf, text
        Chunk htmlChunk = null;
        Chunk rtfChunk = null;
        Chunk textChunk = null;
        for (Chunk chunk : msg.getMainChunks().getChunks()) {
            if (chunk.getChunkId() == MAPIProperty.BODY_HTML.id) {
                htmlChunk = chunk;
            }
            if (chunk.getChunkId() == MAPIProperty.RTF_COMPRESSED.id) {
                rtfChunk = chunk;
            }
            if (chunk.getChunkId() == MAPIProperty.BODY.id) {
                textChunk = chunk;
            }
        }
        boolean doneBody = false;
        xhtml.startElement("div", "class", "message-body");
        if (htmlChunk != null) {
            byte[] data = null;
            if (htmlChunk instanceof ByteChunk) {
                data = ((ByteChunk) htmlChunk).getValue();
            } else if (htmlChunk instanceof StringChunk) {
                data = ((StringChunk) htmlChunk).getRawValue();
            }
            if (data != null) {
                Parser htmlParser = EmbeddedDocumentUtil.tryToFindExistingLeafParser(HtmlParser.class, parseContext);
                if (htmlParser == null) {
                    htmlParser = new HtmlParser();
                }
                htmlParser.parse(new ByteArrayInputStream(data), new EmbeddedContentHandler(new BodyContentHandler(xhtml)), new Metadata(), parseContext);
                doneBody = true;
            }
        }
        if (rtfChunk != null && !doneBody) {
            ByteChunk chunk = (ByteChunk) rtfChunk;
            MAPIRtfAttribute rtf = new MAPIRtfAttribute(MAPIProperty.RTF_COMPRESSED, Types.BINARY.getId(), chunk.getValue());
            Parser rtfParser = EmbeddedDocumentUtil.tryToFindExistingLeafParser(RTFParser.class, parseContext);
            if (rtfParser == null) {
                rtfParser = new RTFParser();
            }
            rtfParser.parse(new ByteArrayInputStream(rtf.getData()), new EmbeddedContentHandler(new BodyContentHandler(xhtml)), new Metadata(), parseContext);
            doneBody = true;
        }
        if (textChunk != null && !doneBody) {
            xhtml.element("p", ((StringChunk) textChunk).getValue());
        }
        xhtml.endElement("div");
        // Process the attachments
        for (AttachmentChunks attachment : msg.getAttachmentFiles()) {
            xhtml.startElement("div", "class", "attachment-entry");
            String filename = null;
            if (attachment.getAttachLongFileName() != null) {
                filename = attachment.getAttachLongFileName().getValue();
            } else if (attachment.getAttachFileName() != null) {
                filename = attachment.getAttachFileName().getValue();
            }
            if (filename != null && filename.length() > 0) {
                xhtml.element("h1", filename);
            }
            if (attachment.getAttachData() != null) {
                handleEmbeddedResource(TikaInputStream.get(attachment.getAttachData().getValue()), filename, null, null, xhtml, true);
            }
            if (attachment.getAttachmentDirectory() != null) {
                handleEmbeddedOfficeDoc(attachment.getAttachmentDirectory().getDirectory(), xhtml);
            }
            xhtml.endElement("div");
        }
    } catch (ChunkNotFoundException e) {
        throw new TikaException("POI MAPIMessage broken - didn't return null on missing chunk", e);
    } finally {
    //You'd think you'd want to call msg.close().
    //Don't do that.  That closes down the file system.
    //If an msg has multiple msg attachments, some of them
    //can reside in the same file system.  After the first
    //child is read, the fs is closed, and the other children
    //get a java.nio.channels.ClosedChannelException
    }
}
Also used : ChunkNotFoundException(org.apache.poi.hsmf.exceptions.ChunkNotFoundException) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) MAPIRtfAttribute(org.apache.poi.hmef.attribute.MAPIRtfAttribute) TikaException(org.apache.tika.exception.TikaException) ByteChunk(org.apache.poi.hsmf.datatypes.ByteChunk) Metadata(org.apache.tika.metadata.Metadata) EmbeddedContentHandler(org.apache.tika.sax.EmbeddedContentHandler) ByteChunk(org.apache.poi.hsmf.datatypes.ByteChunk) StringChunk(org.apache.poi.hsmf.datatypes.StringChunk) Chunk(org.apache.poi.hsmf.datatypes.Chunk) Date(java.util.Date) StringChunk(org.apache.poi.hsmf.datatypes.StringChunk) Parser(org.apache.tika.parser.Parser) HtmlParser(org.apache.tika.parser.html.HtmlParser) RTFParser(org.apache.tika.parser.rtf.RTFParser) MboxParser(org.apache.tika.parser.mbox.MboxParser) HtmlParser(org.apache.tika.parser.html.HtmlParser) RTFParser(org.apache.tika.parser.rtf.RTFParser) ByteArrayInputStream(java.io.ByteArrayInputStream) ParseException(java.text.ParseException) Map(java.util.Map) LinkedHashMap(java.util.LinkedHashMap) AttachmentChunks(org.apache.poi.hsmf.datatypes.AttachmentChunks)

Example 2 with ChunkNotFoundException

use of org.apache.poi.hsmf.exceptions.ChunkNotFoundException in project Xponents by OpenSextant.

the class OLEMessageConverter method conversionImplementation.

@Override
protected ConvertedDocument conversionImplementation(InputStream in, File doc) throws IOException {
    ConvertedDocument msgDoc = new ConvertedDocument(doc);
    try {
        MAPIMessage msg = new MAPIMessage(in);
        // If your message is Latin-1 text... there is no real easy way to get bytes of raw message text
        // to ensure it is UTF-8
        // TextTranscodingConverter.setTextAndEncoding(doc, msg.getM);
        // By default this may be UTF-8 text.
        msgDoc.setText(msg.getTextBody());
        /* Would prefer not to set encoding here without knowing  or attempting to derive it properly */
        msgDoc.setEncoding(ConvertedDocument.OUTPUT_ENCODING);
        AttachmentChunks[] chunks = msg.getAttachmentFiles();
        for (AttachmentChunks c : chunks) {
            Content child = new Content();
            child.id = getAttachmentName(c.attachLongFileName, c.attachFileName);
            child.content = c.attachData.getValue();
            msgDoc.addRawChild(child);
        }
        // Get a subject line.
        try {
            msgDoc.addTitle(msg.getSubject());
        } catch (ChunkNotFoundException err) {
            msgDoc.addTitle("(MIME error: unable to get subject)");
        }
        // Get a date line.
        try {
            msgDoc.addCreateDate(msg.getMessageDate());
        } catch (ChunkNotFoundException err) {
        // 
        }
        // Get author.
        try {
            msgDoc.addAuthor(msg.getDisplayFrom());
        } catch (ChunkNotFoundException err) {
            msgDoc.addAuthor("(MIME error: unable to get sender)");
        }
        return msgDoc;
    } catch (Exception xerr) {
        throw new IOException("Unable to parse content", xerr);
    } finally {
        in.close();
    }
}
Also used : ChunkNotFoundException(org.apache.poi.hsmf.exceptions.ChunkNotFoundException) MAPIMessage(org.apache.poi.hsmf.MAPIMessage) Content(org.opensextant.xtext.Content) IOException(java.io.IOException) ConvertedDocument(org.opensextant.xtext.ConvertedDocument) AttachmentChunks(org.apache.poi.hsmf.datatypes.AttachmentChunks) IOException(java.io.IOException) ChunkNotFoundException(org.apache.poi.hsmf.exceptions.ChunkNotFoundException)

Example 3 with ChunkNotFoundException

use of org.apache.poi.hsmf.exceptions.ChunkNotFoundException in project poi by apache.

the class OutlookTextExtactor method getText.

/**
    * Outputs something a little like a RFC822 email
    */
public String getText() {
    MAPIMessage msg = (MAPIMessage) document;
    StringBuffer s = new StringBuffer();
    // See if we can get a suitable encoding for any
    //  non unicode text in the file
    msg.guess7BitEncoding();
    // Off we go
    StringsIterator emails;
    try {
        emails = new StringsIterator(msg.getRecipientEmailAddressList());
    } catch (ChunkNotFoundException e) {
        emails = new StringsIterator(new String[0]);
    }
    try {
        s.append("From: " + msg.getDisplayFrom() + "\n");
    } catch (ChunkNotFoundException e) {
    }
    //  people in To + CC + BCC.
    try {
        handleEmails(s, "To", msg.getDisplayTo(), emails);
    } catch (ChunkNotFoundException e) {
    }
    try {
        handleEmails(s, "CC", msg.getDisplayCC(), emails);
    } catch (ChunkNotFoundException e) {
    }
    try {
        handleEmails(s, "BCC", msg.getDisplayBCC(), emails);
    } catch (ChunkNotFoundException e) {
    }
    // Date - try two ways to find it
    try {
        // First try via the proper chunk
        SimpleDateFormat f = new SimpleDateFormat("E, d MMM yyyy HH:mm:ss Z", Locale.ROOT);
        f.setTimeZone(LocaleUtil.getUserTimeZone());
        s.append("Date: " + f.format(msg.getMessageDate().getTime()) + "\n");
    } catch (ChunkNotFoundException e) {
        try {
            // Failing that try via the raw headers 
            String[] headers = msg.getHeaders();
            for (String header : headers) {
                if (startsWithIgnoreCase(header, "date:")) {
                    s.append("Date:" + header.substring(header.indexOf(':') + 1) + "\n");
                    break;
                }
            }
        } catch (ChunkNotFoundException he) {
        // We can't find the date, sorry...
        }
    }
    try {
        s.append("Subject: " + msg.getSubject() + "\n");
    } catch (ChunkNotFoundException e) {
    }
    // To get the attachments, use ExtractorFactory
    for (AttachmentChunks att : msg.getAttachmentFiles()) {
        StringChunk name = att.getAttachLongFileName();
        if (name == null)
            name = att.getAttachFileName();
        String attName = name == null ? null : name.getValue();
        if (att.getAttachMimeTag() != null && att.getAttachMimeTag().getValue() != null) {
            attName = att.getAttachMimeTag().getValue() + " = " + attName;
        }
        s.append("Attachment: " + attName + "\n");
    }
    try {
        s.append("\n" + msg.getTextBody() + "\n");
    } catch (ChunkNotFoundException e) {
    }
    return s.toString();
}
Also used : ChunkNotFoundException(org.apache.poi.hsmf.exceptions.ChunkNotFoundException) MAPIMessage(org.apache.poi.hsmf.MAPIMessage) SimpleDateFormat(java.text.SimpleDateFormat) AttachmentChunks(org.apache.poi.hsmf.datatypes.AttachmentChunks) StringsIterator(org.apache.poi.util.StringUtil.StringsIterator) StringChunk(org.apache.poi.hsmf.datatypes.StringChunk)

Example 4 with ChunkNotFoundException

use of org.apache.poi.hsmf.exceptions.ChunkNotFoundException in project poi by apache.

the class MAPIMessage method guess7BitEncoding.

/**
    * Tries to identify the correct encoding for 7-bit (non-unicode)
    *  strings in the file.
    * <p>Many messages store their strings as unicode, which is
    *  nice and easy. Some use one-byte encodings for their
    *  strings, but don't always store the encoding anywhere
    *  helpful in the file.</p>
    * <p>This method checks for codepage properties, and failing that
    *  looks at the headers for the message, and uses these to 
    *  guess the correct encoding for your file.</p>
    * <p>Bug #49441 has more on why this is needed</p>
    */
public void guess7BitEncoding() {
    // First choice is a codepage property
    for (MAPIProperty prop : new MAPIProperty[] { MAPIProperty.MESSAGE_CODEPAGE, MAPIProperty.INTERNET_CPID }) {
        List<PropertyValue> val = mainChunks.getProperties().get(prop);
        if (val != null && val.size() > 0) {
            int codepage = ((LongPropertyValue) val.get(0)).getValue();
            try {
                String encoding = CodePageUtil.codepageToEncoding(codepage, true);
                set7BitEncoding(encoding);
                return;
            } catch (UnsupportedEncodingException e) {
                logger.log(POILogger.WARN, "Invalid codepage ID ", codepage, " set for the message via ", prop, ", ignoring");
            }
        }
    }
    // Second choice is a charset on a content type header
    try {
        String[] headers = getHeaders();
        if (headers != null && headers.length > 0) {
            // Look for a content type with a charset
            Pattern p = Pattern.compile("Content-Type:.*?charset=[\"']?([^;'\"]+)[\"']?", Pattern.CASE_INSENSITIVE);
            for (String header : headers) {
                if (header.startsWith("Content-Type")) {
                    Matcher m = p.matcher(header);
                    if (m.matches()) {
                        // Found it! Tell all the string chunks
                        String charset = m.group(1);
                        if (!charset.equalsIgnoreCase("utf-8")) {
                            set7BitEncoding(charset);
                        }
                        return;
                    }
                }
            }
        }
    } catch (ChunkNotFoundException e) {
    }
    // Nothing suitable in the headers, try HTML
    try {
        String html = getHtmlBody();
        if (html != null && html.length() > 0) {
            // Look for a content type in the meta headers
            Pattern p = Pattern.compile("<META\\s+HTTP-EQUIV=\"Content-Type\"\\s+CONTENT=\"text/html;\\s+charset=(.*?)\"");
            Matcher m = p.matcher(html);
            if (m.find()) {
                // Found it! Tell all the string chunks
                String charset = m.group(1);
                set7BitEncoding(charset);
                return;
            }
        }
    } catch (ChunkNotFoundException e) {
    }
}
Also used : ChunkNotFoundException(org.apache.poi.hsmf.exceptions.ChunkNotFoundException) Pattern(java.util.regex.Pattern) Matcher(java.util.regex.Matcher) LongPropertyValue(org.apache.poi.hsmf.datatypes.PropertyValue.LongPropertyValue) LongPropertyValue(org.apache.poi.hsmf.datatypes.PropertyValue.LongPropertyValue) TimePropertyValue(org.apache.poi.hsmf.datatypes.PropertyValue.TimePropertyValue) PropertyValue(org.apache.poi.hsmf.datatypes.PropertyValue) UnsupportedEncodingException(java.io.UnsupportedEncodingException) MAPIProperty(org.apache.poi.hsmf.datatypes.MAPIProperty)

Example 5 with ChunkNotFoundException

use of org.apache.poi.hsmf.exceptions.ChunkNotFoundException in project poi by apache.

the class MAPIMessage method getRecipientNamesList.

/**
    * Returns an array of all the recipient's names, normally
    *  in TO then CC then BCC order.
    * Checks all the likely chunks in search of the names. 
    * See also {@link #getDisplayTo()}, {@link #getDisplayCC()}
    *  and {@link #getDisplayBCC()}.
    */
public String[] getRecipientNamesList() throws ChunkNotFoundException {
    if (recipientChunks == null || recipientChunks.length == 0) {
        throw new ChunkNotFoundException("No recipients section present");
    }
    String[] names = new String[recipientChunks.length];
    for (int i = 0; i < names.length; i++) {
        RecipientChunks rc = recipientChunks[i];
        String name = rc.getRecipientName();
        if (name != null) {
            names[i] = name;
        } else {
            throw new ChunkNotFoundException("No display name holding chunks found for the " + (i + 1) + "th recipient");
        }
    }
    return names;
}
Also used : ChunkNotFoundException(org.apache.poi.hsmf.exceptions.ChunkNotFoundException) RecipientChunks(org.apache.poi.hsmf.datatypes.RecipientChunks)

Aggregations

ChunkNotFoundException (org.apache.poi.hsmf.exceptions.ChunkNotFoundException)8 AttachmentChunks (org.apache.poi.hsmf.datatypes.AttachmentChunks)5 RecipientChunks (org.apache.poi.hsmf.datatypes.RecipientChunks)3 StringChunk (org.apache.poi.hsmf.datatypes.StringChunk)3 ByteArrayInputStream (java.io.ByteArrayInputStream)2 IOException (java.io.IOException)2 UnsupportedEncodingException (java.io.UnsupportedEncodingException)2 Matcher (java.util.regex.Matcher)2 Pattern (java.util.regex.Pattern)2 MAPIMessage (org.apache.poi.hsmf.MAPIMessage)2 MAPIProperty (org.apache.poi.hsmf.datatypes.MAPIProperty)2 PropertyValue (org.apache.poi.hsmf.datatypes.PropertyValue)2 File (java.io.File)1 PrintWriter (java.io.PrintWriter)1 Charset (java.nio.charset.Charset)1 ParseException (java.text.ParseException)1 SimpleDateFormat (java.text.SimpleDateFormat)1 ArrayList (java.util.ArrayList)1 Date (java.util.Date)1 LinkedHashMap (java.util.LinkedHashMap)1