Search in sources :

Example 6 with MAPIRtfAttribute

use of org.apache.poi.hmef.attribute.MAPIRtfAttribute in project tika by apache.

the class OutlookExtractor method parse.

public void parse(XHTMLContentHandler xhtml, Metadata metadata) throws TikaException, SAXException, IOException {
    try {
        msg.setReturnNullOnMissingChunk(true);
        try {
            metadata.set(Office.MAPI_MESSAGE_CLASS, getMessageClass(msg.getMessageClass()));
        } catch (ChunkNotFoundException e) {
        }
        //  as Unicode, try to sort out an encoding for them
        if (msg.has7BitEncodingStrings()) {
            guess7BitEncoding(msg);
        }
        // Start with the metadata
        String subject = msg.getSubject();
        Map<String, String[]> headers = normalizeHeaders(msg.getHeaders());
        String from = msg.getDisplayFrom();
        handleFromTo(headers, metadata);
        metadata.set(TikaCoreProperties.TITLE, subject);
        // TODO: Move to description in Tika 2.0
        metadata.set(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_DESCRIPTION, msg.getConversationTopic());
        try {
            for (String recipientAddress : msg.getRecipientEmailAddressList()) {
                if (recipientAddress != null)
                    metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, recipientAddress);
            }
        } catch (ChunkNotFoundException he) {
        }
        for (Map.Entry<String, String[]> e : headers.entrySet()) {
            String headerKey = e.getKey();
            for (String headerValue : e.getValue()) {
                metadata.add(Metadata.MESSAGE_RAW_HEADER_PREFIX + headerKey, headerValue);
            }
        }
        // First try via the proper chunk
        if (msg.getMessageDate() != null) {
            metadata.set(TikaCoreProperties.CREATED, msg.getMessageDate().getTime());
            metadata.set(TikaCoreProperties.MODIFIED, msg.getMessageDate().getTime());
        } else {
            if (headers != null && headers.size() > 0) {
                for (Map.Entry<String, String[]> header : headers.entrySet()) {
                    String headerKey = header.getKey();
                    if (headerKey.toLowerCase(Locale.ROOT).startsWith("date:")) {
                        String date = headerKey.substring(headerKey.indexOf(':') + 1).trim();
                        // See if we can parse it as a normal mail date
                        try {
                            Date d = MboxParser.parseDate(date);
                            metadata.set(TikaCoreProperties.CREATED, d);
                            metadata.set(TikaCoreProperties.MODIFIED, d);
                        } catch (ParseException e) {
                            // Store it as-is, and hope for the best...
                            metadata.set(TikaCoreProperties.CREATED, date);
                            metadata.set(TikaCoreProperties.MODIFIED, date);
                        }
                        break;
                    }
                }
            }
        }
        xhtml.element("h1", subject);
        // Output the from and to details in text, as you
        //  often want them in text form for searching
        xhtml.startElement("dl");
        if (from != null) {
            header(xhtml, "From", from);
        }
        header(xhtml, "To", msg.getDisplayTo());
        header(xhtml, "Cc", msg.getDisplayCC());
        header(xhtml, "Bcc", msg.getDisplayBCC());
        try {
            header(xhtml, "Recipients", msg.getRecipientEmailAddress());
        } catch (ChunkNotFoundException e) {
        }
        xhtml.endElement("dl");
        // Get the message body. Preference order is: html, rtf, text
        Chunk htmlChunk = null;
        Chunk rtfChunk = null;
        Chunk textChunk = null;
        for (Chunk chunk : msg.getMainChunks().getChunks()) {
            if (chunk.getChunkId() == MAPIProperty.BODY_HTML.id) {
                htmlChunk = chunk;
            }
            if (chunk.getChunkId() == MAPIProperty.RTF_COMPRESSED.id) {
                rtfChunk = chunk;
            }
            if (chunk.getChunkId() == MAPIProperty.BODY.id) {
                textChunk = chunk;
            }
        }
        boolean doneBody = false;
        xhtml.startElement("div", "class", "message-body");
        if (htmlChunk != null) {
            byte[] data = null;
            if (htmlChunk instanceof ByteChunk) {
                data = ((ByteChunk) htmlChunk).getValue();
            } else if (htmlChunk instanceof StringChunk) {
                data = ((StringChunk) htmlChunk).getRawValue();
            }
            if (data != null) {
                Parser htmlParser = EmbeddedDocumentUtil.tryToFindExistingLeafParser(HtmlParser.class, parseContext);
                if (htmlParser == null) {
                    htmlParser = new HtmlParser();
                }
                htmlParser.parse(new ByteArrayInputStream(data), new EmbeddedContentHandler(new BodyContentHandler(xhtml)), new Metadata(), parseContext);
                doneBody = true;
            }
        }
        if (rtfChunk != null && !doneBody) {
            ByteChunk chunk = (ByteChunk) rtfChunk;
            MAPIRtfAttribute rtf = new MAPIRtfAttribute(MAPIProperty.RTF_COMPRESSED, Types.BINARY.getId(), chunk.getValue());
            Parser rtfParser = EmbeddedDocumentUtil.tryToFindExistingLeafParser(RTFParser.class, parseContext);
            if (rtfParser == null) {
                rtfParser = new RTFParser();
            }
            rtfParser.parse(new ByteArrayInputStream(rtf.getData()), new EmbeddedContentHandler(new BodyContentHandler(xhtml)), new Metadata(), parseContext);
            doneBody = true;
        }
        if (textChunk != null && !doneBody) {
            xhtml.element("p", ((StringChunk) textChunk).getValue());
        }
        xhtml.endElement("div");
        // Process the attachments
        for (AttachmentChunks attachment : msg.getAttachmentFiles()) {
            xhtml.startElement("div", "class", "attachment-entry");
            String filename = null;
            if (attachment.getAttachLongFileName() != null) {
                filename = attachment.getAttachLongFileName().getValue();
            } else if (attachment.getAttachFileName() != null) {
                filename = attachment.getAttachFileName().getValue();
            }
            if (filename != null && filename.length() > 0) {
                xhtml.element("h1", filename);
            }
            if (attachment.getAttachData() != null) {
                handleEmbeddedResource(TikaInputStream.get(attachment.getAttachData().getValue()), filename, null, null, xhtml, true);
            }
            if (attachment.getAttachmentDirectory() != null) {
                handleEmbeddedOfficeDoc(attachment.getAttachmentDirectory().getDirectory(), xhtml);
            }
            xhtml.endElement("div");
        }
    } catch (ChunkNotFoundException e) {
        throw new TikaException("POI MAPIMessage broken - didn't return null on missing chunk", e);
    } finally {
    //You'd think you'd want to call msg.close().
    //Don't do that.  That closes down the file system.
    //If an msg has multiple msg attachments, some of them
    //can reside in the same file system.  After the first
    //child is read, the fs is closed, and the other children
    //get a java.nio.channels.ClosedChannelException
    }
}
Also used : ChunkNotFoundException(org.apache.poi.hsmf.exceptions.ChunkNotFoundException) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) MAPIRtfAttribute(org.apache.poi.hmef.attribute.MAPIRtfAttribute) TikaException(org.apache.tika.exception.TikaException) ByteChunk(org.apache.poi.hsmf.datatypes.ByteChunk) Metadata(org.apache.tika.metadata.Metadata) EmbeddedContentHandler(org.apache.tika.sax.EmbeddedContentHandler) ByteChunk(org.apache.poi.hsmf.datatypes.ByteChunk) StringChunk(org.apache.poi.hsmf.datatypes.StringChunk) Chunk(org.apache.poi.hsmf.datatypes.Chunk) Date(java.util.Date) StringChunk(org.apache.poi.hsmf.datatypes.StringChunk) Parser(org.apache.tika.parser.Parser) HtmlParser(org.apache.tika.parser.html.HtmlParser) RTFParser(org.apache.tika.parser.rtf.RTFParser) MboxParser(org.apache.tika.parser.mbox.MboxParser) HtmlParser(org.apache.tika.parser.html.HtmlParser) RTFParser(org.apache.tika.parser.rtf.RTFParser) ByteArrayInputStream(java.io.ByteArrayInputStream) ParseException(java.text.ParseException) Map(java.util.Map) LinkedHashMap(java.util.LinkedHashMap) AttachmentChunks(org.apache.poi.hsmf.datatypes.AttachmentChunks)

Example 7 with MAPIRtfAttribute

use of org.apache.poi.hmef.attribute.MAPIRtfAttribute in project poi by apache.

the class TestCompressedRTF method testQuickBasics.

/**
     * Check that things are as we expected. If this fails,
     *  then decoding has no hope...  
     */
public void testQuickBasics() throws Exception {
    HMEFMessage msg = new HMEFMessage(_samples.openResourceAsStream("quick-winmail.dat"));
    MAPIAttribute rtfAttr = msg.getMessageMAPIAttribute(MAPIProperty.RTF_COMPRESSED);
    assertNotNull(rtfAttr);
    assertTrue(rtfAttr instanceof MAPIRtfAttribute);
    // Check the start of the compressed version
    byte[] data = ((MAPIRtfAttribute) rtfAttr).getRawData();
    assertEquals(5907, data.length);
    // First 16 bytes is header stuff
    // Check it has the length + compressed marker
    assertEquals(5907 - 4, LittleEndian.getShort(data));
    assertEquals("LZFu", StringUtil.getFromCompressedUnicode(data, 8, 4));
    // Now Look at the code
    // Flag: cccUUUUU
    assertEquals((byte) 0x07, data[16 + 0]);
    //  c1a: offset 0 / 0x000
    assertEquals((byte) 0x00, data[16 + 1]);
    //  c1b: length 6+2  -> {\rtf1\a
    assertEquals((byte) 0x06, data[16 + 2]);
    //  c2a: offset 16 / 0x010
    assertEquals((byte) 0x01, data[16 + 3]);
    //  c2b: length 1+2  ->  def
    assertEquals((byte) 0x01, data[16 + 4]);
    //  c3a: offset 182 / 0xb6
    assertEquals((byte) 0x0b, data[16 + 5]);
    //  c3b: length 0+2  -> la 
    assertEquals((byte) 0x60, data[16 + 6]);
    // n
    assertEquals((byte) 0x6e, data[16 + 7]);
    // g
    assertEquals((byte) 0x67, data[16 + 8]);
    // 1
    assertEquals((byte) 0x31, data[16 + 9]);
    // 0
    assertEquals((byte) 0x30, data[16 + 10]);
    // 2
    assertEquals((byte) 0x32, data[16 + 11]);
    // Flag:  UccUUccU
    assertEquals((byte) 0x66, data[16 + 12]);
    // 5 
    assertEquals((byte) 0x35, data[16 + 13]);
    //  c2a: offset 6 / 0x006
    assertEquals((byte) 0x00, data[16 + 14]);
    //  c2b: length 4+2  -> \ansi\a
    assertEquals((byte) 0x64, data[16 + 15]);
    //  c3a: offset 7 / 0x007
    assertEquals((byte) 0x00, data[16 + 16]);
    //  c3b: length 2+2  -> nsi
    assertEquals((byte) 0x72, data[16 + 17]);
    // c 
    assertEquals((byte) 0x63, data[16 + 18]);
    // p
    assertEquals((byte) 0x70, data[16 + 19]);
    //  c6a: offset 221 / 0x0dd
    assertEquals((byte) 0x0d, data[16 + 20]);
    //  c6b: length 0+2  -> g1
    assertEquals((byte) 0xd0, data[16 + 21]);
    //  c7a: offset 224 / 0x0e0
    assertEquals((byte) 0x0e, data[16 + 22]);
    //  c7b: length 0+2  -> 25
    assertEquals((byte) 0x00, data[16 + 23]);
    // 2
    assertEquals((byte) 0x32, data[16 + 24]);
}
Also used : MAPIRtfAttribute(org.apache.poi.hmef.attribute.MAPIRtfAttribute) MAPIAttribute(org.apache.poi.hmef.attribute.MAPIAttribute)

Example 8 with MAPIRtfAttribute

use of org.apache.poi.hmef.attribute.MAPIRtfAttribute in project tika by apache.

the class TNEFParser method parse.

/**
     * Extracts properties and text from an MS Document input stream
     */
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    // We work by recursing, so get the appropriate bits
    EmbeddedDocumentExtractor embeddedExtractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
    // Ask POI to process the file for us
    HMEFMessage msg = new HMEFMessage(stream);
    // Set the message subject if known
    String subject = msg.getSubject();
    if (subject != null && subject.length() > 0) {
        // TODO: Move to title in Tika 2.0
        metadata.set(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_TITLE, subject);
    }
    // Recurse into the message body RTF
    MAPIAttribute attr = msg.getMessageMAPIAttribute(MAPIProperty.RTF_COMPRESSED);
    if (attr != null && attr instanceof MAPIRtfAttribute) {
        MAPIRtfAttribute rtf = (MAPIRtfAttribute) attr;
        handleEmbedded("message.rtf", "application/rtf", rtf.getData(), embeddedExtractor, handler);
    }
    // Recurse into each attachment in turn
    for (Attachment attachment : msg.getAttachments()) {
        String name = attachment.getLongFilename();
        if (name == null || name.length() == 0) {
            name = attachment.getFilename();
        }
        if (name == null || name.length() == 0) {
            String ext = attachment.getExtension();
            if (ext != null) {
                name = "unknown" + ext;
            }
        }
        handleEmbedded(name, null, attachment.getContents(), embeddedExtractor, handler);
    }
}
Also used : HMEFMessage(org.apache.poi.hmef.HMEFMessage) MAPIRtfAttribute(org.apache.poi.hmef.attribute.MAPIRtfAttribute) EmbeddedDocumentExtractor(org.apache.tika.extractor.EmbeddedDocumentExtractor) MAPIAttribute(org.apache.poi.hmef.attribute.MAPIAttribute) Attachment(org.apache.poi.hmef.Attachment)

Aggregations

MAPIRtfAttribute (org.apache.poi.hmef.attribute.MAPIRtfAttribute)8 MAPIAttribute (org.apache.poi.hmef.attribute.MAPIAttribute)5 ByteArrayInputStream (java.io.ByteArrayInputStream)4 InputStream (java.io.InputStream)1 ParseException (java.text.ParseException)1 Date (java.util.Date)1 LinkedHashMap (java.util.LinkedHashMap)1 Map (java.util.Map)1 Attachment (org.apache.poi.hmef.Attachment)1 HMEFMessage (org.apache.poi.hmef.HMEFMessage)1 AttachmentChunks (org.apache.poi.hsmf.datatypes.AttachmentChunks)1 ByteChunk (org.apache.poi.hsmf.datatypes.ByteChunk)1 Chunk (org.apache.poi.hsmf.datatypes.Chunk)1 StringChunk (org.apache.poi.hsmf.datatypes.StringChunk)1 ChunkNotFoundException (org.apache.poi.hsmf.exceptions.ChunkNotFoundException)1 TikaException (org.apache.tika.exception.TikaException)1 EmbeddedDocumentExtractor (org.apache.tika.extractor.EmbeddedDocumentExtractor)1 Metadata (org.apache.tika.metadata.Metadata)1 Parser (org.apache.tika.parser.Parser)1 HtmlParser (org.apache.tika.parser.html.HtmlParser)1