Search in sources :

Example 6 with Chunks

use of org.apache.poi.hsmf.datatypes.Chunks in project poi by apache.

the class TestPOIFSChunkParser method testFindsRecips.

@Test
public void testFindsRecips() throws IOException, ChunkNotFoundException {
    NPOIFSFileSystem simple = new NPOIFSFileSystem(samples.getFile("quick.msg"), true);
    simple.getRoot().getEntry("__recip_version1.0_#00000000");
    ChunkGroup[] groups = POIFSChunkParser.parse(simple.getRoot());
    assertEquals(3, groups.length);
    assertTrue(groups[0] instanceof Chunks);
    assertTrue(groups[1] instanceof RecipientChunks);
    assertTrue(groups[2] instanceof NameIdChunks);
    RecipientChunks recips = (RecipientChunks) groups[1];
    assertEquals("kevin.roast@alfresco.org", recips.recipientSMTPChunk.getValue());
    assertEquals("/O=HOSTEDSERVICE2/OU=FIRST ADMINISTRATIVE GROUP/CN=RECIPIENTS/CN=Kevin.roast@ben", recips.recipientEmailChunk.getValue());
    String search = new String(recips.recipientSearchChunk.getValue(), "ASCII");
    assertEquals("CN=KEVIN.ROAST@BEN\0", search.substring(search.length() - 19));
    // Now via MAPIMessage
    MAPIMessage msg = new MAPIMessage(simple);
    assertNotNull(msg.getRecipientDetailsChunks());
    assertEquals(1, msg.getRecipientDetailsChunks().length);
    assertEquals("kevin.roast@alfresco.org", msg.getRecipientDetailsChunks()[0].recipientSMTPChunk.getValue());
    assertEquals("kevin.roast@alfresco.org", msg.getRecipientDetailsChunks()[0].getRecipientEmailAddress());
    assertEquals("Kevin Roast", msg.getRecipientDetailsChunks()[0].getRecipientName());
    assertEquals("kevin.roast@alfresco.org", msg.getRecipientEmailAddress());
    // Try both SMTP and EX files for recipient
    assertEquals("EX", msg.getRecipientDetailsChunks()[0].deliveryTypeChunk.getValue());
    assertEquals("kevin.roast@alfresco.org", msg.getRecipientDetailsChunks()[0].recipientSMTPChunk.getValue());
    assertEquals("/O=HOSTEDSERVICE2/OU=FIRST ADMINISTRATIVE GROUP/CN=RECIPIENTS/CN=Kevin.roast@ben", msg.getRecipientDetailsChunks()[0].recipientEmailChunk.getValue());
    msg.close();
    simple.close();
    // Now look at another message
    simple = new NPOIFSFileSystem(samples.getFile("simple_test_msg.msg"), true);
    msg = new MAPIMessage(simple);
    assertNotNull(msg.getRecipientDetailsChunks());
    assertEquals(1, msg.getRecipientDetailsChunks().length);
    assertEquals("SMTP", msg.getRecipientDetailsChunks()[0].deliveryTypeChunk.getValue());
    assertEquals(null, msg.getRecipientDetailsChunks()[0].recipientSMTPChunk);
    assertEquals(null, msg.getRecipientDetailsChunks()[0].recipientNameChunk);
    assertEquals("travis@overwrittenstack.com", msg.getRecipientDetailsChunks()[0].recipientEmailChunk.getValue());
    assertEquals("travis@overwrittenstack.com", msg.getRecipientEmailAddress());
    msg.close();
    simple.close();
}
Also used : MAPIMessage(org.apache.poi.hsmf.MAPIMessage) NPOIFSFileSystem(org.apache.poi.poifs.filesystem.NPOIFSFileSystem) ChunkGroup(org.apache.poi.hsmf.datatypes.ChunkGroup) Chunks(org.apache.poi.hsmf.datatypes.Chunks) RecipientChunks(org.apache.poi.hsmf.datatypes.RecipientChunks) NameIdChunks(org.apache.poi.hsmf.datatypes.NameIdChunks) AttachmentChunks(org.apache.poi.hsmf.datatypes.AttachmentChunks) RecipientChunks(org.apache.poi.hsmf.datatypes.RecipientChunks) NameIdChunks(org.apache.poi.hsmf.datatypes.NameIdChunks) Test(org.junit.Test)

Example 7 with Chunks

use of org.apache.poi.hsmf.datatypes.Chunks in project poi by apache.

the class TestFixedSizedProperties method testPropertyValueTypes.

/**
    * Check we find properties of a variety of different types
    */
@Test
public void testPropertyValueTypes() throws Exception {
    Chunks mainChunks = mapiMessageSucceeds.getMainChunks();
    // Ask to have the values looked up
    Map<MAPIProperty, List<PropertyValue>> props = mainChunks.getProperties();
    HashSet<Class<? extends PropertyValue>> seenTypes = new HashSet<Class<? extends PropertyValue>>();
    for (List<PropertyValue> pvs : props.values()) {
        for (PropertyValue pv : pvs) {
            seenTypes.add(pv.getClass());
        }
    }
    assertTrue(seenTypes.toString(), seenTypes.size() > 3);
    assertTrue(seenTypes.toString(), seenTypes.contains(LongPropertyValue.class));
    assertTrue(seenTypes.toString(), seenTypes.contains(TimePropertyValue.class));
    assertFalse(seenTypes.toString(), seenTypes.contains(ChunkBasedPropertyValue.class));
    // Ask for the raw values
    seenTypes.clear();
    for (PropertyValue pv : mainChunks.getRawProperties().values()) {
        seenTypes.add(pv.getClass());
    }
    assertTrue(seenTypes.toString(), seenTypes.size() > 3);
    assertTrue(seenTypes.toString(), seenTypes.contains(LongPropertyValue.class));
    assertTrue(seenTypes.toString(), seenTypes.contains(TimePropertyValue.class));
    assertTrue(seenTypes.toString(), seenTypes.contains(ChunkBasedPropertyValue.class));
}
Also used : Chunks(org.apache.poi.hsmf.datatypes.Chunks) LongPropertyValue(org.apache.poi.hsmf.datatypes.PropertyValue.LongPropertyValue) ChunkBasedPropertyValue(org.apache.poi.hsmf.datatypes.ChunkBasedPropertyValue) LongPropertyValue(org.apache.poi.hsmf.datatypes.PropertyValue.LongPropertyValue) TimePropertyValue(org.apache.poi.hsmf.datatypes.PropertyValue.TimePropertyValue) ChunkBasedPropertyValue(org.apache.poi.hsmf.datatypes.ChunkBasedPropertyValue) PropertyValue(org.apache.poi.hsmf.datatypes.PropertyValue) List(java.util.List) BeforeClass(org.junit.BeforeClass) AfterClass(org.junit.AfterClass) MAPIProperty(org.apache.poi.hsmf.datatypes.MAPIProperty) TimePropertyValue(org.apache.poi.hsmf.datatypes.PropertyValue.TimePropertyValue) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 8 with Chunks

use of org.apache.poi.hsmf.datatypes.Chunks in project tika by apache.

the class OutlookExtractor method handleFromTo.

private void handleFromTo(Map<String, String[]> headers, Metadata metadata) throws ChunkNotFoundException {
    String from = msg.getDisplayFrom();
    metadata.set(TikaCoreProperties.CREATOR, from);
    metadata.set(Metadata.MESSAGE_FROM, from);
    metadata.set(Metadata.MESSAGE_TO, msg.getDisplayTo());
    metadata.set(Metadata.MESSAGE_CC, msg.getDisplayCC());
    metadata.set(Metadata.MESSAGE_BCC, msg.getDisplayBCC());
    Chunks chunks = msg.getMainChunks();
    StringChunk sentByServerType = chunks.getSentByServerType();
    if (sentByServerType != null) {
        metadata.set(Office.MAPI_SENT_BY_SERVER_TYPE, sentByServerType.getValue());
    }
    Map<MAPIProperty, List<Chunk>> mainChunks = msg.getMainChunks().getAll();
    List<Chunk> senderAddresType = mainChunks.get(MAPIProperty.SENDER_ADDRTYPE);
    String senderAddressTypeString = "";
    if (senderAddresType != null && senderAddresType.size() > 0) {
        senderAddressTypeString = senderAddresType.get(0).toString();
    }
    //sometimes in SMTP .msg files there is an email in the sender name field.
    setFirstChunk(mainChunks.get(MAPIProperty.SENDER_NAME), Message.MESSAGE_FROM_NAME, metadata);
    setFirstChunk(mainChunks.get(MAPIProperty.SENT_REPRESENTING_NAME), Office.MAPI_FROM_REPRESENTING_NAME, metadata);
    setFirstChunk(mainChunks.get(MAPIProperty.SENDER_EMAIL_ADDRESS), Message.MESSAGE_FROM_EMAIL, metadata);
    setFirstChunk(mainChunks.get(MAPIProperty.SENT_REPRESENTING_EMAIL_ADDRESS), Office.MAPI_FROM_REPRESENTING_EMAIL, metadata);
    for (Recipient recipient : buildRecipients()) {
        switch(recipient.recipientType) {
            case TO:
                addEvenIfNull(Message.MESSAGE_TO_NAME, recipient.name, metadata);
                addEvenIfNull(Message.MESSAGE_TO_DISPLAY_NAME, recipient.displayName, metadata);
                addEvenIfNull(Message.MESSAGE_TO_EMAIL, recipient.emailAddress, metadata);
                break;
            case CC:
                addEvenIfNull(Message.MESSAGE_CC_NAME, recipient.name, metadata);
                addEvenIfNull(Message.MESSAGE_CC_DISPLAY_NAME, recipient.displayName, metadata);
                addEvenIfNull(Message.MESSAGE_CC_EMAIL, recipient.emailAddress, metadata);
                break;
            case BCC:
                addEvenIfNull(Message.MESSAGE_BCC_NAME, recipient.name, metadata);
                addEvenIfNull(Message.MESSAGE_BCC_DISPLAY_NAME, recipient.displayName, metadata);
                addEvenIfNull(Message.MESSAGE_BCC_EMAIL, recipient.emailAddress, metadata);
                break;
            default:
                //log unknown or undefined?
                break;
        }
    }
}
Also used : Chunks(org.apache.poi.hsmf.datatypes.Chunks) RecipientChunks(org.apache.poi.hsmf.datatypes.RecipientChunks) AttachmentChunks(org.apache.poi.hsmf.datatypes.AttachmentChunks) List(java.util.List) ArrayList(java.util.ArrayList) LinkedList(java.util.LinkedList) ByteChunk(org.apache.poi.hsmf.datatypes.ByteChunk) StringChunk(org.apache.poi.hsmf.datatypes.StringChunk) Chunk(org.apache.poi.hsmf.datatypes.Chunk) MAPIProperty(org.apache.poi.hsmf.datatypes.MAPIProperty) StringChunk(org.apache.poi.hsmf.datatypes.StringChunk)

Example 9 with Chunks

use of org.apache.poi.hsmf.datatypes.Chunks in project tika by apache.

the class OutlookExtractor method guess7BitEncoding.

/**
     * Tries to identify the correct encoding for 7-bit (non-unicode)
     *  strings in the file.
     * <p>Many messages store their strings as unicode, which is
     *  nice and easy. Some use one-byte encodings for their
     *  strings, but don't always store the encoding anywhere
     *  helpful in the file.</p>
     * <p>This method checks for codepage properties, and failing that
     *  looks at the headers for the message, and uses these to
     *  guess the correct encoding for your file.</p>
     * <p>Bug #49441 has more on why this is needed</p>
     * <p>This is taken verbatim from POI (TIKA-1238)
     * as a temporary workaround to prevent unsupported encoding exceptions</p>
     */
private void guess7BitEncoding(MAPIMessage msg) {
    Chunks mainChunks = msg.getMainChunks();
    //sanity check
    if (mainChunks == null) {
        return;
    }
    Map<MAPIProperty, List<PropertyValue>> props = mainChunks.getProperties();
    if (props != null) {
        // First choice is a codepage property
        for (MAPIProperty prop : new MAPIProperty[] { MAPIProperty.MESSAGE_CODEPAGE, MAPIProperty.INTERNET_CPID }) {
            List<PropertyValue> val = props.get(prop);
            if (val != null && val.size() > 0) {
                int codepage = ((PropertyValue.LongPropertyValue) val.get(0)).getValue();
                String encoding = null;
                try {
                    encoding = CodePageUtil.codepageToEncoding(codepage, true);
                } catch (UnsupportedEncodingException e) {
                //swallow
                }
                if (tryToSet7BitEncoding(msg, encoding)) {
                    return;
                }
            }
        }
    }
    // Second choice is a charset on a content type header
    try {
        String[] headers = msg.getHeaders();
        if (headers != null && headers.length > 0) {
            // Look for a content type with a charset
            Pattern p = Pattern.compile("Content-Type:.*?charset=[\"']?([^;'\"]+)[\"']?", Pattern.CASE_INSENSITIVE);
            for (String header : headers) {
                if (header.startsWith("Content-Type")) {
                    Matcher m = p.matcher(header);
                    if (m.matches()) {
                        // Found it! Tell all the string chunks
                        String charset = m.group(1);
                        if (tryToSet7BitEncoding(msg, charset)) {
                            return;
                        }
                    }
                }
            }
        }
    } catch (ChunkNotFoundException e) {
    }
    // meta header if there is no other information?
    try {
        String html = msg.getHtmlBody();
        if (html != null && html.length() > 0) {
            Charset charset = null;
            try {
                charset = detector.detect(new ByteArrayInputStream(html.getBytes(UTF_8)), EMPTY_METADATA);
            } catch (IOException e) {
            //swallow
            }
            if (charset != null && tryToSet7BitEncoding(msg, charset.name())) {
                return;
            }
        }
    } catch (ChunkNotFoundException e) {
    }
    //absolute last resort, try charset detector
    StringChunk text = mainChunks.getTextBodyChunk();
    if (text != null) {
        CharsetDetector detector = new CharsetDetector();
        detector.setText(text.getRawValue());
        CharsetMatch match = detector.detect();
        if (match != null && match.getConfidence() > 35 && tryToSet7BitEncoding(msg, match.getName())) {
            return;
        }
    }
}
Also used : ChunkNotFoundException(org.apache.poi.hsmf.exceptions.ChunkNotFoundException) Pattern(java.util.regex.Pattern) Chunks(org.apache.poi.hsmf.datatypes.Chunks) RecipientChunks(org.apache.poi.hsmf.datatypes.RecipientChunks) AttachmentChunks(org.apache.poi.hsmf.datatypes.AttachmentChunks) Matcher(java.util.regex.Matcher) CharsetDetector(org.apache.tika.parser.txt.CharsetDetector) PropertyValue(org.apache.poi.hsmf.datatypes.PropertyValue) UnsupportedEncodingException(java.io.UnsupportedEncodingException) Charset(java.nio.charset.Charset) IOException(java.io.IOException) StringChunk(org.apache.poi.hsmf.datatypes.StringChunk) CharsetMatch(org.apache.tika.parser.txt.CharsetMatch) ByteArrayInputStream(java.io.ByteArrayInputStream) List(java.util.List) ArrayList(java.util.ArrayList) LinkedList(java.util.LinkedList) MAPIProperty(org.apache.poi.hsmf.datatypes.MAPIProperty)

Aggregations

Chunks (org.apache.poi.hsmf.datatypes.Chunks)9 AttachmentChunks (org.apache.poi.hsmf.datatypes.AttachmentChunks)8 RecipientChunks (org.apache.poi.hsmf.datatypes.RecipientChunks)8 NameIdChunks (org.apache.poi.hsmf.datatypes.NameIdChunks)6 ChunkGroup (org.apache.poi.hsmf.datatypes.ChunkGroup)5 Test (org.junit.Test)5 MAPIMessage (org.apache.poi.hsmf.MAPIMessage)4 NPOIFSFileSystem (org.apache.poi.poifs.filesystem.NPOIFSFileSystem)4 ArrayList (java.util.ArrayList)3 List (java.util.List)3 MAPIProperty (org.apache.poi.hsmf.datatypes.MAPIProperty)3 StringChunk (org.apache.poi.hsmf.datatypes.StringChunk)3 IOException (java.io.IOException)2 LinkedList (java.util.LinkedList)2 ByteChunk (org.apache.poi.hsmf.datatypes.ByteChunk)2 Chunk (org.apache.poi.hsmf.datatypes.Chunk)2 PropertyValue (org.apache.poi.hsmf.datatypes.PropertyValue)2 DirectoryNode (org.apache.poi.poifs.filesystem.DirectoryNode)2 ByteArrayInputStream (java.io.ByteArrayInputStream)1 UnsupportedEncodingException (java.io.UnsupportedEncodingException)1