Search in sources :

Example 1 with MessageConverter

use of org.opensextant.xtext.converters.MessageConverter in project Xponents by OpenSextant.

the class XText method setup.

/**
     * If by this point you have taken items out of the requested types the
     * converters will not be setup. E.g., if you don't want PDF or HTML
     * conversion - those resources will not be initialized.
     * 
     * @throws IOException
     *             on err
     */
public void setup() throws IOException {
    defaultConversion = new DefaultConverter(maxBuffer);
    embeddedConversion = new EmbeddedContentConverter(maxBuffer);
    paths.configure();
    // Invoke converter instances only as requested types suggest.
    // If caller has removed file types from the list, then
    String mimetype = "txt";
    if (requestedFileTypes.contains(mimetype)) {
        converters.put(mimetype, new TextTranscodingConverter());
    }
    mimetype = "html";
    if (requestedFileTypes.contains(mimetype)) {
        Converter webConv = new TikaHTMLConverter(this.scrubHTML, maxHTMLBuffer);
        converters.put(mimetype, webConv);
        converters.put("htm", webConv);
        converters.put("xhtml", webConv);
        requestedFileTypes.add("htm");
        requestedFileTypes.add("xhtml");
    }
    MessageConverter emailParser = new MessageConverter();
    mimetype = "eml";
    if (requestedFileTypes.contains(mimetype)) {
        converters.put(mimetype, emailParser);
    }
    mimetype = "msg";
    if (requestedFileTypes.contains(mimetype)) {
        converters.put(mimetype, emailParser);
    }
    WebArchiveConverter webArchiveParser = new WebArchiveConverter();
    mimetype = "mht";
    /* RFC822 */
    if (requestedFileTypes.contains(mimetype)) {
        converters.put(mimetype, webArchiveParser);
    }
    ImageMetadataConverter imgConv = new ImageMetadataConverter();
    String[] imageTypes = { "jpeg", "jpg" };
    for (String img : imageTypes) {
        if (requestedFileTypes.contains(img)) {
            converters.put(img, imgConv);
        }
    }
    //
    for (String t : requestedFileTypes) {
        ignoreFileType(t + ".txt");
    }
    fileFilters = requestedFileTypes.toArray(new String[requestedFileTypes.size()]);
}
Also used : ImageMetadataConverter(org.opensextant.xtext.converters.ImageMetadataConverter) WebArchiveConverter(org.opensextant.xtext.converters.WebArchiveConverter) TextTranscodingConverter(org.opensextant.xtext.converters.TextTranscodingConverter) EmbeddedContentConverter(org.opensextant.xtext.converters.EmbeddedContentConverter) ImageMetadataConverter(org.opensextant.xtext.converters.ImageMetadataConverter) TikaHTMLConverter(org.opensextant.xtext.converters.TikaHTMLConverter) MessageConverter(org.opensextant.xtext.converters.MessageConverter) WebArchiveConverter(org.opensextant.xtext.converters.WebArchiveConverter) TextTranscodingConverter(org.opensextant.xtext.converters.TextTranscodingConverter) DefaultConverter(org.opensextant.xtext.converters.DefaultConverter) EmbeddedContentConverter(org.opensextant.xtext.converters.EmbeddedContentConverter) MessageConverter(org.opensextant.xtext.converters.MessageConverter) TikaHTMLConverter(org.opensextant.xtext.converters.TikaHTMLConverter) DefaultConverter(org.opensextant.xtext.converters.DefaultConverter)

Example 2 with MessageConverter

use of org.opensextant.xtext.converters.MessageConverter in project Xponents by OpenSextant.

the class MessageConverterTest method complexEmailTest.

@Test
public void complexEmailTest() throws Exception {
    MessageConverter conv = new MessageConverter();
    ConvertedDocument doc = conv.convert(TEST_FILE);
    Assert.assertEquals((MESSAGE_BODY + MESSAGE_BOUNDARY).trim(), doc.getText());
    Assert.assertEquals(5, doc.getRawChildren().size());
    final HashMap<String, Content> children = new HashMap<String, Content>();
    for (final Content child : doc.getRawChildren()) {
        children.put(child.id, child);
    }
    Content text_attach = children.get("xtext-embedded-attached-text.txt");
    Assert.assertNotNull("text attachment was not found, available attachments are: " + children.keySet(), text_attach);
    String orig_text_attach = IOUtils.toString(getClass().getResourceAsStream("xtext-embedded-attached-text.txt"), "UTF-8");
    String sep = System.getProperty("line.separator");
    if (!"\r\n".equals(sep)) {
        orig_text_attach = orig_text_attach.replaceAll(sep, "\r\n");
    }
    Assert.assertEquals("text/plain", new MimeType(text_attach.mimeType).getBaseType());
    Assert.assertEquals(orig_text_attach, new String(text_attach.content, text_attach.encoding));
    Assert.assertEquals("A686FA7D9F4FB64E99601455209639C5@imc.mitre.org", text_attach.meta.getProperty(CONTENT_ID));
    Assert.assertEquals("attachment", text_attach.meta.getProperty(CONTENT_DISPOSITION));
    Content html_attach = children.get("word_doc_as_html.htm");
    Assert.assertNotNull("Embedded HTML was not found.", html_attach);
    Assert.assertEquals("text/html", new MimeType(html_attach.mimeType).getBaseType());
    Assert.assertEquals("64B706D14F6CAF4598A5A756E2E763A0@imc.mitre.org", html_attach.meta.getProperty(CONTENT_ID));
    Assert.assertEquals("attachment", html_attach.meta.getProperty(CONTENT_DISPOSITION));
    Content word_attach = children.get("doc_with_embedded_geocoded_image2.docx");
    Assert.assertNotNull("Doc with geocoded image was not found.", word_attach);
    Assert.assertEquals("application/vnd.openxmlformats-officedocument.wordprocessingml.document", new MimeType(word_attach.mimeType).getBaseType());
    Assert.assertEquals("3ED3B89ABF3D1840B551B527B4DA054D@imc.mitre.org", word_attach.meta.getProperty(CONTENT_ID));
    Assert.assertEquals("attachment", word_attach.meta.getProperty(CONTENT_DISPOSITION));
    Content jpeg_attach = children.get("android_photo_with_gps1.jpeg");
    Assert.assertNotNull("Photo with attached image was not found.", jpeg_attach);
    Assert.assertEquals("image/jpeg", new MimeType(jpeg_attach.mimeType).getBaseType());
    Assert.assertEquals("485710da-7b60-461a-a566-0ad2e0a14b82@imc.mitre.org", jpeg_attach.meta.getProperty(CONTENT_ID));
    Assert.assertEquals("inline", jpeg_attach.meta.getProperty(CONTENT_DISPOSITION));
    Content htmlbody = null;
    for (final Content child : doc.getRawChildren()) {
        if ("true".equals(child.meta.getProperty(MessageConverter.MAIL_KEY_PREFIX + "html-body"))) {
            Assert.assertNull("multiple html bodies found", htmlbody);
            Assert.assertEquals("text/html", new MimeType(child.mimeType).getBaseType());
            Assert.assertEquals("BEA4D58835C6A342B10D665B40F9D105@imc.mitre.org", child.meta.getProperty(CONTENT_ID));
            htmlbody = child;
        }
    }
    Assert.assertNotNull("html body was not found", htmlbody);
}
Also used : HashMap(java.util.HashMap) Content(org.opensextant.xtext.Content) MessageConverter(org.opensextant.xtext.converters.MessageConverter) ConvertedDocument(org.opensextant.xtext.ConvertedDocument) MimeType(javax.activation.MimeType) Test(org.junit.Test)

Aggregations

MessageConverter (org.opensextant.xtext.converters.MessageConverter)2 HashMap (java.util.HashMap)1 MimeType (javax.activation.MimeType)1 Test (org.junit.Test)1 Content (org.opensextant.xtext.Content)1 ConvertedDocument (org.opensextant.xtext.ConvertedDocument)1 DefaultConverter (org.opensextant.xtext.converters.DefaultConverter)1 EmbeddedContentConverter (org.opensextant.xtext.converters.EmbeddedContentConverter)1 ImageMetadataConverter (org.opensextant.xtext.converters.ImageMetadataConverter)1 TextTranscodingConverter (org.opensextant.xtext.converters.TextTranscodingConverter)1 TikaHTMLConverter (org.opensextant.xtext.converters.TikaHTMLConverter)1 WebArchiveConverter (org.opensextant.xtext.converters.WebArchiveConverter)1