use of org.opensextant.xtext.converters.MessageConverter in project Xponents by OpenSextant.
the class XText method setup.
/**
* If by this point you have taken items out of the requested types the
* converters will not be setup. E.g., if you don't want PDF or HTML
* conversion - those resources will not be initialized.
*
* @throws IOException
* on err
*/
public void setup() throws IOException {
defaultConversion = new DefaultConverter(maxBuffer);
embeddedConversion = new EmbeddedContentConverter(maxBuffer);
paths.configure();
// Invoke converter instances only as requested types suggest.
// If caller has removed file types from the list, then
String mimetype = "txt";
if (requestedFileTypes.contains(mimetype)) {
converters.put(mimetype, new TextTranscodingConverter());
}
mimetype = "html";
if (requestedFileTypes.contains(mimetype)) {
Converter webConv = new TikaHTMLConverter(this.scrubHTML, maxHTMLBuffer);
converters.put(mimetype, webConv);
converters.put("htm", webConv);
converters.put("xhtml", webConv);
requestedFileTypes.add("htm");
requestedFileTypes.add("xhtml");
}
MessageConverter emailParser = new MessageConverter();
mimetype = "eml";
if (requestedFileTypes.contains(mimetype)) {
converters.put(mimetype, emailParser);
}
mimetype = "msg";
if (requestedFileTypes.contains(mimetype)) {
converters.put(mimetype, emailParser);
}
WebArchiveConverter webArchiveParser = new WebArchiveConverter();
mimetype = "mht";
/* RFC822 */
if (requestedFileTypes.contains(mimetype)) {
converters.put(mimetype, webArchiveParser);
}
ImageMetadataConverter imgConv = new ImageMetadataConverter();
String[] imageTypes = { "jpeg", "jpg" };
for (String img : imageTypes) {
if (requestedFileTypes.contains(img)) {
converters.put(img, imgConv);
}
}
//
for (String t : requestedFileTypes) {
ignoreFileType(t + ".txt");
}
fileFilters = requestedFileTypes.toArray(new String[requestedFileTypes.size()]);
}
use of org.opensextant.xtext.converters.MessageConverter in project Xponents by OpenSextant.
the class MessageConverterTest method complexEmailTest.
@Test
public void complexEmailTest() throws Exception {
MessageConverter conv = new MessageConverter();
ConvertedDocument doc = conv.convert(TEST_FILE);
Assert.assertEquals((MESSAGE_BODY + MESSAGE_BOUNDARY).trim(), doc.getText());
Assert.assertEquals(5, doc.getRawChildren().size());
final HashMap<String, Content> children = new HashMap<String, Content>();
for (final Content child : doc.getRawChildren()) {
children.put(child.id, child);
}
Content text_attach = children.get("xtext-embedded-attached-text.txt");
Assert.assertNotNull("text attachment was not found, available attachments are: " + children.keySet(), text_attach);
String orig_text_attach = IOUtils.toString(getClass().getResourceAsStream("xtext-embedded-attached-text.txt"), "UTF-8");
String sep = System.getProperty("line.separator");
if (!"\r\n".equals(sep)) {
orig_text_attach = orig_text_attach.replaceAll(sep, "\r\n");
}
Assert.assertEquals("text/plain", new MimeType(text_attach.mimeType).getBaseType());
Assert.assertEquals(orig_text_attach, new String(text_attach.content, text_attach.encoding));
Assert.assertEquals("A686FA7D9F4FB64E99601455209639C5@imc.mitre.org", text_attach.meta.getProperty(CONTENT_ID));
Assert.assertEquals("attachment", text_attach.meta.getProperty(CONTENT_DISPOSITION));
Content html_attach = children.get("word_doc_as_html.htm");
Assert.assertNotNull("Embedded HTML was not found.", html_attach);
Assert.assertEquals("text/html", new MimeType(html_attach.mimeType).getBaseType());
Assert.assertEquals("64B706D14F6CAF4598A5A756E2E763A0@imc.mitre.org", html_attach.meta.getProperty(CONTENT_ID));
Assert.assertEquals("attachment", html_attach.meta.getProperty(CONTENT_DISPOSITION));
Content word_attach = children.get("doc_with_embedded_geocoded_image2.docx");
Assert.assertNotNull("Doc with geocoded image was not found.", word_attach);
Assert.assertEquals("application/vnd.openxmlformats-officedocument.wordprocessingml.document", new MimeType(word_attach.mimeType).getBaseType());
Assert.assertEquals("3ED3B89ABF3D1840B551B527B4DA054D@imc.mitre.org", word_attach.meta.getProperty(CONTENT_ID));
Assert.assertEquals("attachment", word_attach.meta.getProperty(CONTENT_DISPOSITION));
Content jpeg_attach = children.get("android_photo_with_gps1.jpeg");
Assert.assertNotNull("Photo with attached image was not found.", jpeg_attach);
Assert.assertEquals("image/jpeg", new MimeType(jpeg_attach.mimeType).getBaseType());
Assert.assertEquals("485710da-7b60-461a-a566-0ad2e0a14b82@imc.mitre.org", jpeg_attach.meta.getProperty(CONTENT_ID));
Assert.assertEquals("inline", jpeg_attach.meta.getProperty(CONTENT_DISPOSITION));
Content htmlbody = null;
for (final Content child : doc.getRawChildren()) {
if ("true".equals(child.meta.getProperty(MessageConverter.MAIL_KEY_PREFIX + "html-body"))) {
Assert.assertNull("multiple html bodies found", htmlbody);
Assert.assertEquals("text/html", new MimeType(child.mimeType).getBaseType());
Assert.assertEquals("BEA4D58835C6A342B10D665B40F9D105@imc.mitre.org", child.meta.getProperty(CONTENT_ID));
htmlbody = child;
}
}
Assert.assertNotNull("html body was not found", htmlbody);
}
Aggregations