use of org.apache.tika.sax.ToHTMLContentHandler in project tika by apache.
the class OutlookPSTParserTest method testParse.
@Test
public void testParse() throws Exception {
Parser pstParser = new AutoDetectParser();
Metadata metadata = new Metadata();
ContentHandler handler = new ToHTMLContentHandler();
ParseContext context = new ParseContext();
EmbeddedTrackingExtrator trackingExtrator = new EmbeddedTrackingExtrator(context);
context.set(EmbeddedDocumentExtractor.class, trackingExtrator);
context.set(Parser.class, new AutoDetectParser());
pstParser.parse(getResourceAsStream("/test-documents/testPST.pst"), handler, metadata, context);
String output = handler.toString();
assertFalse(output.isEmpty());
assertTrue(output.contains("<meta name=\"Content-Length\" content=\"271360\">"));
assertTrue(output.contains("<meta name=\"Content-Type\" content=\"application/vnd.ms-outlook-pst\">"));
assertTrue(output.contains("<body><div class=\"email-folder\"><h1>"));
assertTrue(output.contains("<div class=\"embedded\" id=\"<530D9CAC.5080901@gmail.com>\"><h1>Re: Feature Generators</h1>"));
assertTrue(output.contains("<div class=\"embedded\" id=\"<1393363252.28814.YahooMailNeo@web140906.mail.bf1.yahoo.com>\"><h1>Re: init tokenizer fails: \"Bad type in putfield/putstatic\"</h1>"));
assertTrue(output.contains("Gary Murphy commented on TIKA-1250:"));
assertTrue(output.contains("<div class=\"email-folder\"><h1>Racine (pour la recherche)</h1>"));
List<Metadata> metaList = trackingExtrator.trackingMetadata;
assertEquals(6, metaList.size());
Metadata firstMail = metaList.get(0);
assertEquals("Jörn Kottmann", firstMail.get(TikaCoreProperties.CREATOR));
assertEquals("Re: Feature Generators", firstMail.get(TikaCoreProperties.TITLE));
assertEquals("kottmann@gmail.com", firstMail.get("senderEmailAddress"));
assertEquals("users@opennlp.apache.org", firstMail.get("displayTo"));
assertEquals("", firstMail.get("displayCC"));
assertEquals("", firstMail.get("displayBCC"));
}
Aggregations