use of org.apache.tika.sax.BodyContentHandler in project tika by apache.
the class OldExcelParserTest method testMetadata.
// Disabled, until we can get the POI code to tell us the version
@Test
@Ignore
public void testMetadata() throws Exception {
TikaInputStream stream = getTestFile(file);
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
OldExcelParser parser = new OldExcelParser();
parser.parse(stream, handler, metadata, new ParseContext());
// We can get the content type
assertEquals("application/vnd.ms-excel.sheet.4", metadata.get(Metadata.CONTENT_TYPE));
// But no other metadata
assertEquals(null, metadata.get(TikaCoreProperties.TITLE));
assertEquals(null, metadata.get(Metadata.SUBJECT));
}
use of org.apache.tika.sax.BodyContentHandler in project tika by apache.
the class OutlookParserTest method testOutlookParsing.
@Test
public void testOutlookParsing() throws Exception {
// Should auto-detect!
Parser parser = new AutoDetectParser();
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
try (InputStream stream = OutlookParserTest.class.getResourceAsStream("/test-documents/test-outlook.msg")) {
parser.parse(stream, handler, metadata, new ParseContext());
}
assertEquals("application/vnd.ms-outlook", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("Microsoft Outlook Express 6", metadata.get(TikaCoreProperties.TITLE));
assertEquals("Nouvel utilisateur de Outlook Express", metadata.get(Metadata.MESSAGE_RECIPIENT_ADDRESS));
assertEquals("L'Équipe Microsoft Outlook Express", metadata.get(TikaCoreProperties.CREATOR));
assertEquals("L'Équipe Microsoft Outlook Express", metadata.get(Metadata.AUTHOR));
//ensure that "raw" header is correctly decoded
assertEquals("L'Équipe Microsoft Outlook Express <msoe@microsoft.com>", metadata.get(Metadata.MESSAGE_RAW_HEADER_PREFIX + "From"));
assertEquals("Nouvel utilisateur de Outlook Express", metadata.get(Message.MESSAGE_TO_EMAIL));
assertEquals("", metadata.get(Message.MESSAGE_TO_NAME));
assertEquals("Nouvel utilisateur de Outlook Express", metadata.get(Message.MESSAGE_TO_DISPLAY_NAME));
// Stored as Thu, 5 Apr 2007 09:26:06 -0700
assertEquals("2007-04-05T16:26:06Z", metadata.get(TikaCoreProperties.CREATED));
String content = handler.toString();
assertContains("Microsoft Outlook Express 6", content);
assertContains("L'Équipe Microsoft Outlook Express", content);
assertContains("Nouvel utilisateur de Outlook Express", content);
assertContains("Messagerie et groupes de discussion", content);
}
use of org.apache.tika.sax.BodyContentHandler in project tika by apache.
the class OutlookParserTest method testOutlookNew.
/**
* Test case for TIKA-395, to ensure parser works for new Outlook formats.
*
* @see <a href="https://issues.apache.org/jira/browse/TIKA-395">TIKA-395</a>
*/
@Test
public void testOutlookNew() throws Exception {
Parser parser = new AutoDetectParser();
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
try (InputStream stream = OutlookParserTest.class.getResourceAsStream("/test-documents/test-outlook2003.msg")) {
parser.parse(stream, handler, metadata, new ParseContext());
}
assertEquals("application/vnd.ms-outlook", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("Welcome to Microsoft Office Outlook 2003", metadata.get(TikaCoreProperties.TITLE));
String content = handler.toString();
assertContains("Outlook 2003", content);
assertContains("Streamlined Mail Experience", content);
assertContains("Navigation Pane", content);
//make sure these are parallel
assertEquals("", metadata.get(Message.MESSAGE_TO_EMAIL));
assertEquals("New Outlook User", metadata.get(Message.MESSAGE_TO_NAME));
assertEquals("New Outlook User", metadata.get(Message.MESSAGE_TO_DISPLAY_NAME));
}
use of org.apache.tika.sax.BodyContentHandler in project tika by apache.
the class OOXMLParserTest method testEncrypted.
@Test
public void testEncrypted() throws Exception {
Map<String, String> tests = new HashMap<String, String>();
tests.put("testWORD_protected_passtika.docx", "This is an encrypted Word 2007 File");
tests.put("testPPT_protected_passtika.pptx", "This is an encrypted PowerPoint 2007 slide.");
tests.put("testEXCEL_protected_passtika.xlsx", "This is an Encrypted Excel spreadsheet.");
Parser parser = new AutoDetectParser();
Metadata m = new Metadata();
PasswordProvider passwordProvider = new PasswordProvider() {
@Override
public String getPassword(Metadata metadata) {
return "tika";
}
};
ParseContext passwordContext = new ParseContext();
passwordContext.set(org.apache.tika.parser.PasswordProvider.class, passwordProvider);
for (Map.Entry<String, String> e : tests.entrySet()) {
try (InputStream is = getTestDocument(e.getKey())) {
ContentHandler handler = new BodyContentHandler();
parser.parse(is, handler, m, passwordContext);
assertContains(e.getValue(), handler.toString());
}
}
ParseContext context = new ParseContext();
//now try with no password
for (Map.Entry<String, String> e : tests.entrySet()) {
boolean exc = false;
try (InputStream is = getTestDocument(e.getKey())) {
ContentHandler handler = new BodyContentHandler();
parser.parse(is, handler, m, context);
} catch (EncryptedDocumentException ex) {
exc = true;
}
assertTrue(exc);
}
}
use of org.apache.tika.sax.BodyContentHandler in project tika by apache.
the class OOXMLParserTest method testWordArt.
@Test
public void testWordArt() throws Exception {
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
try (InputStream stream = OOXMLParserTest.class.getResourceAsStream("/test-documents/testWordArt.pptx")) {
new AutoDetectParser().parse(stream, handler, metadata, new ParseContext());
}
String content = handler.toString();
assertContains("Here is some red word Art", content);
}
Aggregations