use of org.xml.sax.ContentHandler in project tika by apache.
the class OOXMLParserTest method testPowerPointMetadataEarly.
/**
* Test that the metadata is already extracted when the body is processed.
* See TIKA-1109
*/
@Test
public void testPowerPointMetadataEarly() throws Exception {
String[] extensions = new String[] { "pptx", "pptm", "ppsm", "ppsx", "potm" };
final String[] mimeTypes = new String[] { "application/vnd.openxmlformats-officedocument.presentationml.presentation", "application/vnd.ms-powerpoint.presentation.macroenabled.12", "application/vnd.ms-powerpoint.slideshow.macroenabled.12", "application/vnd.openxmlformats-officedocument.presentationml.slideshow", "application/vnd.ms-powerpoint.template.macroenabled.12" };
for (int i = 0; i < extensions.length; i++) {
String extension = extensions[i];
final String filename = "testPPT." + extension;
Parser parser = new AutoDetectParser();
final Metadata metadata = new Metadata();
// Allow the value to be access from the inner class
final int currentI = i;
ContentHandler handler = new BodyContentHandler() {
public void startDocument() {
assertEquals("Mime-type checking for " + filename, mimeTypes[currentI], metadata.get(Metadata.CONTENT_TYPE));
assertEquals("Attachment Test", metadata.get(TikaCoreProperties.TITLE));
assertEquals("Rajiv", metadata.get(TikaCoreProperties.CREATOR));
assertEquals("Rajiv", metadata.get(Metadata.AUTHOR));
}
};
ParseContext context = new ParseContext();
try (InputStream input = getTestDocument(filename)) {
parser.parse(input, handler, metadata, context);
}
}
}
use of org.xml.sax.ContentHandler in project tika by apache.
the class OOXMLParserTest method testUnsupportedPowerPoint.
/**
* For the PowerPoint formats we don't currently support, ensure that
* we don't break either
*/
@Test
public void testUnsupportedPowerPoint() throws Exception {
String[] extensions = new String[] { "xps", "thmx" };
String[] mimeTypes = new String[] { "application/vnd.ms-xpsdocument", // Is this right?
"application/vnd.openxmlformats-officedocument" };
for (int i = 0; i < extensions.length; i++) {
String extension = extensions[i];
String filename = "testPPT." + extension;
Parser parser = new AutoDetectParser();
Metadata metadata = new Metadata();
metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
ContentHandler handler = new BodyContentHandler();
ParseContext context = new ParseContext();
try (InputStream input = getTestDocument(filename)) {
parser.parse(input, handler, metadata, context);
// Should get the metadata
assertEquals("Mime-type checking for " + filename, mimeTypes[i], metadata.get(Metadata.CONTENT_TYPE));
// But that's about it
}
}
}
use of org.xml.sax.ContentHandler in project tika by apache.
the class OldExcelParserTest method testMetadata.
// Disabled, until we can get the POI code to tell us the version
@Test
@Ignore
public void testMetadata() throws Exception {
TikaInputStream stream = getTestFile(file);
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
OldExcelParser parser = new OldExcelParser();
parser.parse(stream, handler, metadata, new ParseContext());
// We can get the content type
assertEquals("application/vnd.ms-excel.sheet.4", metadata.get(Metadata.CONTENT_TYPE));
// But no other metadata
assertEquals(null, metadata.get(TikaCoreProperties.TITLE));
assertEquals(null, metadata.get(Metadata.SUBJECT));
}
use of org.xml.sax.ContentHandler in project tika by apache.
the class OutlookParserTest method testOutlookParsing.
@Test
public void testOutlookParsing() throws Exception {
// Should auto-detect!
Parser parser = new AutoDetectParser();
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
try (InputStream stream = OutlookParserTest.class.getResourceAsStream("/test-documents/test-outlook.msg")) {
parser.parse(stream, handler, metadata, new ParseContext());
}
assertEquals("application/vnd.ms-outlook", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("Microsoft Outlook Express 6", metadata.get(TikaCoreProperties.TITLE));
assertEquals("Nouvel utilisateur de Outlook Express", metadata.get(Metadata.MESSAGE_RECIPIENT_ADDRESS));
assertEquals("L'Équipe Microsoft Outlook Express", metadata.get(TikaCoreProperties.CREATOR));
assertEquals("L'Équipe Microsoft Outlook Express", metadata.get(Metadata.AUTHOR));
//ensure that "raw" header is correctly decoded
assertEquals("L'Équipe Microsoft Outlook Express <msoe@microsoft.com>", metadata.get(Metadata.MESSAGE_RAW_HEADER_PREFIX + "From"));
assertEquals("Nouvel utilisateur de Outlook Express", metadata.get(Message.MESSAGE_TO_EMAIL));
assertEquals("", metadata.get(Message.MESSAGE_TO_NAME));
assertEquals("Nouvel utilisateur de Outlook Express", metadata.get(Message.MESSAGE_TO_DISPLAY_NAME));
// Stored as Thu, 5 Apr 2007 09:26:06 -0700
assertEquals("2007-04-05T16:26:06Z", metadata.get(TikaCoreProperties.CREATED));
String content = handler.toString();
assertContains("Microsoft Outlook Express 6", content);
assertContains("L'Équipe Microsoft Outlook Express", content);
assertContains("Nouvel utilisateur de Outlook Express", content);
assertContains("Messagerie et groupes de discussion", content);
}
use of org.xml.sax.ContentHandler in project tika by apache.
the class OutlookParserTest method testOutlookNew.
/**
* Test case for TIKA-395, to ensure parser works for new Outlook formats.
*
* @see <a href="https://issues.apache.org/jira/browse/TIKA-395">TIKA-395</a>
*/
@Test
public void testOutlookNew() throws Exception {
Parser parser = new AutoDetectParser();
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
try (InputStream stream = OutlookParserTest.class.getResourceAsStream("/test-documents/test-outlook2003.msg")) {
parser.parse(stream, handler, metadata, new ParseContext());
}
assertEquals("application/vnd.ms-outlook", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("Welcome to Microsoft Office Outlook 2003", metadata.get(TikaCoreProperties.TITLE));
String content = handler.toString();
assertContains("Outlook 2003", content);
assertContains("Streamlined Mail Experience", content);
assertContains("Navigation Pane", content);
//make sure these are parallel
assertEquals("", metadata.get(Message.MESSAGE_TO_EMAIL));
assertEquals("New Outlook User", metadata.get(Message.MESSAGE_TO_NAME));
assertEquals("New Outlook User", metadata.get(Message.MESSAGE_TO_DISPLAY_NAME));
}
Aggregations