use of org.apache.tika.parser.AutoDetectParser in project tika by apache.
the class OOXMLParserTest method testPowerPointMetadataEarly.
/**
* Test that the metadata is already extracted when the body is processed.
* See TIKA-1109
*/
@Test
public void testPowerPointMetadataEarly() throws Exception {
String[] extensions = new String[] { "pptx", "pptm", "ppsm", "ppsx", "potm" };
final String[] mimeTypes = new String[] { "application/vnd.openxmlformats-officedocument.presentationml.presentation", "application/vnd.ms-powerpoint.presentation.macroenabled.12", "application/vnd.ms-powerpoint.slideshow.macroenabled.12", "application/vnd.openxmlformats-officedocument.presentationml.slideshow", "application/vnd.ms-powerpoint.template.macroenabled.12" };
for (int i = 0; i < extensions.length; i++) {
String extension = extensions[i];
final String filename = "testPPT." + extension;
Parser parser = new AutoDetectParser();
final Metadata metadata = new Metadata();
// Allow the value to be access from the inner class
final int currentI = i;
ContentHandler handler = new BodyContentHandler() {
public void startDocument() {
assertEquals("Mime-type checking for " + filename, mimeTypes[currentI], metadata.get(Metadata.CONTENT_TYPE));
assertEquals("Attachment Test", metadata.get(TikaCoreProperties.TITLE));
assertEquals("Rajiv", metadata.get(TikaCoreProperties.CREATOR));
assertEquals("Rajiv", metadata.get(Metadata.AUTHOR));
}
};
ParseContext context = new ParseContext();
try (InputStream input = getTestDocument(filename)) {
parser.parse(input, handler, metadata, context);
}
}
}
use of org.apache.tika.parser.AutoDetectParser in project tika by apache.
the class OOXMLParserTest method testUnsupportedPowerPoint.
/**
* For the PowerPoint formats we don't currently support, ensure that
* we don't break either
*/
@Test
public void testUnsupportedPowerPoint() throws Exception {
String[] extensions = new String[] { "xps", "thmx" };
String[] mimeTypes = new String[] { "application/vnd.ms-xpsdocument", // Is this right?
"application/vnd.openxmlformats-officedocument" };
for (int i = 0; i < extensions.length; i++) {
String extension = extensions[i];
String filename = "testPPT." + extension;
Parser parser = new AutoDetectParser();
Metadata metadata = new Metadata();
metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
ContentHandler handler = new BodyContentHandler();
ParseContext context = new ParseContext();
try (InputStream input = getTestDocument(filename)) {
parser.parse(input, handler, metadata, context);
// Should get the metadata
assertEquals("Mime-type checking for " + filename, mimeTypes[i], metadata.get(Metadata.CONTENT_TYPE));
// But that's about it
}
}
}
use of org.apache.tika.parser.AutoDetectParser in project tika by apache.
the class OutlookParserTest method testOutlookHTMLVersion.
@Test
public void testOutlookHTMLVersion() throws Exception {
Parser parser = new AutoDetectParser();
Metadata metadata = new Metadata();
// Check the HTML version
StringWriter sw = new StringWriter();
SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance();
TransformerHandler handler = factory.newTransformerHandler();
handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
handler.setResult(new StreamResult(sw));
try (InputStream stream = OutlookParserTest.class.getResourceAsStream("/test-documents/testMSG_chinese.msg")) {
parser.parse(stream, handler, metadata, new ParseContext());
}
// As the HTML version should have been processed, ensure
// we got some of the links
String content = sw.toString();
assertContains("<dd>tests.chang@fengttt.com</dd>", content);
assertContains("<p>Alfresco MSG format testing", content);
assertContains("<li>1", content);
assertContains("<li>2", content);
// Make sure we don't have nested html docs
assertEquals(2, content.split("<body>").length);
assertEquals(2, content.split("<\\/body>").length);
// Make sure that the Chinese actually came through
assertContains("張毓倫", metadata.get(TikaCoreProperties.CREATOR));
assertContains("陳惠珍", content);
assertEquals("tests.chang@fengttt.com", metadata.get(Message.MESSAGE_TO_EMAIL));
assertEquals("Tests Chang@FT (張毓倫)", metadata.get(Office.MAPI_FROM_REPRESENTING_NAME));
assertEquals("/O=FT GROUP/OU=FT/CN=RECIPIENTS/CN=LYDIACHANG", metadata.get(Office.MAPI_FROM_REPRESENTING_EMAIL));
}
use of org.apache.tika.parser.AutoDetectParser in project tika by apache.
the class OutlookParserTest method testOutlookParsing.
@Test
public void testOutlookParsing() throws Exception {
// Should auto-detect!
Parser parser = new AutoDetectParser();
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
try (InputStream stream = OutlookParserTest.class.getResourceAsStream("/test-documents/test-outlook.msg")) {
parser.parse(stream, handler, metadata, new ParseContext());
}
assertEquals("application/vnd.ms-outlook", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("Microsoft Outlook Express 6", metadata.get(TikaCoreProperties.TITLE));
assertEquals("Nouvel utilisateur de Outlook Express", metadata.get(Metadata.MESSAGE_RECIPIENT_ADDRESS));
assertEquals("L'Équipe Microsoft Outlook Express", metadata.get(TikaCoreProperties.CREATOR));
assertEquals("L'Équipe Microsoft Outlook Express", metadata.get(Metadata.AUTHOR));
//ensure that "raw" header is correctly decoded
assertEquals("L'Équipe Microsoft Outlook Express <msoe@microsoft.com>", metadata.get(Metadata.MESSAGE_RAW_HEADER_PREFIX + "From"));
assertEquals("Nouvel utilisateur de Outlook Express", metadata.get(Message.MESSAGE_TO_EMAIL));
assertEquals("", metadata.get(Message.MESSAGE_TO_NAME));
assertEquals("Nouvel utilisateur de Outlook Express", metadata.get(Message.MESSAGE_TO_DISPLAY_NAME));
// Stored as Thu, 5 Apr 2007 09:26:06 -0700
assertEquals("2007-04-05T16:26:06Z", metadata.get(TikaCoreProperties.CREATED));
String content = handler.toString();
assertContains("Microsoft Outlook Express 6", content);
assertContains("L'Équipe Microsoft Outlook Express", content);
assertContains("Nouvel utilisateur de Outlook Express", content);
assertContains("Messagerie et groupes de discussion", content);
}
use of org.apache.tika.parser.AutoDetectParser in project tika by apache.
the class OutlookParserTest method testOutlookForwarded.
@Test
public void testOutlookForwarded() throws Exception {
Parser parser = new AutoDetectParser();
Metadata metadata = new Metadata();
// Check the HTML version
StringWriter sw = new StringWriter();
SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance();
TransformerHandler handler = factory.newTransformerHandler();
handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
handler.setResult(new StreamResult(sw));
try (InputStream stream = OutlookParserTest.class.getResourceAsStream("/test-documents/testMSG_forwarded.msg")) {
parser.parse(stream, handler, metadata, new ParseContext());
}
// Make sure we don't have nested docs
String content = sw.toString();
assertEquals(2, content.split("<body>").length);
assertEquals(2, content.split("<\\/body>").length);
}
Aggregations