use of org.xml.sax.ContentHandler in project tika by apache.
the class ExcelParserTest method testHeaderAndFooterExtraction.
@Test
public void testHeaderAndFooterExtraction() throws Exception {
try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_headers_footers.xls")) {
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
ParseContext context = new ParseContext();
context.set(Locale.class, Locale.UK);
new OfficeParser().parse(input, handler, metadata, context);
assertEquals("application/vnd.ms-excel", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("Internal spreadsheet", metadata.get(TikaCoreProperties.TITLE));
assertEquals("Aeham Abushwashi", metadata.get(TikaCoreProperties.CREATOR));
assertEquals("Aeham Abushwashi", metadata.get(Metadata.AUTHOR));
String content = handler.toString();
assertContains("John Smith1", content);
assertContains("John Smith50", content);
assertContains("1 Corporate HQ", content);
assertContains("Header - Corporate Spreadsheet", content);
assertContains("Header - For Internal Use Only", content);
assertContains("Header - Author: John Smith", content);
assertContains("Footer - Corporate Spreadsheet", content);
assertContains("Footer - For Internal Use Only", content);
assertContains("Footer - Author: John Smith", content);
}
}
use of org.xml.sax.ContentHandler in project tika by apache.
the class ExcelParserTest method testWorksSpreadsheet70.
@Test
public void testWorksSpreadsheet70() throws Exception {
try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testWORKSSpreadsheet7.0.xlr")) {
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler(-1);
ParseContext context = new ParseContext();
context.set(Locale.class, Locale.US);
new OfficeParser().parse(input, handler, metadata, context);
String content = handler.toString();
assertContains("Microsoft Works", content);
}
}
use of org.xml.sax.ContentHandler in project tika by apache.
the class ExcelParserTest method testExcelParserPassword.
@Test
public void testExcelParserPassword() throws Exception {
try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_protected_passtika.xls")) {
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
ParseContext context = new ParseContext();
context.set(Locale.class, Locale.US);
new OfficeParser().parse(input, handler, metadata, context);
fail("Document is encrypted, shouldn't parse");
} catch (EncryptedDocumentException e) {
// Good
}
// Try again, this time with the password
try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_protected_passtika.xls")) {
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
ParseContext context = new ParseContext();
context.set(Locale.class, Locale.US);
context.set(PasswordProvider.class, new PasswordProvider() {
@Override
public String getPassword(Metadata metadata) {
return "tika";
}
});
new OfficeParser().parse(input, handler, metadata, context);
assertEquals("application/vnd.ms-excel", metadata.get(Metadata.CONTENT_TYPE));
assertEquals(null, metadata.get(TikaCoreProperties.TITLE));
assertEquals("Antoni", metadata.get(TikaCoreProperties.CREATOR));
assertEquals("2011-11-25T09:52:48Z", metadata.get(TikaCoreProperties.CREATED));
String content = handler.toString();
assertContains("This is an Encrypted Excel spreadsheet", content);
assertNotContained("9.0", content);
}
}
use of org.xml.sax.ContentHandler in project tika by apache.
the class PowerPointParserTest method testPowerPointParser.
@Test
public void testPowerPointParser() throws Exception {
try (InputStream input = PowerPointParserTest.class.getResourceAsStream("/test-documents/testPPT.ppt")) {
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
new OfficeParser().parse(input, handler, metadata, new ParseContext());
assertEquals("application/vnd.ms-powerpoint", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("Sample Powerpoint Slide", metadata.get(TikaCoreProperties.TITLE));
assertEquals("Keith Bennett", metadata.get(TikaCoreProperties.CREATOR));
assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
String content = handler.toString();
assertContains("Sample Powerpoint Slide", content);
assertContains("Powerpoint X for Mac", content);
}
}
use of org.xml.sax.ContentHandler in project tika by apache.
the class PowerPointParserTest method testMasterText.
/**
* TIKA-712 Master Slide Text from PPT and PPTX files
* should be extracted too
*/
@Test
public void testMasterText() throws Exception {
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
try (InputStream stream = PowerPointParserTest.class.getResourceAsStream("/test-documents/testPPT_masterText.ppt")) {
new OfficeParser().parse(stream, handler, metadata, new ParseContext());
}
String content = handler.toString();
assertContains("Text that I added to the master slide", content);
// Make sure boilerplate text didn't come through:
assertEquals(-1, content.indexOf("Click to edit Master"));
//TIKA-1171
assertEquals(-1, content.indexOf("*"));
}
Aggregations