use of org.xml.sax.ContentHandler in project tika by apache.
the class SXSLFExtractorTest method testUnsupportedPowerPoint.
/**
* For the PowerPoint formats we don't currently support, ensure that
* we don't break either
*/
@Test
public void testUnsupportedPowerPoint() throws Exception {
String[] extensions = new String[] { "xps", "thmx" };
String[] mimeTypes = new String[] { "application/vnd.ms-xpsdocument", // Is this right?
"application/vnd.openxmlformats-officedocument" };
for (int i = 0; i < extensions.length; i++) {
String extension = extensions[i];
String filename = "testPPT." + extension;
Parser parser = new AutoDetectParser();
Metadata metadata = new Metadata();
metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
ContentHandler handler = new BodyContentHandler();
try (InputStream input = getResourceAsStream("/test-documents/" + filename)) {
parser.parse(input, handler, metadata, parseContext);
// Should get the metadata
assertEquals("Mime-type checking for " + filename, mimeTypes[i], metadata.get(Metadata.CONTENT_TYPE));
// But that's about it
}
}
}
use of org.xml.sax.ContentHandler in project tika by apache.
the class OOXMLParserTest method testMasterText.
/**
* TIKA-712 Master Slide Text from PPT and PPTX files
* should be extracted too
*/
@Test
public void testMasterText() throws Exception {
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
try (InputStream stream = OOXMLParserTest.class.getResourceAsStream("/test-documents/testPPT_masterText.pptx")) {
new AutoDetectParser().parse(stream, handler, metadata, new ParseContext());
}
String content = handler.toString();
assertContains("Text that I added to the master slide", content);
}
use of org.xml.sax.ContentHandler in project tika by apache.
the class OOXMLParserTest method testExcelXLSB.
@Test
public void testExcelXLSB() throws Exception {
Detector detector = new DefaultDetector();
AutoDetectParser parser = new AutoDetectParser();
Metadata m = new Metadata();
m.add(Metadata.RESOURCE_NAME_KEY, "excel.xlsb");
// Should be detected correctly
MediaType type;
try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL.xlsb")) {
type = detector.detect(input, m);
assertEquals("application/vnd.ms-excel.sheet.binary.macroenabled.12", type.toString());
}
// OfficeParser won't handle it
assertEquals(false, (new OfficeParser()).getSupportedTypes(new ParseContext()).contains(type));
// OOXMLParser will (soon) handle it
assertTrue((new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type));
// AutoDetectParser doesn't break on it
try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL.xlsb")) {
ContentHandler handler = new BodyContentHandler(-1);
ParseContext context = new ParseContext();
context.set(Locale.class, Locale.US);
parser.parse(input, handler, m, context);
String content = handler.toString();
assertContains("This is an example spreadsheet", content);
}
}
use of org.xml.sax.ContentHandler in project tika by apache.
the class OOXMLParserTest method testNoFormat.
/**
* TIKA-1044 - Handle word documents where parts of the
* text have no formatting or styles applied to them
*/
@Test
public void testNoFormat() throws Exception {
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
try (InputStream stream = WordParserTest.class.getResourceAsStream("/test-documents/testWORD_no_format.docx")) {
new OOXMLParser().parse(stream, handler, metadata, new ParseContext());
}
String content = handler.toString();
assertContains("This is a piece of text that causes an exception", content);
}
use of org.xml.sax.ContentHandler in project tika by apache.
the class OOXMLParserTest method testProtectedExcelSheets.
/**
* Documents with some sheets are protected, but not all.
* See TIKA-364.
*/
@Test
public void testProtectedExcelSheets() throws Exception {
Parser parser = new AutoDetectParser();
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
ParseContext context = new ParseContext();
try (InputStream input = OOXMLParserTest.class.getResourceAsStream("/test-documents/protectedSheets.xlsx")) {
parser.parse(input, handler, metadata, context);
assertEquals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("true", metadata.get(TikaMetadataKeys.PROTECTED));
}
}
Aggregations