use of org.apache.tika.parser.ParseContext in project tika by apache.
the class OOXMLParserTest method testMacrosInPptm.
@Test
public void testMacrosInPptm() throws Exception {
//test default is "don't extract macros"
for (Metadata metadata : getRecursiveMetadata("testPPT_macros.pptm")) {
if (metadata.get(Metadata.CONTENT_TYPE).equals("text/x-vbasic")) {
fail("Shouldn't have extracted macros as default");
}
}
//now test that they were extracted
ParseContext context = new ParseContext();
OfficeParserConfig officeParserConfig = new OfficeParserConfig();
officeParserConfig.setExtractMacros(true);
context.set(OfficeParserConfig.class, officeParserConfig);
Metadata minExpected = new Metadata();
minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Embolden()");
minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Italicize()");
minExpected.add(Metadata.CONTENT_TYPE, "text/x-vbasic");
minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
assertContainsAtLeast(minExpected, getRecursiveMetadata("testPPT_macros.pptm", context));
//test configuring via config file
TikaConfig tikaConfig = new TikaConfig(this.getClass().getResourceAsStream("tika-config-dom-macros.xml"));
AutoDetectParser parser = new AutoDetectParser(tikaConfig);
assertContainsAtLeast(minExpected, getRecursiveMetadata("testPPT_macros.pptm", parser));
}
use of org.apache.tika.parser.ParseContext in project tika by apache.
the class OOXMLParserTest method testExcelXLSB.
@Test
public void testExcelXLSB() throws Exception {
Detector detector = new DefaultDetector();
AutoDetectParser parser = new AutoDetectParser();
Metadata m = new Metadata();
m.add(Metadata.RESOURCE_NAME_KEY, "excel.xlsb");
// Should be detected correctly
MediaType type;
try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL.xlsb")) {
type = detector.detect(input, m);
assertEquals("application/vnd.ms-excel.sheet.binary.macroenabled.12", type.toString());
}
// OfficeParser won't handle it
assertEquals(false, (new OfficeParser()).getSupportedTypes(new ParseContext()).contains(type));
// OOXMLParser will (soon) handle it
assertTrue((new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type));
// AutoDetectParser doesn't break on it
try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL.xlsb")) {
ContentHandler handler = new BodyContentHandler(-1);
ParseContext context = new ParseContext();
context.set(Locale.class, Locale.US);
parser.parse(input, handler, m, context);
String content = handler.toString();
assertContains("This is an example spreadsheet", content);
}
}
use of org.apache.tika.parser.ParseContext in project tika by apache.
the class OOXMLParserTest method testNoFormat.
/**
* TIKA-1044 - Handle word documents where parts of the
* text have no formatting or styles applied to them
*/
@Test
public void testNoFormat() throws Exception {
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
try (InputStream stream = WordParserTest.class.getResourceAsStream("/test-documents/testWORD_no_format.docx")) {
new OOXMLParser().parse(stream, handler, metadata, new ParseContext());
}
String content = handler.toString();
assertContains("This is a piece of text that causes an exception", content);
}
use of org.apache.tika.parser.ParseContext in project tika by apache.
the class OOXMLParserTest method testProtectedExcelSheets.
/**
* Documents with some sheets are protected, but not all.
* See TIKA-364.
*/
@Test
public void testProtectedExcelSheets() throws Exception {
Parser parser = new AutoDetectParser();
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
ParseContext context = new ParseContext();
try (InputStream input = OOXMLParserTest.class.getResourceAsStream("/test-documents/protectedSheets.xlsx")) {
parser.parse(input, handler, metadata, context);
assertEquals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("true", metadata.get(TikaMetadataKeys.PROTECTED));
}
}
use of org.apache.tika.parser.ParseContext in project tika by apache.
the class OOXMLParserTest method testPowerPointMetadataEarly.
/**
* Test that the metadata is already extracted when the body is processed.
* See TIKA-1109
*/
@Test
public void testPowerPointMetadataEarly() throws Exception {
String[] extensions = new String[] { "pptx", "pptm", "ppsm", "ppsx", "potm" };
final String[] mimeTypes = new String[] { "application/vnd.openxmlformats-officedocument.presentationml.presentation", "application/vnd.ms-powerpoint.presentation.macroenabled.12", "application/vnd.ms-powerpoint.slideshow.macroenabled.12", "application/vnd.openxmlformats-officedocument.presentationml.slideshow", "application/vnd.ms-powerpoint.template.macroenabled.12" };
for (int i = 0; i < extensions.length; i++) {
String extension = extensions[i];
final String filename = "testPPT." + extension;
Parser parser = new AutoDetectParser();
final Metadata metadata = new Metadata();
// Allow the value to be access from the inner class
final int currentI = i;
ContentHandler handler = new BodyContentHandler() {
public void startDocument() {
assertEquals("Mime-type checking for " + filename, mimeTypes[currentI], metadata.get(Metadata.CONTENT_TYPE));
assertEquals("Attachment Test", metadata.get(TikaCoreProperties.TITLE));
assertEquals("Rajiv", metadata.get(TikaCoreProperties.CREATOR));
assertEquals("Rajiv", metadata.get(Metadata.AUTHOR));
}
};
ParseContext context = new ParseContext();
try (InputStream input = getTestDocument(filename)) {
parser.parse(input, handler, metadata, context);
}
}
}
Aggregations