use of org.apache.tika.parser.ParseContext in project tika by apache.
the class SXSLFExtractorTest method testPowerPointCustomProperties.
@Test
public void testPowerPointCustomProperties() throws Exception {
Metadata metadata = new Metadata();
ParseContext context = new ParseContext();
context.set(Locale.class, Locale.US);
OfficeParserConfig officeParserConfig = new OfficeParserConfig();
officeParserConfig.setUseSAXPptxExtractor(true);
context.set(OfficeParserConfig.class, officeParserConfig);
getXML("testPPT_custom_props.pptx", metadata, parseContext);
assertEquals("application/vnd.openxmlformats-officedocument.presentationml.presentation", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("JOUVIN ETIENNE", metadata.get(TikaCoreProperties.CREATOR));
assertEquals("EJ04325S", metadata.get(TikaCoreProperties.MODIFIER));
assertEquals("EJ04325S", metadata.get(Metadata.LAST_AUTHOR));
assertEquals("2011-08-22T13:30:53Z", metadata.get(TikaCoreProperties.CREATED));
assertEquals("2011-08-22T13:30:53Z", metadata.get(Metadata.CREATION_DATE));
assertEquals("2011-08-22T13:32:49Z", metadata.get(TikaCoreProperties.MODIFIED));
assertEquals("2011-08-22T13:32:49Z", metadata.get(Metadata.DATE));
assertEquals("1", metadata.get(Office.SLIDE_COUNT));
assertEquals("3", metadata.get(Office.WORD_COUNT));
assertEquals("Test extraction properties pptx", metadata.get(TikaCoreProperties.TITLE));
assertEquals("true", metadata.get("custom:myCustomBoolean"));
assertEquals("3", metadata.get("custom:myCustomNumber"));
assertEquals("MyStringValue", metadata.get("custom:MyCustomString"));
assertEquals("2010-12-30T22:00:00Z", metadata.get("custom:MyCustomDate"));
assertEquals("2010-12-29T22:00:00Z", metadata.get("custom:myCustomSecondDate"));
}
use of org.apache.tika.parser.ParseContext in project tika by apache.
the class SXWPFExtractorTest method testEncrypted.
@Test
public void testEncrypted() throws Exception {
Map<String, String> tests = new HashMap<String, String>();
tests.put("testWORD_protected_passtika.docx", "This is an encrypted Word 2007 File");
Parser parser = new AutoDetectParser();
Metadata m = new Metadata();
PasswordProvider passwordProvider = new PasswordProvider() {
@Override
public String getPassword(Metadata metadata) {
return "tika";
}
};
OfficeParserConfig opc = new OfficeParserConfig();
opc.setUseSAXDocxExtractor(true);
ParseContext passwordContext = new ParseContext();
passwordContext.set(org.apache.tika.parser.PasswordProvider.class, passwordProvider);
passwordContext.set(OfficeParserConfig.class, opc);
for (Map.Entry<String, String> e : tests.entrySet()) {
assertContains(e.getValue(), getXML(e.getKey(), passwordContext).xml);
}
//now try with no password
for (Map.Entry<String, String> e : tests.entrySet()) {
boolean exc = false;
try {
getXML(e.getKey(), parseContext);
} catch (EncryptedDocumentException ex) {
exc = true;
}
assertTrue(exc);
}
}
use of org.apache.tika.parser.ParseContext in project tika by apache.
the class OOXMLParserTest method testMasterText.
/**
* TIKA-712 Master Slide Text from PPT and PPTX files
* should be extracted too
*/
@Test
public void testMasterText() throws Exception {
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
try (InputStream stream = OOXMLParserTest.class.getResourceAsStream("/test-documents/testPPT_masterText.pptx")) {
new AutoDetectParser().parse(stream, handler, metadata, new ParseContext());
}
String content = handler.toString();
assertContains("Text that I added to the master slide", content);
}
use of org.apache.tika.parser.ParseContext in project tika by apache.
the class OOXMLParserTest method testMacrosInPptm.
@Test
public void testMacrosInPptm() throws Exception {
//test default is "don't extract macros"
for (Metadata metadata : getRecursiveMetadata("testPPT_macros.pptm")) {
if (metadata.get(Metadata.CONTENT_TYPE).equals("text/x-vbasic")) {
fail("Shouldn't have extracted macros as default");
}
}
//now test that they were extracted
ParseContext context = new ParseContext();
OfficeParserConfig officeParserConfig = new OfficeParserConfig();
officeParserConfig.setExtractMacros(true);
context.set(OfficeParserConfig.class, officeParserConfig);
Metadata minExpected = new Metadata();
minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Embolden()");
minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Italicize()");
minExpected.add(Metadata.CONTENT_TYPE, "text/x-vbasic");
minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
assertContainsAtLeast(minExpected, getRecursiveMetadata("testPPT_macros.pptm", context));
//test configuring via config file
TikaConfig tikaConfig = new TikaConfig(this.getClass().getResourceAsStream("tika-config-dom-macros.xml"));
AutoDetectParser parser = new AutoDetectParser(tikaConfig);
assertContainsAtLeast(minExpected, getRecursiveMetadata("testPPT_macros.pptm", parser));
}
use of org.apache.tika.parser.ParseContext in project tika by apache.
the class OOXMLParserTest method testExcelXLSB.
@Test
public void testExcelXLSB() throws Exception {
Detector detector = new DefaultDetector();
AutoDetectParser parser = new AutoDetectParser();
Metadata m = new Metadata();
m.add(Metadata.RESOURCE_NAME_KEY, "excel.xlsb");
// Should be detected correctly
MediaType type;
try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL.xlsb")) {
type = detector.detect(input, m);
assertEquals("application/vnd.ms-excel.sheet.binary.macroenabled.12", type.toString());
}
// OfficeParser won't handle it
assertEquals(false, (new OfficeParser()).getSupportedTypes(new ParseContext()).contains(type));
// OOXMLParser will (soon) handle it
assertTrue((new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type));
// AutoDetectParser doesn't break on it
try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL.xlsb")) {
ContentHandler handler = new BodyContentHandler(-1);
ParseContext context = new ParseContext();
context.set(Locale.class, Locale.US);
parser.parse(input, handler, m, context);
String content = handler.toString();
assertContains("This is an example spreadsheet", content);
}
}
Aggregations