use of org.apache.tika.parser.AutoDetectParser in project tika by apache.
the class TikaCLI method configure.
private void configure(String configFilePath) throws Exception {
this.configFilePath = configFilePath;
config = new TikaConfig(new File(configFilePath));
parser = new AutoDetectParser(config);
if (digester != null) {
parser = new DigestingParser(parser, digester);
}
detector = config.getDetector();
context.set(Parser.class, parser);
}
use of org.apache.tika.parser.AutoDetectParser in project tika by apache.
the class DigestingAutoDetectParserFactory method getParser.
@Override
public Parser getParser(TikaConfig config) {
Parser p = new AutoDetectParser(config);
if (digester == null) {
return p;
}
DigestingParser d = new DigestingParser(p, digester);
return d;
}
use of org.apache.tika.parser.AutoDetectParser in project tika by apache.
the class SXSLFExtractorTest method testMacrosInPptm.
@Test
public void testMacrosInPptm() throws Exception {
Metadata parsedBy = new Metadata();
parsedBy.add("X-Parsed-By", "org.apache.tika.parser.microsoft.ooxml.xslf.XSLFEventBasedPowerPointExtractor");
List<Metadata> metadataList = getRecursiveMetadata("testPPT_macros.pptm", parseContext);
//test default is "don't extract macros"
for (Metadata metadata : metadataList) {
if (metadata.get(Metadata.CONTENT_TYPE).equals("text/x-vbasic")) {
fail("Shouldn't have extracted macros as default");
}
}
assertContainsAtLeast(parsedBy, metadataList);
//now test that they are extracted
ParseContext context = new ParseContext();
OfficeParserConfig officeParserConfig = new OfficeParserConfig();
officeParserConfig.setExtractMacros(true);
officeParserConfig.setUseSAXPptxExtractor(true);
context.set(OfficeParserConfig.class, officeParserConfig);
Metadata minExpected = new Metadata();
minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Embolden()");
minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Italicize()");
minExpected.add(Metadata.CONTENT_TYPE, "text/x-vbasic");
minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
metadataList = getRecursiveMetadata("testPPT_macros.pptm", context);
assertContainsAtLeast(minExpected, metadataList);
assertContainsAtLeast(parsedBy, metadataList);
//test configuring via config file
TikaConfig tikaConfig = new TikaConfig(this.getClass().getResourceAsStream("tika-config-sax-macros.xml"));
AutoDetectParser parser = new AutoDetectParser(tikaConfig);
metadataList = getRecursiveMetadata("testPPT_macros.pptm", parser);
assertContainsAtLeast(minExpected, metadataList);
assertContainsAtLeast(parsedBy, metadataList);
}
use of org.apache.tika.parser.AutoDetectParser in project tika by apache.
the class SXSLFExtractorTest method testPowerPoint.
/**
* We have a number of different powerpoint files,
* such as presentation, macro-enabled etc
*/
@Test
public void testPowerPoint() throws Exception {
String[] extensions = new String[] { "pptx", "pptm", "ppsm", "ppsx", "potm" };
String[] mimeTypes = new String[] { "application/vnd.openxmlformats-officedocument.presentationml.presentation", "application/vnd.ms-powerpoint.presentation.macroenabled.12", "application/vnd.ms-powerpoint.slideshow.macroenabled.12", "application/vnd.openxmlformats-officedocument.presentationml.slideshow", "application/vnd.ms-powerpoint.template.macroenabled.12" };
for (int i = 0; i < extensions.length; i++) {
String extension = extensions[i];
String filename = "testPPT." + extension;
Parser parser = new AutoDetectParser();
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
try (InputStream input = getResourceAsStream("/test-documents/" + filename)) {
parser.parse(input, handler, metadata, parseContext);
assertEquals("Mime-type checking for " + filename, mimeTypes[i], metadata.get(Metadata.CONTENT_TYPE));
assertEquals("Attachment Test", metadata.get(TikaCoreProperties.TITLE));
assertEquals("Rajiv", metadata.get(TikaCoreProperties.CREATOR));
assertEquals("Rajiv", metadata.get(Metadata.AUTHOR));
String content = handler.toString();
// Theme files don't have the text in them
if (extension.equals("thmx")) {
assertEquals("", content);
} else {
assertTrue("Text missing for " + filename + "\n" + content, content.contains("Attachment Test"));
assertTrue("Text missing for " + filename + "\n" + content, content.contains("This is a test file data with the same content"));
assertTrue("Text missing for " + filename + "\n" + content, content.contains("content parsing"));
assertTrue("Text missing for " + filename + "\n" + content, content.contains("Different words to test against"));
assertTrue("Text missing for " + filename + "\n" + content, content.contains("Mystery"));
}
}
}
}
use of org.apache.tika.parser.AutoDetectParser in project tika by apache.
the class SXSLFExtractorTest method testUnsupportedPowerPoint.
/**
* For the PowerPoint formats we don't currently support, ensure that
* we don't break either
*/
@Test
public void testUnsupportedPowerPoint() throws Exception {
String[] extensions = new String[] { "xps", "thmx" };
String[] mimeTypes = new String[] { "application/vnd.ms-xpsdocument", // Is this right?
"application/vnd.openxmlformats-officedocument" };
for (int i = 0; i < extensions.length; i++) {
String extension = extensions[i];
String filename = "testPPT." + extension;
Parser parser = new AutoDetectParser();
Metadata metadata = new Metadata();
metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
ContentHandler handler = new BodyContentHandler();
try (InputStream input = getResourceAsStream("/test-documents/" + filename)) {
parser.parse(input, handler, metadata, parseContext);
// Should get the metadata
assertEquals("Mime-type checking for " + filename, mimeTypes[i], metadata.get(Metadata.CONTENT_TYPE));
// But that's about it
}
}
}
Aggregations