use of org.apache.tika.config.TikaConfig in project tika by apache.
the class TikaCLI method configure.
private void configure(String configFilePath) throws Exception {
this.configFilePath = configFilePath;
config = new TikaConfig(new File(configFilePath));
parser = new AutoDetectParser(config);
if (digester != null) {
parser = new DigestingParser(parser, digester);
}
detector = config.getDetector();
context.set(Parser.class, parser);
}
use of org.apache.tika.config.TikaConfig in project tika by apache.
the class SXSLFExtractorTest method testMacrosInPptm.
@Test
public void testMacrosInPptm() throws Exception {
Metadata parsedBy = new Metadata();
parsedBy.add("X-Parsed-By", "org.apache.tika.parser.microsoft.ooxml.xslf.XSLFEventBasedPowerPointExtractor");
List<Metadata> metadataList = getRecursiveMetadata("testPPT_macros.pptm", parseContext);
//test default is "don't extract macros"
for (Metadata metadata : metadataList) {
if (metadata.get(Metadata.CONTENT_TYPE).equals("text/x-vbasic")) {
fail("Shouldn't have extracted macros as default");
}
}
assertContainsAtLeast(parsedBy, metadataList);
//now test that they are extracted
ParseContext context = new ParseContext();
OfficeParserConfig officeParserConfig = new OfficeParserConfig();
officeParserConfig.setExtractMacros(true);
officeParserConfig.setUseSAXPptxExtractor(true);
context.set(OfficeParserConfig.class, officeParserConfig);
Metadata minExpected = new Metadata();
minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Embolden()");
minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Italicize()");
minExpected.add(Metadata.CONTENT_TYPE, "text/x-vbasic");
minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
metadataList = getRecursiveMetadata("testPPT_macros.pptm", context);
assertContainsAtLeast(minExpected, metadataList);
assertContainsAtLeast(parsedBy, metadataList);
//test configuring via config file
TikaConfig tikaConfig = new TikaConfig(this.getClass().getResourceAsStream("tika-config-sax-macros.xml"));
AutoDetectParser parser = new AutoDetectParser(tikaConfig);
metadataList = getRecursiveMetadata("testPPT_macros.pptm", parser);
assertContainsAtLeast(minExpected, metadataList);
assertContainsAtLeast(parsedBy, metadataList);
}
use of org.apache.tika.config.TikaConfig in project tika by apache.
the class OOXMLParserTest method testMacrosInPptm.
@Test
public void testMacrosInPptm() throws Exception {
//test default is "don't extract macros"
for (Metadata metadata : getRecursiveMetadata("testPPT_macros.pptm")) {
if (metadata.get(Metadata.CONTENT_TYPE).equals("text/x-vbasic")) {
fail("Shouldn't have extracted macros as default");
}
}
//now test that they were extracted
ParseContext context = new ParseContext();
OfficeParserConfig officeParserConfig = new OfficeParserConfig();
officeParserConfig.setExtractMacros(true);
context.set(OfficeParserConfig.class, officeParserConfig);
Metadata minExpected = new Metadata();
minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Embolden()");
minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Italicize()");
minExpected.add(Metadata.CONTENT_TYPE, "text/x-vbasic");
minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
assertContainsAtLeast(minExpected, getRecursiveMetadata("testPPT_macros.pptm", context));
//test configuring via config file
TikaConfig tikaConfig = new TikaConfig(this.getClass().getResourceAsStream("tika-config-dom-macros.xml"));
AutoDetectParser parser = new AutoDetectParser(tikaConfig);
assertContainsAtLeast(minExpected, getRecursiveMetadata("testPPT_macros.pptm", parser));
}
use of org.apache.tika.config.TikaConfig in project tika by apache.
the class SXWPFExtractorTest method testMacrosInDocm.
@Test
public void testMacrosInDocm() throws Exception {
Metadata parsedBy = new Metadata();
parsedBy.add("X-Parsed-By", "org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor");
//test default is "don't extract macros"
List<Metadata> metadataList = getRecursiveMetadata("testWORD_macros.docm", parseContext);
for (Metadata metadata : metadataList) {
if (metadata.get(Metadata.CONTENT_TYPE).equals("text/x-vbasic")) {
fail("Shouldn't have extracted macros as default");
}
}
assertContainsAtLeast(parsedBy, metadataList);
//now test that they were extracted
ParseContext context = new ParseContext();
OfficeParserConfig officeParserConfig = new OfficeParserConfig();
officeParserConfig.setExtractMacros(true);
officeParserConfig.setUseSAXDocxExtractor(true);
context.set(OfficeParserConfig.class, officeParserConfig);
metadataList = getRecursiveMetadata("testWORD_macros.docm", context);
//check that content came out of the .docm file
assertContains("quick", metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT));
assertContainsAtLeast(parsedBy, metadataList);
Metadata minExpected = new Metadata();
minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Embolden()");
minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Italicize()");
minExpected.add(Metadata.CONTENT_TYPE, "text/x-vbasic");
minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
assertContainsAtLeast(minExpected, metadataList);
assertContainsAtLeast(parsedBy, metadataList);
//test configuring via config file
TikaConfig tikaConfig = new TikaConfig(this.getClass().getResourceAsStream("tika-config-sax-macros.xml"));
AutoDetectParser parser = new AutoDetectParser(tikaConfig);
metadataList = getRecursiveMetadata("testWORD_macros.docm", parser);
assertContainsAtLeast(minExpected, metadataList);
assertContainsAtLeast(parsedBy, metadataList);
}
use of org.apache.tika.config.TikaConfig in project tika by apache.
the class OOXMLParserTest method testMacrosInDocm.
@Test
public void testMacrosInDocm() throws Exception {
//test default is "don't extract macros"
for (Metadata metadata : getRecursiveMetadata("testWORD_macros.docm")) {
if (metadata.get(Metadata.CONTENT_TYPE).equals("text/x-vbasic")) {
fail("Shouldn't have extracted macros as default");
}
}
//now test that they were extracted
ParseContext context = new ParseContext();
OfficeParserConfig officeParserConfig = new OfficeParserConfig();
officeParserConfig.setExtractMacros(true);
context.set(OfficeParserConfig.class, officeParserConfig);
Metadata minExpected = new Metadata();
minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Embolden()");
minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Italicize()");
minExpected.add(Metadata.CONTENT_TYPE, "text/x-vbasic");
minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
assertContainsAtLeast(minExpected, getRecursiveMetadata("testWORD_macros.docm", context));
//test configuring via config file
TikaConfig tikaConfig = new TikaConfig(this.getClass().getResourceAsStream("tika-config-dom-macros.xml"));
AutoDetectParser parser = new AutoDetectParser(tikaConfig);
assertContainsAtLeast(minExpected, getRecursiveMetadata("testWORD_macros.docm", parser));
}
Aggregations