use of org.apache.tika.parser.microsoft.OfficeParserConfig in project tika by apache.
the class SXWPFExtractorTest method testTurningOffTextBoxExtraction.
//TIKA-2346
@Test
public void testTurningOffTextBoxExtraction() throws Exception {
ParseContext pc = new ParseContext();
OfficeParserConfig officeParserConfig = new OfficeParserConfig();
officeParserConfig.setIncludeShapeBasedContent(false);
officeParserConfig.setUseSAXDocxExtractor(true);
pc.set(OfficeParserConfig.class, officeParserConfig);
String xml = getXML("testWORD_text_box.docx", pc).xml;
assertContains("This text is directly in the body of the document.", xml);
assertNotContained("This text is inside of a text box in the body of the document.", xml);
assertNotContained("This text is inside of a text box in the header of the document.", xml);
assertNotContained("This text is inside of a text box in the footer of the document.", xml);
}
use of org.apache.tika.parser.microsoft.OfficeParserConfig in project tika by apache.
the class OOXMLParserTest method testMacroinXlsm.
@Test
public void testMacroinXlsm() throws Exception {
//test default is "don't extract macros"
for (Metadata metadata : getRecursiveMetadata("testEXCEL_macro.xlsm")) {
if (metadata.get(Metadata.CONTENT_TYPE).equals("text/x-vbasic")) {
fail("Shouldn't have extracted macros as default");
}
}
//now test that they were extracted
ParseContext context = new ParseContext();
OfficeParserConfig officeParserConfig = new OfficeParserConfig();
officeParserConfig.setExtractMacros(true);
context.set(OfficeParserConfig.class, officeParserConfig);
Metadata minExpected = new Metadata();
minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Dirty()");
minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "dirty dirt dirt");
minExpected.add(Metadata.CONTENT_TYPE, "text/x-vbasic");
minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
assertContainsAtLeast(minExpected, getRecursiveMetadata("testEXCEL_macro.xlsm", context));
//test configuring via config file
TikaConfig tikaConfig = new TikaConfig(this.getClass().getResourceAsStream("tika-config-dom-macros.xml"));
AutoDetectParser parser = new AutoDetectParser(tikaConfig);
assertContainsAtLeast(minExpected, getRecursiveMetadata("testEXCEL_macro.xlsm", parser));
}
use of org.apache.tika.parser.microsoft.OfficeParserConfig in project tika by apache.
the class OOXMLParserTest method testTurningOffTextBoxExtraction.
//TIKA-2346
@Test
public void testTurningOffTextBoxExtraction() throws Exception {
ParseContext pc = new ParseContext();
OfficeParserConfig officeParserConfig = new OfficeParserConfig();
officeParserConfig.setIncludeShapeBasedContent(false);
pc.set(OfficeParserConfig.class, officeParserConfig);
String xml = getXML("testWORD_text_box.docx", pc).xml;
assertContains("This text is directly in the body of the document.", xml);
assertNotContained("This text is inside of a text box in the body of the document.", xml);
assertNotContained("This text is inside of a text box in the header of the document.", xml);
assertNotContained("This text is inside of a text box in the footer of the document.", xml);
}
use of org.apache.tika.parser.microsoft.OfficeParserConfig in project tika by apache.
the class OOXMLParserTest method testBatch.
//@Test //use this for lightweight benchmarking to compare xwpf options
public void testBatch() throws Exception {
OfficeParserConfig officeParserConfig = new OfficeParserConfig();
officeParserConfig.setUseSAXDocxExtractor(true);
long started = new Date().getTime();
int ex = 0;
for (int i = 0; i < 100; i++) {
for (File f : getResourceAsFile("/test-documents").listFiles()) {
if (!f.getName().endsWith(".docx")) {
continue;
}
try (InputStream is = TikaInputStream.get(f)) {
ParseContext parseContext = new ParseContext();
parseContext.set(OfficeParserConfig.class, officeParserConfig);
//test only the extraction of the main docx content, not embedded docs
parseContext.set(Parser.class, new EmptyParser());
Metadata metadata = new Metadata();
XMLResult r = getXML(is, parser, metadata, parseContext);
} catch (Exception e) {
ex++;
}
}
}
System.out.println("elapsed: " + (new Date().getTime() - started) + " with " + ex + " exceptions");
}
use of org.apache.tika.parser.microsoft.OfficeParserConfig in project tika by apache.
the class SXWPFExtractorTest method testSkipDeleted.
@Test
public void testSkipDeleted() throws Exception {
ParseContext pc = new ParseContext();
OfficeParserConfig officeParserConfig = new OfficeParserConfig();
officeParserConfig.setIncludeDeletedContent(true);
officeParserConfig.setUseSAXDocxExtractor(true);
officeParserConfig.setIncludeMoveFromContent(true);
pc.set(OfficeParserConfig.class, officeParserConfig);
XMLResult r = getXML("testWORD_2006ml.docx", pc);
assertContains("frog", r.xml);
assertContainsCount("Second paragraph", r.xml, 2);
}
Aggregations