use of org.apache.tika.parser.microsoft.OfficeParserConfig in project tika by apache.
the class SXWPFExtractorTest method setUp.
@Before
public void setUp() {
parseContext = new ParseContext();
OfficeParserConfig officeParserConfig = new OfficeParserConfig();
officeParserConfig.setUseSAXDocxExtractor(true);
parseContext.set(OfficeParserConfig.class, officeParserConfig);
}
use of org.apache.tika.parser.microsoft.OfficeParserConfig in project tika by apache.
the class Word2006MLParserTest method testSkipDeletedAndMoveFrom.
@Test
public void testSkipDeletedAndMoveFrom() throws Exception {
ParseContext pc = new ParseContext();
OfficeParserConfig officeParserConfig = new OfficeParserConfig();
officeParserConfig.setIncludeDeletedContent(true);
officeParserConfig.setIncludeMoveFromContent(true);
pc.set(OfficeParserConfig.class, officeParserConfig);
XMLResult r = getXML("testWORD_2006ml.xml", pc);
assertContains("frog", r.xml);
assertContainsCount("Second paragraph", r.xml, 2);
}
use of org.apache.tika.parser.microsoft.OfficeParserConfig in project tika by apache.
the class OOXMLParserTest method testMacrosInDocm.
@Test
public void testMacrosInDocm() throws Exception {
//test default is "don't extract macros"
for (Metadata metadata : getRecursiveMetadata("testWORD_macros.docm")) {
if (metadata.get(Metadata.CONTENT_TYPE).equals("text/x-vbasic")) {
fail("Shouldn't have extracted macros as default");
}
}
//now test that they were extracted
ParseContext context = new ParseContext();
OfficeParserConfig officeParserConfig = new OfficeParserConfig();
officeParserConfig.setExtractMacros(true);
context.set(OfficeParserConfig.class, officeParserConfig);
Metadata minExpected = new Metadata();
minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Embolden()");
minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Italicize()");
minExpected.add(Metadata.CONTENT_TYPE, "text/x-vbasic");
minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
assertContainsAtLeast(minExpected, getRecursiveMetadata("testWORD_macros.docm", context));
//test configuring via config file
TikaConfig tikaConfig = new TikaConfig(this.getClass().getResourceAsStream("tika-config-dom-macros.xml"));
AutoDetectParser parser = new AutoDetectParser(tikaConfig);
assertContainsAtLeast(minExpected, getRecursiveMetadata("testWORD_macros.docm", parser));
}
use of org.apache.tika.parser.microsoft.OfficeParserConfig in project tika by apache.
the class OOXMLExtractorFactory method parse.
public static void parse(InputStream stream, ContentHandler baseHandler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
Locale locale = context.get(Locale.class, Locale.getDefault());
ExtractorFactory.setThreadPrefersEventExtractors(true);
try {
OOXMLExtractor extractor;
OPCPackage pkg;
// Locate or Open the OPCPackage for the file
TikaInputStream tis = TikaInputStream.cast(stream);
if (tis != null && tis.getOpenContainer() instanceof OPCPackage) {
pkg = (OPCPackage) tis.getOpenContainer();
} else if (tis != null && tis.hasFile()) {
pkg = OPCPackage.open(tis.getFile().getPath(), PackageAccess.READ);
tis.setOpenContainer(pkg);
} else {
InputStream shield = new CloseShieldInputStream(stream);
pkg = OPCPackage.open(shield);
}
// Get the type, and ensure it's one we handle
MediaType type = ZipContainerDetector.detectOfficeOpenXML(pkg);
if (type == null || OOXMLParser.UNSUPPORTED_OOXML_TYPES.contains(type)) {
// Not a supported type, delegate to Empty Parser
EmptyParser.INSTANCE.parse(stream, baseHandler, metadata, context);
return;
}
metadata.set(Metadata.CONTENT_TYPE, type.toString());
// Have the appropriate OOXML text extractor picked
POIXMLTextExtractor poiExtractor = null;
// This has already been set by OOXMLParser's call to configure()
// We can rely on this being non-null.
OfficeParserConfig config = context.get(OfficeParserConfig.class);
if (config.getUseSAXDocxExtractor()) {
poiExtractor = trySXWPF(pkg);
}
if (poiExtractor == null && config.getUseSAXPptxExtractor()) {
poiExtractor = trySXSLF(pkg);
}
if (poiExtractor == null) {
poiExtractor = ExtractorFactory.createExtractor(pkg);
}
POIXMLDocument document = poiExtractor.getDocument();
if (poiExtractor instanceof XSSFBEventBasedExcelExtractor) {
extractor = new XSSFBExcelExtractorDecorator(context, poiExtractor, locale);
} else if (poiExtractor instanceof XSSFEventBasedExcelExtractor) {
extractor = new XSSFExcelExtractorDecorator(context, poiExtractor, locale);
} else if (poiExtractor instanceof XWPFEventBasedWordExtractor) {
extractor = new SXWPFWordExtractorDecorator(metadata, context, (XWPFEventBasedWordExtractor) poiExtractor);
metadata.add("X-Parsed-By", XWPFEventBasedWordExtractor.class.getCanonicalName());
} else if (poiExtractor instanceof XSLFEventBasedPowerPointExtractor) {
extractor = new SXSLFPowerPointExtractorDecorator(metadata, context, (XSLFEventBasedPowerPointExtractor) poiExtractor);
metadata.add("X-Parsed-By", XSLFEventBasedPowerPointExtractor.class.getCanonicalName());
} else if (document == null) {
throw new TikaException("Expecting UserModel based POI OOXML extractor with a document, but none found. " + "The extractor returned was a " + poiExtractor);
} else if (document instanceof XMLSlideShow) {
extractor = new XSLFPowerPointExtractorDecorator(context, (org.apache.poi.xslf.extractor.XSLFPowerPointExtractor) poiExtractor);
} else if (document instanceof XWPFDocument) {
extractor = new XWPFWordExtractorDecorator(context, (XWPFWordExtractor) poiExtractor);
} else {
extractor = new POIXMLTextExtractorDecorator(context, poiExtractor);
}
// Get the bulk of the metadata first, so that it's accessible during
// parsing if desired by the client (see TIKA-1109)
extractor.getMetadataExtractor().extract(metadata);
// Extract the text, along with any in-document metadata
extractor.getXHTML(baseHandler, metadata, context);
} catch (IllegalArgumentException e) {
if (e.getMessage() != null && e.getMessage().startsWith("No supported documents found")) {
throw new TikaException("TIKA-418: RuntimeException while getting content" + " for thmx and xps file types", e);
} else {
throw new TikaException("Error creating OOXML extractor", e);
}
} catch (InvalidFormatException e) {
throw new TikaException("Error creating OOXML extractor", e);
} catch (OpenXML4JException e) {
throw new TikaException("Error creating OOXML extractor", e);
} catch (XmlException e) {
throw new TikaException("Error creating OOXML extractor", e);
}
}
use of org.apache.tika.parser.microsoft.OfficeParserConfig in project tika by apache.
the class OOXMLParserTest method testXLSBVarious.
@Test
public void testXLSBVarious() throws Exception {
OfficeParserConfig officeParserConfig = new OfficeParserConfig();
officeParserConfig.setExtractMacros(true);
ParseContext parseContext = new ParseContext();
parseContext.set(OfficeParserConfig.class, officeParserConfig);
List<Metadata> metadataList = getRecursiveMetadata("testEXCEL_various.xlsb", parseContext);
assertEquals(4, metadataList.size());
String xml = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);
assertContains("<td>13</td>", xml);
assertContains("<td>13.1211231321</td>", xml);
assertContains("<td>$ 3.03</td>", xml);
assertContains("<td>20%</td>", xml);
assertContains("<td>13.12</td>", xml);
assertContains("<td>123456789012345</td>", xml);
assertContains("<td>1.23456789012345E+15</td>", xml);
assertContains("test comment2", xml);
assertContains("comment4 (end of row)", xml);
assertContains("<td>1/4</td>", xml);
assertContains("<td>3/9/17</td>", xml);
assertContains("<td>4</td>", xml);
assertContains("<td>2</td>", xml);
assertContains("<td> 46/1963</td>", xml);
assertContains("<td> 3/128</td>", xml);
assertContains("test textbox", xml);
assertContains("test WordArt", xml);
assertContains("<a href=\"http://lucene.apache.org/\">http://lucene.apache.org/</a>", xml);
assertContains("<a href=\"http://tika.apache.org/\">http://tika.apache.org/</a>", xml);
assertContains("OddLeftHeader OddCenterHeader OddRightHeader", xml);
assertContains("EvenLeftHeader EvenCenterHeader EvenRightHeader", xml);
assertContains("FirstPageLeftHeader FirstPageCenterHeader FirstPageRightHeader", xml);
assertContains("OddLeftFooter OddCenterFooter OddRightFooter", xml);
assertContains("EvenLeftFooter EvenCenterFooter EvenRightFooter", xml);
assertContains("FirstPageLeftFooter FirstPageCenterFooter FirstPageRightFooter", xml);
}
Aggregations