Search in sources :

Example 6 with OfficeParserConfig

use of org.apache.tika.parser.microsoft.OfficeParserConfig in project tika by apache.

the class SXWPFExtractorTest method setUp.

@Before
public void setUp() {
    parseContext = new ParseContext();
    OfficeParserConfig officeParserConfig = new OfficeParserConfig();
    officeParserConfig.setUseSAXDocxExtractor(true);
    parseContext.set(OfficeParserConfig.class, officeParserConfig);
}
Also used : ParseContext(org.apache.tika.parser.ParseContext) OfficeParserConfig(org.apache.tika.parser.microsoft.OfficeParserConfig) Before(org.junit.Before)

Example 7 with OfficeParserConfig

use of org.apache.tika.parser.microsoft.OfficeParserConfig in project tika by apache.

the class Word2006MLParserTest method testSkipDeletedAndMoveFrom.

@Test
public void testSkipDeletedAndMoveFrom() throws Exception {
    ParseContext pc = new ParseContext();
    OfficeParserConfig officeParserConfig = new OfficeParserConfig();
    officeParserConfig.setIncludeDeletedContent(true);
    officeParserConfig.setIncludeMoveFromContent(true);
    pc.set(OfficeParserConfig.class, officeParserConfig);
    XMLResult r = getXML("testWORD_2006ml.xml", pc);
    assertContains("frog", r.xml);
    assertContainsCount("Second paragraph", r.xml, 2);
}
Also used : ParseContext(org.apache.tika.parser.ParseContext) OfficeParserConfig(org.apache.tika.parser.microsoft.OfficeParserConfig) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 8 with OfficeParserConfig

use of org.apache.tika.parser.microsoft.OfficeParserConfig in project tika by apache.

the class OOXMLParserTest method testMacrosInDocm.

@Test
public void testMacrosInDocm() throws Exception {
    //test default is "don't extract macros"
    for (Metadata metadata : getRecursiveMetadata("testWORD_macros.docm")) {
        if (metadata.get(Metadata.CONTENT_TYPE).equals("text/x-vbasic")) {
            fail("Shouldn't have extracted macros as default");
        }
    }
    //now test that they were extracted
    ParseContext context = new ParseContext();
    OfficeParserConfig officeParserConfig = new OfficeParserConfig();
    officeParserConfig.setExtractMacros(true);
    context.set(OfficeParserConfig.class, officeParserConfig);
    Metadata minExpected = new Metadata();
    minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Embolden()");
    minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Italicize()");
    minExpected.add(Metadata.CONTENT_TYPE, "text/x-vbasic");
    minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
    assertContainsAtLeast(minExpected, getRecursiveMetadata("testWORD_macros.docm", context));
    //test configuring via config file
    TikaConfig tikaConfig = new TikaConfig(this.getClass().getResourceAsStream("tika-config-dom-macros.xml"));
    AutoDetectParser parser = new AutoDetectParser(tikaConfig);
    assertContainsAtLeast(minExpected, getRecursiveMetadata("testWORD_macros.docm", parser));
}
Also used : TikaConfig(org.apache.tika.config.TikaConfig) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) OfficeParserConfig(org.apache.tika.parser.microsoft.OfficeParserConfig) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) ExcelParserTest(org.apache.tika.parser.microsoft.ExcelParserTest) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest) WordParserTest(org.apache.tika.parser.microsoft.WordParserTest)

Example 9 with OfficeParserConfig

use of org.apache.tika.parser.microsoft.OfficeParserConfig in project tika by apache.

the class OOXMLExtractorFactory method parse.

public static void parse(InputStream stream, ContentHandler baseHandler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    Locale locale = context.get(Locale.class, Locale.getDefault());
    ExtractorFactory.setThreadPrefersEventExtractors(true);
    try {
        OOXMLExtractor extractor;
        OPCPackage pkg;
        // Locate or Open the OPCPackage for the file
        TikaInputStream tis = TikaInputStream.cast(stream);
        if (tis != null && tis.getOpenContainer() instanceof OPCPackage) {
            pkg = (OPCPackage) tis.getOpenContainer();
        } else if (tis != null && tis.hasFile()) {
            pkg = OPCPackage.open(tis.getFile().getPath(), PackageAccess.READ);
            tis.setOpenContainer(pkg);
        } else {
            InputStream shield = new CloseShieldInputStream(stream);
            pkg = OPCPackage.open(shield);
        }
        // Get the type, and ensure it's one we handle
        MediaType type = ZipContainerDetector.detectOfficeOpenXML(pkg);
        if (type == null || OOXMLParser.UNSUPPORTED_OOXML_TYPES.contains(type)) {
            // Not a supported type, delegate to Empty Parser
            EmptyParser.INSTANCE.parse(stream, baseHandler, metadata, context);
            return;
        }
        metadata.set(Metadata.CONTENT_TYPE, type.toString());
        // Have the appropriate OOXML text extractor picked
        POIXMLTextExtractor poiExtractor = null;
        // This has already been set by OOXMLParser's call to configure()
        // We can rely on this being non-null.
        OfficeParserConfig config = context.get(OfficeParserConfig.class);
        if (config.getUseSAXDocxExtractor()) {
            poiExtractor = trySXWPF(pkg);
        }
        if (poiExtractor == null && config.getUseSAXPptxExtractor()) {
            poiExtractor = trySXSLF(pkg);
        }
        if (poiExtractor == null) {
            poiExtractor = ExtractorFactory.createExtractor(pkg);
        }
        POIXMLDocument document = poiExtractor.getDocument();
        if (poiExtractor instanceof XSSFBEventBasedExcelExtractor) {
            extractor = new XSSFBExcelExtractorDecorator(context, poiExtractor, locale);
        } else if (poiExtractor instanceof XSSFEventBasedExcelExtractor) {
            extractor = new XSSFExcelExtractorDecorator(context, poiExtractor, locale);
        } else if (poiExtractor instanceof XWPFEventBasedWordExtractor) {
            extractor = new SXWPFWordExtractorDecorator(metadata, context, (XWPFEventBasedWordExtractor) poiExtractor);
            metadata.add("X-Parsed-By", XWPFEventBasedWordExtractor.class.getCanonicalName());
        } else if (poiExtractor instanceof XSLFEventBasedPowerPointExtractor) {
            extractor = new SXSLFPowerPointExtractorDecorator(metadata, context, (XSLFEventBasedPowerPointExtractor) poiExtractor);
            metadata.add("X-Parsed-By", XSLFEventBasedPowerPointExtractor.class.getCanonicalName());
        } else if (document == null) {
            throw new TikaException("Expecting UserModel based POI OOXML extractor with a document, but none found. " + "The extractor returned was a " + poiExtractor);
        } else if (document instanceof XMLSlideShow) {
            extractor = new XSLFPowerPointExtractorDecorator(context, (org.apache.poi.xslf.extractor.XSLFPowerPointExtractor) poiExtractor);
        } else if (document instanceof XWPFDocument) {
            extractor = new XWPFWordExtractorDecorator(context, (XWPFWordExtractor) poiExtractor);
        } else {
            extractor = new POIXMLTextExtractorDecorator(context, poiExtractor);
        }
        // Get the bulk of the metadata first, so that it's accessible during
        //  parsing if desired by the client (see TIKA-1109)
        extractor.getMetadataExtractor().extract(metadata);
        // Extract the text, along with any in-document metadata
        extractor.getXHTML(baseHandler, metadata, context);
    } catch (IllegalArgumentException e) {
        if (e.getMessage() != null && e.getMessage().startsWith("No supported documents found")) {
            throw new TikaException("TIKA-418: RuntimeException while getting content" + " for thmx and xps file types", e);
        } else {
            throw new TikaException("Error creating OOXML extractor", e);
        }
    } catch (InvalidFormatException e) {
        throw new TikaException("Error creating OOXML extractor", e);
    } catch (OpenXML4JException e) {
        throw new TikaException("Error creating OOXML extractor", e);
    } catch (XmlException e) {
        throw new TikaException("Error creating OOXML extractor", e);
    }
}
Also used : Locale(java.util.Locale) TikaInputStream(org.apache.tika.io.TikaInputStream) XWPFEventBasedWordExtractor(org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor) InvalidFormatException(org.apache.poi.openxml4j.exceptions.InvalidFormatException) OpenXML4JException(org.apache.poi.openxml4j.exceptions.OpenXML4JException) XSSFEventBasedExcelExtractor(org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor) OfficeParserConfig(org.apache.tika.parser.microsoft.OfficeParserConfig) MediaType(org.apache.tika.mime.MediaType) XWPFDocument(org.apache.poi.xwpf.usermodel.XWPFDocument) XSLFEventBasedPowerPointExtractor(org.apache.tika.parser.microsoft.ooxml.xslf.XSLFEventBasedPowerPointExtractor) TikaException(org.apache.tika.exception.TikaException) XSSFBEventBasedExcelExtractor(org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) XWPFWordExtractor(org.apache.poi.xwpf.extractor.XWPFWordExtractor) POIXMLDocument(org.apache.poi.POIXMLDocument) POIXMLTextExtractor(org.apache.poi.POIXMLTextExtractor) XmlException(org.apache.xmlbeans.XmlException) XMLSlideShow(org.apache.poi.xslf.usermodel.XMLSlideShow) OPCPackage(org.apache.poi.openxml4j.opc.OPCPackage) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream)

Example 10 with OfficeParserConfig

use of org.apache.tika.parser.microsoft.OfficeParserConfig in project tika by apache.

the class OOXMLParserTest method testXLSBVarious.

@Test
public void testXLSBVarious() throws Exception {
    OfficeParserConfig officeParserConfig = new OfficeParserConfig();
    officeParserConfig.setExtractMacros(true);
    ParseContext parseContext = new ParseContext();
    parseContext.set(OfficeParserConfig.class, officeParserConfig);
    List<Metadata> metadataList = getRecursiveMetadata("testEXCEL_various.xlsb", parseContext);
    assertEquals(4, metadataList.size());
    String xml = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);
    assertContains("<td>13</td>", xml);
    assertContains("<td>13.1211231321</td>", xml);
    assertContains("<td>$   3.03</td>", xml);
    assertContains("<td>20%</td>", xml);
    assertContains("<td>13.12</td>", xml);
    assertContains("<td>123456789012345</td>", xml);
    assertContains("<td>1.23456789012345E+15</td>", xml);
    assertContains("test comment2", xml);
    assertContains("comment4 (end of row)", xml);
    assertContains("<td>1/4</td>", xml);
    assertContains("<td>3/9/17</td>", xml);
    assertContains("<td>4</td>", xml);
    assertContains("<td>2</td>", xml);
    assertContains("<td>   46/1963</td>", xml);
    assertContains("<td>  3/128</td>", xml);
    assertContains("test textbox", xml);
    assertContains("test WordArt", xml);
    assertContains("<a href=\"http://lucene.apache.org/\">http://lucene.apache.org/</a>", xml);
    assertContains("<a href=\"http://tika.apache.org/\">http://tika.apache.org/</a>", xml);
    assertContains("OddLeftHeader OddCenterHeader OddRightHeader", xml);
    assertContains("EvenLeftHeader EvenCenterHeader EvenRightHeader", xml);
    assertContains("FirstPageLeftHeader FirstPageCenterHeader FirstPageRightHeader", xml);
    assertContains("OddLeftFooter OddCenterFooter OddRightFooter", xml);
    assertContains("EvenLeftFooter EvenCenterFooter EvenRightFooter", xml);
    assertContains("FirstPageLeftFooter FirstPageCenterFooter FirstPageRightFooter", xml);
}
Also used : OfficeParserConfig(org.apache.tika.parser.microsoft.OfficeParserConfig) ParseContext(org.apache.tika.parser.ParseContext) Metadata(org.apache.tika.metadata.Metadata) ExcelParserTest(org.apache.tika.parser.microsoft.ExcelParserTest) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest) WordParserTest(org.apache.tika.parser.microsoft.WordParserTest)

Aggregations

OfficeParserConfig (org.apache.tika.parser.microsoft.OfficeParserConfig)16 ParseContext (org.apache.tika.parser.ParseContext)15 TikaTest (org.apache.tika.TikaTest)13 Test (org.junit.Test)13 Metadata (org.apache.tika.metadata.Metadata)9 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)6 ExcelParserTest (org.apache.tika.parser.microsoft.ExcelParserTest)6 WordParserTest (org.apache.tika.parser.microsoft.WordParserTest)6 TikaConfig (org.apache.tika.config.TikaConfig)5 InputStream (java.io.InputStream)2 EncryptedDocumentException (org.apache.tika.exception.EncryptedDocumentException)2 TikaInputStream (org.apache.tika.io.TikaInputStream)2 File (java.io.File)1 Date (java.util.Date)1 HashMap (java.util.HashMap)1 Locale (java.util.Locale)1 Map (java.util.Map)1 CloseShieldInputStream (org.apache.commons.io.input.CloseShieldInputStream)1 POIXMLDocument (org.apache.poi.POIXMLDocument)1 POIXMLTextExtractor (org.apache.poi.POIXMLTextExtractor)1