Search in sources :

Example 6 with XWPFWordExtractor

use of org.apache.poi.xwpf.extractor.XWPFWordExtractor in project tika by apache.

the class OOXMLExtractorFactory method parse.

public static void parse(InputStream stream, ContentHandler baseHandler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    Locale locale = context.get(Locale.class, Locale.getDefault());
    ExtractorFactory.setThreadPrefersEventExtractors(true);
    try {
        OOXMLExtractor extractor;
        OPCPackage pkg;
        // Locate or Open the OPCPackage for the file
        TikaInputStream tis = TikaInputStream.cast(stream);
        if (tis != null && tis.getOpenContainer() instanceof OPCPackage) {
            pkg = (OPCPackage) tis.getOpenContainer();
        } else if (tis != null && tis.hasFile()) {
            pkg = OPCPackage.open(tis.getFile().getPath(), PackageAccess.READ);
            tis.setOpenContainer(pkg);
        } else {
            InputStream shield = new CloseShieldInputStream(stream);
            pkg = OPCPackage.open(shield);
        }
        // Get the type, and ensure it's one we handle
        MediaType type = ZipContainerDetector.detectOfficeOpenXML(pkg);
        if (type == null || OOXMLParser.UNSUPPORTED_OOXML_TYPES.contains(type)) {
            // Not a supported type, delegate to Empty Parser
            EmptyParser.INSTANCE.parse(stream, baseHandler, metadata, context);
            return;
        }
        metadata.set(Metadata.CONTENT_TYPE, type.toString());
        // Have the appropriate OOXML text extractor picked
        POIXMLTextExtractor poiExtractor = null;
        // This has already been set by OOXMLParser's call to configure()
        // We can rely on this being non-null.
        OfficeParserConfig config = context.get(OfficeParserConfig.class);
        if (config.getUseSAXDocxExtractor()) {
            poiExtractor = trySXWPF(pkg);
        }
        if (poiExtractor == null && config.getUseSAXPptxExtractor()) {
            poiExtractor = trySXSLF(pkg);
        }
        if (poiExtractor == null) {
            poiExtractor = ExtractorFactory.createExtractor(pkg);
        }
        POIXMLDocument document = poiExtractor.getDocument();
        if (poiExtractor instanceof XSSFBEventBasedExcelExtractor) {
            extractor = new XSSFBExcelExtractorDecorator(context, poiExtractor, locale);
        } else if (poiExtractor instanceof XSSFEventBasedExcelExtractor) {
            extractor = new XSSFExcelExtractorDecorator(context, poiExtractor, locale);
        } else if (poiExtractor instanceof XWPFEventBasedWordExtractor) {
            extractor = new SXWPFWordExtractorDecorator(metadata, context, (XWPFEventBasedWordExtractor) poiExtractor);
            metadata.add("X-Parsed-By", XWPFEventBasedWordExtractor.class.getCanonicalName());
        } else if (poiExtractor instanceof XSLFEventBasedPowerPointExtractor) {
            extractor = new SXSLFPowerPointExtractorDecorator(metadata, context, (XSLFEventBasedPowerPointExtractor) poiExtractor);
            metadata.add("X-Parsed-By", XSLFEventBasedPowerPointExtractor.class.getCanonicalName());
        } else if (document == null) {
            throw new TikaException("Expecting UserModel based POI OOXML extractor with a document, but none found. " + "The extractor returned was a " + poiExtractor);
        } else if (document instanceof XMLSlideShow) {
            extractor = new XSLFPowerPointExtractorDecorator(context, (org.apache.poi.xslf.extractor.XSLFPowerPointExtractor) poiExtractor);
        } else if (document instanceof XWPFDocument) {
            extractor = new XWPFWordExtractorDecorator(context, (XWPFWordExtractor) poiExtractor);
        } else {
            extractor = new POIXMLTextExtractorDecorator(context, poiExtractor);
        }
        // Get the bulk of the metadata first, so that it's accessible during
        //  parsing if desired by the client (see TIKA-1109)
        extractor.getMetadataExtractor().extract(metadata);
        // Extract the text, along with any in-document metadata
        extractor.getXHTML(baseHandler, metadata, context);
    } catch (IllegalArgumentException e) {
        if (e.getMessage() != null && e.getMessage().startsWith("No supported documents found")) {
            throw new TikaException("TIKA-418: RuntimeException while getting content" + " for thmx and xps file types", e);
        } else {
            throw new TikaException("Error creating OOXML extractor", e);
        }
    } catch (InvalidFormatException e) {
        throw new TikaException("Error creating OOXML extractor", e);
    } catch (OpenXML4JException e) {
        throw new TikaException("Error creating OOXML extractor", e);
    } catch (XmlException e) {
        throw new TikaException("Error creating OOXML extractor", e);
    }
}
Also used : Locale(java.util.Locale) TikaInputStream(org.apache.tika.io.TikaInputStream) XWPFEventBasedWordExtractor(org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor) InvalidFormatException(org.apache.poi.openxml4j.exceptions.InvalidFormatException) OpenXML4JException(org.apache.poi.openxml4j.exceptions.OpenXML4JException) XSSFEventBasedExcelExtractor(org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor) OfficeParserConfig(org.apache.tika.parser.microsoft.OfficeParserConfig) MediaType(org.apache.tika.mime.MediaType) XWPFDocument(org.apache.poi.xwpf.usermodel.XWPFDocument) XSLFEventBasedPowerPointExtractor(org.apache.tika.parser.microsoft.ooxml.xslf.XSLFEventBasedPowerPointExtractor) TikaException(org.apache.tika.exception.TikaException) XSSFBEventBasedExcelExtractor(org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) XWPFWordExtractor(org.apache.poi.xwpf.extractor.XWPFWordExtractor) POIXMLDocument(org.apache.poi.POIXMLDocument) POIXMLTextExtractor(org.apache.poi.POIXMLTextExtractor) XmlException(org.apache.xmlbeans.XmlException) XMLSlideShow(org.apache.poi.xslf.usermodel.XMLSlideShow) OPCPackage(org.apache.poi.openxml4j.opc.OPCPackage) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream)

Example 7 with XWPFWordExtractor

use of org.apache.poi.xwpf.extractor.XWPFWordExtractor in project poi by apache.

the class TestXWPFBugs method bug53475NoCSPName.

/**
     * A word document that's encrypted with non-standard
     * Encryption options, and no cspname section. See bug 53475
     */
@Test
public void bug53475NoCSPName() throws Exception {
    File file = POIDataSamples.getDocumentInstance().getFile("bug53475-password-is-solrcell.docx");
    NPOIFSFileSystem filesystem = new NPOIFSFileSystem(file, true);
    // Check the encryption details
    EncryptionInfo info = new EncryptionInfo(filesystem);
    assertEquals(128, info.getHeader().getKeySize());
    assertEquals(CipherAlgorithm.aes128, info.getHeader().getCipherAlgorithm());
    assertEquals(HashAlgorithm.sha1, info.getHeader().getHashAlgorithmEx());
    // Check it can be decoded
    Decryptor d = Decryptor.getInstance(info);
    assertTrue("Unable to process: document is encrypted", d.verifyPassword("solrcell"));
    // Check we can read the word document in that
    InputStream dataStream = d.getDataStream(filesystem);
    OPCPackage opc = OPCPackage.open(dataStream);
    XWPFDocument doc = new XWPFDocument(opc);
    XWPFWordExtractor ex = new XWPFWordExtractor(doc);
    String text = ex.getText();
    assertNotNull(text);
    assertEquals("This is password protected Word document.", text.trim());
    ex.close();
    filesystem.close();
}
Also used : NPOIFSFileSystem(org.apache.poi.poifs.filesystem.NPOIFSFileSystem) Decryptor(org.apache.poi.poifs.crypt.Decryptor) EncryptionInfo(org.apache.poi.poifs.crypt.EncryptionInfo) InputStream(java.io.InputStream) XWPFWordExtractor(org.apache.poi.xwpf.extractor.XWPFWordExtractor) XWPFDocument(org.apache.poi.xwpf.usermodel.XWPFDocument) File(java.io.File) ZipFile(java.util.zip.ZipFile) OPCPackage(org.apache.poi.openxml4j.opc.OPCPackage) Test(org.junit.Test)

Example 8 with XWPFWordExtractor

use of org.apache.poi.xwpf.extractor.XWPFWordExtractor in project poi by apache.

the class TestXWPFBugs method bug53475_aes256.

/**
     * A word document with aes-256, i.e. aes is always 128 bit (= 128 bit block size),
     * but the key can be 128/192/256 bits
     */
@Test
public void bug53475_aes256() throws Exception {
    int maxKeyLen = Cipher.getMaxAllowedKeyLength("AES");
    Assume.assumeTrue("Please install JCE Unlimited Strength Jurisdiction Policy files for AES 256", maxKeyLen == 2147483647);
    File file = POIDataSamples.getDocumentInstance().getFile("bug53475-password-is-pass.docx");
    NPOIFSFileSystem filesystem = new NPOIFSFileSystem(file, true);
    // Check the encryption details
    EncryptionInfo info = new EncryptionInfo(filesystem);
    assertEquals(16, info.getHeader().getBlockSize());
    assertEquals(256, info.getHeader().getKeySize());
    assertEquals(CipherAlgorithm.aes256, info.getHeader().getCipherAlgorithm());
    assertEquals(HashAlgorithm.sha1, info.getHeader().getHashAlgorithmEx());
    // Check it can be decoded
    Decryptor d = Decryptor.getInstance(info);
    assertTrue("Unable to process: document is encrypted", d.verifyPassword("pass"));
    // Check we can read the word document in that
    InputStream dataStream = d.getDataStream(filesystem);
    OPCPackage opc = OPCPackage.open(dataStream);
    XWPFDocument doc = new XWPFDocument(opc);
    XWPFWordExtractor ex = new XWPFWordExtractor(doc);
    String text = ex.getText();
    assertNotNull(text);
    // I know ... a stupid typo, maybe next time ...
    assertEquals("The is a password protected document.", text.trim());
    ex.close();
    filesystem.close();
}
Also used : NPOIFSFileSystem(org.apache.poi.poifs.filesystem.NPOIFSFileSystem) Decryptor(org.apache.poi.poifs.crypt.Decryptor) EncryptionInfo(org.apache.poi.poifs.crypt.EncryptionInfo) InputStream(java.io.InputStream) XWPFWordExtractor(org.apache.poi.xwpf.extractor.XWPFWordExtractor) XWPFDocument(org.apache.poi.xwpf.usermodel.XWPFDocument) File(java.io.File) ZipFile(java.util.zip.ZipFile) OPCPackage(org.apache.poi.openxml4j.opc.OPCPackage) Test(org.junit.Test)

Example 9 with XWPFWordExtractor

use of org.apache.poi.xwpf.extractor.XWPFWordExtractor in project poi by apache.

the class TestXWPFDocument method testWriteFromReadOnlyOPC.

@Test
@Ignore("XWPF should be able to write to a new Stream when opened Read-Only")
public void testWriteFromReadOnlyOPC() throws Exception {
    OPCPackage opc = OPCPackage.open(POIDataSamples.getDocumentInstance().getFile("SampleDoc.docx"), PackageAccess.READ);
    XWPFDocument doc = new XWPFDocument(opc);
    XWPFWordExtractor ext = new XWPFWordExtractor(doc);
    String origText = ext.getText();
    doc = XWPFTestDataSamples.writeOutAndReadBack(doc);
    ext.close();
    ext = new XWPFWordExtractor(doc);
    assertEquals(origText, ext.getText());
    ext.close();
}
Also used : XWPFWordExtractor(org.apache.poi.xwpf.extractor.XWPFWordExtractor) OPCPackage(org.apache.poi.openxml4j.opc.OPCPackage) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 10 with XWPFWordExtractor

use of org.apache.poi.xwpf.extractor.XWPFWordExtractor in project poi by apache.

the class TestExtractorFactory method testFile.

@Test
public void testFile() throws Exception {
    // Excel
    POITextExtractor xlsExtractor = ExtractorFactory.createExtractor(xls);
    assertNotNull("Had empty extractor for " + xls, xlsExtractor);
    assertTrue("Expected instanceof ExcelExtractor, but had: " + xlsExtractor.getClass(), xlsExtractor instanceof ExcelExtractor);
    assertTrue(xlsExtractor.getText().length() > 200);
    xlsExtractor.close();
    POITextExtractor extractor = ExtractorFactory.createExtractor(xlsx);
    assertTrue(extractor.getClass().getName(), extractor instanceof XSSFExcelExtractor);
    extractor.close();
    extractor = ExtractorFactory.createExtractor(xlsx);
    assertTrue(extractor.getText().length() > 200);
    extractor.close();
    extractor = ExtractorFactory.createExtractor(xltx);
    assertTrue(extractor.getClass().getName(), extractor instanceof XSSFExcelExtractor);
    extractor.close();
    extractor = ExtractorFactory.createExtractor(xlsb);
    assertContains(extractor.getText(), "test");
    extractor.close();
    extractor = ExtractorFactory.createExtractor(xltx);
    assertContains(extractor.getText(), "test");
    extractor.close();
    // TODO Support OOXML-Strict, see bug #57699
    try {
        /*extractor =*/
        ExtractorFactory.createExtractor(xlsxStrict);
        fail("OOXML-Strict isn't yet supported");
    } catch (POIXMLException e) {
    // Expected, for now
    }
    //        extractor = ExtractorFactory.createExtractor(xlsxStrict);
    //        assertTrue(
    //                extractor
    //                instanceof XSSFExcelExtractor
    //        );
    //        extractor.close();
    //
    //        extractor = ExtractorFactory.createExtractor(xlsxStrict);
    //        assertTrue(
    //                extractor.getText().contains("test")
    //        );
    //        extractor.close();
    // Word
    extractor = ExtractorFactory.createExtractor(doc);
    assertTrue(extractor instanceof WordExtractor);
    assertTrue(extractor.getText().length() > 120);
    extractor.close();
    extractor = ExtractorFactory.createExtractor(doc6);
    assertTrue(extractor instanceof Word6Extractor);
    assertTrue(extractor.getText().length() > 20);
    extractor.close();
    extractor = ExtractorFactory.createExtractor(doc95);
    assertTrue(extractor instanceof Word6Extractor);
    assertTrue(extractor.getText().length() > 120);
    extractor.close();
    extractor = ExtractorFactory.createExtractor(docx);
    assertTrue(extractor instanceof XWPFWordExtractor);
    extractor.close();
    extractor = ExtractorFactory.createExtractor(docx);
    assertTrue(extractor.getText().length() > 120);
    extractor.close();
    extractor = ExtractorFactory.createExtractor(dotx);
    assertTrue(extractor instanceof XWPFWordExtractor);
    extractor.close();
    extractor = ExtractorFactory.createExtractor(dotx);
    assertContains(extractor.getText(), "Test");
    extractor.close();
    // PowerPoint (PPT)
    extractor = ExtractorFactory.createExtractor(ppt);
    assertTrue(extractor instanceof PowerPointExtractor);
    assertTrue(extractor.getText().length() > 120);
    extractor.close();
    // PowerPoint (PPTX)
    extractor = ExtractorFactory.createExtractor(pptx);
    assertTrue(extractor instanceof XSLFPowerPointExtractor);
    assertTrue(extractor.getText().length() > 120);
    extractor.close();
    // Visio - binary
    extractor = ExtractorFactory.createExtractor(vsd);
    assertTrue(extractor instanceof VisioTextExtractor);
    assertTrue(extractor.getText().length() > 50);
    extractor.close();
    // Visio - vsdx
    extractor = ExtractorFactory.createExtractor(vsdx);
    assertTrue(extractor instanceof XDGFVisioExtractor);
    assertTrue(extractor.getText().length() > 20);
    extractor.close();
    // Publisher
    extractor = ExtractorFactory.createExtractor(pub);
    assertTrue(extractor instanceof PublisherTextExtractor);
    assertTrue(extractor.getText().length() > 50);
    extractor.close();
    // Outlook msg
    extractor = ExtractorFactory.createExtractor(msg);
    assertTrue(extractor instanceof OutlookTextExtactor);
    assertTrue(extractor.getText().length() > 50);
    extractor.close();
    // Text
    try {
        ExtractorFactory.createExtractor(txt);
        fail();
    } catch (IllegalArgumentException e) {
    // Good
    }
}
Also used : XDGFVisioExtractor(org.apache.poi.xdgf.extractor.XDGFVisioExtractor) XSSFExcelExtractor(org.apache.poi.xssf.extractor.XSSFExcelExtractor) Word6Extractor(org.apache.poi.hwpf.extractor.Word6Extractor) PowerPointExtractor(org.apache.poi.hslf.extractor.PowerPointExtractor) XSLFPowerPointExtractor(org.apache.poi.xslf.extractor.XSLFPowerPointExtractor) XWPFWordExtractor(org.apache.poi.xwpf.extractor.XWPFWordExtractor) PublisherTextExtractor(org.apache.poi.hpbf.extractor.PublisherTextExtractor) POIXMLException(org.apache.poi.POIXMLException) WordExtractor(org.apache.poi.hwpf.extractor.WordExtractor) XWPFWordExtractor(org.apache.poi.xwpf.extractor.XWPFWordExtractor) OutlookTextExtactor(org.apache.poi.hsmf.extractor.OutlookTextExtactor) XSLFPowerPointExtractor(org.apache.poi.xslf.extractor.XSLFPowerPointExtractor) POITextExtractor(org.apache.poi.POITextExtractor) XSSFExcelExtractor(org.apache.poi.xssf.extractor.XSSFExcelExtractor) ExcelExtractor(org.apache.poi.hssf.extractor.ExcelExtractor) XSSFEventBasedExcelExtractor(org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor) EventBasedExcelExtractor(org.apache.poi.hssf.extractor.EventBasedExcelExtractor) VisioTextExtractor(org.apache.poi.hdgf.extractor.VisioTextExtractor) Test(org.junit.Test)

Aggregations

XWPFWordExtractor (org.apache.poi.xwpf.extractor.XWPFWordExtractor)10 Test (org.junit.Test)7 XSLFPowerPointExtractor (org.apache.poi.xslf.extractor.XSLFPowerPointExtractor)6 XSSFExcelExtractor (org.apache.poi.xssf.extractor.XSSFExcelExtractor)6 OPCPackage (org.apache.poi.openxml4j.opc.OPCPackage)5 XSSFEventBasedExcelExtractor (org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor)5 XDGFVisioExtractor (org.apache.poi.xdgf.extractor.XDGFVisioExtractor)4 File (java.io.File)3 InputStream (java.io.InputStream)3 POITextExtractor (org.apache.poi.POITextExtractor)3 PowerPointExtractor (org.apache.poi.hslf.extractor.PowerPointExtractor)3 OutlookTextExtactor (org.apache.poi.hsmf.extractor.OutlookTextExtactor)3 EventBasedExcelExtractor (org.apache.poi.hssf.extractor.EventBasedExcelExtractor)3 ExcelExtractor (org.apache.poi.hssf.extractor.ExcelExtractor)3 WordExtractor (org.apache.poi.hwpf.extractor.WordExtractor)3 XWPFDocument (org.apache.poi.xwpf.usermodel.XWPFDocument)3 FileInputStream (java.io.FileInputStream)2 IOException (java.io.IOException)2 ZipFile (java.util.zip.ZipFile)2 POIXMLException (org.apache.poi.POIXMLException)2