use of org.apache.poi.xwpf.extractor.XWPFWordExtractor in project tika by apache.
the class OOXMLExtractorFactory method parse.
public static void parse(InputStream stream, ContentHandler baseHandler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
Locale locale = context.get(Locale.class, Locale.getDefault());
ExtractorFactory.setThreadPrefersEventExtractors(true);
try {
OOXMLExtractor extractor;
OPCPackage pkg;
// Locate or Open the OPCPackage for the file
TikaInputStream tis = TikaInputStream.cast(stream);
if (tis != null && tis.getOpenContainer() instanceof OPCPackage) {
pkg = (OPCPackage) tis.getOpenContainer();
} else if (tis != null && tis.hasFile()) {
pkg = OPCPackage.open(tis.getFile().getPath(), PackageAccess.READ);
tis.setOpenContainer(pkg);
} else {
InputStream shield = new CloseShieldInputStream(stream);
pkg = OPCPackage.open(shield);
}
// Get the type, and ensure it's one we handle
MediaType type = ZipContainerDetector.detectOfficeOpenXML(pkg);
if (type == null || OOXMLParser.UNSUPPORTED_OOXML_TYPES.contains(type)) {
// Not a supported type, delegate to Empty Parser
EmptyParser.INSTANCE.parse(stream, baseHandler, metadata, context);
return;
}
metadata.set(Metadata.CONTENT_TYPE, type.toString());
// Have the appropriate OOXML text extractor picked
POIXMLTextExtractor poiExtractor = null;
// This has already been set by OOXMLParser's call to configure()
// We can rely on this being non-null.
OfficeParserConfig config = context.get(OfficeParserConfig.class);
if (config.getUseSAXDocxExtractor()) {
poiExtractor = trySXWPF(pkg);
}
if (poiExtractor == null && config.getUseSAXPptxExtractor()) {
poiExtractor = trySXSLF(pkg);
}
if (poiExtractor == null) {
poiExtractor = ExtractorFactory.createExtractor(pkg);
}
POIXMLDocument document = poiExtractor.getDocument();
if (poiExtractor instanceof XSSFBEventBasedExcelExtractor) {
extractor = new XSSFBExcelExtractorDecorator(context, poiExtractor, locale);
} else if (poiExtractor instanceof XSSFEventBasedExcelExtractor) {
extractor = new XSSFExcelExtractorDecorator(context, poiExtractor, locale);
} else if (poiExtractor instanceof XWPFEventBasedWordExtractor) {
extractor = new SXWPFWordExtractorDecorator(metadata, context, (XWPFEventBasedWordExtractor) poiExtractor);
metadata.add("X-Parsed-By", XWPFEventBasedWordExtractor.class.getCanonicalName());
} else if (poiExtractor instanceof XSLFEventBasedPowerPointExtractor) {
extractor = new SXSLFPowerPointExtractorDecorator(metadata, context, (XSLFEventBasedPowerPointExtractor) poiExtractor);
metadata.add("X-Parsed-By", XSLFEventBasedPowerPointExtractor.class.getCanonicalName());
} else if (document == null) {
throw new TikaException("Expecting UserModel based POI OOXML extractor with a document, but none found. " + "The extractor returned was a " + poiExtractor);
} else if (document instanceof XMLSlideShow) {
extractor = new XSLFPowerPointExtractorDecorator(context, (org.apache.poi.xslf.extractor.XSLFPowerPointExtractor) poiExtractor);
} else if (document instanceof XWPFDocument) {
extractor = new XWPFWordExtractorDecorator(context, (XWPFWordExtractor) poiExtractor);
} else {
extractor = new POIXMLTextExtractorDecorator(context, poiExtractor);
}
// Get the bulk of the metadata first, so that it's accessible during
// parsing if desired by the client (see TIKA-1109)
extractor.getMetadataExtractor().extract(metadata);
// Extract the text, along with any in-document metadata
extractor.getXHTML(baseHandler, metadata, context);
} catch (IllegalArgumentException e) {
if (e.getMessage() != null && e.getMessage().startsWith("No supported documents found")) {
throw new TikaException("TIKA-418: RuntimeException while getting content" + " for thmx and xps file types", e);
} else {
throw new TikaException("Error creating OOXML extractor", e);
}
} catch (InvalidFormatException e) {
throw new TikaException("Error creating OOXML extractor", e);
} catch (OpenXML4JException e) {
throw new TikaException("Error creating OOXML extractor", e);
} catch (XmlException e) {
throw new TikaException("Error creating OOXML extractor", e);
}
}
use of org.apache.poi.xwpf.extractor.XWPFWordExtractor in project poi by apache.
the class TestXWPFBugs method bug53475NoCSPName.
/**
* A word document that's encrypted with non-standard
* Encryption options, and no cspname section. See bug 53475
*/
@Test
public void bug53475NoCSPName() throws Exception {
File file = POIDataSamples.getDocumentInstance().getFile("bug53475-password-is-solrcell.docx");
NPOIFSFileSystem filesystem = new NPOIFSFileSystem(file, true);
// Check the encryption details
EncryptionInfo info = new EncryptionInfo(filesystem);
assertEquals(128, info.getHeader().getKeySize());
assertEquals(CipherAlgorithm.aes128, info.getHeader().getCipherAlgorithm());
assertEquals(HashAlgorithm.sha1, info.getHeader().getHashAlgorithmEx());
// Check it can be decoded
Decryptor d = Decryptor.getInstance(info);
assertTrue("Unable to process: document is encrypted", d.verifyPassword("solrcell"));
// Check we can read the word document in that
InputStream dataStream = d.getDataStream(filesystem);
OPCPackage opc = OPCPackage.open(dataStream);
XWPFDocument doc = new XWPFDocument(opc);
XWPFWordExtractor ex = new XWPFWordExtractor(doc);
String text = ex.getText();
assertNotNull(text);
assertEquals("This is password protected Word document.", text.trim());
ex.close();
filesystem.close();
}
use of org.apache.poi.xwpf.extractor.XWPFWordExtractor in project poi by apache.
the class TestXWPFBugs method bug53475_aes256.
/**
* A word document with aes-256, i.e. aes is always 128 bit (= 128 bit block size),
* but the key can be 128/192/256 bits
*/
@Test
public void bug53475_aes256() throws Exception {
int maxKeyLen = Cipher.getMaxAllowedKeyLength("AES");
Assume.assumeTrue("Please install JCE Unlimited Strength Jurisdiction Policy files for AES 256", maxKeyLen == 2147483647);
File file = POIDataSamples.getDocumentInstance().getFile("bug53475-password-is-pass.docx");
NPOIFSFileSystem filesystem = new NPOIFSFileSystem(file, true);
// Check the encryption details
EncryptionInfo info = new EncryptionInfo(filesystem);
assertEquals(16, info.getHeader().getBlockSize());
assertEquals(256, info.getHeader().getKeySize());
assertEquals(CipherAlgorithm.aes256, info.getHeader().getCipherAlgorithm());
assertEquals(HashAlgorithm.sha1, info.getHeader().getHashAlgorithmEx());
// Check it can be decoded
Decryptor d = Decryptor.getInstance(info);
assertTrue("Unable to process: document is encrypted", d.verifyPassword("pass"));
// Check we can read the word document in that
InputStream dataStream = d.getDataStream(filesystem);
OPCPackage opc = OPCPackage.open(dataStream);
XWPFDocument doc = new XWPFDocument(opc);
XWPFWordExtractor ex = new XWPFWordExtractor(doc);
String text = ex.getText();
assertNotNull(text);
// I know ... a stupid typo, maybe next time ...
assertEquals("The is a password protected document.", text.trim());
ex.close();
filesystem.close();
}
use of org.apache.poi.xwpf.extractor.XWPFWordExtractor in project poi by apache.
the class TestXWPFDocument method testWriteFromReadOnlyOPC.
@Test
@Ignore("XWPF should be able to write to a new Stream when opened Read-Only")
public void testWriteFromReadOnlyOPC() throws Exception {
OPCPackage opc = OPCPackage.open(POIDataSamples.getDocumentInstance().getFile("SampleDoc.docx"), PackageAccess.READ);
XWPFDocument doc = new XWPFDocument(opc);
XWPFWordExtractor ext = new XWPFWordExtractor(doc);
String origText = ext.getText();
doc = XWPFTestDataSamples.writeOutAndReadBack(doc);
ext.close();
ext = new XWPFWordExtractor(doc);
assertEquals(origText, ext.getText());
ext.close();
}
use of org.apache.poi.xwpf.extractor.XWPFWordExtractor in project poi by apache.
the class TestExtractorFactory method testFile.
@Test
public void testFile() throws Exception {
// Excel
POITextExtractor xlsExtractor = ExtractorFactory.createExtractor(xls);
assertNotNull("Had empty extractor for " + xls, xlsExtractor);
assertTrue("Expected instanceof ExcelExtractor, but had: " + xlsExtractor.getClass(), xlsExtractor instanceof ExcelExtractor);
assertTrue(xlsExtractor.getText().length() > 200);
xlsExtractor.close();
POITextExtractor extractor = ExtractorFactory.createExtractor(xlsx);
assertTrue(extractor.getClass().getName(), extractor instanceof XSSFExcelExtractor);
extractor.close();
extractor = ExtractorFactory.createExtractor(xlsx);
assertTrue(extractor.getText().length() > 200);
extractor.close();
extractor = ExtractorFactory.createExtractor(xltx);
assertTrue(extractor.getClass().getName(), extractor instanceof XSSFExcelExtractor);
extractor.close();
extractor = ExtractorFactory.createExtractor(xlsb);
assertContains(extractor.getText(), "test");
extractor.close();
extractor = ExtractorFactory.createExtractor(xltx);
assertContains(extractor.getText(), "test");
extractor.close();
// TODO Support OOXML-Strict, see bug #57699
try {
/*extractor =*/
ExtractorFactory.createExtractor(xlsxStrict);
fail("OOXML-Strict isn't yet supported");
} catch (POIXMLException e) {
// Expected, for now
}
// extractor = ExtractorFactory.createExtractor(xlsxStrict);
// assertTrue(
// extractor
// instanceof XSSFExcelExtractor
// );
// extractor.close();
//
// extractor = ExtractorFactory.createExtractor(xlsxStrict);
// assertTrue(
// extractor.getText().contains("test")
// );
// extractor.close();
// Word
extractor = ExtractorFactory.createExtractor(doc);
assertTrue(extractor instanceof WordExtractor);
assertTrue(extractor.getText().length() > 120);
extractor.close();
extractor = ExtractorFactory.createExtractor(doc6);
assertTrue(extractor instanceof Word6Extractor);
assertTrue(extractor.getText().length() > 20);
extractor.close();
extractor = ExtractorFactory.createExtractor(doc95);
assertTrue(extractor instanceof Word6Extractor);
assertTrue(extractor.getText().length() > 120);
extractor.close();
extractor = ExtractorFactory.createExtractor(docx);
assertTrue(extractor instanceof XWPFWordExtractor);
extractor.close();
extractor = ExtractorFactory.createExtractor(docx);
assertTrue(extractor.getText().length() > 120);
extractor.close();
extractor = ExtractorFactory.createExtractor(dotx);
assertTrue(extractor instanceof XWPFWordExtractor);
extractor.close();
extractor = ExtractorFactory.createExtractor(dotx);
assertContains(extractor.getText(), "Test");
extractor.close();
// PowerPoint (PPT)
extractor = ExtractorFactory.createExtractor(ppt);
assertTrue(extractor instanceof PowerPointExtractor);
assertTrue(extractor.getText().length() > 120);
extractor.close();
// PowerPoint (PPTX)
extractor = ExtractorFactory.createExtractor(pptx);
assertTrue(extractor instanceof XSLFPowerPointExtractor);
assertTrue(extractor.getText().length() > 120);
extractor.close();
// Visio - binary
extractor = ExtractorFactory.createExtractor(vsd);
assertTrue(extractor instanceof VisioTextExtractor);
assertTrue(extractor.getText().length() > 50);
extractor.close();
// Visio - vsdx
extractor = ExtractorFactory.createExtractor(vsdx);
assertTrue(extractor instanceof XDGFVisioExtractor);
assertTrue(extractor.getText().length() > 20);
extractor.close();
// Publisher
extractor = ExtractorFactory.createExtractor(pub);
assertTrue(extractor instanceof PublisherTextExtractor);
assertTrue(extractor.getText().length() > 50);
extractor.close();
// Outlook msg
extractor = ExtractorFactory.createExtractor(msg);
assertTrue(extractor instanceof OutlookTextExtactor);
assertTrue(extractor.getText().length() > 50);
extractor.close();
// Text
try {
ExtractorFactory.createExtractor(txt);
fail();
} catch (IllegalArgumentException e) {
// Good
}
}
Aggregations