use of org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor in project poi by apache.
the class ExtractorFactory method createExtractor.
/**
* Tries to determine the actual type of file and produces a matching text-extractor for it.
*
* @param pkg An {@link OPCPackage}.
* @return A {@link POIXMLTextExtractor} for the given file.
* @throws IOException If an error occurs while reading the file
* @throws OpenXML4JException If an error parsing the OpenXML file format is found.
* @throws XmlException If an XML parsing error occurs.
* @throws IllegalArgumentException If no matching file type could be found.
*/
public static POIXMLTextExtractor createExtractor(OPCPackage pkg) throws IOException, OpenXML4JException, XmlException {
try {
// Check for the normal Office core document
PackageRelationshipCollection core;
core = pkg.getRelationshipsByType(CORE_DOCUMENT_REL);
// If nothing was found, try some of the other OOXML-based core types
if (core.size() == 0) {
// Could it be an OOXML-Strict one?
core = pkg.getRelationshipsByType(STRICT_DOCUMENT_REL);
}
if (core.size() == 0) {
// Could it be a visio one?
core = pkg.getRelationshipsByType(VISIO_DOCUMENT_REL);
if (core.size() == 1)
return new XDGFVisioExtractor(pkg);
}
// Should just be a single core document, complain if not
if (core.size() != 1) {
throw new IllegalArgumentException("Invalid OOXML Package received - expected 1 core document, found " + core.size());
}
// Grab the core document part, and try to identify from that
final PackagePart corePart = pkg.getPart(core.getRelationship(0));
final String contentType = corePart.getContentType();
// Is it XSSF?
for (XSSFRelation rel : XSSFExcelExtractor.SUPPORTED_TYPES) {
if (rel.getContentType().equals(contentType)) {
if (getPreferEventExtractor()) {
return new XSSFEventBasedExcelExtractor(pkg);
}
return new XSSFExcelExtractor(pkg);
}
}
// Is it XWPF?
for (XWPFRelation rel : XWPFWordExtractor.SUPPORTED_TYPES) {
if (rel.getContentType().equals(contentType)) {
return new XWPFWordExtractor(pkg);
}
}
// Is it XSLF?
for (XSLFRelation rel : XSLFPowerPointExtractor.SUPPORTED_TYPES) {
if (rel.getContentType().equals(contentType)) {
return new XSLFPowerPointExtractor(pkg);
}
}
// special handling for SlideShow-Theme-files,
if (XSLFRelation.THEME_MANAGER.getContentType().equals(contentType)) {
return new XSLFPowerPointExtractor(new XSLFSlideShow(pkg));
}
// How about xlsb?
for (XSSFRelation rel : XSSFBEventBasedExcelExtractor.SUPPORTED_TYPES) {
if (rel.getContentType().equals(contentType)) {
return new XSSFBEventBasedExcelExtractor(pkg);
}
}
throw new IllegalArgumentException("No supported documents found in the OOXML package (found " + contentType + ")");
} catch (IOException e) {
// ensure that we close the package again if there is an error opening it, however
// we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor!
pkg.revert();
throw e;
} catch (OpenXML4JException e) {
// ensure that we close the package again if there is an error opening it, however
// we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor!
pkg.revert();
throw e;
} catch (XmlException e) {
// ensure that we close the package again if there is an error opening it, however
// we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor!
pkg.revert();
throw e;
} catch (RuntimeException e) {
// ensure that we close the package again if there is an error opening it, however
// we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor!
pkg.revert();
throw e;
}
}
use of org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor in project tika by apache.
the class OOXMLExtractorFactory method parse.
public static void parse(InputStream stream, ContentHandler baseHandler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
Locale locale = context.get(Locale.class, Locale.getDefault());
ExtractorFactory.setThreadPrefersEventExtractors(true);
try {
OOXMLExtractor extractor;
OPCPackage pkg;
// Locate or Open the OPCPackage for the file
TikaInputStream tis = TikaInputStream.cast(stream);
if (tis != null && tis.getOpenContainer() instanceof OPCPackage) {
pkg = (OPCPackage) tis.getOpenContainer();
} else if (tis != null && tis.hasFile()) {
pkg = OPCPackage.open(tis.getFile().getPath(), PackageAccess.READ);
tis.setOpenContainer(pkg);
} else {
InputStream shield = new CloseShieldInputStream(stream);
pkg = OPCPackage.open(shield);
}
// Get the type, and ensure it's one we handle
MediaType type = ZipContainerDetector.detectOfficeOpenXML(pkg);
if (type == null || OOXMLParser.UNSUPPORTED_OOXML_TYPES.contains(type)) {
// Not a supported type, delegate to Empty Parser
EmptyParser.INSTANCE.parse(stream, baseHandler, metadata, context);
return;
}
metadata.set(Metadata.CONTENT_TYPE, type.toString());
// Have the appropriate OOXML text extractor picked
POIXMLTextExtractor poiExtractor = null;
// This has already been set by OOXMLParser's call to configure()
// We can rely on this being non-null.
OfficeParserConfig config = context.get(OfficeParserConfig.class);
if (config.getUseSAXDocxExtractor()) {
poiExtractor = trySXWPF(pkg);
}
if (poiExtractor == null && config.getUseSAXPptxExtractor()) {
poiExtractor = trySXSLF(pkg);
}
if (poiExtractor == null) {
poiExtractor = ExtractorFactory.createExtractor(pkg);
}
POIXMLDocument document = poiExtractor.getDocument();
if (poiExtractor instanceof XSSFBEventBasedExcelExtractor) {
extractor = new XSSFBExcelExtractorDecorator(context, poiExtractor, locale);
} else if (poiExtractor instanceof XSSFEventBasedExcelExtractor) {
extractor = new XSSFExcelExtractorDecorator(context, poiExtractor, locale);
} else if (poiExtractor instanceof XWPFEventBasedWordExtractor) {
extractor = new SXWPFWordExtractorDecorator(metadata, context, (XWPFEventBasedWordExtractor) poiExtractor);
metadata.add("X-Parsed-By", XWPFEventBasedWordExtractor.class.getCanonicalName());
} else if (poiExtractor instanceof XSLFEventBasedPowerPointExtractor) {
extractor = new SXSLFPowerPointExtractorDecorator(metadata, context, (XSLFEventBasedPowerPointExtractor) poiExtractor);
metadata.add("X-Parsed-By", XSLFEventBasedPowerPointExtractor.class.getCanonicalName());
} else if (document == null) {
throw new TikaException("Expecting UserModel based POI OOXML extractor with a document, but none found. " + "The extractor returned was a " + poiExtractor);
} else if (document instanceof XMLSlideShow) {
extractor = new XSLFPowerPointExtractorDecorator(context, (org.apache.poi.xslf.extractor.XSLFPowerPointExtractor) poiExtractor);
} else if (document instanceof XWPFDocument) {
extractor = new XWPFWordExtractorDecorator(context, (XWPFWordExtractor) poiExtractor);
} else {
extractor = new POIXMLTextExtractorDecorator(context, poiExtractor);
}
// Get the bulk of the metadata first, so that it's accessible during
// parsing if desired by the client (see TIKA-1109)
extractor.getMetadataExtractor().extract(metadata);
// Extract the text, along with any in-document metadata
extractor.getXHTML(baseHandler, metadata, context);
} catch (IllegalArgumentException e) {
if (e.getMessage() != null && e.getMessage().startsWith("No supported documents found")) {
throw new TikaException("TIKA-418: RuntimeException while getting content" + " for thmx and xps file types", e);
} else {
throw new TikaException("Error creating OOXML extractor", e);
}
} catch (InvalidFormatException e) {
throw new TikaException("Error creating OOXML extractor", e);
} catch (OpenXML4JException e) {
throw new TikaException("Error creating OOXML extractor", e);
} catch (XmlException e) {
throw new TikaException("Error creating OOXML extractor", e);
}
}
use of org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor in project poi by apache.
the class TestSecureTempZip method protectedTempZip.
/**
* Test case for #59841 - this is an example on how to use encrypted temp files,
* which are streamed into POI opposed to having everything in memory
*/
@Test
public void protectedTempZip() throws IOException, GeneralSecurityException, XmlException, OpenXML4JException {
File tikaProt = XSSFTestDataSamples.getSampleFile("protected_passtika.xlsx");
FileInputStream fis = new FileInputStream(tikaProt);
POIFSFileSystem poifs = new POIFSFileSystem(fis);
EncryptionInfo ei = new EncryptionInfo(poifs);
Decryptor dec = ei.getDecryptor();
boolean passOk = dec.verifyPassword("tika");
assertTrue(passOk);
// extract encrypted ooxml file and write to custom encrypted zip file
InputStream is = dec.getDataStream(poifs);
// provide ZipEntrySource to poi which decrypts on the fly
ZipEntrySource source = AesZipFileZipEntrySource.createZipEntrySource(is);
// test the source
OPCPackage opc = OPCPackage.open(source);
String expected = "This is an Encrypted Excel spreadsheet.";
XSSFEventBasedExcelExtractor extractor = new XSSFEventBasedExcelExtractor(opc);
extractor.setIncludeSheetNames(false);
String txt = extractor.getText();
assertEquals(expected, txt.trim());
XSSFWorkbook wb = new XSSFWorkbook(opc);
txt = wb.getSheetAt(0).getRow(0).getCell(0).getStringCellValue();
assertEquals(expected, txt);
extractor.close();
wb.close();
opc.close();
source.close();
poifs.close();
fis.close();
}
use of org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor in project poi by apache.
the class TestExtractorFactory method testPreferEventBased.
@Test
public void testPreferEventBased() throws Exception {
assertFalse(ExtractorFactory.getPreferEventExtractor());
assertFalse(ExtractorFactory.getThreadPrefersEventExtractors());
assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
ExtractorFactory.setThreadPrefersEventExtractors(true);
assertTrue(ExtractorFactory.getPreferEventExtractor());
assertTrue(ExtractorFactory.getThreadPrefersEventExtractors());
assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
ExtractorFactory.setAllThreadsPreferEventExtractors(false);
assertFalse(ExtractorFactory.getPreferEventExtractor());
assertTrue(ExtractorFactory.getThreadPrefersEventExtractors());
assertEquals(Boolean.FALSE, ExtractorFactory.getAllThreadsPreferEventExtractors());
ExtractorFactory.setAllThreadsPreferEventExtractors(null);
assertTrue(ExtractorFactory.getPreferEventExtractor());
assertTrue(ExtractorFactory.getThreadPrefersEventExtractors());
assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
// Check we get the right extractors now
POITextExtractor extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)));
assertTrue(extractor instanceof EventBasedExcelExtractor);
extractor.close();
extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)));
assertTrue(extractor.getText().length() > 200);
extractor.close();
extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ));
assertTrue(extractor instanceof XSSFEventBasedExcelExtractor);
extractor.close();
extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ));
assertTrue(extractor.getText().length() > 200);
extractor.close();
// Put back to normal
ExtractorFactory.setThreadPrefersEventExtractors(false);
assertFalse(ExtractorFactory.getPreferEventExtractor());
assertFalse(ExtractorFactory.getThreadPrefersEventExtractors());
assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
// And back
extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)));
assertTrue(extractor instanceof ExcelExtractor);
extractor.close();
extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)));
assertTrue(extractor.getText().length() > 200);
extractor.close();
extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ));
assertTrue(extractor instanceof XSSFExcelExtractor);
extractor.close();
extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString()));
assertTrue(extractor.getText().length() > 200);
extractor.close();
}
Aggregations