use of org.apache.poi.POIXMLTextExtractor in project poi by apache.
the class TestExtractorFactory method testPackage.
@Test
public void testPackage() throws Exception {
// Excel
POIXMLTextExtractor extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ));
assertTrue(extractor instanceof XSSFExcelExtractor);
extractor.close();
extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString()));
assertTrue(extractor.getText().length() > 200);
extractor.close();
// Word
extractor = ExtractorFactory.createExtractor(OPCPackage.open(docx.toString()));
assertTrue(extractor instanceof XWPFWordExtractor);
extractor.close();
extractor = ExtractorFactory.createExtractor(OPCPackage.open(docx.toString()));
assertTrue(extractor.getText().length() > 120);
extractor.close();
// PowerPoint
extractor = ExtractorFactory.createExtractor(OPCPackage.open(pptx.toString()));
assertTrue(extractor instanceof XSLFPowerPointExtractor);
extractor.close();
extractor = ExtractorFactory.createExtractor(OPCPackage.open(pptx.toString()));
assertTrue(extractor.getText().length() > 120);
extractor.close();
// Visio
extractor = ExtractorFactory.createExtractor(OPCPackage.open(vsdx.toString()));
assertTrue(extractor instanceof XDGFVisioExtractor);
assertTrue(extractor.getText().length() > 20);
extractor.close();
// Text
try {
ExtractorFactory.createExtractor(OPCPackage.open(txt.toString()));
fail("TestExtractorFactory.testPackage() failed on " + txt);
} catch (UnsupportedFileFormatException e) {
// Good
} catch (Exception e) {
System.out.println("TestExtractorFactory.testPackage() failed on " + txt);
throw e;
}
}
use of org.apache.poi.POIXMLTextExtractor in project poi by apache.
the class XSSFBEventBasedExcelExtractor method main.
public static void main(String[] args) throws Exception {
if (args.length < 1) {
System.err.println("Use:");
System.err.println(" XSSFBEventBasedExcelExtractor <filename.xlsb>");
System.exit(1);
}
POIXMLTextExtractor extractor = new XSSFBEventBasedExcelExtractor(args[0]);
System.out.println(extractor.getText());
extractor.close();
}
use of org.apache.poi.POIXMLTextExtractor in project tika by apache.
the class OOXMLExtractorFactory method parse.
public static void parse(InputStream stream, ContentHandler baseHandler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
Locale locale = context.get(Locale.class, Locale.getDefault());
ExtractorFactory.setThreadPrefersEventExtractors(true);
try {
OOXMLExtractor extractor;
OPCPackage pkg;
// Locate or Open the OPCPackage for the file
TikaInputStream tis = TikaInputStream.cast(stream);
if (tis != null && tis.getOpenContainer() instanceof OPCPackage) {
pkg = (OPCPackage) tis.getOpenContainer();
} else if (tis != null && tis.hasFile()) {
pkg = OPCPackage.open(tis.getFile().getPath(), PackageAccess.READ);
tis.setOpenContainer(pkg);
} else {
InputStream shield = new CloseShieldInputStream(stream);
pkg = OPCPackage.open(shield);
}
// Get the type, and ensure it's one we handle
MediaType type = ZipContainerDetector.detectOfficeOpenXML(pkg);
if (type == null || OOXMLParser.UNSUPPORTED_OOXML_TYPES.contains(type)) {
// Not a supported type, delegate to Empty Parser
EmptyParser.INSTANCE.parse(stream, baseHandler, metadata, context);
return;
}
metadata.set(Metadata.CONTENT_TYPE, type.toString());
// Have the appropriate OOXML text extractor picked
POIXMLTextExtractor poiExtractor = null;
// This has already been set by OOXMLParser's call to configure()
// We can rely on this being non-null.
OfficeParserConfig config = context.get(OfficeParserConfig.class);
if (config.getUseSAXDocxExtractor()) {
poiExtractor = trySXWPF(pkg);
}
if (poiExtractor == null && config.getUseSAXPptxExtractor()) {
poiExtractor = trySXSLF(pkg);
}
if (poiExtractor == null) {
poiExtractor = ExtractorFactory.createExtractor(pkg);
}
POIXMLDocument document = poiExtractor.getDocument();
if (poiExtractor instanceof XSSFBEventBasedExcelExtractor) {
extractor = new XSSFBExcelExtractorDecorator(context, poiExtractor, locale);
} else if (poiExtractor instanceof XSSFEventBasedExcelExtractor) {
extractor = new XSSFExcelExtractorDecorator(context, poiExtractor, locale);
} else if (poiExtractor instanceof XWPFEventBasedWordExtractor) {
extractor = new SXWPFWordExtractorDecorator(metadata, context, (XWPFEventBasedWordExtractor) poiExtractor);
metadata.add("X-Parsed-By", XWPFEventBasedWordExtractor.class.getCanonicalName());
} else if (poiExtractor instanceof XSLFEventBasedPowerPointExtractor) {
extractor = new SXSLFPowerPointExtractorDecorator(metadata, context, (XSLFEventBasedPowerPointExtractor) poiExtractor);
metadata.add("X-Parsed-By", XSLFEventBasedPowerPointExtractor.class.getCanonicalName());
} else if (document == null) {
throw new TikaException("Expecting UserModel based POI OOXML extractor with a document, but none found. " + "The extractor returned was a " + poiExtractor);
} else if (document instanceof XMLSlideShow) {
extractor = new XSLFPowerPointExtractorDecorator(context, (org.apache.poi.xslf.extractor.XSLFPowerPointExtractor) poiExtractor);
} else if (document instanceof XWPFDocument) {
extractor = new XWPFWordExtractorDecorator(context, (XWPFWordExtractor) poiExtractor);
} else {
extractor = new POIXMLTextExtractorDecorator(context, poiExtractor);
}
// Get the bulk of the metadata first, so that it's accessible during
// parsing if desired by the client (see TIKA-1109)
extractor.getMetadataExtractor().extract(metadata);
// Extract the text, along with any in-document metadata
extractor.getXHTML(baseHandler, metadata, context);
} catch (IllegalArgumentException e) {
if (e.getMessage() != null && e.getMessage().startsWith("No supported documents found")) {
throw new TikaException("TIKA-418: RuntimeException while getting content" + " for thmx and xps file types", e);
} else {
throw new TikaException("Error creating OOXML extractor", e);
}
} catch (InvalidFormatException e) {
throw new TikaException("Error creating OOXML extractor", e);
} catch (OpenXML4JException e) {
throw new TikaException("Error creating OOXML extractor", e);
} catch (XmlException e) {
throw new TikaException("Error creating OOXML extractor", e);
}
}
Aggregations