use of org.apache.poi.POITextExtractor in project poi by apache.
the class OLE2ExtractorFactory method getEmbededDocsTextExtractors.
/**
* Returns an array of text extractors, one for each of
* the embedded documents in the file (if there are any).
* If there are no embedded documents, you'll get back an
* empty array. Otherwise, you'll get one open
* {@link POITextExtractor} for each embedded file.
*/
public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException {
// All the embedded directories we spotted
List<Entry> dirs = new ArrayList<Entry>();
// For anything else not directly held in as a POIFS directory
List<InputStream> nonPOIFS = new ArrayList<InputStream>();
// Find all the embedded directories
DirectoryEntry root = ext.getRoot();
if (root == null) {
throw new IllegalStateException("The extractor didn't know which POIFS it came from!");
}
if (ext instanceof ExcelExtractor) {
// These are in MBD... under the root
Iterator<Entry> it = root.getEntries();
while (it.hasNext()) {
Entry entry = it.next();
if (entry.getName().startsWith("MBD")) {
dirs.add(entry);
}
}
} else {
// Ask Scratchpad, or fail trying
Class<?> cls = getScratchpadClass();
try {
Method m = cls.getDeclaredMethod("identifyEmbeddedResources", POIOLE2TextExtractor.class, List.class, List.class);
m.invoke(null, ext, dirs, nonPOIFS);
} catch (Exception e) {
throw new IllegalArgumentException("Error checking for Scratchpad embedded resources", e);
}
}
// Create the extractors
if (dirs.size() == 0 && nonPOIFS.size() == 0) {
return new POITextExtractor[0];
}
ArrayList<POITextExtractor> e = new ArrayList<POITextExtractor>();
for (Entry dir : dirs) {
e.add(createExtractor((DirectoryNode) dir));
}
for (InputStream nonPOIF : nonPOIFS) {
try {
e.add(createExtractor(nonPOIF));
} catch (IllegalArgumentException ie) {
// Ignore, just means it didn't contain
// a format we support as yet
LOGGER.log(POILogger.WARN, ie);
} catch (Exception xe) {
// Ignore, invalid format
LOGGER.log(POILogger.WARN, xe);
}
}
return e.toArray(new POITextExtractor[e.size()]);
}
use of org.apache.poi.POITextExtractor in project poi by apache.
the class AbstractFileHandler method handleExtractingAsStream.
private void handleExtractingAsStream(File file) throws IOException, OpenXML4JException, XmlException {
InputStream stream = new FileInputStream(file);
try {
POITextExtractor streamExtractor = ExtractorFactory.createExtractor(stream);
try {
assertNotNull(streamExtractor);
assertNotNull(streamExtractor.getText());
} finally {
streamExtractor.close();
}
} finally {
stream.close();
}
}
use of org.apache.poi.POITextExtractor in project poi by apache.
the class TestExtractorFactory method testPreferEventBased.
@Test
public void testPreferEventBased() throws Exception {
assertFalse(ExtractorFactory.getPreferEventExtractor());
assertFalse(ExtractorFactory.getThreadPrefersEventExtractors());
assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
ExtractorFactory.setThreadPrefersEventExtractors(true);
assertTrue(ExtractorFactory.getPreferEventExtractor());
assertTrue(ExtractorFactory.getThreadPrefersEventExtractors());
assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
ExtractorFactory.setAllThreadsPreferEventExtractors(false);
assertFalse(ExtractorFactory.getPreferEventExtractor());
assertTrue(ExtractorFactory.getThreadPrefersEventExtractors());
assertEquals(Boolean.FALSE, ExtractorFactory.getAllThreadsPreferEventExtractors());
ExtractorFactory.setAllThreadsPreferEventExtractors(null);
assertTrue(ExtractorFactory.getPreferEventExtractor());
assertTrue(ExtractorFactory.getThreadPrefersEventExtractors());
assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
// Check we get the right extractors now
POITextExtractor extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)));
assertTrue(extractor instanceof EventBasedExcelExtractor);
extractor.close();
extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)));
assertTrue(extractor.getText().length() > 200);
extractor.close();
extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ));
assertTrue(extractor instanceof XSSFEventBasedExcelExtractor);
extractor.close();
extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ));
assertTrue(extractor.getText().length() > 200);
extractor.close();
// Put back to normal
ExtractorFactory.setThreadPrefersEventExtractors(false);
assertFalse(ExtractorFactory.getPreferEventExtractor());
assertFalse(ExtractorFactory.getThreadPrefersEventExtractors());
assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
// And back
extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)));
assertTrue(extractor instanceof ExcelExtractor);
extractor.close();
extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)));
assertTrue(extractor.getText().length() > 200);
extractor.close();
extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ));
assertTrue(extractor instanceof XSSFExcelExtractor);
extractor.close();
extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString()));
assertTrue(extractor.getText().length() > 200);
extractor.close();
}
use of org.apache.poi.POITextExtractor in project poi by apache.
the class TestExtractorFactory method test45565.
// This bug is currently open. This test will fail with "expected error not thrown" when the bug has been fixed.
// When this happens, change this from @Test(expected=...) to @Test
// bug 45565: text within TextBoxes is extracted by ExcelExtractor and WordExtractor
@Test(expected = AssertionError.class)
public void test45565() throws Exception {
POITextExtractor extractor = ExtractorFactory.createExtractor(HSSFTestDataSamples.getSampleFile("45565.xls"));
try {
String text = extractor.getText();
assertContains(text, "testdoc");
assertContains(text, "test phrase");
} finally {
extractor.close();
}
}
use of org.apache.poi.POITextExtractor in project poi by apache.
the class TestExtractorFactory method testFile.
@Test
public void testFile() throws Exception {
// Excel
POITextExtractor xlsExtractor = ExtractorFactory.createExtractor(xls);
assertNotNull("Had empty extractor for " + xls, xlsExtractor);
assertTrue("Expected instanceof ExcelExtractor, but had: " + xlsExtractor.getClass(), xlsExtractor instanceof ExcelExtractor);
assertTrue(xlsExtractor.getText().length() > 200);
xlsExtractor.close();
POITextExtractor extractor = ExtractorFactory.createExtractor(xlsx);
assertTrue(extractor.getClass().getName(), extractor instanceof XSSFExcelExtractor);
extractor.close();
extractor = ExtractorFactory.createExtractor(xlsx);
assertTrue(extractor.getText().length() > 200);
extractor.close();
extractor = ExtractorFactory.createExtractor(xltx);
assertTrue(extractor.getClass().getName(), extractor instanceof XSSFExcelExtractor);
extractor.close();
extractor = ExtractorFactory.createExtractor(xlsb);
assertContains(extractor.getText(), "test");
extractor.close();
extractor = ExtractorFactory.createExtractor(xltx);
assertContains(extractor.getText(), "test");
extractor.close();
// TODO Support OOXML-Strict, see bug #57699
try {
/*extractor =*/
ExtractorFactory.createExtractor(xlsxStrict);
fail("OOXML-Strict isn't yet supported");
} catch (POIXMLException e) {
// Expected, for now
}
// extractor = ExtractorFactory.createExtractor(xlsxStrict);
// assertTrue(
// extractor
// instanceof XSSFExcelExtractor
// );
// extractor.close();
//
// extractor = ExtractorFactory.createExtractor(xlsxStrict);
// assertTrue(
// extractor.getText().contains("test")
// );
// extractor.close();
// Word
extractor = ExtractorFactory.createExtractor(doc);
assertTrue(extractor instanceof WordExtractor);
assertTrue(extractor.getText().length() > 120);
extractor.close();
extractor = ExtractorFactory.createExtractor(doc6);
assertTrue(extractor instanceof Word6Extractor);
assertTrue(extractor.getText().length() > 20);
extractor.close();
extractor = ExtractorFactory.createExtractor(doc95);
assertTrue(extractor instanceof Word6Extractor);
assertTrue(extractor.getText().length() > 120);
extractor.close();
extractor = ExtractorFactory.createExtractor(docx);
assertTrue(extractor instanceof XWPFWordExtractor);
extractor.close();
extractor = ExtractorFactory.createExtractor(docx);
assertTrue(extractor.getText().length() > 120);
extractor.close();
extractor = ExtractorFactory.createExtractor(dotx);
assertTrue(extractor instanceof XWPFWordExtractor);
extractor.close();
extractor = ExtractorFactory.createExtractor(dotx);
assertContains(extractor.getText(), "Test");
extractor.close();
// PowerPoint (PPT)
extractor = ExtractorFactory.createExtractor(ppt);
assertTrue(extractor instanceof PowerPointExtractor);
assertTrue(extractor.getText().length() > 120);
extractor.close();
// PowerPoint (PPTX)
extractor = ExtractorFactory.createExtractor(pptx);
assertTrue(extractor instanceof XSLFPowerPointExtractor);
assertTrue(extractor.getText().length() > 120);
extractor.close();
// Visio - binary
extractor = ExtractorFactory.createExtractor(vsd);
assertTrue(extractor instanceof VisioTextExtractor);
assertTrue(extractor.getText().length() > 50);
extractor.close();
// Visio - vsdx
extractor = ExtractorFactory.createExtractor(vsdx);
assertTrue(extractor instanceof XDGFVisioExtractor);
assertTrue(extractor.getText().length() > 20);
extractor.close();
// Publisher
extractor = ExtractorFactory.createExtractor(pub);
assertTrue(extractor instanceof PublisherTextExtractor);
assertTrue(extractor.getText().length() > 50);
extractor.close();
// Outlook msg
extractor = ExtractorFactory.createExtractor(msg);
assertTrue(extractor instanceof OutlookTextExtactor);
assertTrue(extractor.getText().length() > 50);
extractor.close();
// Text
try {
ExtractorFactory.createExtractor(txt);
fail();
} catch (IllegalArgumentException e) {
// Good
}
}
Aggregations