use of org.apache.poi.poifs.filesystem.DirectoryNode in project poi by apache.
the class WordToTextConverter method processOle2.
@Override
protected boolean processOle2(HWPFDocument wordDocument, Element block, Entry entry) throws Exception {
if (!(entry instanceof DirectoryNode))
return false;
DirectoryNode directoryNode = (DirectoryNode) entry;
/*
* even if there is no ExtractorFactory in classpath, still support
* included Word's objects
*/
if (directoryNode.hasEntry("WordDocument")) {
String text = WordToTextConverter.getText((DirectoryNode) entry);
block.appendChild(textDocumentFacade.createText(UNICODECHAR_ZERO_WIDTH_SPACE + text + UNICODECHAR_ZERO_WIDTH_SPACE));
return true;
}
Object extractor;
try {
Class<?> cls = Class.forName("org.apache.poi.extractor.ExtractorFactory");
Method createExtractor = cls.getMethod("createExtractor", DirectoryNode.class);
extractor = createExtractor.invoke(null, directoryNode);
} catch (Exception exc) {
// no extractor in classpath
logger.log(POILogger.WARN, "There is an OLE object entry '", entry.getName(), "', but there is no text extractor for this object type ", "or text extractor factory is not available: ", "" + exc);
return false;
}
try {
Method getText = extractor.getClass().getMethod("getText");
String text = (String) getText.invoke(extractor);
block.appendChild(textDocumentFacade.createText(UNICODECHAR_ZERO_WIDTH_SPACE + text + UNICODECHAR_ZERO_WIDTH_SPACE));
return true;
} catch (Exception exc) {
logger.log(POILogger.ERROR, "Unable to extract text from OLE entry '", entry.getName(), "': ", exc, exc);
return false;
}
}
use of org.apache.poi.poifs.filesystem.DirectoryNode in project poi by apache.
the class TestWordExtractor method testDifferentPOIFS.
/**
* Tests that we can work with both {@link POIFSFileSystem}
* and {@link NPOIFSFileSystem}
*/
@Test
public void testDifferentPOIFS() throws Exception {
// Open the two filesystems
File file = docTests.getFile("test2.doc");
InputStream is = new FileInputStream(file);
OPOIFSFileSystem opoifs = new OPOIFSFileSystem(is);
is.close();
NPOIFSFileSystem npoifs = new NPOIFSFileSystem(file);
DirectoryNode[] files = { opoifs.getRoot(), npoifs.getRoot() };
// Open directly
for (DirectoryNode dir : files) {
@SuppressWarnings("resource") WordExtractor extractor = new WordExtractor(dir);
assertEqualsTrim(p_text1_block, extractor.getText());
// extractor.close();
}
// Open via a HWPFDocument
for (DirectoryNode dir : files) {
HWPFDocument doc = new HWPFDocument(dir);
WordExtractor extractor = new WordExtractor(doc);
assertEqualsTrim(p_text1_block, extractor.getText());
extractor.close();
}
npoifs.close();
}
use of org.apache.poi.poifs.filesystem.DirectoryNode in project poi by apache.
the class TestWordExtractor method testExtractFromEmbeded.
/**
* Test that we can get data from two different embedded word documents
*/
@Test
public void testExtractFromEmbeded() throws IOException {
InputStream is = POIDataSamples.getSpreadSheetInstance().openResourceAsStream("excel_with_embeded.xls");
POIFSFileSystem fs = new POIFSFileSystem(is);
is.close();
DirectoryNode dirA = (DirectoryNode) fs.getRoot().getEntry("MBD0000A3B7");
DirectoryNode dirB = (DirectoryNode) fs.getRoot().getEntry("MBD0000A3B2");
// Should have WordDocument and 1Table
assertNotNull(dirA.getEntry("1Table"));
assertNotNull(dirA.getEntry("WordDocument"));
assertNotNull(dirB.getEntry("1Table"));
assertNotNull(dirB.getEntry("WordDocument"));
// Check each in turn
HWPFDocument docA = new HWPFDocument(dirA);
WordExtractor extractorA = new WordExtractor(docA);
assertNotNull(extractorA.getText());
assertTrue(extractorA.getText().length() > 20);
assertEqualsTrim("I am a sample document\r\nNot much on me\r\nI am document 1\r\n", extractorA.getText());
assertEquals("Sample Doc 1", extractorA.getSummaryInformation().getTitle());
assertEquals("Sample Test", extractorA.getSummaryInformation().getSubject());
HWPFDocument docB = new HWPFDocument(dirB);
WordExtractor extractorB = new WordExtractor(docB);
assertNotNull(extractorB.getText());
assertTrue(extractorB.getText().length() > 20);
assertEqualsTrim("I am another sample document\r\nNot much on me\r\nI am document 2\r\n", extractorB.getText());
assertEquals("Sample Doc 2", extractorB.getSummaryInformation().getTitle());
assertEquals("Another Sample Test", extractorB.getSummaryInformation().getSubject());
extractorA.close();
docA.close();
extractorB.close();
docB.close();
fs.close();
}
use of org.apache.poi.poifs.filesystem.DirectoryNode in project poi by apache.
the class TestExcelExtractor method testWithEmbededInOwn.
/**
* Excel embeded in excel
*/
@Test
public void testWithEmbededInOwn() throws Exception {
POIDataSamples ssSamples = POIDataSamples.getSpreadSheetInstance();
POIFSFileSystem fs = null;
HSSFWorkbook wbA = null, wbB = null;
ExcelExtractor exA = null, exB = null, ex = null;
try {
fs = new POIFSFileSystem(ssSamples.getFile("excel_with_embeded.xls"));
DirectoryNode dirA = (DirectoryNode) fs.getRoot().getEntry("MBD0000A3B5");
DirectoryNode dirB = (DirectoryNode) fs.getRoot().getEntry("MBD0000A3B4");
wbA = new HSSFWorkbook(dirA, fs, true);
wbB = new HSSFWorkbook(dirB, fs, true);
exA = new ExcelExtractor(wbA);
exB = new ExcelExtractor(wbB);
assertEquals("Sheet1\nTest excel file\nThis is the first file\nSheet2\nSheet3\n", exA.getText());
assertEquals("Sample Excel", exA.getSummaryInformation().getTitle());
assertEquals("Sheet1\nAnother excel file\nThis is the second file\nSheet2\nSheet3\n", exB.getText());
assertEquals("Sample Excel 2", exB.getSummaryInformation().getTitle());
// And the base file too
ex = new ExcelExtractor(fs);
assertEquals("Sheet1\nI have lots of embeded files in me\nSheet2\nSheet3\n", ex.getText());
assertEquals("Excel With Embeded", ex.getSummaryInformation().getTitle());
} finally {
if (ex != null)
ex.close();
if (exB != null)
exB.close();
if (exA != null)
exA.close();
if (wbB != null)
wbB.close();
if (wbA != null)
wbA.close();
if (fs != null)
fs.close();
}
}
use of org.apache.poi.poifs.filesystem.DirectoryNode in project poi by apache.
the class TestExcelExtractor method testWithEmbeded.
/**
* Embeded in a non-excel file
*/
@Test
public void testWithEmbeded() throws Exception {
POIFSFileSystem fs = null;
HSSFWorkbook wbA = null, wbB = null;
ExcelExtractor exA = null, exB = null;
try {
fs = new POIFSFileSystem(POIDataSamples.getDocumentInstance().getFile("word_with_embeded.doc"));
DirectoryNode objPool = (DirectoryNode) fs.getRoot().getEntry("ObjectPool");
DirectoryNode dirA = (DirectoryNode) objPool.getEntry("_1269427460");
DirectoryNode dirB = (DirectoryNode) objPool.getEntry("_1269427461");
wbA = new HSSFWorkbook(dirA, fs, true);
exA = new ExcelExtractor(wbA);
wbB = new HSSFWorkbook(dirB, fs, true);
exB = new ExcelExtractor(wbB);
assertEquals("Sheet1\nTest excel file\nThis is the first file\nSheet2\nSheet3\n", exA.getText());
assertEquals("Sample Excel", exA.getSummaryInformation().getTitle());
assertEquals("Sheet1\nAnother excel file\nThis is the second file\nSheet2\nSheet3\n", exB.getText());
assertEquals("Sample Excel 2", exB.getSummaryInformation().getTitle());
} finally {
if (exB != null)
exB.close();
if (wbB != null)
wbB.close();
if (exA != null)
exA.close();
if (wbA != null)
wbA.close();
if (fs != null)
fs.close();
}
}
Aggregations