use of org.apache.poi.POITextExtractor in project poi by apache.
the class TestExtractorFactory method testEmbeded.
/**
* Test embeded docs text extraction. For now, only
* does poifs embeded, but will do ooxml ones
* at some point.
*/
@Test
public void testEmbeded() throws Exception {
POIOLE2TextExtractor ext;
POITextExtractor[] embeds;
// No embedings
ext = (POIOLE2TextExtractor) ExtractorFactory.createExtractor(xls);
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
assertEquals(0, embeds.length);
ext.close();
// Excel
ext = (POIOLE2TextExtractor) ExtractorFactory.createExtractor(xlsEmb);
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
assertEquals(6, embeds.length);
int numWord = 0, numXls = 0, numPpt = 0, numMsg = 0, numWordX;
for (POITextExtractor embed : embeds) {
assertTrue(embed.getText().length() > 20);
if (embed instanceof PowerPointExtractor)
numPpt++;
else if (embed instanceof ExcelExtractor)
numXls++;
else if (embed instanceof WordExtractor)
numWord++;
else if (embed instanceof OutlookTextExtactor)
numMsg++;
}
assertEquals(2, numPpt);
assertEquals(2, numXls);
assertEquals(2, numWord);
assertEquals(0, numMsg);
ext.close();
// Word
ext = (POIOLE2TextExtractor) ExtractorFactory.createExtractor(docEmb);
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
numWord = 0;
numXls = 0;
numPpt = 0;
numMsg = 0;
assertEquals(4, embeds.length);
for (POITextExtractor embed : embeds) {
assertTrue(embed.getText().length() > 20);
if (embed instanceof PowerPointExtractor)
numPpt++;
else if (embed instanceof ExcelExtractor)
numXls++;
else if (embed instanceof WordExtractor)
numWord++;
else if (embed instanceof OutlookTextExtactor)
numMsg++;
}
assertEquals(1, numPpt);
assertEquals(2, numXls);
assertEquals(1, numWord);
assertEquals(0, numMsg);
ext.close();
// Word which contains an OOXML file
ext = (POIOLE2TextExtractor) ExtractorFactory.createExtractor(docEmbOOXML);
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
numWord = 0;
numXls = 0;
numPpt = 0;
numMsg = 0;
numWordX = 0;
assertEquals(3, embeds.length);
for (POITextExtractor embed : embeds) {
assertTrue(embed.getText().length() > 20);
if (embed instanceof PowerPointExtractor)
numPpt++;
else if (embed instanceof ExcelExtractor)
numXls++;
else if (embed instanceof WordExtractor)
numWord++;
else if (embed instanceof OutlookTextExtactor)
numMsg++;
else if (embed instanceof XWPFWordExtractor)
numWordX++;
}
assertEquals(1, numPpt);
assertEquals(1, numXls);
assertEquals(0, numWord);
assertEquals(1, numWordX);
assertEquals(0, numMsg);
ext.close();
// Outlook
ext = (OutlookTextExtactor) ExtractorFactory.createExtractor(msgEmb);
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
numWord = 0;
numXls = 0;
numPpt = 0;
numMsg = 0;
assertEquals(1, embeds.length);
for (POITextExtractor embed : embeds) {
assertTrue(embed.getText().length() > 20);
if (embed instanceof PowerPointExtractor)
numPpt++;
else if (embed instanceof ExcelExtractor)
numXls++;
else if (embed instanceof WordExtractor)
numWord++;
else if (embed instanceof OutlookTextExtactor)
numMsg++;
}
assertEquals(0, numPpt);
assertEquals(0, numXls);
assertEquals(1, numWord);
assertEquals(0, numMsg);
ext.close();
// Outlook with another outlook file in it
ext = (OutlookTextExtactor) ExtractorFactory.createExtractor(msgEmbMsg);
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
numWord = 0;
numXls = 0;
numPpt = 0;
numMsg = 0;
assertEquals(1, embeds.length);
for (POITextExtractor embed : embeds) {
assertTrue(embed.getText().length() > 20);
if (embed instanceof PowerPointExtractor)
numPpt++;
else if (embed instanceof ExcelExtractor)
numXls++;
else if (embed instanceof WordExtractor)
numWord++;
else if (embed instanceof OutlookTextExtactor)
numMsg++;
}
assertEquals(0, numPpt);
assertEquals(0, numXls);
assertEquals(0, numWord);
assertEquals(1, numMsg);
ext.close();
// TODO - PowerPoint
// TODO - Publisher
// TODO - Visio
}
use of org.apache.poi.POITextExtractor in project poi by apache.
the class TestXSLFPowerPointExtractor method test45541.
@Test
public void test45541() throws Exception {
// extract text from a powerpoint that has a header in the notes-element
POITextExtractor extr = ExtractorFactory.createExtractor(slTests.getFile("45541_Header.pptx"));
String text = extr.getText();
assertNotNull(text);
assertFalse("Had: " + text, text.contains("testdoc"));
text = ((XSLFPowerPointExtractor) extr).getText(false, true);
assertContains(text, "testdoc");
extr.close();
assertNotNull(text);
// extract text from a powerpoint that has a footer in the master-slide
extr = ExtractorFactory.createExtractor(slTests.getFile("45541_Footer.pptx"));
text = extr.getText();
assertNotContained(text, "testdoc");
text = ((XSLFPowerPointExtractor) extr).getText(false, true);
assertNotContained(text, "testdoc");
text = ((XSLFPowerPointExtractor) extr).getText(false, false, true);
assertNotContained(text, "testdoc");
extr.close();
}
use of org.apache.poi.POITextExtractor in project poi by apache.
the class TestZipPackage method testZipEntityExpansionSharedStringTableEvents.
@Test
public void testZipEntityExpansionSharedStringTableEvents() throws Exception {
boolean before = ExtractorFactory.getThreadPrefersEventExtractors();
ExtractorFactory.setThreadPrefersEventExtractors(true);
try {
POITextExtractor extractor = ExtractorFactory.createExtractor(HSSFTestDataSamples.getSampleFile("poc-shared-strings.xlsx"));
try {
assertNotNull(extractor);
try {
extractor.getText();
} catch (IllegalStateException e) {
// expected due to shared strings expansion
}
} finally {
extractor.close();
}
} catch (XmlException e) {
assertEntityLimitReached(e);
} finally {
ExtractorFactory.setThreadPrefersEventExtractors(before);
}
}
use of org.apache.poi.POITextExtractor in project poi by apache.
the class TestZipPackage method testZipEntityExpansionExceedsMemory.
@Test
public void testZipEntityExpansionExceedsMemory() throws Exception {
try {
Workbook wb = WorkbookFactory.create(XSSFTestDataSamples.openSamplePackage("poc-xmlbomb.xlsx"));
wb.close();
fail("Should catch exception due to entity expansion limitations");
} catch (POIXMLException e) {
assertEntityLimitReached(e);
}
try {
POITextExtractor extractor = ExtractorFactory.createExtractor(HSSFTestDataSamples.getSampleFile("poc-xmlbomb.xlsx"));
try {
assertNotNull(extractor);
try {
extractor.getText();
} catch (IllegalStateException e) {
// expected due to shared strings expansion
}
} finally {
extractor.close();
}
} catch (POIXMLException e) {
assertEntityLimitReached(e);
}
}
use of org.apache.poi.POITextExtractor in project poi by apache.
the class TestWordExtractor method testExtractorFromWord6Extractor.
@Test
public void testExtractorFromWord6Extractor() throws Exception {
InputStream is = POIDataSamples.getHPSFInstance().openResourceAsStream("TestMickey.doc");
POIFSFileSystem fs = new POIFSFileSystem(is);
is.close();
Word6Extractor wExt = new Word6Extractor(fs);
try {
POITextExtractor ext = wExt.getMetadataTextExtractor();
try {
// Now overall
String text = ext.getText();
assertContains(text, "TEMPLATE = Normal");
assertContains(text, "SUBJECT = sample subject");
assertContains(text, "MANAGER = sample manager");
assertContains(text, "COMPANY = sample company");
} finally {
ext.close();
}
} finally {
wExt.close();
fs.close();
}
}
Aggregations