use of org.apache.poi.POIOLE2TextExtractor in project poi by apache.
the class AbstractFileHandler method handleExtractingInternal.
private void handleExtractingInternal(File file) throws Exception {
long length = file.length();
long modified = file.lastModified();
POITextExtractor extractor = ExtractorFactory.createExtractor(file);
try {
assertNotNull("Should get a POITextExtractor but had none for file " + file, extractor);
assertNotNull("Should get some text but had none for file " + file, extractor.getText());
// also try metadata
@SuppressWarnings("resource") POITextExtractor metadataExtractor = extractor.getMetadataTextExtractor();
assertNotNull(metadataExtractor.getText());
assertFalse("Expected Extraction to fail for file " + file + " and handler " + this + ", but did not fail!", EXPECTED_EXTRACTOR_FAILURES.contains(file.getParentFile().getName() + "/" + file.getName()));
assertEquals("File should not be modified by extractor", length, file.length());
assertEquals("File should not be modified by extractor", modified, file.lastModified());
handleExtractingAsStream(file);
if (extractor instanceof POIOLE2TextExtractor) {
HPSFPropertiesExtractor hpsfExtractor = new HPSFPropertiesExtractor((POIOLE2TextExtractor) extractor);
try {
assertNotNull(hpsfExtractor.getDocumentSummaryInformationText());
assertNotNull(hpsfExtractor.getSummaryInformationText());
String text = hpsfExtractor.getText();
//System.out.println(text);
assertNotNull(text);
} finally {
hpsfExtractor.close();
}
}
} catch (IllegalArgumentException e) {
if (!EXPECTED_EXTRACTOR_FAILURES.contains(file.getParentFile().getName() + "/" + file.getName())) {
throw e;
}
} finally {
extractor.close();
}
}
use of org.apache.poi.POIOLE2TextExtractor in project poi by apache.
the class ExtractorFactory method createExtractor.
public static POITextExtractor createExtractor(File f) throws IOException, OpenXML4JException, XmlException {
NPOIFSFileSystem fs = null;
try {
fs = new NPOIFSFileSystem(f);
POIOLE2TextExtractor extractor = createExtractor(fs);
extractor.setFilesystem(fs);
return extractor;
} catch (OfficeXmlFileException e) {
// ensure file-handle release
IOUtils.closeQuietly(fs);
return createExtractor(OPCPackage.open(f.toString(), PackageAccess.READ));
} catch (NotOLE2FileException ne) {
// ensure file-handle release
IOUtils.closeQuietly(fs);
throw new IllegalArgumentException("Your File was neither an OLE2 file, nor an OOXML file");
} catch (OpenXML4JException e) {
// ensure file-handle release
IOUtils.closeQuietly(fs);
throw e;
} catch (XmlException e) {
// ensure file-handle release
IOUtils.closeQuietly(fs);
throw e;
} catch (IOException e) {
// ensure file-handle release
IOUtils.closeQuietly(fs);
throw e;
} catch (RuntimeException e) {
// ensure file-handle release
IOUtils.closeQuietly(fs);
throw e;
}
}
use of org.apache.poi.POIOLE2TextExtractor in project poi by apache.
the class TestExtractorFactory method testEmbeded.
/**
* Test embeded docs text extraction. For now, only
* does poifs embeded, but will do ooxml ones
* at some point.
*/
@Test
public void testEmbeded() throws Exception {
POIOLE2TextExtractor ext;
POITextExtractor[] embeds;
// No embedings
ext = (POIOLE2TextExtractor) ExtractorFactory.createExtractor(xls);
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
assertEquals(0, embeds.length);
ext.close();
// Excel
ext = (POIOLE2TextExtractor) ExtractorFactory.createExtractor(xlsEmb);
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
assertEquals(6, embeds.length);
int numWord = 0, numXls = 0, numPpt = 0, numMsg = 0, numWordX;
for (POITextExtractor embed : embeds) {
assertTrue(embed.getText().length() > 20);
if (embed instanceof PowerPointExtractor)
numPpt++;
else if (embed instanceof ExcelExtractor)
numXls++;
else if (embed instanceof WordExtractor)
numWord++;
else if (embed instanceof OutlookTextExtactor)
numMsg++;
}
assertEquals(2, numPpt);
assertEquals(2, numXls);
assertEquals(2, numWord);
assertEquals(0, numMsg);
ext.close();
// Word
ext = (POIOLE2TextExtractor) ExtractorFactory.createExtractor(docEmb);
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
numWord = 0;
numXls = 0;
numPpt = 0;
numMsg = 0;
assertEquals(4, embeds.length);
for (POITextExtractor embed : embeds) {
assertTrue(embed.getText().length() > 20);
if (embed instanceof PowerPointExtractor)
numPpt++;
else if (embed instanceof ExcelExtractor)
numXls++;
else if (embed instanceof WordExtractor)
numWord++;
else if (embed instanceof OutlookTextExtactor)
numMsg++;
}
assertEquals(1, numPpt);
assertEquals(2, numXls);
assertEquals(1, numWord);
assertEquals(0, numMsg);
ext.close();
// Word which contains an OOXML file
ext = (POIOLE2TextExtractor) ExtractorFactory.createExtractor(docEmbOOXML);
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
numWord = 0;
numXls = 0;
numPpt = 0;
numMsg = 0;
numWordX = 0;
assertEquals(3, embeds.length);
for (POITextExtractor embed : embeds) {
assertTrue(embed.getText().length() > 20);
if (embed instanceof PowerPointExtractor)
numPpt++;
else if (embed instanceof ExcelExtractor)
numXls++;
else if (embed instanceof WordExtractor)
numWord++;
else if (embed instanceof OutlookTextExtactor)
numMsg++;
else if (embed instanceof XWPFWordExtractor)
numWordX++;
}
assertEquals(1, numPpt);
assertEquals(1, numXls);
assertEquals(0, numWord);
assertEquals(1, numWordX);
assertEquals(0, numMsg);
ext.close();
// Outlook
ext = (OutlookTextExtactor) ExtractorFactory.createExtractor(msgEmb);
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
numWord = 0;
numXls = 0;
numPpt = 0;
numMsg = 0;
assertEquals(1, embeds.length);
for (POITextExtractor embed : embeds) {
assertTrue(embed.getText().length() > 20);
if (embed instanceof PowerPointExtractor)
numPpt++;
else if (embed instanceof ExcelExtractor)
numXls++;
else if (embed instanceof WordExtractor)
numWord++;
else if (embed instanceof OutlookTextExtactor)
numMsg++;
}
assertEquals(0, numPpt);
assertEquals(0, numXls);
assertEquals(1, numWord);
assertEquals(0, numMsg);
ext.close();
// Outlook with another outlook file in it
ext = (OutlookTextExtactor) ExtractorFactory.createExtractor(msgEmbMsg);
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
numWord = 0;
numXls = 0;
numPpt = 0;
numMsg = 0;
assertEquals(1, embeds.length);
for (POITextExtractor embed : embeds) {
assertTrue(embed.getText().length() > 20);
if (embed instanceof PowerPointExtractor)
numPpt++;
else if (embed instanceof ExcelExtractor)
numXls++;
else if (embed instanceof WordExtractor)
numWord++;
else if (embed instanceof OutlookTextExtactor)
numMsg++;
}
assertEquals(0, numPpt);
assertEquals(0, numXls);
assertEquals(0, numWord);
assertEquals(1, numMsg);
ext.close();
// TODO - PowerPoint
// TODO - Publisher
// TODO - Visio
}
Aggregations