use of org.apache.poi.hsmf.extractor.OutlookTextExtactor in project poi by apache.
the class OLE2ScratchpadExtractorFactory method identifyEmbeddedResources.
/**
* Returns an array of text extractors, one for each of
* the embedded documents in the file (if there are any).
* If there are no embedded documents, you'll get back an
* empty array. Otherwise, you'll get one open
* {@link POITextExtractor} for each embedded file.
*/
public static void identifyEmbeddedResources(POIOLE2TextExtractor ext, List<Entry> dirs, List<InputStream> nonPOIFS) throws IOException {
// Find all the embedded directories
DirectoryEntry root = ext.getRoot();
if (root == null) {
throw new IllegalStateException("The extractor didn't know which POIFS it came from!");
}
if (ext instanceof WordExtractor) {
// These are in ObjectPool -> _... under the root
try {
DirectoryEntry op = (DirectoryEntry) root.getEntry("ObjectPool");
Iterator<Entry> it = op.getEntries();
while (it.hasNext()) {
Entry entry = it.next();
if (entry.getName().startsWith("_")) {
dirs.add(entry);
}
}
} catch (FileNotFoundException e) {
// ignored here
}
//} else if(ext instanceof PowerPointExtractor) {
// Tricky, not stored directly in poifs
// TODO
} else if (ext instanceof OutlookTextExtactor) {
// Stored in the Attachment blocks
MAPIMessage msg = ((OutlookTextExtactor) ext).getMAPIMessage();
for (AttachmentChunks attachment : msg.getAttachmentFiles()) {
if (attachment.getAttachData() != null) {
byte[] data = attachment.getAttachData().getValue();
nonPOIFS.add(new ByteArrayInputStream(data));
} else if (attachment.getAttachmentDirectory() != null) {
dirs.add(attachment.getAttachmentDirectory().getDirectory());
}
}
}
}
use of org.apache.poi.hsmf.extractor.OutlookTextExtactor in project poi by apache.
the class TestExtractorFactory method testPOIFS.
@Test
public void testPOIFS() throws Exception {
// Excel
assertTrue(ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls))) instanceof ExcelExtractor);
assertTrue(ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls))).getText().length() > 200);
// Word
assertTrue(ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc))) instanceof WordExtractor);
assertTrue(ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc))).getText().length() > 120);
assertTrue(ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc6))) instanceof Word6Extractor);
assertTrue(ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc6))).getText().length() > 20);
assertTrue(ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc95))) instanceof Word6Extractor);
assertTrue(ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc95))).getText().length() > 120);
// PowerPoint
assertTrue(ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(ppt))) instanceof PowerPointExtractor);
assertTrue(ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(ppt))).getText().length() > 120);
// Visio
assertTrue(ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(vsd))) instanceof VisioTextExtractor);
assertTrue(ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(vsd))).getText().length() > 50);
// Publisher
assertTrue(ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(pub))) instanceof PublisherTextExtractor);
assertTrue(ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(pub))).getText().length() > 50);
// Outlook msg
assertTrue(ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(msg))) instanceof OutlookTextExtactor);
assertTrue(ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(msg))).getText().length() > 50);
// Text
try {
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(txt)));
fail();
} catch (IOException e) {
// Good
}
}
use of org.apache.poi.hsmf.extractor.OutlookTextExtactor in project poi by apache.
the class TestExtractorFactory method testFile.
@Test
public void testFile() throws Exception {
// Excel
POITextExtractor xlsExtractor = ExtractorFactory.createExtractor(xls);
assertNotNull("Had empty extractor for " + xls, xlsExtractor);
assertTrue("Expected instanceof ExcelExtractor, but had: " + xlsExtractor.getClass(), xlsExtractor instanceof ExcelExtractor);
assertTrue(xlsExtractor.getText().length() > 200);
xlsExtractor.close();
POITextExtractor extractor = ExtractorFactory.createExtractor(xlsx);
assertTrue(extractor.getClass().getName(), extractor instanceof XSSFExcelExtractor);
extractor.close();
extractor = ExtractorFactory.createExtractor(xlsx);
assertTrue(extractor.getText().length() > 200);
extractor.close();
extractor = ExtractorFactory.createExtractor(xltx);
assertTrue(extractor.getClass().getName(), extractor instanceof XSSFExcelExtractor);
extractor.close();
extractor = ExtractorFactory.createExtractor(xlsb);
assertContains(extractor.getText(), "test");
extractor.close();
extractor = ExtractorFactory.createExtractor(xltx);
assertContains(extractor.getText(), "test");
extractor.close();
// TODO Support OOXML-Strict, see bug #57699
try {
/*extractor =*/
ExtractorFactory.createExtractor(xlsxStrict);
fail("OOXML-Strict isn't yet supported");
} catch (POIXMLException e) {
// Expected, for now
}
// extractor = ExtractorFactory.createExtractor(xlsxStrict);
// assertTrue(
// extractor
// instanceof XSSFExcelExtractor
// );
// extractor.close();
//
// extractor = ExtractorFactory.createExtractor(xlsxStrict);
// assertTrue(
// extractor.getText().contains("test")
// );
// extractor.close();
// Word
extractor = ExtractorFactory.createExtractor(doc);
assertTrue(extractor instanceof WordExtractor);
assertTrue(extractor.getText().length() > 120);
extractor.close();
extractor = ExtractorFactory.createExtractor(doc6);
assertTrue(extractor instanceof Word6Extractor);
assertTrue(extractor.getText().length() > 20);
extractor.close();
extractor = ExtractorFactory.createExtractor(doc95);
assertTrue(extractor instanceof Word6Extractor);
assertTrue(extractor.getText().length() > 120);
extractor.close();
extractor = ExtractorFactory.createExtractor(docx);
assertTrue(extractor instanceof XWPFWordExtractor);
extractor.close();
extractor = ExtractorFactory.createExtractor(docx);
assertTrue(extractor.getText().length() > 120);
extractor.close();
extractor = ExtractorFactory.createExtractor(dotx);
assertTrue(extractor instanceof XWPFWordExtractor);
extractor.close();
extractor = ExtractorFactory.createExtractor(dotx);
assertContains(extractor.getText(), "Test");
extractor.close();
// PowerPoint (PPT)
extractor = ExtractorFactory.createExtractor(ppt);
assertTrue(extractor instanceof PowerPointExtractor);
assertTrue(extractor.getText().length() > 120);
extractor.close();
// PowerPoint (PPTX)
extractor = ExtractorFactory.createExtractor(pptx);
assertTrue(extractor instanceof XSLFPowerPointExtractor);
assertTrue(extractor.getText().length() > 120);
extractor.close();
// Visio - binary
extractor = ExtractorFactory.createExtractor(vsd);
assertTrue(extractor instanceof VisioTextExtractor);
assertTrue(extractor.getText().length() > 50);
extractor.close();
// Visio - vsdx
extractor = ExtractorFactory.createExtractor(vsdx);
assertTrue(extractor instanceof XDGFVisioExtractor);
assertTrue(extractor.getText().length() > 20);
extractor.close();
// Publisher
extractor = ExtractorFactory.createExtractor(pub);
assertTrue(extractor instanceof PublisherTextExtractor);
assertTrue(extractor.getText().length() > 50);
extractor.close();
// Outlook msg
extractor = ExtractorFactory.createExtractor(msg);
assertTrue(extractor instanceof OutlookTextExtactor);
assertTrue(extractor.getText().length() > 50);
extractor.close();
// Text
try {
ExtractorFactory.createExtractor(txt);
fail();
} catch (IllegalArgumentException e) {
// Good
}
}
use of org.apache.poi.hsmf.extractor.OutlookTextExtactor in project poi by apache.
the class TestFixedSizedProperties method testReadMessageDateFailsWithOutlookTextExtractor.
/**
* Test to see if we can read the Date Chunk with OutlookTextExtractor.
*/
@Test
public // @Ignore("TODO Work out why the Thu 21st vs Monday 25th problem is occurring and fix")
void testReadMessageDateFailsWithOutlookTextExtractor() throws Exception {
OutlookTextExtactor ext = new OutlookTextExtactor(mapiMessageFails);
// Don't close re-used test resources here
ext.setFilesystem(null);
String text = ext.getText();
assertContains(text, "Date: Thu, 21 Jun 2012 14:14:04 +0000\n");
ext.close();
}
Aggregations