use of org.apache.poi.hwpf.extractor.WordExtractor in project Xponents by OpenSextant.
the class MSDocConverter method conversionImplementation.
/**
*/
@Override
protected ConvertedDocument conversionImplementation(InputStream input, java.io.File doc) throws IOException {
org.apache.poi.hwpf.extractor.WordExtractor ex = new WordExtractor(input);
String[] ps = ex.getParagraphText();
input.close();
StringBuilder sb = new StringBuilder();
for (int i = 0; i < ps.length; i++) {
sb.append(WordExtractor.stripFields(ps[i]).trim());
sb.append('\n');
}
ConvertedDocument textdoc = new ConvertedDocument(doc);
textdoc.setText(sb.toString());
ex.close();
return textdoc;
}
use of org.apache.poi.hwpf.extractor.WordExtractor in project poi by apache.
the class ExtractorFactory method getEmbededDocsTextExtractors.
/**
* Returns an array of text extractors, one for each of
* the embedded documents in the file (if there are any).
* If there are no embedded documents, you'll get back an
* empty array. Otherwise, you'll get one open
* {@link POITextExtractor} for each embedded file.
*/
public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException, OpenXML4JException, XmlException {
// All the embedded directories we spotted
ArrayList<Entry> dirs = new ArrayList<Entry>();
// For anything else not directly held in as a POIFS directory
ArrayList<InputStream> nonPOIFS = new ArrayList<InputStream>();
// Find all the embedded directories
DirectoryEntry root = ext.getRoot();
if (root == null) {
throw new IllegalStateException("The extractor didn't know which POIFS it came from!");
}
if (ext instanceof ExcelExtractor) {
// These are in MBD... under the root
Iterator<Entry> it = root.getEntries();
while (it.hasNext()) {
Entry entry = it.next();
if (entry.getName().startsWith("MBD")) {
dirs.add(entry);
}
}
} else if (ext instanceof WordExtractor) {
// These are in ObjectPool -> _... under the root
try {
DirectoryEntry op = (DirectoryEntry) root.getEntry("ObjectPool");
Iterator<Entry> it = op.getEntries();
while (it.hasNext()) {
Entry entry = it.next();
if (entry.getName().startsWith("_")) {
dirs.add(entry);
}
}
} catch (FileNotFoundException e) {
logger.log(POILogger.INFO, "Ignoring FileNotFoundException while extracting Word document", e.getLocalizedMessage());
// ignored here
}
//} else if(ext instanceof PowerPointExtractor) {
// Tricky, not stored directly in poifs
// TODO
} else if (ext instanceof OutlookTextExtactor) {
// Stored in the Attachment blocks
MAPIMessage msg = ((OutlookTextExtactor) ext).getMAPIMessage();
for (AttachmentChunks attachment : msg.getAttachmentFiles()) {
if (attachment.getAttachData() != null) {
byte[] data = attachment.getAttachData().getValue();
nonPOIFS.add(new ByteArrayInputStream(data));
} else if (attachment.getAttachmentDirectory() != null) {
dirs.add(attachment.getAttachmentDirectory().getDirectory());
}
}
}
// Create the extractors
if (dirs.size() == 0 && nonPOIFS.size() == 0) {
return new POITextExtractor[0];
}
ArrayList<POITextExtractor> textExtractors = new ArrayList<POITextExtractor>();
for (Entry dir : dirs) {
textExtractors.add(createExtractor((DirectoryNode) dir));
}
for (InputStream nonPOIF : nonPOIFS) {
try {
textExtractors.add(createExtractor(nonPOIF));
} catch (IllegalArgumentException e) {
// Ignore, just means it didn't contain
// a format we support as yet
logger.log(POILogger.INFO, "Format not supported yet", e.getLocalizedMessage());
} catch (XmlException e) {
throw new IOException(e.getMessage(), e);
} catch (OpenXML4JException e) {
throw new IOException(e.getMessage(), e);
}
}
return textExtractors.toArray(new POITextExtractor[textExtractors.size()]);
}
use of org.apache.poi.hwpf.extractor.WordExtractor in project poi by apache.
the class TestExtractorFactory method testOPOIFS.
@Test
public void testOPOIFS() throws Exception {
// Excel
assertTrue(ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(xls))) instanceof ExcelExtractor);
assertTrue(ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(xls))).getText().length() > 200);
// Word
assertTrue(ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc))) instanceof WordExtractor);
assertTrue(ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc))).getText().length() > 120);
assertTrue(ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc6))) instanceof Word6Extractor);
assertTrue(ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc6))).getText().length() > 20);
assertTrue(ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc95))) instanceof Word6Extractor);
assertTrue(ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc95))).getText().length() > 120);
// PowerPoint
assertTrue(ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(ppt))) instanceof PowerPointExtractor);
assertTrue(ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(ppt))).getText().length() > 120);
// Visio
assertTrue(ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(vsd))) instanceof VisioTextExtractor);
assertTrue(ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(vsd))).getText().length() > 50);
// Publisher
assertTrue(ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(pub))) instanceof PublisherTextExtractor);
assertTrue(ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(pub))).getText().length() > 50);
// Outlook msg
assertTrue(ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(msg))) instanceof OutlookTextExtactor);
assertTrue(ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(msg))).getText().length() > 50);
// Text
try {
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(txt)));
fail();
} catch (IOException e) {
// Good
}
}
use of org.apache.poi.hwpf.extractor.WordExtractor in project poi by apache.
the class TestExtractorFactory method testInputStream.
@Test
public void testInputStream() throws Exception {
// Excel
POITextExtractor extractor = ExtractorFactory.createExtractor(new FileInputStream(xls));
assertTrue(extractor instanceof ExcelExtractor);
assertTrue(extractor.getText().length() > 200);
extractor.close();
extractor = ExtractorFactory.createExtractor(new FileInputStream(xlsx));
assertTrue(extractor.getClass().getName(), extractor instanceof XSSFExcelExtractor);
assertTrue(extractor.getText().length() > 200);
// TODO Support OOXML-Strict, see bug #57699
// assertTrue(
// ExtractorFactory.createExtractor(new FileInputStream(xlsxStrict))
// instanceof XSSFExcelExtractor
// );
// assertTrue(
// ExtractorFactory.createExtractor(new FileInputStream(xlsxStrict)).getText().length() > 200
// );
extractor.close();
// Word
extractor = ExtractorFactory.createExtractor(new FileInputStream(doc));
assertTrue(extractor.getClass().getName(), extractor instanceof WordExtractor);
assertTrue(extractor.getText().length() > 120);
extractor.close();
extractor = ExtractorFactory.createExtractor(new FileInputStream(doc6));
assertTrue(extractor.getClass().getName(), extractor instanceof Word6Extractor);
assertTrue(extractor.getText().length() > 20);
extractor.close();
extractor = ExtractorFactory.createExtractor(new FileInputStream(doc95));
assertTrue(extractor.getClass().getName(), extractor instanceof Word6Extractor);
assertTrue(extractor.getText().length() > 120);
extractor.close();
extractor = ExtractorFactory.createExtractor(new FileInputStream(docx));
assertTrue(extractor instanceof XWPFWordExtractor);
assertTrue(extractor.getText().length() > 120);
extractor.close();
// PowerPoint
extractor = ExtractorFactory.createExtractor(new FileInputStream(ppt));
assertTrue(extractor instanceof PowerPointExtractor);
assertTrue(extractor.getText().length() > 120);
extractor.close();
extractor = ExtractorFactory.createExtractor(new FileInputStream(pptx));
assertTrue(extractor instanceof XSLFPowerPointExtractor);
assertTrue(extractor.getText().length() > 120);
extractor.close();
// Visio
extractor = ExtractorFactory.createExtractor(new FileInputStream(vsd));
assertTrue(extractor instanceof VisioTextExtractor);
assertTrue(extractor.getText().length() > 50);
extractor.close();
// Visio - vsdx
extractor = ExtractorFactory.createExtractor(new FileInputStream(vsdx));
assertTrue(extractor instanceof XDGFVisioExtractor);
assertTrue(extractor.getText().length() > 20);
extractor.close();
// Publisher
extractor = ExtractorFactory.createExtractor(new FileInputStream(pub));
assertTrue(extractor instanceof PublisherTextExtractor);
assertTrue(extractor.getText().length() > 50);
extractor.close();
// Outlook msg
extractor = ExtractorFactory.createExtractor(new FileInputStream(msg));
assertTrue(extractor instanceof OutlookTextExtactor);
assertTrue(extractor.getText().length() > 50);
extractor.close();
// Text
try {
FileInputStream stream = new FileInputStream(txt);
try {
ExtractorFactory.createExtractor(stream);
fail();
} finally {
IOUtils.closeQuietly(stream);
}
} catch (IllegalArgumentException e) {
// Good
}
}
use of org.apache.poi.hwpf.extractor.WordExtractor in project poi by apache.
the class TestExtractorFactory method testEmbeded.
/**
* Test embeded docs text extraction. For now, only
* does poifs embeded, but will do ooxml ones
* at some point.
*/
@Test
public void testEmbeded() throws Exception {
POIOLE2TextExtractor ext;
POITextExtractor[] embeds;
// No embedings
ext = (POIOLE2TextExtractor) ExtractorFactory.createExtractor(xls);
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
assertEquals(0, embeds.length);
ext.close();
// Excel
ext = (POIOLE2TextExtractor) ExtractorFactory.createExtractor(xlsEmb);
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
assertEquals(6, embeds.length);
int numWord = 0, numXls = 0, numPpt = 0, numMsg = 0, numWordX;
for (POITextExtractor embed : embeds) {
assertTrue(embed.getText().length() > 20);
if (embed instanceof PowerPointExtractor)
numPpt++;
else if (embed instanceof ExcelExtractor)
numXls++;
else if (embed instanceof WordExtractor)
numWord++;
else if (embed instanceof OutlookTextExtactor)
numMsg++;
}
assertEquals(2, numPpt);
assertEquals(2, numXls);
assertEquals(2, numWord);
assertEquals(0, numMsg);
ext.close();
// Word
ext = (POIOLE2TextExtractor) ExtractorFactory.createExtractor(docEmb);
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
numWord = 0;
numXls = 0;
numPpt = 0;
numMsg = 0;
assertEquals(4, embeds.length);
for (POITextExtractor embed : embeds) {
assertTrue(embed.getText().length() > 20);
if (embed instanceof PowerPointExtractor)
numPpt++;
else if (embed instanceof ExcelExtractor)
numXls++;
else if (embed instanceof WordExtractor)
numWord++;
else if (embed instanceof OutlookTextExtactor)
numMsg++;
}
assertEquals(1, numPpt);
assertEquals(2, numXls);
assertEquals(1, numWord);
assertEquals(0, numMsg);
ext.close();
// Word which contains an OOXML file
ext = (POIOLE2TextExtractor) ExtractorFactory.createExtractor(docEmbOOXML);
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
numWord = 0;
numXls = 0;
numPpt = 0;
numMsg = 0;
numWordX = 0;
assertEquals(3, embeds.length);
for (POITextExtractor embed : embeds) {
assertTrue(embed.getText().length() > 20);
if (embed instanceof PowerPointExtractor)
numPpt++;
else if (embed instanceof ExcelExtractor)
numXls++;
else if (embed instanceof WordExtractor)
numWord++;
else if (embed instanceof OutlookTextExtactor)
numMsg++;
else if (embed instanceof XWPFWordExtractor)
numWordX++;
}
assertEquals(1, numPpt);
assertEquals(1, numXls);
assertEquals(0, numWord);
assertEquals(1, numWordX);
assertEquals(0, numMsg);
ext.close();
// Outlook
ext = (OutlookTextExtactor) ExtractorFactory.createExtractor(msgEmb);
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
numWord = 0;
numXls = 0;
numPpt = 0;
numMsg = 0;
assertEquals(1, embeds.length);
for (POITextExtractor embed : embeds) {
assertTrue(embed.getText().length() > 20);
if (embed instanceof PowerPointExtractor)
numPpt++;
else if (embed instanceof ExcelExtractor)
numXls++;
else if (embed instanceof WordExtractor)
numWord++;
else if (embed instanceof OutlookTextExtactor)
numMsg++;
}
assertEquals(0, numPpt);
assertEquals(0, numXls);
assertEquals(1, numWord);
assertEquals(0, numMsg);
ext.close();
// Outlook with another outlook file in it
ext = (OutlookTextExtactor) ExtractorFactory.createExtractor(msgEmbMsg);
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
numWord = 0;
numXls = 0;
numPpt = 0;
numMsg = 0;
assertEquals(1, embeds.length);
for (POITextExtractor embed : embeds) {
assertTrue(embed.getText().length() > 20);
if (embed instanceof PowerPointExtractor)
numPpt++;
else if (embed instanceof ExcelExtractor)
numXls++;
else if (embed instanceof WordExtractor)
numWord++;
else if (embed instanceof OutlookTextExtactor)
numMsg++;
}
assertEquals(0, numPpt);
assertEquals(0, numXls);
assertEquals(0, numWord);
assertEquals(1, numMsg);
ext.close();
// TODO - PowerPoint
// TODO - Publisher
// TODO - Visio
}
Aggregations