use of org.apache.poi.hwpf.extractor.WordExtractor in project poi by apache.
the class TestBugs method test51678And51524.
/**
* Bug 51678 - Extracting text from Bug51524.zip is slow Bug 51524 -
* PapBinTable constructor is slow
*/
@Test
public void test51678And51524() throws IOException {
// TODO: refactor into something nicer!
if (System.getProperty("poi.test.remote") != null) {
String href = "http://domex.nps.edu/corp/files/govdocs1/007/007488.doc";
HWPFDocument hwpfDocument = HWPFTestDataSamples.openRemoteFile(href);
WordExtractor wordExtractor = new WordExtractor(hwpfDocument);
try {
wordExtractor.getText();
} finally {
wordExtractor.close();
}
}
}
use of org.apache.poi.hwpf.extractor.WordExtractor in project poi by apache.
the class TestBugs method getText.
private String getText(String samplefile) throws IOException {
HWPFDocument doc = HWPFTestDataSamples.openSampleFile(samplefile);
WordExtractor extractor = new WordExtractor(doc);
try {
return extractor.getText();
} finally {
extractor.close();
doc.close();
}
}
use of org.apache.poi.hwpf.extractor.WordExtractor in project poi by apache.
the class TestBugs method test44431.
/**
* Bug 44331 - HWPFDocument.write destroys fields
*/
@SuppressWarnings("deprecation")
@Test
public void test44431() throws IOException {
HWPFDocument doc1 = HWPFTestDataSamples.openSampleFile("Bug44431.doc");
WordExtractor extractor1 = new WordExtractor(doc1);
try {
HWPFDocument doc2 = HWPFTestDataSamples.writeOutAndReadBack(doc1);
WordExtractor extractor2 = new WordExtractor(doc2);
try {
assertEqualsIgnoreNewline(extractor1.getFooterText(), extractor2.getFooterText());
assertEqualsIgnoreNewline(extractor1.getHeaderText(), extractor2.getHeaderText());
assertEqualsIgnoreNewline(Arrays.toString(extractor1.getParagraphText()), Arrays.toString(extractor2.getParagraphText()));
assertEqualsIgnoreNewline(extractor1.getText(), extractor2.getText());
} finally {
extractor2.close();
}
} finally {
extractor1.close();
doc1.close();
}
}
use of org.apache.poi.hwpf.extractor.WordExtractor in project poi by apache.
the class TestProblems method testProblemHeaderStories49936.
/**
* Bug #49936 - Problems with reading the header out of the Header Stories
*/
@SuppressWarnings("deprecation")
@Test
public void testProblemHeaderStories49936() throws IOException {
HWPFDocument doc = HWPFTestDataSamples.openSampleFile("HeaderFooterProblematic.doc");
HeaderStories hs = new HeaderStories(doc);
assertEquals("", hs.getFirstHeader());
assertEquals("\r", hs.getEvenHeader());
assertEquals("", hs.getOddHeader());
assertEquals("", hs.getFirstFooter());
assertEquals("", hs.getEvenFooter());
assertEquals("", hs.getOddFooter());
WordExtractor ext = new WordExtractor(doc);
assertEquals("\n", ext.getHeaderText());
assertEquals("", ext.getFooterText());
ext.close();
doc.close();
}
use of org.apache.poi.hwpf.extractor.WordExtractor in project poi by apache.
the class OLE2ScratchpadExtractorFactory method identifyEmbeddedResources.
/**
* Returns an array of text extractors, one for each of
* the embedded documents in the file (if there are any).
* If there are no embedded documents, you'll get back an
* empty array. Otherwise, you'll get one open
* {@link POITextExtractor} for each embedded file.
*/
public static void identifyEmbeddedResources(POIOLE2TextExtractor ext, List<Entry> dirs, List<InputStream> nonPOIFS) throws IOException {
// Find all the embedded directories
DirectoryEntry root = ext.getRoot();
if (root == null) {
throw new IllegalStateException("The extractor didn't know which POIFS it came from!");
}
if (ext instanceof WordExtractor) {
// These are in ObjectPool -> _... under the root
try {
DirectoryEntry op = (DirectoryEntry) root.getEntry("ObjectPool");
Iterator<Entry> it = op.getEntries();
while (it.hasNext()) {
Entry entry = it.next();
if (entry.getName().startsWith("_")) {
dirs.add(entry);
}
}
} catch (FileNotFoundException e) {
// ignored here
}
//} else if(ext instanceof PowerPointExtractor) {
// Tricky, not stored directly in poifs
// TODO
} else if (ext instanceof OutlookTextExtactor) {
// Stored in the Attachment blocks
MAPIMessage msg = ((OutlookTextExtactor) ext).getMAPIMessage();
for (AttachmentChunks attachment : msg.getAttachmentFiles()) {
if (attachment.getAttachData() != null) {
byte[] data = attachment.getAttachData().getValue();
nonPOIFS.add(new ByteArrayInputStream(data));
} else if (attachment.getAttachmentDirectory() != null) {
dirs.add(attachment.getAttachmentDirectory().getDirectory());
}
}
}
}
Aggregations