use of org.apache.poi.hwpf.extractor.WordExtractor in project poi by apache.
the class TestBugs method test45473.
/**
* Bug 45473 - HWPF cannot read file after save
*/
@Test
public void test45473() throws IOException {
// Fetch the current text
HWPFDocument doc1 = HWPFTestDataSamples.openSampleFile("Bug45473.doc");
WordExtractor wordExtractor = new WordExtractor(doc1);
final String text1;
try {
text1 = wordExtractor.getText().trim();
} finally {
wordExtractor.close();
doc1.close();
}
// Re-load, then re-save and re-check
doc1 = HWPFTestDataSamples.openSampleFile("Bug45473.doc");
HWPFDocument doc2 = HWPFTestDataSamples.writeOutAndReadBack(doc1);
WordExtractor wordExtractor2 = new WordExtractor(doc2);
final String text2;
try {
text2 = wordExtractor2.getText().trim();
} finally {
wordExtractor2.close();
doc1.close();
}
// the text in the saved document has some differences in line
// separators but we tolerate that
assertEqualsIgnoreNewline(text1.replaceAll("\n", ""), text2.replaceAll("\n", ""));
}
use of org.apache.poi.hwpf.extractor.WordExtractor in project Gargoyle by callakrsos.
the class DocFileParser method DocFileContentParser.
public String DocFileContentParser(String fileName) {
POIFSFileSystem fs = null;
try {
fs = new POIFSFileSystem(new FileInputStream(fileName));
if (fileName.endsWith(".doc")) {
HWPFDocument doc = new HWPFDocument(fs);
WordExtractor we = new WordExtractor(doc);
return we.getText();
} else if (fileName.endsWith(".xls")) {
ExcelExtractor ex = new ExcelExtractor(fs);
ex.setFormulasNotResults(true);
ex.setIncludeSheetNames(true);
return ex.getText();
} else if (fileName.endsWith(".ppt")) {
PowerPointExtractor extractor = new PowerPointExtractor(fs);
return extractor.getText();
}
} catch (Exception e) {
LOGGER.debug("document file cant be indexed");
}
return "";
}
use of org.apache.poi.hwpf.extractor.WordExtractor in project poi by apache.
the class HWPFFileHandler method test.
// a test-case to test this locally without executing the full TestAllFiles
@Override
@Test
public void test() throws Exception {
File file = new File("test-data/document/52117.doc");
InputStream stream = new FileInputStream(file);
try {
handleFile(stream, file.getPath());
} finally {
stream.close();
}
handleExtracting(file);
stream = new FileInputStream(file);
try {
WordExtractor extractor = new WordExtractor(stream);
try {
assertNotNull(extractor.getText());
} finally {
extractor.close();
}
} finally {
stream.close();
}
}
use of org.apache.poi.hwpf.extractor.WordExtractor in project poi by apache.
the class TestBugs method test47286.
/**
* [FAILING] Bug 47286 - Word documents saves in wrong format if source
* contains form elements
*/
@SuppressWarnings("deprecation")
@Test
public void test47286() throws IOException {
// Fetch the current text
HWPFDocument doc1 = HWPFTestDataSamples.openSampleFile("Bug47286.doc");
WordExtractor wordExtractor = new WordExtractor(doc1);
final String text1;
try {
text1 = wordExtractor.getText().trim();
} finally {
wordExtractor.close();
doc1.close();
}
// Re-load, then re-save and re-check
doc1 = HWPFTestDataSamples.openSampleFile("Bug47286.doc");
HWPFDocument doc2 = HWPFTestDataSamples.writeOutAndReadBack(doc1);
WordExtractor wordExtractor2 = new WordExtractor(doc2);
final String text2;
try {
text2 = wordExtractor2.getText().trim();
} finally {
wordExtractor2.close();
doc1.close();
}
// the text in the saved document has some differences in line
// separators but we tolerate that
assertEqualsIgnoreNewline(text1.replaceAll("\n", ""), text2.replaceAll("\n", ""));
assertEquals(doc1.getCharacterTable().getTextRuns().size(), doc2.getCharacterTable().getTextRuns().size());
List<PlexOfField> expectedFields = doc1.getFieldsTables().getFieldsPLCF(FieldsDocumentPart.MAIN);
List<PlexOfField> actualFields = doc2.getFieldsTables().getFieldsPLCF(FieldsDocumentPart.MAIN);
assertEquals(expectedFields.size(), actualFields.size());
assertTableStructures(doc1.getRange(), doc2.getRange());
}
use of org.apache.poi.hwpf.extractor.WordExtractor in project poi by apache.
the class TestExtractorFactory method testPOIFS.
@Test
public void testPOIFS() throws Exception {
// Excel
assertTrue(ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls))) instanceof ExcelExtractor);
assertTrue(ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls))).getText().length() > 200);
// Word
assertTrue(ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc))) instanceof WordExtractor);
assertTrue(ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc))).getText().length() > 120);
assertTrue(ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc6))) instanceof Word6Extractor);
assertTrue(ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc6))).getText().length() > 20);
assertTrue(ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc95))) instanceof Word6Extractor);
assertTrue(ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc95))).getText().length() > 120);
// PowerPoint
assertTrue(ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(ppt))) instanceof PowerPointExtractor);
assertTrue(ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(ppt))).getText().length() > 120);
// Visio
assertTrue(ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(vsd))) instanceof VisioTextExtractor);
assertTrue(ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(vsd))).getText().length() > 50);
// Publisher
assertTrue(ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(pub))) instanceof PublisherTextExtractor);
assertTrue(ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(pub))).getText().length() > 50);
// Outlook msg
assertTrue(ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(msg))) instanceof OutlookTextExtactor);
assertTrue(ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(msg))).getText().length() > 50);
// Text
try {
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(txt)));
fail();
} catch (IOException e) {
// Good
}
}
Aggregations