use of org.apache.poi.hwpf.HWPFDocument in project Gargoyle by callakrsos.
the class DocFileParser method DocFileContentParser.
public String DocFileContentParser(String fileName) {
POIFSFileSystem fs = null;
try {
fs = new POIFSFileSystem(new FileInputStream(fileName));
if (fileName.endsWith(".doc")) {
HWPFDocument doc = new HWPFDocument(fs);
WordExtractor we = new WordExtractor(doc);
return we.getText();
} else if (fileName.endsWith(".xls")) {
ExcelExtractor ex = new ExcelExtractor(fs);
ex.setFormulasNotResults(true);
ex.setIncludeSheetNames(true);
return ex.getText();
} else if (fileName.endsWith(".ppt")) {
PowerPointExtractor extractor = new PowerPointExtractor(fs);
return extractor.getText();
}
} catch (Exception e) {
LOGGER.debug("document file cant be indexed");
}
return "";
}
use of org.apache.poi.hwpf.HWPFDocument in project wechat by dllwh.
the class WordUtil method convertHtmlByWord2003.
/**
* @方法描述: 将word2003转换为html文件
* @param sourceFile
* 源word文件路径
* @param parentPath
* 目标文件路径
* @param saveFileName
* 目标文件名称
* @param charsetName
* 编码
* @return
* @throws Exception
*/
public static boolean convertHtmlByWord2003(String sourceFile, String parentPath, String saveFileName, String encode) throws Exception {
if (StringUtils.isBlank(encode)) {
encode = "UTF-8";
}
File imgPath = new File(parentPath);
if (!imgPath.exists()) {
// 图片目录不存在则创建
imgPath.mkdirs();
}
// 创建一个文档
HWPFDocument wordDocument = new HWPFDocument(new FileInputStream(sourceFile));
// 对普通文本的操作
WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());
// 对图片的操作: 图片在html文件上的相对路径
wordToHtmlConverter.setPicturesManager(new PicturesManager() {
public String savePicture(byte[] content, PictureType pictureType, String suggestedName, float widthInches, float heightInches) {
return suggestedName;
}
});
// 保存图片
List<Picture> pics = wordDocument.getPicturesTable().getAllPictures();
if (pics != null) {
for (int i = 0; i < pics.size(); i++) {
Picture pic = (Picture) pics.get(i);
try {
pic.writeImageContent(new FileOutputStream(parentPath + pic.suggestFullFileName()));
} catch (FileNotFoundException e) {
e.printStackTrace();
}
}
}
// 解析word文档
wordToHtmlConverter.processDocument(wordDocument);
Document htmlDocument = wordToHtmlConverter.getDocument();
ByteArrayOutputStream output = new ByteArrayOutputStream();
DOMSource domSource = new DOMSource(htmlDocument);
StreamResult streamResult = new StreamResult(output);
// 下面都是转换
TransformerFactory tf = TransformerFactory.newInstance();
Transformer serializer = tf.newTransformer();
serializer.setOutputProperty(OutputKeys.ENCODING, encode);
serializer.setOutputProperty(OutputKeys.INDENT, "yes");
serializer.setOutputProperty(OutputKeys.METHOD, "html");
serializer.transform(domSource, streamResult);
// 调用writeFile类
writeFile(new String(output.toByteArray()), parentPath + File.separator + saveFileName, encode);
IOUtils.closeQuietly(output);
return false;
}
use of org.apache.poi.hwpf.HWPFDocument in project poi by apache.
the class HWPFFileHandler method handleFile.
@Override
public void handleFile(InputStream stream, String path) throws Exception {
HWPFDocument doc = new HWPFDocument(stream);
assertNotNull(doc.getBookmarks());
assertNotNull(doc.getCharacterTable());
assertNotNull(doc.getEndnotes());
handlePOIDocument(doc);
}
use of org.apache.poi.hwpf.HWPFDocument in project poi by apache.
the class TestHWPFWrite method testInvalidInPlaceWriteOPOIFS.
@Test(expected = IllegalStateException.class)
public void testInvalidInPlaceWriteOPOIFS() throws Exception {
// Can't work for OPOIFS
OPOIFSFileSystem ofs = new OPOIFSFileSystem(SAMPLES.openResourceAsStream("SampleDoc.doc"));
HWPFDocument doc = new HWPFDocument(ofs.getRoot());
try {
doc.write();
} finally {
doc.close();
}
}
use of org.apache.poi.hwpf.HWPFDocument in project poi by apache.
the class TestBugs method test46220.
/**
* Bug 46220 - images are not properly extracted
*/
@Test
public void test46220() throws IOException {
HWPFDocument doc = HWPFTestDataSamples.openSampleFile("Bug46220.doc");
// reference checksums as in Bugzilla
String[] md5 = { "851be142bce6d01848e730cb6903f39e", "7fc6d8fb58b09ababd036d10a0e8c039", "a7dc644c40bc2fbf17b2b62d07f99248", "72d07b8db5fad7099d90bc4c304b4666" };
List<Picture> pics = doc.getPicturesTable().getAllPictures();
assertEquals(4, pics.size());
for (int i = 0; i < pics.size(); i++) {
Picture pic = pics.get(i);
byte[] data = pic.getRawContent();
// use Apache Commons Codec utils to compute md5
assertEqualsIgnoreNewline(md5[i], DigestUtils.md5Hex(data));
}
doc.close();
}
Aggregations