use of org.apache.poi.hwpf.extractor.Word6Extractor in project poi by apache.
the class TestExtractorFactory method testOPOIFS.
@Test
public void testOPOIFS() throws Exception {
// Excel
assertTrue(ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(xls))) instanceof ExcelExtractor);
assertTrue(ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(xls))).getText().length() > 200);
// Word
assertTrue(ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc))) instanceof WordExtractor);
assertTrue(ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc))).getText().length() > 120);
assertTrue(ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc6))) instanceof Word6Extractor);
assertTrue(ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc6))).getText().length() > 20);
assertTrue(ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc95))) instanceof Word6Extractor);
assertTrue(ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc95))).getText().length() > 120);
// PowerPoint
assertTrue(ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(ppt))) instanceof PowerPointExtractor);
assertTrue(ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(ppt))).getText().length() > 120);
// Visio
assertTrue(ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(vsd))) instanceof VisioTextExtractor);
assertTrue(ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(vsd))).getText().length() > 50);
// Publisher
assertTrue(ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(pub))) instanceof PublisherTextExtractor);
assertTrue(ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(pub))).getText().length() > 50);
// Outlook msg
assertTrue(ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(msg))) instanceof OutlookTextExtactor);
assertTrue(ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(msg))).getText().length() > 50);
// Text
try {
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(txt)));
fail();
} catch (IOException e) {
// Good
}
}
use of org.apache.poi.hwpf.extractor.Word6Extractor in project poi by apache.
the class TestExtractorFactory method testInputStream.
@Test
public void testInputStream() throws Exception {
// Excel
POITextExtractor extractor = ExtractorFactory.createExtractor(new FileInputStream(xls));
assertTrue(extractor instanceof ExcelExtractor);
assertTrue(extractor.getText().length() > 200);
extractor.close();
extractor = ExtractorFactory.createExtractor(new FileInputStream(xlsx));
assertTrue(extractor.getClass().getName(), extractor instanceof XSSFExcelExtractor);
assertTrue(extractor.getText().length() > 200);
// TODO Support OOXML-Strict, see bug #57699
// assertTrue(
// ExtractorFactory.createExtractor(new FileInputStream(xlsxStrict))
// instanceof XSSFExcelExtractor
// );
// assertTrue(
// ExtractorFactory.createExtractor(new FileInputStream(xlsxStrict)).getText().length() > 200
// );
extractor.close();
// Word
extractor = ExtractorFactory.createExtractor(new FileInputStream(doc));
assertTrue(extractor.getClass().getName(), extractor instanceof WordExtractor);
assertTrue(extractor.getText().length() > 120);
extractor.close();
extractor = ExtractorFactory.createExtractor(new FileInputStream(doc6));
assertTrue(extractor.getClass().getName(), extractor instanceof Word6Extractor);
assertTrue(extractor.getText().length() > 20);
extractor.close();
extractor = ExtractorFactory.createExtractor(new FileInputStream(doc95));
assertTrue(extractor.getClass().getName(), extractor instanceof Word6Extractor);
assertTrue(extractor.getText().length() > 120);
extractor.close();
extractor = ExtractorFactory.createExtractor(new FileInputStream(docx));
assertTrue(extractor instanceof XWPFWordExtractor);
assertTrue(extractor.getText().length() > 120);
extractor.close();
// PowerPoint
extractor = ExtractorFactory.createExtractor(new FileInputStream(ppt));
assertTrue(extractor instanceof PowerPointExtractor);
assertTrue(extractor.getText().length() > 120);
extractor.close();
extractor = ExtractorFactory.createExtractor(new FileInputStream(pptx));
assertTrue(extractor instanceof XSLFPowerPointExtractor);
assertTrue(extractor.getText().length() > 120);
extractor.close();
// Visio
extractor = ExtractorFactory.createExtractor(new FileInputStream(vsd));
assertTrue(extractor instanceof VisioTextExtractor);
assertTrue(extractor.getText().length() > 50);
extractor.close();
// Visio - vsdx
extractor = ExtractorFactory.createExtractor(new FileInputStream(vsdx));
assertTrue(extractor instanceof XDGFVisioExtractor);
assertTrue(extractor.getText().length() > 20);
extractor.close();
// Publisher
extractor = ExtractorFactory.createExtractor(new FileInputStream(pub));
assertTrue(extractor instanceof PublisherTextExtractor);
assertTrue(extractor.getText().length() > 50);
extractor.close();
// Outlook msg
extractor = ExtractorFactory.createExtractor(new FileInputStream(msg));
assertTrue(extractor instanceof OutlookTextExtactor);
assertTrue(extractor.getText().length() > 50);
extractor.close();
// Text
try {
FileInputStream stream = new FileInputStream(txt);
try {
ExtractorFactory.createExtractor(stream);
fail();
} finally {
IOUtils.closeQuietly(stream);
}
} catch (IllegalArgumentException e) {
// Good
}
}
use of org.apache.poi.hwpf.extractor.Word6Extractor in project poi by apache.
the class TestHWPFOldDocument method testCodePageBug50955.
@Test
public void testCodePageBug50955() throws IOException {
//windows 1251
HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Bug50955.doc");
Word6Extractor ex = new Word6Extractor(doc);
StringBuilder sb = new StringBuilder();
for (String p : ex.getParagraphText()) {
sb.append(p);
}
//Greetings!
assertContains(sb.toString(), "привет");
ex.close();
doc.close();
}
use of org.apache.poi.hwpf.extractor.Word6Extractor in project poi by apache.
the class TestHWPFOldDocument method testDefaultCodePageEncoding.
@Test
public void testDefaultCodePageEncoding() throws IOException {
HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Bug60942.doc");
Word6Extractor ex = new Word6Extractor(doc);
String txt = ex.getText();
assertContains(txt, "BERTHOD");
assertContains(txt, "APPLICOLOR");
assertContains(txt, "les meilleurs");
assertContains(txt, "GUY LECOLE");
ex.close();
doc.close();
}
use of org.apache.poi.hwpf.extractor.Word6Extractor in project tika by apache.
the class WordExtractor method parseWord6.
protected void parseWord6(DirectoryNode root, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException {
HWPFOldDocument doc = new HWPFOldDocument(root);
Word6Extractor extractor = new Word6Extractor(doc);
for (String p : extractor.getParagraphText()) {
xhtml.element("p", p);
}
}
Aggregations