use of org.apache.poi.hslf.extractor.PowerPointExtractor in project poi by apache.
the class TestExtractorFactory method testOPOIFS.
@Test
public void testOPOIFS() throws Exception {
// Excel
assertTrue(ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(xls))) instanceof ExcelExtractor);
assertTrue(ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(xls))).getText().length() > 200);
// Word
assertTrue(ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc))) instanceof WordExtractor);
assertTrue(ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc))).getText().length() > 120);
assertTrue(ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc6))) instanceof Word6Extractor);
assertTrue(ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc6))).getText().length() > 20);
assertTrue(ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc95))) instanceof Word6Extractor);
assertTrue(ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc95))).getText().length() > 120);
// PowerPoint
assertTrue(ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(ppt))) instanceof PowerPointExtractor);
assertTrue(ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(ppt))).getText().length() > 120);
// Visio
assertTrue(ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(vsd))) instanceof VisioTextExtractor);
assertTrue(ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(vsd))).getText().length() > 50);
// Publisher
assertTrue(ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(pub))) instanceof PublisherTextExtractor);
assertTrue(ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(pub))).getText().length() > 50);
// Outlook msg
assertTrue(ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(msg))) instanceof OutlookTextExtactor);
assertTrue(ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(msg))).getText().length() > 50);
// Text
try {
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(txt)));
fail();
} catch (IOException e) {
// Good
}
}
use of org.apache.poi.hslf.extractor.PowerPointExtractor in project poi by apache.
the class TestExtractorFactory method testInputStream.
@Test
public void testInputStream() throws Exception {
// Excel
POITextExtractor extractor = ExtractorFactory.createExtractor(new FileInputStream(xls));
assertTrue(extractor instanceof ExcelExtractor);
assertTrue(extractor.getText().length() > 200);
extractor.close();
extractor = ExtractorFactory.createExtractor(new FileInputStream(xlsx));
assertTrue(extractor.getClass().getName(), extractor instanceof XSSFExcelExtractor);
assertTrue(extractor.getText().length() > 200);
// TODO Support OOXML-Strict, see bug #57699
// assertTrue(
// ExtractorFactory.createExtractor(new FileInputStream(xlsxStrict))
// instanceof XSSFExcelExtractor
// );
// assertTrue(
// ExtractorFactory.createExtractor(new FileInputStream(xlsxStrict)).getText().length() > 200
// );
extractor.close();
// Word
extractor = ExtractorFactory.createExtractor(new FileInputStream(doc));
assertTrue(extractor.getClass().getName(), extractor instanceof WordExtractor);
assertTrue(extractor.getText().length() > 120);
extractor.close();
extractor = ExtractorFactory.createExtractor(new FileInputStream(doc6));
assertTrue(extractor.getClass().getName(), extractor instanceof Word6Extractor);
assertTrue(extractor.getText().length() > 20);
extractor.close();
extractor = ExtractorFactory.createExtractor(new FileInputStream(doc95));
assertTrue(extractor.getClass().getName(), extractor instanceof Word6Extractor);
assertTrue(extractor.getText().length() > 120);
extractor.close();
extractor = ExtractorFactory.createExtractor(new FileInputStream(docx));
assertTrue(extractor instanceof XWPFWordExtractor);
assertTrue(extractor.getText().length() > 120);
extractor.close();
// PowerPoint
extractor = ExtractorFactory.createExtractor(new FileInputStream(ppt));
assertTrue(extractor instanceof PowerPointExtractor);
assertTrue(extractor.getText().length() > 120);
extractor.close();
extractor = ExtractorFactory.createExtractor(new FileInputStream(pptx));
assertTrue(extractor instanceof XSLFPowerPointExtractor);
assertTrue(extractor.getText().length() > 120);
extractor.close();
// Visio
extractor = ExtractorFactory.createExtractor(new FileInputStream(vsd));
assertTrue(extractor instanceof VisioTextExtractor);
assertTrue(extractor.getText().length() > 50);
extractor.close();
// Visio - vsdx
extractor = ExtractorFactory.createExtractor(new FileInputStream(vsdx));
assertTrue(extractor instanceof XDGFVisioExtractor);
assertTrue(extractor.getText().length() > 20);
extractor.close();
// Publisher
extractor = ExtractorFactory.createExtractor(new FileInputStream(pub));
assertTrue(extractor instanceof PublisherTextExtractor);
assertTrue(extractor.getText().length() > 50);
extractor.close();
// Outlook msg
extractor = ExtractorFactory.createExtractor(new FileInputStream(msg));
assertTrue(extractor instanceof OutlookTextExtactor);
assertTrue(extractor.getText().length() > 50);
extractor.close();
// Text
try {
FileInputStream stream = new FileInputStream(txt);
try {
ExtractorFactory.createExtractor(stream);
fail();
} finally {
IOUtils.closeQuietly(stream);
}
} catch (IllegalArgumentException e) {
// Good
}
}
use of org.apache.poi.hslf.extractor.PowerPointExtractor in project poi by apache.
the class TestExtractorFactory method testEmbeded.
/**
* Test embeded docs text extraction. For now, only
* does poifs embeded, but will do ooxml ones
* at some point.
*/
@Test
public void testEmbeded() throws Exception {
POIOLE2TextExtractor ext;
POITextExtractor[] embeds;
// No embedings
ext = (POIOLE2TextExtractor) ExtractorFactory.createExtractor(xls);
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
assertEquals(0, embeds.length);
ext.close();
// Excel
ext = (POIOLE2TextExtractor) ExtractorFactory.createExtractor(xlsEmb);
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
assertEquals(6, embeds.length);
int numWord = 0, numXls = 0, numPpt = 0, numMsg = 0, numWordX;
for (POITextExtractor embed : embeds) {
assertTrue(embed.getText().length() > 20);
if (embed instanceof PowerPointExtractor)
numPpt++;
else if (embed instanceof ExcelExtractor)
numXls++;
else if (embed instanceof WordExtractor)
numWord++;
else if (embed instanceof OutlookTextExtactor)
numMsg++;
}
assertEquals(2, numPpt);
assertEquals(2, numXls);
assertEquals(2, numWord);
assertEquals(0, numMsg);
ext.close();
// Word
ext = (POIOLE2TextExtractor) ExtractorFactory.createExtractor(docEmb);
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
numWord = 0;
numXls = 0;
numPpt = 0;
numMsg = 0;
assertEquals(4, embeds.length);
for (POITextExtractor embed : embeds) {
assertTrue(embed.getText().length() > 20);
if (embed instanceof PowerPointExtractor)
numPpt++;
else if (embed instanceof ExcelExtractor)
numXls++;
else if (embed instanceof WordExtractor)
numWord++;
else if (embed instanceof OutlookTextExtactor)
numMsg++;
}
assertEquals(1, numPpt);
assertEquals(2, numXls);
assertEquals(1, numWord);
assertEquals(0, numMsg);
ext.close();
// Word which contains an OOXML file
ext = (POIOLE2TextExtractor) ExtractorFactory.createExtractor(docEmbOOXML);
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
numWord = 0;
numXls = 0;
numPpt = 0;
numMsg = 0;
numWordX = 0;
assertEquals(3, embeds.length);
for (POITextExtractor embed : embeds) {
assertTrue(embed.getText().length() > 20);
if (embed instanceof PowerPointExtractor)
numPpt++;
else if (embed instanceof ExcelExtractor)
numXls++;
else if (embed instanceof WordExtractor)
numWord++;
else if (embed instanceof OutlookTextExtactor)
numMsg++;
else if (embed instanceof XWPFWordExtractor)
numWordX++;
}
assertEquals(1, numPpt);
assertEquals(1, numXls);
assertEquals(0, numWord);
assertEquals(1, numWordX);
assertEquals(0, numMsg);
ext.close();
// Outlook
ext = (OutlookTextExtactor) ExtractorFactory.createExtractor(msgEmb);
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
numWord = 0;
numXls = 0;
numPpt = 0;
numMsg = 0;
assertEquals(1, embeds.length);
for (POITextExtractor embed : embeds) {
assertTrue(embed.getText().length() > 20);
if (embed instanceof PowerPointExtractor)
numPpt++;
else if (embed instanceof ExcelExtractor)
numXls++;
else if (embed instanceof WordExtractor)
numWord++;
else if (embed instanceof OutlookTextExtactor)
numMsg++;
}
assertEquals(0, numPpt);
assertEquals(0, numXls);
assertEquals(1, numWord);
assertEquals(0, numMsg);
ext.close();
// Outlook with another outlook file in it
ext = (OutlookTextExtactor) ExtractorFactory.createExtractor(msgEmbMsg);
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
numWord = 0;
numXls = 0;
numPpt = 0;
numMsg = 0;
assertEquals(1, embeds.length);
for (POITextExtractor embed : embeds) {
assertTrue(embed.getText().length() > 20);
if (embed instanceof PowerPointExtractor)
numPpt++;
else if (embed instanceof ExcelExtractor)
numXls++;
else if (embed instanceof WordExtractor)
numWord++;
else if (embed instanceof OutlookTextExtactor)
numMsg++;
}
assertEquals(0, numPpt);
assertEquals(0, numXls);
assertEquals(0, numWord);
assertEquals(1, numMsg);
ext.close();
// TODO - PowerPoint
// TODO - Publisher
// TODO - Visio
}
use of org.apache.poi.hslf.extractor.PowerPointExtractor in project CodeUtils by boredream.
the class TempUtils method setPPT.
public static void setPPT() {
try {
PowerPointExtractor ppe = new PowerPointExtractor("temp" + File.separator + "office" + File.separator + "ppt2007.ppt");
DocumentSummaryInformation dsi = ppe.getDocSummaryInformation();
DirectoryEntry root = ppe.getRoot();
System.out.println(dsi.getSlideCount());
System.out.println(root.getName());
} catch (Exception e) {
e.printStackTrace();
}
// SlideShow _slideShow = new SlideShow();
// Slide slide = _slideShow.createSlide();
//
// // 创建并置入简单文本
// TextBox _text = new TextBox();
// TextRun _textRun = _text.createTextRun();
// _textRun.setRawText("杜磊米");
// _text.setAnchor(new Rectangle(10,10,100,100));
//
// // 创建并置入带有样式的文本
// AutoShape _autoShape = new AutoShape(ShapeTypes.Rectangle); //设置形状
// TextRun _autoText = _autoShape.createTextRun();
// _autoText.setRawText("杜磊米");
// _autoShape.setAnchor(new Rectangle(200,200,100,100));
// _autoShape.setFillColor(new Color(170,215,255));
// _autoShape.setLineWidth(5.0);
// _autoShape.setLineStyle(Line.LINE_DOUBLE);
//
// // AutoShape 对象可以设置多个不同样式文本
// TextRun _autoText2 = _autoShape.createTextRun();
// RichTextRun _richText = _autoText2.appendText("杜");
// _richText.setFontColor(new Color(255,255,255));
// RichTextRun _richText2 = _autoText2.appendText("磊米");
// _richText2.setFontColor(new Color(255,0,0));
// _richText2.setFontSize(12);
//
// // 将文本对象置入幻灯片
// slide.addShape(_text);
// slide.addShape(_autoShape);
//
//
//
// // 输出文件
// try {
// _slideShow.write(new FileOutputStream("temp\\office\\test.pptx"));
// } catch (FileNotFoundException e) {
// // TODO Auto-generated catch block
// e.printStackTrace();
// } catch (IOException e) {
// // TODO Auto-generated catch block
// e.printStackTrace();
// }
}
use of org.apache.poi.hslf.extractor.PowerPointExtractor in project Gargoyle by callakrsos.
the class DocFileParser method DocFileContentParser.
public String DocFileContentParser(String fileName) {
POIFSFileSystem fs = null;
try {
fs = new POIFSFileSystem(new FileInputStream(fileName));
if (fileName.endsWith(".doc")) {
HWPFDocument doc = new HWPFDocument(fs);
WordExtractor we = new WordExtractor(doc);
return we.getText();
} else if (fileName.endsWith(".xls")) {
ExcelExtractor ex = new ExcelExtractor(fs);
ex.setFormulasNotResults(true);
ex.setIncludeSheetNames(true);
return ex.getText();
} else if (fileName.endsWith(".ppt")) {
PowerPointExtractor extractor = new PowerPointExtractor(fs);
return extractor.getText();
}
} catch (Exception e) {
LOGGER.debug("document file cant be indexed");
}
return "";
}
Aggregations