use of org.apache.poi.xslf.extractor.XSLFPowerPointExtractor in project poi by apache.
the class TestExtractorFactory method testPackage.
@Test
public void testPackage() throws Exception {
// Excel
POIXMLTextExtractor extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ));
assertTrue(extractor instanceof XSSFExcelExtractor);
extractor.close();
extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString()));
assertTrue(extractor.getText().length() > 200);
extractor.close();
// Word
extractor = ExtractorFactory.createExtractor(OPCPackage.open(docx.toString()));
assertTrue(extractor instanceof XWPFWordExtractor);
extractor.close();
extractor = ExtractorFactory.createExtractor(OPCPackage.open(docx.toString()));
assertTrue(extractor.getText().length() > 120);
extractor.close();
// PowerPoint
extractor = ExtractorFactory.createExtractor(OPCPackage.open(pptx.toString()));
assertTrue(extractor instanceof XSLFPowerPointExtractor);
extractor.close();
extractor = ExtractorFactory.createExtractor(OPCPackage.open(pptx.toString()));
assertTrue(extractor.getText().length() > 120);
extractor.close();
// Visio
extractor = ExtractorFactory.createExtractor(OPCPackage.open(vsdx.toString()));
assertTrue(extractor instanceof XDGFVisioExtractor);
assertTrue(extractor.getText().length() > 20);
extractor.close();
// Text
try {
ExtractorFactory.createExtractor(OPCPackage.open(txt.toString()));
fail("TestExtractorFactory.testPackage() failed on " + txt);
} catch (UnsupportedFileFormatException e) {
// Good
} catch (Exception e) {
System.out.println("TestExtractorFactory.testPackage() failed on " + txt);
throw e;
}
}
use of org.apache.poi.xslf.extractor.XSLFPowerPointExtractor in project poi by apache.
the class TestExtractorFactory method testInputStream.
@Test
public void testInputStream() throws Exception {
// Excel
POITextExtractor extractor = ExtractorFactory.createExtractor(new FileInputStream(xls));
assertTrue(extractor instanceof ExcelExtractor);
assertTrue(extractor.getText().length() > 200);
extractor.close();
extractor = ExtractorFactory.createExtractor(new FileInputStream(xlsx));
assertTrue(extractor.getClass().getName(), extractor instanceof XSSFExcelExtractor);
assertTrue(extractor.getText().length() > 200);
// TODO Support OOXML-Strict, see bug #57699
// assertTrue(
// ExtractorFactory.createExtractor(new FileInputStream(xlsxStrict))
// instanceof XSSFExcelExtractor
// );
// assertTrue(
// ExtractorFactory.createExtractor(new FileInputStream(xlsxStrict)).getText().length() > 200
// );
extractor.close();
// Word
extractor = ExtractorFactory.createExtractor(new FileInputStream(doc));
assertTrue(extractor.getClass().getName(), extractor instanceof WordExtractor);
assertTrue(extractor.getText().length() > 120);
extractor.close();
extractor = ExtractorFactory.createExtractor(new FileInputStream(doc6));
assertTrue(extractor.getClass().getName(), extractor instanceof Word6Extractor);
assertTrue(extractor.getText().length() > 20);
extractor.close();
extractor = ExtractorFactory.createExtractor(new FileInputStream(doc95));
assertTrue(extractor.getClass().getName(), extractor instanceof Word6Extractor);
assertTrue(extractor.getText().length() > 120);
extractor.close();
extractor = ExtractorFactory.createExtractor(new FileInputStream(docx));
assertTrue(extractor instanceof XWPFWordExtractor);
assertTrue(extractor.getText().length() > 120);
extractor.close();
// PowerPoint
extractor = ExtractorFactory.createExtractor(new FileInputStream(ppt));
assertTrue(extractor instanceof PowerPointExtractor);
assertTrue(extractor.getText().length() > 120);
extractor.close();
extractor = ExtractorFactory.createExtractor(new FileInputStream(pptx));
assertTrue(extractor instanceof XSLFPowerPointExtractor);
assertTrue(extractor.getText().length() > 120);
extractor.close();
// Visio
extractor = ExtractorFactory.createExtractor(new FileInputStream(vsd));
assertTrue(extractor instanceof VisioTextExtractor);
assertTrue(extractor.getText().length() > 50);
extractor.close();
// Visio - vsdx
extractor = ExtractorFactory.createExtractor(new FileInputStream(vsdx));
assertTrue(extractor instanceof XDGFVisioExtractor);
assertTrue(extractor.getText().length() > 20);
extractor.close();
// Publisher
extractor = ExtractorFactory.createExtractor(new FileInputStream(pub));
assertTrue(extractor instanceof PublisherTextExtractor);
assertTrue(extractor.getText().length() > 50);
extractor.close();
// Outlook msg
extractor = ExtractorFactory.createExtractor(new FileInputStream(msg));
assertTrue(extractor instanceof OutlookTextExtactor);
assertTrue(extractor.getText().length() > 50);
extractor.close();
// Text
try {
FileInputStream stream = new FileInputStream(txt);
try {
ExtractorFactory.createExtractor(stream);
fail();
} finally {
IOUtils.closeQuietly(stream);
}
} catch (IllegalArgumentException e) {
// Good
}
}
use of org.apache.poi.xslf.extractor.XSLFPowerPointExtractor in project Gargoyle by callakrsos.
the class DocxFileParser method docxFileContentParser.
public String docxFileContentParser(String fileName) {
try {
FileInputStream fs = new FileInputStream(new File(fileName));
OPCPackage d = OPCPackage.open(fs);
if (fileName.endsWith(".docx")) {
XWPFWordExtractor xw = new XWPFWordExtractor(d);
return xw.getText();
} else if (fileName.endsWith(".pptx")) {
XSLFPowerPointExtractor xp = new XSLFPowerPointExtractor(d);
return xp.getText();
} else if (fileName.endsWith(".xlsx")) {
XSSFExcelExtractor xe = new XSSFExcelExtractor(d);
xe.setFormulasNotResults(true);
xe.setIncludeSheetNames(true);
return xe.getText();
}
} catch (Exception e) {
System.out.println("# DocxFileParser Error :" + e.getMessage());
}
return "";
}
use of org.apache.poi.xslf.extractor.XSLFPowerPointExtractor in project carbon-apimgt by wso2.
the class DocumentIndexer method fetchDocumentContent.
/**
* Write document content to document artifact as its raw content
*
* @param registry
* @param documentResource
* @return
* @throws RegistryException
* @throws IOException
* @throws APIManagementException
*/
private String fetchDocumentContent(Registry registry, Resource documentResource) throws RegistryException, IOException, APIManagementException {
GenericArtifactManager docArtifactManager = APIUtil.getArtifactManager(registry, APIConstants.DOCUMENTATION_KEY);
GenericArtifact documentArtifact = docArtifactManager.getGenericArtifact(documentResource.getUUID());
String sourceType = documentArtifact.getAttribute(APIConstants.DOC_SOURCE_TYPE);
String contentString = null;
if (Documentation.DocumentSourceType.FILE.name().equals(sourceType)) {
String path = documentArtifact.getAttribute(APIConstants.DOC_FILE_PATH);
int indexOfApimgt = path.indexOf(APIConstants.APIMGT_REGISTRY_LOCATION);
String filepath = path.substring(indexOfApimgt);
Resource contentResource = registry.get(filepath);
int indexOfFiles = filepath.indexOf(APIConstants.DOCUMENT_FILE_DIR) + APIConstants.DOCUMENT_FILE_DIR.length() + 1;
String fileName = filepath.substring(indexOfFiles);
String extension = FilenameUtils.getExtension(fileName);
InputStream inputStream = null;
try {
inputStream = contentResource.getContentStream();
switch(extension) {
case APIConstants.PDF_EXTENSION:
PDFParser pdfParser = new PDFParser(new RandomAccessBufferedFileInputStream(inputStream));
pdfParser.parse();
COSDocument cosDocument = pdfParser.getDocument();
PDFTextStripper stripper = new PDFTextStripper();
contentString = stripper.getText(new PDDocument(cosDocument));
break;
case APIConstants.DOC_EXTENSION:
{
POIFSFileSystem pfs = new POIFSFileSystem(inputStream);
WordExtractor msWord2003Extractor = new WordExtractor(pfs);
contentString = msWord2003Extractor.getText();
break;
}
case APIConstants.DOCX_EXTENSION:
XWPFDocument doc = new XWPFDocument(inputStream);
XWPFWordExtractor msWord2007Extractor = new XWPFWordExtractor(doc);
contentString = msWord2007Extractor.getText();
break;
case APIConstants.XLS_EXTENSION:
{
POIFSFileSystem pfs = new POIFSFileSystem(inputStream);
ExcelExtractor extractor = new ExcelExtractor(pfs);
contentString = extractor.getText();
break;
}
case APIConstants.XLSX_EXTENSION:
XSSFWorkbook xssfSheets = new XSSFWorkbook(inputStream);
XSSFExcelExtractor xssfExcelExtractor = new XSSFExcelExtractor(xssfSheets);
contentString = xssfExcelExtractor.getText();
break;
case APIConstants.PPT_EXTENSION:
{
POIFSFileSystem fs = new POIFSFileSystem(inputStream);
PowerPointExtractor extractor = new PowerPointExtractor(fs);
contentString = extractor.getText();
break;
}
case APIConstants.PPTX_EXTENSION:
XMLSlideShow xmlSlideShow = new XMLSlideShow(inputStream);
XSLFPowerPointExtractor xslfPowerPointExtractor = new XSLFPowerPointExtractor(xmlSlideShow);
contentString = xslfPowerPointExtractor.getText();
break;
case APIConstants.TXT_EXTENSION:
case APIConstants.WSDL_EXTENSION:
case APIConstants.XML_DOC_EXTENSION:
BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream));
String line;
StringBuilder contentBuilder = new StringBuilder();
while ((line = reader.readLine()) != null) {
contentBuilder.append(line);
}
contentString = contentBuilder.toString();
break;
}
} finally {
IOUtils.closeQuietly(inputStream);
}
} else if (Documentation.DocumentSourceType.INLINE.name().equals(sourceType)) {
String fileName = ((ResourceImpl) documentResource).getName();
String pathToDocFile = documentResource.getPath();
String pathToContent = pathToDocFile.substring(0, pathToDocFile.lastIndexOf(fileName)) + APIConstants.INLINE_DOCUMENT_CONTENT_DIR + RegistryConstants.PATH_SEPARATOR + fileName;
if (registry.resourceExists(pathToContent)) {
Resource contentResource = registry.get(pathToContent);
InputStream instream = null;
BufferedReader reader = null;
String line;
try {
instream = contentResource.getContentStream();
reader = new BufferedReader(new InputStreamReader(instream));
StringBuilder contentBuilder = new StringBuilder();
while ((line = reader.readLine()) != null) {
contentBuilder.append(line);
}
contentString = contentBuilder.toString();
} finally {
if (reader != null) {
IOUtils.closeQuietly(reader);
}
}
}
}
return contentString;
}
use of org.apache.poi.xslf.extractor.XSLFPowerPointExtractor in project poi by apache.
the class XSLFFileHandler method handleExtracting.
@Override
public void handleExtracting(File file) throws Exception {
super.handleExtracting(file);
// additionally try the other getText() methods
XSLFPowerPointExtractor extractor = (XSLFPowerPointExtractor) ExtractorFactory.createExtractor(file);
try {
assertNotNull(extractor);
assertNotNull(extractor.getText(true, true, true));
assertEquals("With all options disabled we should not get text", "", extractor.getText(false, false, false));
} finally {
extractor.close();
}
}
Aggregations