Search in sources :

Example 1 with XSLFPowerPointExtractor

use of org.apache.poi.xslf.extractor.XSLFPowerPointExtractor in project poi by apache.

the class TestExtractorFactory method testPackage.

@Test
public void testPackage() throws Exception {
    // Excel
    POIXMLTextExtractor extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ));
    assertTrue(extractor instanceof XSSFExcelExtractor);
    extractor.close();
    extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString()));
    assertTrue(extractor.getText().length() > 200);
    extractor.close();
    // Word
    extractor = ExtractorFactory.createExtractor(OPCPackage.open(docx.toString()));
    assertTrue(extractor instanceof XWPFWordExtractor);
    extractor.close();
    extractor = ExtractorFactory.createExtractor(OPCPackage.open(docx.toString()));
    assertTrue(extractor.getText().length() > 120);
    extractor.close();
    // PowerPoint
    extractor = ExtractorFactory.createExtractor(OPCPackage.open(pptx.toString()));
    assertTrue(extractor instanceof XSLFPowerPointExtractor);
    extractor.close();
    extractor = ExtractorFactory.createExtractor(OPCPackage.open(pptx.toString()));
    assertTrue(extractor.getText().length() > 120);
    extractor.close();
    // Visio
    extractor = ExtractorFactory.createExtractor(OPCPackage.open(vsdx.toString()));
    assertTrue(extractor instanceof XDGFVisioExtractor);
    assertTrue(extractor.getText().length() > 20);
    extractor.close();
    // Text
    try {
        ExtractorFactory.createExtractor(OPCPackage.open(txt.toString()));
        fail("TestExtractorFactory.testPackage() failed on " + txt);
    } catch (UnsupportedFileFormatException e) {
    // Good
    } catch (Exception e) {
        System.out.println("TestExtractorFactory.testPackage() failed on " + txt);
        throw e;
    }
}
Also used : UnsupportedFileFormatException(org.apache.poi.UnsupportedFileFormatException) XDGFVisioExtractor(org.apache.poi.xdgf.extractor.XDGFVisioExtractor) POIXMLTextExtractor(org.apache.poi.POIXMLTextExtractor) XSLFPowerPointExtractor(org.apache.poi.xslf.extractor.XSLFPowerPointExtractor) XSSFExcelExtractor(org.apache.poi.xssf.extractor.XSSFExcelExtractor) XWPFWordExtractor(org.apache.poi.xwpf.extractor.XWPFWordExtractor) OldExcelFormatException(org.apache.poi.hssf.OldExcelFormatException) UnsupportedFileFormatException(org.apache.poi.UnsupportedFileFormatException) POIXMLException(org.apache.poi.POIXMLException) IOException(java.io.IOException) Test(org.junit.Test)

Example 2 with XSLFPowerPointExtractor

use of org.apache.poi.xslf.extractor.XSLFPowerPointExtractor in project poi by apache.

the class TestExtractorFactory method testInputStream.

@Test
public void testInputStream() throws Exception {
    // Excel
    POITextExtractor extractor = ExtractorFactory.createExtractor(new FileInputStream(xls));
    assertTrue(extractor instanceof ExcelExtractor);
    assertTrue(extractor.getText().length() > 200);
    extractor.close();
    extractor = ExtractorFactory.createExtractor(new FileInputStream(xlsx));
    assertTrue(extractor.getClass().getName(), extractor instanceof XSSFExcelExtractor);
    assertTrue(extractor.getText().length() > 200);
    // TODO Support OOXML-Strict, see bug #57699
    //        assertTrue(
    //                ExtractorFactory.createExtractor(new FileInputStream(xlsxStrict))
    //                instanceof XSSFExcelExtractor
    //        );
    //        assertTrue(
    //                ExtractorFactory.createExtractor(new FileInputStream(xlsxStrict)).getText().length() > 200
    //        );
    extractor.close();
    // Word
    extractor = ExtractorFactory.createExtractor(new FileInputStream(doc));
    assertTrue(extractor.getClass().getName(), extractor instanceof WordExtractor);
    assertTrue(extractor.getText().length() > 120);
    extractor.close();
    extractor = ExtractorFactory.createExtractor(new FileInputStream(doc6));
    assertTrue(extractor.getClass().getName(), extractor instanceof Word6Extractor);
    assertTrue(extractor.getText().length() > 20);
    extractor.close();
    extractor = ExtractorFactory.createExtractor(new FileInputStream(doc95));
    assertTrue(extractor.getClass().getName(), extractor instanceof Word6Extractor);
    assertTrue(extractor.getText().length() > 120);
    extractor.close();
    extractor = ExtractorFactory.createExtractor(new FileInputStream(docx));
    assertTrue(extractor instanceof XWPFWordExtractor);
    assertTrue(extractor.getText().length() > 120);
    extractor.close();
    // PowerPoint
    extractor = ExtractorFactory.createExtractor(new FileInputStream(ppt));
    assertTrue(extractor instanceof PowerPointExtractor);
    assertTrue(extractor.getText().length() > 120);
    extractor.close();
    extractor = ExtractorFactory.createExtractor(new FileInputStream(pptx));
    assertTrue(extractor instanceof XSLFPowerPointExtractor);
    assertTrue(extractor.getText().length() > 120);
    extractor.close();
    // Visio
    extractor = ExtractorFactory.createExtractor(new FileInputStream(vsd));
    assertTrue(extractor instanceof VisioTextExtractor);
    assertTrue(extractor.getText().length() > 50);
    extractor.close();
    // Visio - vsdx
    extractor = ExtractorFactory.createExtractor(new FileInputStream(vsdx));
    assertTrue(extractor instanceof XDGFVisioExtractor);
    assertTrue(extractor.getText().length() > 20);
    extractor.close();
    // Publisher
    extractor = ExtractorFactory.createExtractor(new FileInputStream(pub));
    assertTrue(extractor instanceof PublisherTextExtractor);
    assertTrue(extractor.getText().length() > 50);
    extractor.close();
    // Outlook msg
    extractor = ExtractorFactory.createExtractor(new FileInputStream(msg));
    assertTrue(extractor instanceof OutlookTextExtactor);
    assertTrue(extractor.getText().length() > 50);
    extractor.close();
    // Text
    try {
        FileInputStream stream = new FileInputStream(txt);
        try {
            ExtractorFactory.createExtractor(stream);
            fail();
        } finally {
            IOUtils.closeQuietly(stream);
        }
    } catch (IllegalArgumentException e) {
    // Good
    }
}
Also used : XDGFVisioExtractor(org.apache.poi.xdgf.extractor.XDGFVisioExtractor) XSSFExcelExtractor(org.apache.poi.xssf.extractor.XSSFExcelExtractor) Word6Extractor(org.apache.poi.hwpf.extractor.Word6Extractor) PowerPointExtractor(org.apache.poi.hslf.extractor.PowerPointExtractor) XSLFPowerPointExtractor(org.apache.poi.xslf.extractor.XSLFPowerPointExtractor) XWPFWordExtractor(org.apache.poi.xwpf.extractor.XWPFWordExtractor) PublisherTextExtractor(org.apache.poi.hpbf.extractor.PublisherTextExtractor) FileInputStream(java.io.FileInputStream) WordExtractor(org.apache.poi.hwpf.extractor.WordExtractor) XWPFWordExtractor(org.apache.poi.xwpf.extractor.XWPFWordExtractor) OutlookTextExtactor(org.apache.poi.hsmf.extractor.OutlookTextExtactor) XSLFPowerPointExtractor(org.apache.poi.xslf.extractor.XSLFPowerPointExtractor) POITextExtractor(org.apache.poi.POITextExtractor) XSSFExcelExtractor(org.apache.poi.xssf.extractor.XSSFExcelExtractor) ExcelExtractor(org.apache.poi.hssf.extractor.ExcelExtractor) XSSFEventBasedExcelExtractor(org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor) EventBasedExcelExtractor(org.apache.poi.hssf.extractor.EventBasedExcelExtractor) VisioTextExtractor(org.apache.poi.hdgf.extractor.VisioTextExtractor) Test(org.junit.Test)

Example 3 with XSLFPowerPointExtractor

use of org.apache.poi.xslf.extractor.XSLFPowerPointExtractor in project Gargoyle by callakrsos.

the class DocxFileParser method docxFileContentParser.

public String docxFileContentParser(String fileName) {
    try {
        FileInputStream fs = new FileInputStream(new File(fileName));
        OPCPackage d = OPCPackage.open(fs);
        if (fileName.endsWith(".docx")) {
            XWPFWordExtractor xw = new XWPFWordExtractor(d);
            return xw.getText();
        } else if (fileName.endsWith(".pptx")) {
            XSLFPowerPointExtractor xp = new XSLFPowerPointExtractor(d);
            return xp.getText();
        } else if (fileName.endsWith(".xlsx")) {
            XSSFExcelExtractor xe = new XSSFExcelExtractor(d);
            xe.setFormulasNotResults(true);
            xe.setIncludeSheetNames(true);
            return xe.getText();
        }
    } catch (Exception e) {
        System.out.println("# DocxFileParser Error :" + e.getMessage());
    }
    return "";
}
Also used : XSLFPowerPointExtractor(org.apache.poi.xslf.extractor.XSLFPowerPointExtractor) XSSFExcelExtractor(org.apache.poi.xssf.extractor.XSSFExcelExtractor) XWPFWordExtractor(org.apache.poi.xwpf.extractor.XWPFWordExtractor) File(java.io.File) OPCPackage(org.apache.poi.openxml4j.opc.OPCPackage) FileInputStream(java.io.FileInputStream)

Example 4 with XSLFPowerPointExtractor

use of org.apache.poi.xslf.extractor.XSLFPowerPointExtractor in project carbon-apimgt by wso2.

the class DocumentIndexer method fetchDocumentContent.

/**
 * Write document content to document artifact as its raw content
 *
 * @param registry
 * @param documentResource
 * @return
 * @throws RegistryException
 * @throws IOException
 * @throws APIManagementException
 */
private String fetchDocumentContent(Registry registry, Resource documentResource) throws RegistryException, IOException, APIManagementException {
    GenericArtifactManager docArtifactManager = APIUtil.getArtifactManager(registry, APIConstants.DOCUMENTATION_KEY);
    GenericArtifact documentArtifact = docArtifactManager.getGenericArtifact(documentResource.getUUID());
    String sourceType = documentArtifact.getAttribute(APIConstants.DOC_SOURCE_TYPE);
    String contentString = null;
    if (Documentation.DocumentSourceType.FILE.name().equals(sourceType)) {
        String path = documentArtifact.getAttribute(APIConstants.DOC_FILE_PATH);
        int indexOfApimgt = path.indexOf(APIConstants.APIMGT_REGISTRY_LOCATION);
        String filepath = path.substring(indexOfApimgt);
        Resource contentResource = registry.get(filepath);
        int indexOfFiles = filepath.indexOf(APIConstants.DOCUMENT_FILE_DIR) + APIConstants.DOCUMENT_FILE_DIR.length() + 1;
        String fileName = filepath.substring(indexOfFiles);
        String extension = FilenameUtils.getExtension(fileName);
        InputStream inputStream = null;
        try {
            inputStream = contentResource.getContentStream();
            switch(extension) {
                case APIConstants.PDF_EXTENSION:
                    PDFParser pdfParser = new PDFParser(new RandomAccessBufferedFileInputStream(inputStream));
                    pdfParser.parse();
                    COSDocument cosDocument = pdfParser.getDocument();
                    PDFTextStripper stripper = new PDFTextStripper();
                    contentString = stripper.getText(new PDDocument(cosDocument));
                    break;
                case APIConstants.DOC_EXTENSION:
                    {
                        POIFSFileSystem pfs = new POIFSFileSystem(inputStream);
                        WordExtractor msWord2003Extractor = new WordExtractor(pfs);
                        contentString = msWord2003Extractor.getText();
                        break;
                    }
                case APIConstants.DOCX_EXTENSION:
                    XWPFDocument doc = new XWPFDocument(inputStream);
                    XWPFWordExtractor msWord2007Extractor = new XWPFWordExtractor(doc);
                    contentString = msWord2007Extractor.getText();
                    break;
                case APIConstants.XLS_EXTENSION:
                    {
                        POIFSFileSystem pfs = new POIFSFileSystem(inputStream);
                        ExcelExtractor extractor = new ExcelExtractor(pfs);
                        contentString = extractor.getText();
                        break;
                    }
                case APIConstants.XLSX_EXTENSION:
                    XSSFWorkbook xssfSheets = new XSSFWorkbook(inputStream);
                    XSSFExcelExtractor xssfExcelExtractor = new XSSFExcelExtractor(xssfSheets);
                    contentString = xssfExcelExtractor.getText();
                    break;
                case APIConstants.PPT_EXTENSION:
                    {
                        POIFSFileSystem fs = new POIFSFileSystem(inputStream);
                        PowerPointExtractor extractor = new PowerPointExtractor(fs);
                        contentString = extractor.getText();
                        break;
                    }
                case APIConstants.PPTX_EXTENSION:
                    XMLSlideShow xmlSlideShow = new XMLSlideShow(inputStream);
                    XSLFPowerPointExtractor xslfPowerPointExtractor = new XSLFPowerPointExtractor(xmlSlideShow);
                    contentString = xslfPowerPointExtractor.getText();
                    break;
                case APIConstants.TXT_EXTENSION:
                case APIConstants.WSDL_EXTENSION:
                case APIConstants.XML_DOC_EXTENSION:
                    BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream));
                    String line;
                    StringBuilder contentBuilder = new StringBuilder();
                    while ((line = reader.readLine()) != null) {
                        contentBuilder.append(line);
                    }
                    contentString = contentBuilder.toString();
                    break;
            }
        } finally {
            IOUtils.closeQuietly(inputStream);
        }
    } else if (Documentation.DocumentSourceType.INLINE.name().equals(sourceType)) {
        String fileName = ((ResourceImpl) documentResource).getName();
        String pathToDocFile = documentResource.getPath();
        String pathToContent = pathToDocFile.substring(0, pathToDocFile.lastIndexOf(fileName)) + APIConstants.INLINE_DOCUMENT_CONTENT_DIR + RegistryConstants.PATH_SEPARATOR + fileName;
        if (registry.resourceExists(pathToContent)) {
            Resource contentResource = registry.get(pathToContent);
            InputStream instream = null;
            BufferedReader reader = null;
            String line;
            try {
                instream = contentResource.getContentStream();
                reader = new BufferedReader(new InputStreamReader(instream));
                StringBuilder contentBuilder = new StringBuilder();
                while ((line = reader.readLine()) != null) {
                    contentBuilder.append(line);
                }
                contentString = contentBuilder.toString();
            } finally {
                if (reader != null) {
                    IOUtils.closeQuietly(reader);
                }
            }
        }
    }
    return contentString;
}
Also used : GenericArtifact(org.wso2.carbon.governance.api.generic.dataobjects.GenericArtifact) GenericArtifactManager(org.wso2.carbon.governance.api.generic.GenericArtifactManager) InputStreamReader(java.io.InputStreamReader) XSSFExcelExtractor(org.apache.poi.xssf.extractor.XSSFExcelExtractor) RandomAccessBufferedFileInputStream(org.apache.pdfbox.io.RandomAccessBufferedFileInputStream) InputStream(java.io.InputStream) PowerPointExtractor(org.apache.poi.hslf.extractor.PowerPointExtractor) XSLFPowerPointExtractor(org.apache.poi.xslf.extractor.XSLFPowerPointExtractor) PDFParser(org.apache.pdfbox.pdfparser.PDFParser) XWPFWordExtractor(org.apache.poi.xwpf.extractor.XWPFWordExtractor) Resource(org.wso2.carbon.registry.core.Resource) COSDocument(org.apache.pdfbox.cos.COSDocument) WordExtractor(org.apache.poi.hwpf.extractor.WordExtractor) XWPFWordExtractor(org.apache.poi.xwpf.extractor.XWPFWordExtractor) XSLFPowerPointExtractor(org.apache.poi.xslf.extractor.XSLFPowerPointExtractor) POIFSFileSystem(org.apache.poi.poifs.filesystem.POIFSFileSystem) XSSFExcelExtractor(org.apache.poi.xssf.extractor.XSSFExcelExtractor) ExcelExtractor(org.apache.poi.hssf.extractor.ExcelExtractor) PDDocument(org.apache.pdfbox.pdmodel.PDDocument) XMLSlideShow(org.apache.poi.xslf.usermodel.XMLSlideShow) BufferedReader(java.io.BufferedReader) XSSFWorkbook(org.apache.poi.xssf.usermodel.XSSFWorkbook) XWPFDocument(org.apache.poi.xwpf.usermodel.XWPFDocument) RandomAccessBufferedFileInputStream(org.apache.pdfbox.io.RandomAccessBufferedFileInputStream) PDFTextStripper(org.apache.pdfbox.text.PDFTextStripper)

Example 5 with XSLFPowerPointExtractor

use of org.apache.poi.xslf.extractor.XSLFPowerPointExtractor in project poi by apache.

the class XSLFFileHandler method handleExtracting.

@Override
public void handleExtracting(File file) throws Exception {
    super.handleExtracting(file);
    // additionally try the other getText() methods
    XSLFPowerPointExtractor extractor = (XSLFPowerPointExtractor) ExtractorFactory.createExtractor(file);
    try {
        assertNotNull(extractor);
        assertNotNull(extractor.getText(true, true, true));
        assertEquals("With all options disabled we should not get text", "", extractor.getText(false, false, false));
    } finally {
        extractor.close();
    }
}
Also used : XSLFPowerPointExtractor(org.apache.poi.xslf.extractor.XSLFPowerPointExtractor)

Aggregations

XSLFPowerPointExtractor (org.apache.poi.xslf.extractor.XSLFPowerPointExtractor)9 XSSFExcelExtractor (org.apache.poi.xssf.extractor.XSSFExcelExtractor)6 XWPFWordExtractor (org.apache.poi.xwpf.extractor.XWPFWordExtractor)6 PowerPointExtractor (org.apache.poi.hslf.extractor.PowerPointExtractor)5 XDGFVisioExtractor (org.apache.poi.xdgf.extractor.XDGFVisioExtractor)4 Test (org.junit.Test)4 IOException (java.io.IOException)3 ExcelExtractor (org.apache.poi.hssf.extractor.ExcelExtractor)3 WordExtractor (org.apache.poi.hwpf.extractor.WordExtractor)3 POIFSFileSystem (org.apache.poi.poifs.filesystem.POIFSFileSystem)3 XMLSlideShow (org.apache.poi.xslf.usermodel.XMLSlideShow)3 XSSFEventBasedExcelExtractor (org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor)3 FileInputStream (java.io.FileInputStream)2 POITextExtractor (org.apache.poi.POITextExtractor)2 POIXMLException (org.apache.poi.POIXMLException)2 VisioTextExtractor (org.apache.poi.hdgf.extractor.VisioTextExtractor)2 PublisherTextExtractor (org.apache.poi.hpbf.extractor.PublisherTextExtractor)2 OutlookTextExtactor (org.apache.poi.hsmf.extractor.OutlookTextExtactor)2 EventBasedExcelExtractor (org.apache.poi.hssf.extractor.EventBasedExcelExtractor)2 Word6Extractor (org.apache.poi.hwpf.extractor.Word6Extractor)2