Examples with PDFParser - org.apache.pdfbox.pdfparser.PDFParser

Example 1 with PDFParser

use of org.apache.pdfbox.pdfparser.PDFParser in project pdfbox by apache.

the class SignatureOptions method initFromRandomAccessRead.

private void initFromRandomAccessRead(RandomAccessRead rar) throws IOException {
    pdfSource = rar;
    PDFParser parser = new PDFParser(pdfSource);
    parser.parse();
    visualSignature = parser.getDocument();
}

Also used : PDFParser(org.apache.pdfbox.pdfparser.PDFParser)

Example 2 with PDFParser

use of org.apache.pdfbox.pdfparser.PDFParser in project carbon-apimgt by wso2.

the class DocumentIndexer method fetchDocumentContent.

/**
 * Write document content to document artifact as its raw content
 *
 * @param registry
 * @param documentResource
 * @return
 * @throws RegistryException
 * @throws IOException
 * @throws APIManagementException
 */
private String fetchDocumentContent(Registry registry, Resource documentResource) throws RegistryException, IOException, APIManagementException {
    GenericArtifactManager docArtifactManager = APIUtil.getArtifactManager(registry, APIConstants.DOCUMENTATION_KEY);
    GenericArtifact documentArtifact = docArtifactManager.getGenericArtifact(documentResource.getUUID());
    String sourceType = documentArtifact.getAttribute(APIConstants.DOC_SOURCE_TYPE);
    String contentString = null;
    if (Documentation.DocumentSourceType.FILE.name().equals(sourceType)) {
        String path = documentArtifact.getAttribute(APIConstants.DOC_FILE_PATH);
        int indexOfApimgt = path.indexOf(APIConstants.APIMGT_REGISTRY_LOCATION);
        String filepath = path.substring(indexOfApimgt);
        Resource contentResource = registry.get(filepath);
        int indexOfFiles = filepath.indexOf(APIConstants.DOCUMENT_FILE_DIR) + APIConstants.DOCUMENT_FILE_DIR.length() + 1;
        String fileName = filepath.substring(indexOfFiles);
        String extension = FilenameUtils.getExtension(fileName);
        InputStream inputStream = null;
        try {
            inputStream = contentResource.getContentStream();
            switch(extension) {
                case APIConstants.PDF_EXTENSION:
                    PDFParser pdfParser = new PDFParser(new RandomAccessBufferedFileInputStream(inputStream));
                    pdfParser.parse();
                    COSDocument cosDocument = pdfParser.getDocument();
                    PDFTextStripper stripper = new PDFTextStripper();
                    contentString = stripper.getText(new PDDocument(cosDocument));
                    break;
                case APIConstants.DOC_EXTENSION:
                    {
                        POIFSFileSystem pfs = new POIFSFileSystem(inputStream);
                        WordExtractor msWord2003Extractor = new WordExtractor(pfs);
                        contentString = msWord2003Extractor.getText();
                        break;
                    }
                case APIConstants.DOCX_EXTENSION:
                    XWPFDocument doc = new XWPFDocument(inputStream);
                    XWPFWordExtractor msWord2007Extractor = new XWPFWordExtractor(doc);
                    contentString = msWord2007Extractor.getText();
                    break;
                case APIConstants.XLS_EXTENSION:
                    {
                        POIFSFileSystem pfs = new POIFSFileSystem(inputStream);
                        ExcelExtractor extractor = new ExcelExtractor(pfs);
                        contentString = extractor.getText();
                        break;
                    }
                case APIConstants.XLSX_EXTENSION:
                    XSSFWorkbook xssfSheets = new XSSFWorkbook(inputStream);
                    XSSFExcelExtractor xssfExcelExtractor = new XSSFExcelExtractor(xssfSheets);
                    contentString = xssfExcelExtractor.getText();
                    break;
                case APIConstants.PPT_EXTENSION:
                    {
                        POIFSFileSystem fs = new POIFSFileSystem(inputStream);
                        PowerPointExtractor extractor = new PowerPointExtractor(fs);
                        contentString = extractor.getText();
                        break;
                    }
                case APIConstants.PPTX_EXTENSION:
                    XMLSlideShow xmlSlideShow = new XMLSlideShow(inputStream);
                    XSLFPowerPointExtractor xslfPowerPointExtractor = new XSLFPowerPointExtractor(xmlSlideShow);
                    contentString = xslfPowerPointExtractor.getText();
                    break;
                case APIConstants.TXT_EXTENSION:
                case APIConstants.WSDL_EXTENSION:
                case APIConstants.XML_DOC_EXTENSION:
                    BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream));
                    String line;
                    StringBuilder contentBuilder = new StringBuilder();
                    while ((line = reader.readLine()) != null) {
                        contentBuilder.append(line);
                    }
                    contentString = contentBuilder.toString();
                    break;
            }
        } finally {
            IOUtils.closeQuietly(inputStream);
        }
    } else if (Documentation.DocumentSourceType.INLINE.name().equals(sourceType)) {
        String fileName = ((ResourceImpl) documentResource).getName();
        String pathToDocFile = documentResource.getPath();
        String pathToContent = pathToDocFile.substring(0, pathToDocFile.lastIndexOf(fileName)) + APIConstants.INLINE_DOCUMENT_CONTENT_DIR + RegistryConstants.PATH_SEPARATOR + fileName;
        if (registry.resourceExists(pathToContent)) {
            Resource contentResource = registry.get(pathToContent);
            InputStream instream = null;
            BufferedReader reader = null;
            String line;
            try {
                instream = contentResource.getContentStream();
                reader = new BufferedReader(new InputStreamReader(instream));
                StringBuilder contentBuilder = new StringBuilder();
                while ((line = reader.readLine()) != null) {
                    contentBuilder.append(line);
                }
                contentString = contentBuilder.toString();
            } finally {
                if (reader != null) {
                    IOUtils.closeQuietly(reader);
                }
            }
        }
    }
    return contentString;
}

Also used : GenericArtifact(org.wso2.carbon.governance.api.generic.dataobjects.GenericArtifact) GenericArtifactManager(org.wso2.carbon.governance.api.generic.GenericArtifactManager) InputStreamReader(java.io.InputStreamReader) XSSFExcelExtractor(org.apache.poi.xssf.extractor.XSSFExcelExtractor) RandomAccessBufferedFileInputStream(org.apache.pdfbox.io.RandomAccessBufferedFileInputStream) InputStream(java.io.InputStream) PowerPointExtractor(org.apache.poi.hslf.extractor.PowerPointExtractor) XSLFPowerPointExtractor(org.apache.poi.xslf.extractor.XSLFPowerPointExtractor) PDFParser(org.apache.pdfbox.pdfparser.PDFParser) XWPFWordExtractor(org.apache.poi.xwpf.extractor.XWPFWordExtractor) Resource(org.wso2.carbon.registry.core.Resource) COSDocument(org.apache.pdfbox.cos.COSDocument) WordExtractor(org.apache.poi.hwpf.extractor.WordExtractor) XWPFWordExtractor(org.apache.poi.xwpf.extractor.XWPFWordExtractor) XSLFPowerPointExtractor(org.apache.poi.xslf.extractor.XSLFPowerPointExtractor) POIFSFileSystem(org.apache.poi.poifs.filesystem.POIFSFileSystem) XSSFExcelExtractor(org.apache.poi.xssf.extractor.XSSFExcelExtractor) ExcelExtractor(org.apache.poi.hssf.extractor.ExcelExtractor) PDDocument(org.apache.pdfbox.pdmodel.PDDocument) XMLSlideShow(org.apache.poi.xslf.usermodel.XMLSlideShow) BufferedReader(java.io.BufferedReader) XSSFWorkbook(org.apache.poi.xssf.usermodel.XSSFWorkbook) XWPFDocument(org.apache.poi.xwpf.usermodel.XWPFDocument) RandomAccessBufferedFileInputStream(org.apache.pdfbox.io.RandomAccessBufferedFileInputStream) PDFTextStripper(org.apache.pdfbox.text.PDFTextStripper)

Example 3 with PDFParser

use of org.apache.pdfbox.pdfparser.PDFParser in project carbon-apimgt by wso2.

the class PDFIndexerTest method testShouldReturnIndexedDocumentWhenParameterCorrect.

@Test
public void testShouldReturnIndexedDocumentWhenParameterCorrect() throws IOException {
    String mediaType = "application/pdf+test";
    final String MEDIA_TYPE = "mediaType";
    PDFParser parser = Mockito.mock(PDFParser.class);
    COSDocument cosDoc = Mockito.mock(COSDocument.class);
    PDFTextStripper pdfTextStripper = Mockito.mock(PDFTextStripper.class);
    Mockito.doThrow(IOException.class).when(cosDoc).close();
    Mockito.when(parser.getDocument()).thenReturn(new COSDocument()).thenReturn(cosDoc);
    Mockito.when(pdfTextStripper.getText(new PDDocument())).thenReturn("");
    PDFIndexer pdfIndexer = new PDFIndexerWrapper(parser, pdfTextStripper);
    // should return the default media type when media type is not defined in file2Index
    IndexDocument pdf = pdfIndexer.getIndexedDocument(file2Index);
    if (!"application/pdf".equals(pdf.getFields().get(MEDIA_TYPE).get(0))) {
        Assert.fail();
    }
    // should return the media type we have set in the file2Index even if error occurs in finally block
    file2Index.mediaType = mediaType;
    pdf = pdfIndexer.getIndexedDocument(file2Index);
    if (!mediaType.equals(pdf.getFields().get(MEDIA_TYPE).get(0))) {
        Assert.fail();
    }
}

Also used : IndexDocument(org.wso2.carbon.registry.indexing.solr.IndexDocument) PDFIndexerWrapper(org.wso2.carbon.apimgt.impl.indexing.indexer.util.PDFIndexerWrapper) PDFParser(org.apache.pdfbox.pdfparser.PDFParser) PDDocument(org.apache.pdfbox.pdmodel.PDDocument) COSDocument(org.apache.pdfbox.cos.COSDocument) PDFTextStripper(org.apache.pdfbox.text.PDFTextStripper) Test(org.junit.Test)

Example 4 with PDFParser

use of org.apache.pdfbox.pdfparser.PDFParser in project carina by qaprosoft.

the class PDFUtil method readTxtFromPDF.

/**
 * Reads PDF content in specified page range.
 *
 * @param inputStream InputStream
 * @param startPage Start Page
 * @param endPage End Page
 * @return PDF content
 */
public static String readTxtFromPDF(InputStream inputStream, int startPage, int endPage) {
    PDFTextStripper pdfStripper = null;
    PDDocument pdDoc = null;
    COSDocument cosDoc = null;
    if (inputStream == null) {
        throw new RuntimeException("Input stream not opened");
    }
    try {
        PDFParser parser = new PDFParser(inputStream);
        parser.parse();
        cosDoc = parser.getDocument();
        pdfStripper = new PDFTextStripper();
        pdDoc = new PDDocument(cosDoc);
        pdfStripper.setSortByPosition(true);
        pdfStripper.setStartPage(startPage);
        pdfStripper.setEndPage(endPage);
        return pdfStripper.getText(pdDoc);
    } catch (IOException e) {
        throw new RuntimeException(e);
    } finally {
        try {
            if (cosDoc != null) {
                cosDoc.close();
            }
            if (pdDoc != null) {
                pdDoc.close();
            }
            if (inputStream != null) {
                inputStream.close();
            }
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }
}

Also used : PDDocument(org.apache.pdfbox.pdmodel.PDDocument) PDFParser(org.apache.pdfbox.pdfparser.PDFParser) COSDocument(org.apache.pdfbox.cos.COSDocument) IOException(java.io.IOException) PDFTextStripper(org.apache.pdfbox.util.PDFTextStripper)

Example 5 with PDFParser

use of org.apache.pdfbox.pdfparser.PDFParser in project pdfbox by apache.

the class PDDocument method load.

/**
 * Parses a PDF. Depending on the memory settings parameter the given input
 * stream is either copied to memory or to a temporary file to enable
 * random access to the pdf.
 *
 * @param input stream that contains the document.
 * @param password password to be used for decryption
 * @param keyStore key store to be used for decryption when using public key security
 * @param alias alias to be used for decryption when using public key security
 * @param memUsageSetting defines how memory is used for buffering input stream and PDF streams
 *
 * @return loaded document
 *
 * @throws InvalidPasswordException If the password is incorrect.
 * @throws IOException In case of a reading or parsing error.
 */
public static PDDocument load(InputStream input, String password, InputStream keyStore, String alias, MemoryUsageSetting memUsageSetting) throws IOException {
    ScratchFile scratchFile = new ScratchFile(memUsageSetting);
    try {
        RandomAccessRead source = scratchFile.createBuffer(input);
        PDFParser parser = new PDFParser(source, password, keyStore, alias, scratchFile);
        parser.parse();
        return parser.getPDDocument();
    } catch (IOException ioe) {
        IOUtils.closeQuietly(scratchFile);
        throw ioe;
    }
}

Also used : RandomAccessRead(org.apache.pdfbox.io.RandomAccessRead) ScratchFile(org.apache.pdfbox.io.ScratchFile) PDFParser(org.apache.pdfbox.pdfparser.PDFParser) IOException(java.io.IOException)

Aggregations

PDFParser (org.apache.pdfbox.pdfparser.PDFParser)9 COSDocument (org.apache.pdfbox.cos.COSDocument)5 PDDocument (org.apache.pdfbox.pdmodel.PDDocument)5 IOException (java.io.IOException)4 PDFTextStripper (org.apache.pdfbox.text.PDFTextStripper)4 ScratchFile (org.apache.pdfbox.io.ScratchFile)3 RandomAccessBufferedFileInputStream (org.apache.pdfbox.io.RandomAccessBufferedFileInputStream)2 RandomAccessRead (org.apache.pdfbox.io.RandomAccessRead)2 IndexDocument (org.wso2.carbon.registry.indexing.solr.IndexDocument)2 BufferedReader (java.io.BufferedReader)1 File (java.io.File)1 InputStream (java.io.InputStream)1 InputStreamReader (java.io.InputStreamReader)1 PrintWriter (java.io.PrintWriter)1 HashMap (java.util.HashMap)1 List (java.util.List)1 RandomAccessBuffer (org.apache.pdfbox.io.RandomAccessBuffer)1 RandomAccessFile (org.apache.pdfbox.io.RandomAccessFile)1 PDFTextStripper (org.apache.pdfbox.util.PDFTextStripper)1 PowerPointExtractor (org.apache.poi.hslf.extractor.PowerPointExtractor)1