use of org.apache.pdfbox.pdfparser.PDFParser in project pdfbox by apache.
the class SignatureOptions method initFromRandomAccessRead.
private void initFromRandomAccessRead(RandomAccessRead rar) throws IOException {
pdfSource = rar;
PDFParser parser = new PDFParser(pdfSource);
parser.parse();
visualSignature = parser.getDocument();
}
use of org.apache.pdfbox.pdfparser.PDFParser in project carbon-apimgt by wso2.
the class DocumentIndexer method fetchDocumentContent.
/**
* Write document content to document artifact as its raw content
*
* @param registry
* @param documentResource
* @return
* @throws RegistryException
* @throws IOException
* @throws APIManagementException
*/
private String fetchDocumentContent(Registry registry, Resource documentResource) throws RegistryException, IOException, APIManagementException {
GenericArtifactManager docArtifactManager = APIUtil.getArtifactManager(registry, APIConstants.DOCUMENTATION_KEY);
GenericArtifact documentArtifact = docArtifactManager.getGenericArtifact(documentResource.getUUID());
String sourceType = documentArtifact.getAttribute(APIConstants.DOC_SOURCE_TYPE);
String contentString = null;
if (Documentation.DocumentSourceType.FILE.name().equals(sourceType)) {
String path = documentArtifact.getAttribute(APIConstants.DOC_FILE_PATH);
int indexOfApimgt = path.indexOf(APIConstants.APIMGT_REGISTRY_LOCATION);
String filepath = path.substring(indexOfApimgt);
Resource contentResource = registry.get(filepath);
int indexOfFiles = filepath.indexOf(APIConstants.DOCUMENT_FILE_DIR) + APIConstants.DOCUMENT_FILE_DIR.length() + 1;
String fileName = filepath.substring(indexOfFiles);
String extension = FilenameUtils.getExtension(fileName);
InputStream inputStream = null;
try {
inputStream = contentResource.getContentStream();
switch(extension) {
case APIConstants.PDF_EXTENSION:
PDFParser pdfParser = new PDFParser(new RandomAccessBufferedFileInputStream(inputStream));
pdfParser.parse();
COSDocument cosDocument = pdfParser.getDocument();
PDFTextStripper stripper = new PDFTextStripper();
contentString = stripper.getText(new PDDocument(cosDocument));
break;
case APIConstants.DOC_EXTENSION:
{
POIFSFileSystem pfs = new POIFSFileSystem(inputStream);
WordExtractor msWord2003Extractor = new WordExtractor(pfs);
contentString = msWord2003Extractor.getText();
break;
}
case APIConstants.DOCX_EXTENSION:
XWPFDocument doc = new XWPFDocument(inputStream);
XWPFWordExtractor msWord2007Extractor = new XWPFWordExtractor(doc);
contentString = msWord2007Extractor.getText();
break;
case APIConstants.XLS_EXTENSION:
{
POIFSFileSystem pfs = new POIFSFileSystem(inputStream);
ExcelExtractor extractor = new ExcelExtractor(pfs);
contentString = extractor.getText();
break;
}
case APIConstants.XLSX_EXTENSION:
XSSFWorkbook xssfSheets = new XSSFWorkbook(inputStream);
XSSFExcelExtractor xssfExcelExtractor = new XSSFExcelExtractor(xssfSheets);
contentString = xssfExcelExtractor.getText();
break;
case APIConstants.PPT_EXTENSION:
{
POIFSFileSystem fs = new POIFSFileSystem(inputStream);
PowerPointExtractor extractor = new PowerPointExtractor(fs);
contentString = extractor.getText();
break;
}
case APIConstants.PPTX_EXTENSION:
XMLSlideShow xmlSlideShow = new XMLSlideShow(inputStream);
XSLFPowerPointExtractor xslfPowerPointExtractor = new XSLFPowerPointExtractor(xmlSlideShow);
contentString = xslfPowerPointExtractor.getText();
break;
case APIConstants.TXT_EXTENSION:
case APIConstants.WSDL_EXTENSION:
case APIConstants.XML_DOC_EXTENSION:
BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream));
String line;
StringBuilder contentBuilder = new StringBuilder();
while ((line = reader.readLine()) != null) {
contentBuilder.append(line);
}
contentString = contentBuilder.toString();
break;
}
} finally {
IOUtils.closeQuietly(inputStream);
}
} else if (Documentation.DocumentSourceType.INLINE.name().equals(sourceType)) {
String fileName = ((ResourceImpl) documentResource).getName();
String pathToDocFile = documentResource.getPath();
String pathToContent = pathToDocFile.substring(0, pathToDocFile.lastIndexOf(fileName)) + APIConstants.INLINE_DOCUMENT_CONTENT_DIR + RegistryConstants.PATH_SEPARATOR + fileName;
if (registry.resourceExists(pathToContent)) {
Resource contentResource = registry.get(pathToContent);
InputStream instream = null;
BufferedReader reader = null;
String line;
try {
instream = contentResource.getContentStream();
reader = new BufferedReader(new InputStreamReader(instream));
StringBuilder contentBuilder = new StringBuilder();
while ((line = reader.readLine()) != null) {
contentBuilder.append(line);
}
contentString = contentBuilder.toString();
} finally {
if (reader != null) {
IOUtils.closeQuietly(reader);
}
}
}
}
return contentString;
}
use of org.apache.pdfbox.pdfparser.PDFParser in project carbon-apimgt by wso2.
the class PDFIndexerTest method testShouldReturnIndexedDocumentWhenParameterCorrect.
@Test
public void testShouldReturnIndexedDocumentWhenParameterCorrect() throws IOException {
String mediaType = "application/pdf+test";
final String MEDIA_TYPE = "mediaType";
PDFParser parser = Mockito.mock(PDFParser.class);
COSDocument cosDoc = Mockito.mock(COSDocument.class);
PDFTextStripper pdfTextStripper = Mockito.mock(PDFTextStripper.class);
Mockito.doThrow(IOException.class).when(cosDoc).close();
Mockito.when(parser.getDocument()).thenReturn(new COSDocument()).thenReturn(cosDoc);
Mockito.when(pdfTextStripper.getText(new PDDocument())).thenReturn("");
PDFIndexer pdfIndexer = new PDFIndexerWrapper(parser, pdfTextStripper);
// should return the default media type when media type is not defined in file2Index
IndexDocument pdf = pdfIndexer.getIndexedDocument(file2Index);
if (!"application/pdf".equals(pdf.getFields().get(MEDIA_TYPE).get(0))) {
Assert.fail();
}
// should return the media type we have set in the file2Index even if error occurs in finally block
file2Index.mediaType = mediaType;
pdf = pdfIndexer.getIndexedDocument(file2Index);
if (!mediaType.equals(pdf.getFields().get(MEDIA_TYPE).get(0))) {
Assert.fail();
}
}
use of org.apache.pdfbox.pdfparser.PDFParser in project carina by qaprosoft.
the class PDFUtil method readTxtFromPDF.
/**
* Reads PDF content in specified page range.
*
* @param inputStream InputStream
* @param startPage Start Page
* @param endPage End Page
* @return PDF content
*/
public static String readTxtFromPDF(InputStream inputStream, int startPage, int endPage) {
PDFTextStripper pdfStripper = null;
PDDocument pdDoc = null;
COSDocument cosDoc = null;
if (inputStream == null) {
throw new RuntimeException("Input stream not opened");
}
try {
PDFParser parser = new PDFParser(inputStream);
parser.parse();
cosDoc = parser.getDocument();
pdfStripper = new PDFTextStripper();
pdDoc = new PDDocument(cosDoc);
pdfStripper.setSortByPosition(true);
pdfStripper.setStartPage(startPage);
pdfStripper.setEndPage(endPage);
return pdfStripper.getText(pdDoc);
} catch (IOException e) {
throw new RuntimeException(e);
} finally {
try {
if (cosDoc != null) {
cosDoc.close();
}
if (pdDoc != null) {
pdDoc.close();
}
if (inputStream != null) {
inputStream.close();
}
} catch (IOException e) {
throw new RuntimeException(e);
}
}
}
use of org.apache.pdfbox.pdfparser.PDFParser in project pdfbox by apache.
the class PDDocument method load.
/**
* Parses a PDF. Depending on the memory settings parameter the given input
* stream is either copied to memory or to a temporary file to enable
* random access to the pdf.
*
* @param input stream that contains the document.
* @param password password to be used for decryption
* @param keyStore key store to be used for decryption when using public key security
* @param alias alias to be used for decryption when using public key security
* @param memUsageSetting defines how memory is used for buffering input stream and PDF streams
*
* @return loaded document
*
* @throws InvalidPasswordException If the password is incorrect.
* @throws IOException In case of a reading or parsing error.
*/
public static PDDocument load(InputStream input, String password, InputStream keyStore, String alias, MemoryUsageSetting memUsageSetting) throws IOException {
ScratchFile scratchFile = new ScratchFile(memUsageSetting);
try {
RandomAccessRead source = scratchFile.createBuffer(input);
PDFParser parser = new PDFParser(source, password, keyStore, alias, scratchFile);
parser.parse();
return parser.getPDDocument();
} catch (IOException ioe) {
IOUtils.closeQuietly(scratchFile);
throw ioe;
}
}
Aggregations