use of org.apache.pdfbox.util.PDFTextStripper in project camel by apache.
the class PdfProducer method doExtractText.
private String doExtractText(Exchange exchange) throws IOException, CryptographyException, InvalidPasswordException, BadSecurityHandlerException {
LOG.debug("Got {} operation, going to extract text from provided pdf.", pdfConfiguration.getOperation());
PDDocument document = exchange.getIn().getBody(PDDocument.class);
if (document.isEncrypted()) {
DecryptionMaterial decryptionMaterial = exchange.getIn().getHeader(DECRYPTION_MATERIAL_HEADER_NAME, DecryptionMaterial.class);
if (decryptionMaterial == null) {
throw new IllegalArgumentException(String.format("%s header is expected for %s operation " + "on encrypted document", DECRYPTION_MATERIAL_HEADER_NAME, pdfConfiguration.getOperation()));
}
document.openProtection(decryptionMaterial);
}
PDFTextStripper pdfTextStripper = new PDFTextStripper();
return pdfTextStripper.getText(document);
}
use of org.apache.pdfbox.util.PDFTextStripper in project camel by apache.
the class PdfAppendTest method testAppend.
@Test
public void testAppend() throws Exception {
final String originalText = "Test";
final String textToAppend = "Append";
PDDocument document = new PDDocument();
PDPage page = new PDPage(PDPage.PAGE_SIZE_A4);
document.addPage(page);
PDPageContentStream contentStream = new PDPageContentStream(document, page);
contentStream.setFont(PDType1Font.HELVETICA, 12);
contentStream.beginText();
contentStream.moveTextPositionByAmount(20, 400);
contentStream.drawString(originalText);
contentStream.endText();
contentStream.close();
template.sendBodyAndHeader("direct:start", textToAppend, PdfHeaderConstants.PDF_DOCUMENT_HEADER_NAME, document);
resultEndpoint.setExpectedMessageCount(1);
resultEndpoint.expectedMessagesMatches(new Predicate() {
@Override
public boolean matches(Exchange exchange) {
Object body = exchange.getIn().getBody();
assertThat(body, instanceOf(ByteArrayOutputStream.class));
try {
PDDocument doc = PDDocument.load(new ByteArrayInputStream(((ByteArrayOutputStream) body).toByteArray()));
PDFTextStripper pdfTextStripper = new PDFTextStripper();
String text = pdfTextStripper.getText(doc);
assertEquals(2, doc.getNumberOfPages());
assertThat(text, containsString(originalText));
assertThat(text, containsString(textToAppend));
} catch (IOException e) {
throw new RuntimeException(e);
}
return true;
}
});
resultEndpoint.assertIsSatisfied();
}
use of org.apache.pdfbox.util.PDFTextStripper in project camel by apache.
the class PdfAppendTest method testAppendEncrypted.
@Test
public void testAppendEncrypted() throws Exception {
final String originalText = "Test";
final String textToAppend = "Append";
PDDocument document = new PDDocument();
PDPage page = new PDPage(PDPage.PAGE_SIZE_A4);
document.addPage(page);
PDPageContentStream contentStream = new PDPageContentStream(document, page);
contentStream.setFont(PDType1Font.HELVETICA, 12);
contentStream.beginText();
contentStream.moveTextPositionByAmount(20, 400);
contentStream.drawString(originalText);
contentStream.endText();
contentStream.close();
final String ownerPass = "ownerPass";
final String userPass = "userPass";
AccessPermission accessPermission = new AccessPermission();
accessPermission.setCanExtractContent(false);
StandardProtectionPolicy protectionPolicy = new StandardProtectionPolicy(ownerPass, userPass, accessPermission);
protectionPolicy.setEncryptionKeyLength(128);
document.protect(protectionPolicy);
ByteArrayOutputStream output = new ByteArrayOutputStream();
document.save(output);
// Encryption happens after saving.
PDDocument encryptedDocument = PDDocument.load(new ByteArrayInputStream(output.toByteArray()));
Map<String, Object> headers = new HashMap<String, Object>();
headers.put(PdfHeaderConstants.PDF_DOCUMENT_HEADER_NAME, encryptedDocument);
headers.put(PdfHeaderConstants.DECRYPTION_MATERIAL_HEADER_NAME, new StandardDecryptionMaterial(userPass));
template.sendBodyAndHeaders("direct:start", textToAppend, headers);
resultEndpoint.setExpectedMessageCount(1);
resultEndpoint.expectedMessagesMatches(new Predicate() {
@Override
public boolean matches(Exchange exchange) {
Object body = exchange.getIn().getBody();
assertThat(body, instanceOf(ByteArrayOutputStream.class));
try {
PDDocument doc = PDDocument.load(new ByteArrayInputStream(((ByteArrayOutputStream) body).toByteArray()));
PDFTextStripper pdfTextStripper = new PDFTextStripper();
String text = pdfTextStripper.getText(doc);
assertEquals(2, doc.getNumberOfPages());
assertThat(text, containsString(originalText));
assertThat(text, containsString(textToAppend));
} catch (IOException e) {
throw new RuntimeException(e);
}
return true;
}
});
resultEndpoint.assertIsSatisfied();
}
use of org.apache.pdfbox.util.PDFTextStripper in project OpenOLAT by OpenOLAT.
the class PdfBoxExtractor method extractTextFromPdf.
private FileContent extractTextFromPdf(VFSLeaf leaf) throws IOException, DocumentAccessException {
if (log.isDebug())
log.debug("readContent from pdf starts...");
PDDocument document = null;
BufferedInputStream bis = null;
try {
bis = new BufferedInputStream(leaf.getInputStream());
document = PDDocument.load(bis);
if (document.isEncrypted()) {
try {
document.decrypt("");
} catch (Exception e) {
log.warn("PDF is encrypted. Can not read content file=" + leaf.getName());
LimitedContentWriter writer = new LimitedContentWriter(128, FileDocumentFactory.getMaxFileSize());
writer.append(leaf.getName());
writer.close();
return new FileContent(leaf.getName(), writer.toString());
}
}
String title = getTitle(document);
if (log.isDebug())
log.debug("readContent PDDocument loaded");
PDFTextStripper stripper = new PDFTextStripper();
LimitedContentWriter writer = new LimitedContentWriter(50000, FileDocumentFactory.getMaxFileSize());
stripper.writeText(document, writer);
writer.close();
return new FileContent(title, writer.toString());
} finally {
if (document != null) {
document.close();
}
if (bis != null) {
bis.close();
}
}
}
use of org.apache.pdfbox.util.PDFTextStripper in project portfolio by buchen.
the class PDFInputFile method parse.
public void parse() throws IOException {
try (PDDocument document = PDDocument.load(getFile())) {
PDDocumentInformation pdd = document.getDocumentInformation();
// $NON-NLS-1$
author = pdd.getAuthor() == null ? "" : pdd.getAuthor();
PDFTextStripper textStripper = new PDFTextStripper();
textStripper.setSortByPosition(true);
text = textStripper.getText(document);
}
}
Aggregations