use of org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode in project pdfbox by apache.
the class ExtractText method startExtraction.
/**
* Starts the text extraction.
*
* @param args the commandline arguments.
* @throws IOException if there is an error reading the document or extracting the text.
*/
public void startExtraction(String[] args) throws IOException {
boolean toConsole = false;
boolean toHTML = false;
boolean sort = false;
boolean separateBeads = true;
String password = "";
String encoding = STD_ENCODING;
String pdfFile = null;
String outputFile = null;
// Defaults to text files
String ext = ".txt";
int startPage = 1;
int endPage = Integer.MAX_VALUE;
for (int i = 0; i < args.length; i++) {
if (args[i].equals(PASSWORD)) {
i++;
if (i >= args.length) {
usage();
}
password = args[i];
} else if (args[i].equals(ENCODING)) {
i++;
if (i >= args.length) {
usage();
}
encoding = args[i];
} else if (args[i].equals(START_PAGE)) {
i++;
if (i >= args.length) {
usage();
}
startPage = Integer.parseInt(args[i]);
} else if (args[i].equals(HTML)) {
toHTML = true;
ext = ".html";
} else if (args[i].equals(SORT)) {
sort = true;
} else if (args[i].equals(IGNORE_BEADS)) {
separateBeads = false;
} else if (args[i].equals(DEBUG)) {
debug = true;
} else if (args[i].equals(END_PAGE)) {
i++;
if (i >= args.length) {
usage();
}
endPage = Integer.parseInt(args[i]);
} else if (args[i].equals(CONSOLE)) {
toConsole = true;
} else {
if (pdfFile == null) {
pdfFile = args[i];
} else {
outputFile = args[i];
}
}
}
if (pdfFile == null) {
usage();
} else {
Writer output = null;
PDDocument document = null;
try {
long startTime = startProcessing("Loading PDF " + pdfFile);
if (outputFile == null && pdfFile.length() > 4) {
outputFile = new File(pdfFile.substring(0, pdfFile.length() - 4) + ext).getAbsolutePath();
}
document = PDDocument.load(new File(pdfFile), password);
AccessPermission ap = document.getCurrentAccessPermission();
if (!ap.canExtractContent()) {
throw new IOException("You do not have permission to extract text");
}
stopProcessing("Time for loading: ", startTime);
if (toConsole) {
output = new OutputStreamWriter(System.out, encoding);
} else {
if (toHTML && !STD_ENCODING.equals(encoding)) {
encoding = STD_ENCODING;
System.out.println("The encoding parameter is ignored when writing html output.");
}
output = new OutputStreamWriter(new FileOutputStream(outputFile), encoding);
}
PDFTextStripper stripper;
if (toHTML) {
stripper = new PDFText2HTML();
} else {
stripper = new PDFTextStripper();
}
stripper.setSortByPosition(sort);
stripper.setShouldSeparateByBeads(separateBeads);
stripper.setStartPage(startPage);
stripper.setEndPage(endPage);
startTime = startProcessing("Starting text extraction");
if (debug) {
System.err.println("Writing to " + outputFile);
}
// Extract text for main document:
stripper.writeText(document, output);
// ... also for any embedded PDFs:
PDDocumentCatalog catalog = document.getDocumentCatalog();
PDDocumentNameDictionary names = catalog.getNames();
if (names != null) {
PDEmbeddedFilesNameTreeNode embeddedFiles = names.getEmbeddedFiles();
if (embeddedFiles != null) {
Map<String, PDComplexFileSpecification> embeddedFileNames = embeddedFiles.getNames();
if (embeddedFileNames != null) {
for (Map.Entry<String, PDComplexFileSpecification> ent : embeddedFileNames.entrySet()) {
if (debug) {
System.err.println("Processing embedded file " + ent.getKey() + ":");
}
PDComplexFileSpecification spec = ent.getValue();
PDEmbeddedFile file = spec.getEmbeddedFile();
if (file != null && "application/pdf".equals(file.getSubtype())) {
if (debug) {
System.err.println(" is PDF (size=" + file.getSize() + ")");
}
try (InputStream fis = file.createInputStream();
PDDocument subDoc = PDDocument.load(fis)) {
stripper.writeText(subDoc, output);
}
}
}
}
}
}
stopProcessing("Time for extraction: ", startTime);
} finally {
IOUtils.closeQuietly(output);
IOUtils.closeQuietly(document);
}
}
}
use of org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode in project pdfbox by apache.
the class EmbeddedFiles method doIt.
/**
* create the second sample document from the PDF file format specification.
*
* @param file The file to write the PDF to.
*
* @throws IOException If there is an error writing the data.
*/
public void doIt(String file) throws IOException {
try (// the document
PDDocument doc = new PDDocument()) {
PDPage page = new PDPage();
doc.addPage(page);
PDFont font = PDType1Font.HELVETICA_BOLD;
try (PDPageContentStream contentStream = new PDPageContentStream(doc, page)) {
contentStream.beginText();
contentStream.setFont(font, 12);
contentStream.newLineAtOffset(100, 700);
contentStream.showText("Go to Document->File Attachments to View Embedded Files");
contentStream.endText();
}
// embedded files are stored in a named tree
PDEmbeddedFilesNameTreeNode efTree = new PDEmbeddedFilesNameTreeNode();
// first create the file specification, which holds the embedded file
PDComplexFileSpecification fs = new PDComplexFileSpecification();
fs.setFile("Test.txt");
// create a dummy file stream, this would probably normally be a FileInputStream
byte[] data = "This is the contents of the embedded file".getBytes("ISO-8859-1");
ByteArrayInputStream fakeFile = new ByteArrayInputStream(data);
PDEmbeddedFile ef = new PDEmbeddedFile(doc, fakeFile);
// now lets some of the optional parameters
ef.setSubtype("test/plain");
ef.setSize(data.length);
ef.setCreationDate(new GregorianCalendar());
fs.setEmbeddedFile(ef);
// create a new tree node and add the embedded file
PDEmbeddedFilesNameTreeNode treeNode = new PDEmbeddedFilesNameTreeNode();
treeNode.setNames(Collections.singletonMap("My first attachment", fs));
// add the new node as kid to the root node
List<PDEmbeddedFilesNameTreeNode> kids = new ArrayList<>();
kids.add(treeNode);
efTree.setKids(kids);
// add the tree to the document catalog
PDDocumentNameDictionary names = new PDDocumentNameDictionary(doc.getDocumentCatalog());
names.setEmbeddedFiles(efTree);
doc.getDocumentCatalog().setNames(names);
doc.save(file);
}
}
use of org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode in project mustangproject by ZUGFeRD.
the class ZUGFeRDImporter method extractLowLevel.
/**
* Extracts a ZUGFeRD invoice from a PDF document represented by an input stream. Errors are reported via exception handling.
*
* @param pdfStream a inputstream of a pdf file
*/
private void extractLowLevel(InputStream pdfStream) throws IOException {
try (PDDocument doc = PDDocument.load(pdfStream)) {
// PDDocumentInformation info = doc.getDocumentInformation();
final PDDocumentNameDictionary names = new PDDocumentNameDictionary(doc.getDocumentCatalog());
if (doc.getDocumentCatalog() == null || doc.getDocumentCatalog().getMetadata() == null) {
Logger.getLogger(ZUGFeRDImporter.class.getName()).log(Level.INFO, "no-xmlpart");
return;
}
final InputStream XMP = doc.getDocumentCatalog().getMetadata().exportXMPMetadata();
xmpString = convertStreamToString(XMP);
final PDEmbeddedFilesNameTreeNode etn = names.getEmbeddedFiles();
if (etn == null) {
return;
}
final Map<String, PDComplexFileSpecification> efMap = etn.getNames();
if (efMap != null) {
// see
extractFiles(efMap);
// https://memorynotfound.com/apache-pdfbox-extract-embedded-file-pdf-document/
} else {
final List<PDNameTreeNode<PDComplexFileSpecification>> kids = etn.getKids();
for (final PDNameTreeNode<PDComplexFileSpecification> node : kids) {
final Map<String, PDComplexFileSpecification> namesL = node.getNames();
extractFiles(namesL);
}
}
}
}
use of org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode in project mustangproject by ZUGFeRD.
the class ZUGFeRDExporter method PDFAttachGenericFile.
/**
* Embeds an external file (generic - any type allowed) in the PDF.
*
* @param doc
* PDDocument to attach the file to.
* @param filename
* name of the file that will become attachment name in the PDF
* @param relationship
* how the file relates to the content, e.g. "Alternative"
* @param description
* Human-readable description of the file content
* @param subType
* type of the data e.g. could be "text/xml" - mime like
* @param data
* the binary data of the file/attachment
* @throws java.io.IOException
*/
public void PDFAttachGenericFile(PDDocument doc, String filename, String relationship, String description, String subType, byte[] data) throws IOException {
PDComplexFileSpecification fs = new PDComplexFileSpecification();
fs.setFile(filename);
COSDictionary dict = fs.getCOSObject();
dict.setName("AFRelationship", relationship);
dict.setString("UF", filename);
dict.setString("Desc", description);
ByteArrayInputStream fakeFile = new ByteArrayInputStream(data);
PDEmbeddedFile ef = new PDEmbeddedFile(doc, fakeFile);
ef.setSubtype(subType);
ef.setSize(data.length);
ef.setCreationDate(new GregorianCalendar());
ef.setModDate(GregorianCalendar.getInstance());
fs.setEmbeddedFile(ef);
// In addition make sure the embedded file is set under /UF
dict = fs.getCOSObject();
COSDictionary efDict = (COSDictionary) dict.getDictionaryObject(COSName.EF);
COSBase lowerLevelFile = efDict.getItem(COSName.F);
efDict.setItem(COSName.UF, lowerLevelFile);
// now add the entry to the embedded file tree and set in the document.
PDDocumentNameDictionary names = new PDDocumentNameDictionary(doc.getDocumentCatalog());
PDEmbeddedFilesNameTreeNode efTree = names.getEmbeddedFiles();
if (efTree == null) {
efTree = new PDEmbeddedFilesNameTreeNode();
}
Map<String, PDComplexFileSpecification> namesMap = new HashMap<String, PDComplexFileSpecification>();
Map<String, PDComplexFileSpecification> oldNamesMap = efTree.getNames();
if (oldNamesMap != null) {
for (String key : oldNamesMap.keySet()) {
namesMap.put(key, oldNamesMap.get(key));
}
}
namesMap.put(filename, fs);
efTree.setNames(namesMap);
names.setEmbeddedFiles(efTree);
doc.getDocumentCatalog().setNames(names);
// AF entry (Array) in catalog with the FileSpec
COSArray cosArray = (COSArray) doc.getDocumentCatalog().getCOSObject().getItem("AF");
if (cosArray == null) {
cosArray = new COSArray();
}
cosArray.add(fs);
COSDictionary dict2 = doc.getDocumentCatalog().getCOSObject();
COSArray array = new COSArray();
// see below
array.add(fs.getCOSObject());
dict2.setItem("AF", array);
doc.getDocumentCatalog().getCOSObject().setItem("AF", cosArray);
}
use of org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode in project tika by apache.
the class AbstractPDF2XHTML method extractEmbeddedDocuments.
private void extractEmbeddedDocuments(PDDocument document) throws IOException, SAXException, TikaException {
PDDocumentNameDictionary namesDictionary = new PDDocumentNameDictionary(document.getDocumentCatalog());
PDEmbeddedFilesNameTreeNode efTree = namesDictionary.getEmbeddedFiles();
if (efTree == null) {
return;
}
Map<String, PDComplexFileSpecification> embeddedFileNames = efTree.getNames();
//Map<String, COSObjectable> that contains the doc info.
if (embeddedFileNames != null) {
processEmbeddedDocNames(embeddedFileNames);
} else {
List<PDNameTreeNode<PDComplexFileSpecification>> kids = efTree.getKids();
if (kids == null) {
return;
}
for (PDNameTreeNode<PDComplexFileSpecification> node : kids) {
embeddedFileNames = node.getNames();
if (embeddedFileNames != null) {
processEmbeddedDocNames(embeddedFileNames);
}
}
}
}
Aggregations