Search in sources :

Example 1 with PDNameTreeNode

use of org.apache.pdfbox.pdmodel.common.PDNameTreeNode in project mustangproject by ZUGFeRD.

the class ZUGFeRDImporter method extractLowLevel.

/**
 * Extracts a ZUGFeRD invoice from a PDF document represented by an input stream. Errors are reported via exception handling.
 *
 * @param pdfStream a inputstream of a pdf file
 */
private void extractLowLevel(InputStream pdfStream) throws IOException {
    try (PDDocument doc = PDDocument.load(pdfStream)) {
        // PDDocumentInformation info = doc.getDocumentInformation();
        final PDDocumentNameDictionary names = new PDDocumentNameDictionary(doc.getDocumentCatalog());
        if (doc.getDocumentCatalog() == null || doc.getDocumentCatalog().getMetadata() == null) {
            Logger.getLogger(ZUGFeRDImporter.class.getName()).log(Level.INFO, "no-xmlpart");
            return;
        }
        final InputStream XMP = doc.getDocumentCatalog().getMetadata().exportXMPMetadata();
        xmpString = convertStreamToString(XMP);
        final PDEmbeddedFilesNameTreeNode etn = names.getEmbeddedFiles();
        if (etn == null) {
            return;
        }
        final Map<String, PDComplexFileSpecification> efMap = etn.getNames();
        if (efMap != null) {
            // see
            extractFiles(efMap);
        // https://memorynotfound.com/apache-pdfbox-extract-embedded-file-pdf-document/
        } else {
            final List<PDNameTreeNode<PDComplexFileSpecification>> kids = etn.getKids();
            for (final PDNameTreeNode<PDComplexFileSpecification> node : kids) {
                final Map<String, PDComplexFileSpecification> namesL = node.getNames();
                extractFiles(namesL);
            }
        }
    }
}
Also used : PDEmbeddedFilesNameTreeNode(org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode) ByteArrayInputStream(java.io.ByteArrayInputStream) InputStream(java.io.InputStream) PDDocument(org.apache.pdfbox.pdmodel.PDDocument) PDComplexFileSpecification(org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification) PDDocumentNameDictionary(org.apache.pdfbox.pdmodel.PDDocumentNameDictionary) PDNameTreeNode(org.apache.pdfbox.pdmodel.common.PDNameTreeNode)

Example 2 with PDNameTreeNode

use of org.apache.pdfbox.pdmodel.common.PDNameTreeNode in project tika by apache.

the class AbstractPDF2XHTML method extractEmbeddedDocuments.

private void extractEmbeddedDocuments(PDDocument document) throws IOException, SAXException, TikaException {
    PDDocumentNameDictionary namesDictionary = new PDDocumentNameDictionary(document.getDocumentCatalog());
    PDEmbeddedFilesNameTreeNode efTree = namesDictionary.getEmbeddedFiles();
    if (efTree == null) {
        return;
    }
    Map<String, PDComplexFileSpecification> embeddedFileNames = efTree.getNames();
    //Map<String, COSObjectable> that contains the doc info.
    if (embeddedFileNames != null) {
        processEmbeddedDocNames(embeddedFileNames);
    } else {
        List<PDNameTreeNode<PDComplexFileSpecification>> kids = efTree.getKids();
        if (kids == null) {
            return;
        }
        for (PDNameTreeNode<PDComplexFileSpecification> node : kids) {
            embeddedFileNames = node.getNames();
            if (embeddedFileNames != null) {
                processEmbeddedDocNames(embeddedFileNames);
            }
        }
    }
}
Also used : PDEmbeddedFilesNameTreeNode(org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode) PDComplexFileSpecification(org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification) PDDocumentNameDictionary(org.apache.pdfbox.pdmodel.PDDocumentNameDictionary) PDNameTreeNode(org.apache.pdfbox.pdmodel.common.PDNameTreeNode)

Example 3 with PDNameTreeNode

use of org.apache.pdfbox.pdmodel.common.PDNameTreeNode in project pdfbox by apache.

the class ExtractEmbeddedFiles method main.

/**
 * This is the main method.
 *
 * @param args The command line arguments.
 *
 * @throws IOException If there is an error parsing the document.
 */
public static void main(String[] args) throws IOException {
    if (args.length != 1) {
        usage();
        System.exit(1);
    } else {
        PDDocument document = null;
        try {
            File pdfFile = new File(args[0]);
            String filePath = pdfFile.getParent() + System.getProperty("file.separator");
            document = PDDocument.load(pdfFile);
            PDDocumentNameDictionary namesDictionary = new PDDocumentNameDictionary(document.getDocumentCatalog());
            PDEmbeddedFilesNameTreeNode efTree = namesDictionary.getEmbeddedFiles();
            if (efTree != null) {
                Map<String, PDComplexFileSpecification> names = efTree.getNames();
                if (names != null) {
                    extractFiles(names, filePath);
                } else {
                    List<PDNameTreeNode<PDComplexFileSpecification>> kids = efTree.getKids();
                    for (PDNameTreeNode<PDComplexFileSpecification> node : kids) {
                        names = node.getNames();
                        extractFiles(names, filePath);
                    }
                }
            }
            // extract files from annotations
            for (PDPage page : document.getPages()) {
                for (PDAnnotation annotation : page.getAnnotations()) {
                    if (annotation instanceof PDAnnotationFileAttachment) {
                        PDAnnotationFileAttachment annotationFileAttachment = (PDAnnotationFileAttachment) annotation;
                        PDFileSpecification fileSpec = annotationFileAttachment.getFile();
                        if (fileSpec instanceof PDComplexFileSpecification) {
                            PDComplexFileSpecification complexFileSpec = (PDComplexFileSpecification) fileSpec;
                            PDEmbeddedFile embeddedFile = getEmbeddedFile(complexFileSpec);
                            extractFile(filePath, complexFileSpec.getFilename(), embeddedFile);
                        }
                    }
                }
            }
        } finally {
            if (document != null) {
                document.close();
            }
        }
    }
}
Also used : PDPage(org.apache.pdfbox.pdmodel.PDPage) PDAnnotationFileAttachment(org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationFileAttachment) PDEmbeddedFilesNameTreeNode(org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode) PDEmbeddedFile(org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile) PDAnnotation(org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation) PDComplexFileSpecification(org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification) PDDocumentNameDictionary(org.apache.pdfbox.pdmodel.PDDocumentNameDictionary) PDFileSpecification(org.apache.pdfbox.pdmodel.common.filespecification.PDFileSpecification) PDDocument(org.apache.pdfbox.pdmodel.PDDocument) PDEmbeddedFile(org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile) File(java.io.File) PDNameTreeNode(org.apache.pdfbox.pdmodel.common.PDNameTreeNode)

Aggregations

PDDocumentNameDictionary (org.apache.pdfbox.pdmodel.PDDocumentNameDictionary)3 PDEmbeddedFilesNameTreeNode (org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode)3 PDNameTreeNode (org.apache.pdfbox.pdmodel.common.PDNameTreeNode)3 PDComplexFileSpecification (org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification)3 PDDocument (org.apache.pdfbox.pdmodel.PDDocument)2 ByteArrayInputStream (java.io.ByteArrayInputStream)1 File (java.io.File)1 InputStream (java.io.InputStream)1 PDPage (org.apache.pdfbox.pdmodel.PDPage)1 PDEmbeddedFile (org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile)1 PDFileSpecification (org.apache.pdfbox.pdmodel.common.filespecification.PDFileSpecification)1 PDAnnotation (org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation)1 PDAnnotationFileAttachment (org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationFileAttachment)1