Search in sources :

Example 1 with PDEmbeddedFilesNameTreeNode

use of org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode in project pdfbox by apache.

the class ExtractText method startExtraction.

/**
 * Starts the text extraction.
 *
 * @param args the commandline arguments.
 * @throws IOException if there is an error reading the document or extracting the text.
 */
public void startExtraction(String[] args) throws IOException {
    boolean toConsole = false;
    boolean toHTML = false;
    boolean sort = false;
    boolean separateBeads = true;
    String password = "";
    String encoding = STD_ENCODING;
    String pdfFile = null;
    String outputFile = null;
    // Defaults to text files
    String ext = ".txt";
    int startPage = 1;
    int endPage = Integer.MAX_VALUE;
    for (int i = 0; i < args.length; i++) {
        if (args[i].equals(PASSWORD)) {
            i++;
            if (i >= args.length) {
                usage();
            }
            password = args[i];
        } else if (args[i].equals(ENCODING)) {
            i++;
            if (i >= args.length) {
                usage();
            }
            encoding = args[i];
        } else if (args[i].equals(START_PAGE)) {
            i++;
            if (i >= args.length) {
                usage();
            }
            startPage = Integer.parseInt(args[i]);
        } else if (args[i].equals(HTML)) {
            toHTML = true;
            ext = ".html";
        } else if (args[i].equals(SORT)) {
            sort = true;
        } else if (args[i].equals(IGNORE_BEADS)) {
            separateBeads = false;
        } else if (args[i].equals(DEBUG)) {
            debug = true;
        } else if (args[i].equals(END_PAGE)) {
            i++;
            if (i >= args.length) {
                usage();
            }
            endPage = Integer.parseInt(args[i]);
        } else if (args[i].equals(CONSOLE)) {
            toConsole = true;
        } else {
            if (pdfFile == null) {
                pdfFile = args[i];
            } else {
                outputFile = args[i];
            }
        }
    }
    if (pdfFile == null) {
        usage();
    } else {
        Writer output = null;
        PDDocument document = null;
        try {
            long startTime = startProcessing("Loading PDF " + pdfFile);
            if (outputFile == null && pdfFile.length() > 4) {
                outputFile = new File(pdfFile.substring(0, pdfFile.length() - 4) + ext).getAbsolutePath();
            }
            document = PDDocument.load(new File(pdfFile), password);
            AccessPermission ap = document.getCurrentAccessPermission();
            if (!ap.canExtractContent()) {
                throw new IOException("You do not have permission to extract text");
            }
            stopProcessing("Time for loading: ", startTime);
            if (toConsole) {
                output = new OutputStreamWriter(System.out, encoding);
            } else {
                if (toHTML && !STD_ENCODING.equals(encoding)) {
                    encoding = STD_ENCODING;
                    System.out.println("The encoding parameter is ignored when writing html output.");
                }
                output = new OutputStreamWriter(new FileOutputStream(outputFile), encoding);
            }
            PDFTextStripper stripper;
            if (toHTML) {
                stripper = new PDFText2HTML();
            } else {
                stripper = new PDFTextStripper();
            }
            stripper.setSortByPosition(sort);
            stripper.setShouldSeparateByBeads(separateBeads);
            stripper.setStartPage(startPage);
            stripper.setEndPage(endPage);
            startTime = startProcessing("Starting text extraction");
            if (debug) {
                System.err.println("Writing to " + outputFile);
            }
            // Extract text for main document:
            stripper.writeText(document, output);
            // ... also for any embedded PDFs:
            PDDocumentCatalog catalog = document.getDocumentCatalog();
            PDDocumentNameDictionary names = catalog.getNames();
            if (names != null) {
                PDEmbeddedFilesNameTreeNode embeddedFiles = names.getEmbeddedFiles();
                if (embeddedFiles != null) {
                    Map<String, PDComplexFileSpecification> embeddedFileNames = embeddedFiles.getNames();
                    if (embeddedFileNames != null) {
                        for (Map.Entry<String, PDComplexFileSpecification> ent : embeddedFileNames.entrySet()) {
                            if (debug) {
                                System.err.println("Processing embedded file " + ent.getKey() + ":");
                            }
                            PDComplexFileSpecification spec = ent.getValue();
                            PDEmbeddedFile file = spec.getEmbeddedFile();
                            if (file != null && "application/pdf".equals(file.getSubtype())) {
                                if (debug) {
                                    System.err.println("  is PDF (size=" + file.getSize() + ")");
                                }
                                try (InputStream fis = file.createInputStream();
                                    PDDocument subDoc = PDDocument.load(fis)) {
                                    stripper.writeText(subDoc, output);
                                }
                            }
                        }
                    }
                }
            }
            stopProcessing("Time for extraction: ", startTime);
        } finally {
            IOUtils.closeQuietly(output);
            IOUtils.closeQuietly(document);
        }
    }
}
Also used : PDEmbeddedFilesNameTreeNode(org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode) PDEmbeddedFile(org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile) InputStream(java.io.InputStream) AccessPermission(org.apache.pdfbox.pdmodel.encryption.AccessPermission) IOException(java.io.IOException) PDComplexFileSpecification(org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification) PDDocumentCatalog(org.apache.pdfbox.pdmodel.PDDocumentCatalog) PDDocumentNameDictionary(org.apache.pdfbox.pdmodel.PDDocumentNameDictionary) PDDocument(org.apache.pdfbox.pdmodel.PDDocument) FileOutputStream(java.io.FileOutputStream) OutputStreamWriter(java.io.OutputStreamWriter) PDEmbeddedFile(org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile) File(java.io.File) Map(java.util.Map) Writer(java.io.Writer) OutputStreamWriter(java.io.OutputStreamWriter) PDFTextStripper(org.apache.pdfbox.text.PDFTextStripper)

Example 2 with PDEmbeddedFilesNameTreeNode

use of org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode in project pdfbox by apache.

the class EmbeddedFiles method doIt.

/**
 * create the second sample document from the PDF file format specification.
 *
 * @param file The file to write the PDF to.
 *
 * @throws IOException If there is an error writing the data.
 */
public void doIt(String file) throws IOException {
    try (// the document
    PDDocument doc = new PDDocument()) {
        PDPage page = new PDPage();
        doc.addPage(page);
        PDFont font = PDType1Font.HELVETICA_BOLD;
        try (PDPageContentStream contentStream = new PDPageContentStream(doc, page)) {
            contentStream.beginText();
            contentStream.setFont(font, 12);
            contentStream.newLineAtOffset(100, 700);
            contentStream.showText("Go to Document->File Attachments to View Embedded Files");
            contentStream.endText();
        }
        // embedded files are stored in a named tree
        PDEmbeddedFilesNameTreeNode efTree = new PDEmbeddedFilesNameTreeNode();
        // first create the file specification, which holds the embedded file
        PDComplexFileSpecification fs = new PDComplexFileSpecification();
        fs.setFile("Test.txt");
        // create a dummy file stream, this would probably normally be a FileInputStream
        byte[] data = "This is the contents of the embedded file".getBytes("ISO-8859-1");
        ByteArrayInputStream fakeFile = new ByteArrayInputStream(data);
        PDEmbeddedFile ef = new PDEmbeddedFile(doc, fakeFile);
        // now lets some of the optional parameters
        ef.setSubtype("test/plain");
        ef.setSize(data.length);
        ef.setCreationDate(new GregorianCalendar());
        fs.setEmbeddedFile(ef);
        // create a new tree node and add the embedded file
        PDEmbeddedFilesNameTreeNode treeNode = new PDEmbeddedFilesNameTreeNode();
        treeNode.setNames(Collections.singletonMap("My first attachment", fs));
        // add the new node as kid to the root node
        List<PDEmbeddedFilesNameTreeNode> kids = new ArrayList<>();
        kids.add(treeNode);
        efTree.setKids(kids);
        // add the tree to the document catalog
        PDDocumentNameDictionary names = new PDDocumentNameDictionary(doc.getDocumentCatalog());
        names.setEmbeddedFiles(efTree);
        doc.getDocumentCatalog().setNames(names);
        doc.save(file);
    }
}
Also used : PDFont(org.apache.pdfbox.pdmodel.font.PDFont) PDPage(org.apache.pdfbox.pdmodel.PDPage) PDEmbeddedFilesNameTreeNode(org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode) PDEmbeddedFile(org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile) GregorianCalendar(java.util.GregorianCalendar) ArrayList(java.util.ArrayList) PDComplexFileSpecification(org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification) PDDocumentNameDictionary(org.apache.pdfbox.pdmodel.PDDocumentNameDictionary) ByteArrayInputStream(java.io.ByteArrayInputStream) PDDocument(org.apache.pdfbox.pdmodel.PDDocument) PDPageContentStream(org.apache.pdfbox.pdmodel.PDPageContentStream)

Example 3 with PDEmbeddedFilesNameTreeNode

use of org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode in project mustangproject by ZUGFeRD.

the class ZUGFeRDImporter method extractLowLevel.

/**
 * Extracts a ZUGFeRD invoice from a PDF document represented by an input stream. Errors are reported via exception handling.
 *
 * @param pdfStream a inputstream of a pdf file
 */
private void extractLowLevel(InputStream pdfStream) throws IOException {
    try (PDDocument doc = PDDocument.load(pdfStream)) {
        // PDDocumentInformation info = doc.getDocumentInformation();
        final PDDocumentNameDictionary names = new PDDocumentNameDictionary(doc.getDocumentCatalog());
        if (doc.getDocumentCatalog() == null || doc.getDocumentCatalog().getMetadata() == null) {
            Logger.getLogger(ZUGFeRDImporter.class.getName()).log(Level.INFO, "no-xmlpart");
            return;
        }
        final InputStream XMP = doc.getDocumentCatalog().getMetadata().exportXMPMetadata();
        xmpString = convertStreamToString(XMP);
        final PDEmbeddedFilesNameTreeNode etn = names.getEmbeddedFiles();
        if (etn == null) {
            return;
        }
        final Map<String, PDComplexFileSpecification> efMap = etn.getNames();
        if (efMap != null) {
            // see
            extractFiles(efMap);
        // https://memorynotfound.com/apache-pdfbox-extract-embedded-file-pdf-document/
        } else {
            final List<PDNameTreeNode<PDComplexFileSpecification>> kids = etn.getKids();
            for (final PDNameTreeNode<PDComplexFileSpecification> node : kids) {
                final Map<String, PDComplexFileSpecification> namesL = node.getNames();
                extractFiles(namesL);
            }
        }
    }
}
Also used : PDEmbeddedFilesNameTreeNode(org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode) ByteArrayInputStream(java.io.ByteArrayInputStream) InputStream(java.io.InputStream) PDDocument(org.apache.pdfbox.pdmodel.PDDocument) PDComplexFileSpecification(org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification) PDDocumentNameDictionary(org.apache.pdfbox.pdmodel.PDDocumentNameDictionary) PDNameTreeNode(org.apache.pdfbox.pdmodel.common.PDNameTreeNode)

Example 4 with PDEmbeddedFilesNameTreeNode

use of org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode in project mustangproject by ZUGFeRD.

the class ZUGFeRDExporter method PDFAttachGenericFile.

/**
	 * Embeds an external file (generic - any type allowed) in the PDF.
	 *
	 * @param doc
	 *            PDDocument to attach the file to.
	 * @param filename
	 *            name of the file that will become attachment name in the PDF
	 * @param relationship
	 *            how the file relates to the content, e.g. "Alternative"
	 * @param description
	 *            Human-readable description of the file content
	 * @param subType
	 *            type of the data e.g. could be "text/xml" - mime like
	 * @param data
	 *            the binary data of the file/attachment
         * @throws java.io.IOException
	 */
public void PDFAttachGenericFile(PDDocument doc, String filename, String relationship, String description, String subType, byte[] data) throws IOException {
    PDComplexFileSpecification fs = new PDComplexFileSpecification();
    fs.setFile(filename);
    COSDictionary dict = fs.getCOSObject();
    dict.setName("AFRelationship", relationship);
    dict.setString("UF", filename);
    dict.setString("Desc", description);
    ByteArrayInputStream fakeFile = new ByteArrayInputStream(data);
    PDEmbeddedFile ef = new PDEmbeddedFile(doc, fakeFile);
    ef.setSubtype(subType);
    ef.setSize(data.length);
    ef.setCreationDate(new GregorianCalendar());
    ef.setModDate(GregorianCalendar.getInstance());
    fs.setEmbeddedFile(ef);
    // In addition make sure the embedded file is set under /UF
    dict = fs.getCOSObject();
    COSDictionary efDict = (COSDictionary) dict.getDictionaryObject(COSName.EF);
    COSBase lowerLevelFile = efDict.getItem(COSName.F);
    efDict.setItem(COSName.UF, lowerLevelFile);
    // now add the entry to the embedded file tree and set in the document.
    PDDocumentNameDictionary names = new PDDocumentNameDictionary(doc.getDocumentCatalog());
    PDEmbeddedFilesNameTreeNode efTree = names.getEmbeddedFiles();
    if (efTree == null) {
        efTree = new PDEmbeddedFilesNameTreeNode();
    }
    Map<String, PDComplexFileSpecification> namesMap = new HashMap<String, PDComplexFileSpecification>();
    Map<String, PDComplexFileSpecification> oldNamesMap = efTree.getNames();
    if (oldNamesMap != null) {
        for (String key : oldNamesMap.keySet()) {
            namesMap.put(key, oldNamesMap.get(key));
        }
    }
    namesMap.put(filename, fs);
    efTree.setNames(namesMap);
    names.setEmbeddedFiles(efTree);
    doc.getDocumentCatalog().setNames(names);
    // AF entry (Array) in catalog with the FileSpec
    COSArray cosArray = (COSArray) doc.getDocumentCatalog().getCOSObject().getItem("AF");
    if (cosArray == null) {
        cosArray = new COSArray();
    }
    cosArray.add(fs);
    COSDictionary dict2 = doc.getDocumentCatalog().getCOSObject();
    COSArray array = new COSArray();
    // see below
    array.add(fs.getCOSObject());
    dict2.setItem("AF", array);
    doc.getDocumentCatalog().getCOSObject().setItem("AF", cosArray);
}
Also used : COSDictionary(org.apache.pdfbox.cos.COSDictionary) COSArray(org.apache.pdfbox.cos.COSArray) ByteArrayInputStream(java.io.ByteArrayInputStream) PDEmbeddedFilesNameTreeNode(org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode) HashMap(java.util.HashMap) PDEmbeddedFile(org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile) GregorianCalendar(java.util.GregorianCalendar) COSBase(org.apache.pdfbox.cos.COSBase) PDComplexFileSpecification(org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification) PDDocumentNameDictionary(org.apache.pdfbox.pdmodel.PDDocumentNameDictionary)

Example 5 with PDEmbeddedFilesNameTreeNode

use of org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode in project tika by apache.

the class AbstractPDF2XHTML method extractEmbeddedDocuments.

private void extractEmbeddedDocuments(PDDocument document) throws IOException, SAXException, TikaException {
    PDDocumentNameDictionary namesDictionary = new PDDocumentNameDictionary(document.getDocumentCatalog());
    PDEmbeddedFilesNameTreeNode efTree = namesDictionary.getEmbeddedFiles();
    if (efTree == null) {
        return;
    }
    Map<String, PDComplexFileSpecification> embeddedFileNames = efTree.getNames();
    //Map<String, COSObjectable> that contains the doc info.
    if (embeddedFileNames != null) {
        processEmbeddedDocNames(embeddedFileNames);
    } else {
        List<PDNameTreeNode<PDComplexFileSpecification>> kids = efTree.getKids();
        if (kids == null) {
            return;
        }
        for (PDNameTreeNode<PDComplexFileSpecification> node : kids) {
            embeddedFileNames = node.getNames();
            if (embeddedFileNames != null) {
                processEmbeddedDocNames(embeddedFileNames);
            }
        }
    }
}
Also used : PDEmbeddedFilesNameTreeNode(org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode) PDComplexFileSpecification(org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification) PDDocumentNameDictionary(org.apache.pdfbox.pdmodel.PDDocumentNameDictionary) PDNameTreeNode(org.apache.pdfbox.pdmodel.common.PDNameTreeNode)

Aggregations

PDDocumentNameDictionary (org.apache.pdfbox.pdmodel.PDDocumentNameDictionary)10 PDEmbeddedFilesNameTreeNode (org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode)10 PDComplexFileSpecification (org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification)10 PDDocument (org.apache.pdfbox.pdmodel.PDDocument)8 PDEmbeddedFile (org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile)8 PDDocumentCatalog (org.apache.pdfbox.pdmodel.PDDocumentCatalog)5 ByteArrayInputStream (java.io.ByteArrayInputStream)4 File (java.io.File)4 InputStream (java.io.InputStream)4 FileOutputStream (java.io.FileOutputStream)3 PDNameTreeNode (org.apache.pdfbox.pdmodel.common.PDNameTreeNode)3 Test (org.junit.Test)3 GregorianCalendar (java.util.GregorianCalendar)2 Map (java.util.Map)2 PDPage (org.apache.pdfbox.pdmodel.PDPage)2 ByteArrayOutputStream (java.io.ByteArrayOutputStream)1 FileInputStream (java.io.FileInputStream)1 IOException (java.io.IOException)1 OutputStream (java.io.OutputStream)1 OutputStreamWriter (java.io.OutputStreamWriter)1