Search in sources :

Example 1 with PDEmbeddedFile

use of org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile in project pdfbox by apache.

the class ExtractText method startExtraction.

/**
 * Starts the text extraction.
 *
 * @param args the commandline arguments.
 * @throws IOException if there is an error reading the document or extracting the text.
 */
public void startExtraction(String[] args) throws IOException {
    boolean toConsole = false;
    boolean toHTML = false;
    boolean sort = false;
    boolean separateBeads = true;
    String password = "";
    String encoding = STD_ENCODING;
    String pdfFile = null;
    String outputFile = null;
    // Defaults to text files
    String ext = ".txt";
    int startPage = 1;
    int endPage = Integer.MAX_VALUE;
    for (int i = 0; i < args.length; i++) {
        if (args[i].equals(PASSWORD)) {
            i++;
            if (i >= args.length) {
                usage();
            }
            password = args[i];
        } else if (args[i].equals(ENCODING)) {
            i++;
            if (i >= args.length) {
                usage();
            }
            encoding = args[i];
        } else if (args[i].equals(START_PAGE)) {
            i++;
            if (i >= args.length) {
                usage();
            }
            startPage = Integer.parseInt(args[i]);
        } else if (args[i].equals(HTML)) {
            toHTML = true;
            ext = ".html";
        } else if (args[i].equals(SORT)) {
            sort = true;
        } else if (args[i].equals(IGNORE_BEADS)) {
            separateBeads = false;
        } else if (args[i].equals(DEBUG)) {
            debug = true;
        } else if (args[i].equals(END_PAGE)) {
            i++;
            if (i >= args.length) {
                usage();
            }
            endPage = Integer.parseInt(args[i]);
        } else if (args[i].equals(CONSOLE)) {
            toConsole = true;
        } else {
            if (pdfFile == null) {
                pdfFile = args[i];
            } else {
                outputFile = args[i];
            }
        }
    }
    if (pdfFile == null) {
        usage();
    } else {
        Writer output = null;
        PDDocument document = null;
        try {
            long startTime = startProcessing("Loading PDF " + pdfFile);
            if (outputFile == null && pdfFile.length() > 4) {
                outputFile = new File(pdfFile.substring(0, pdfFile.length() - 4) + ext).getAbsolutePath();
            }
            document = PDDocument.load(new File(pdfFile), password);
            AccessPermission ap = document.getCurrentAccessPermission();
            if (!ap.canExtractContent()) {
                throw new IOException("You do not have permission to extract text");
            }
            stopProcessing("Time for loading: ", startTime);
            if (toConsole) {
                output = new OutputStreamWriter(System.out, encoding);
            } else {
                if (toHTML && !STD_ENCODING.equals(encoding)) {
                    encoding = STD_ENCODING;
                    System.out.println("The encoding parameter is ignored when writing html output.");
                }
                output = new OutputStreamWriter(new FileOutputStream(outputFile), encoding);
            }
            PDFTextStripper stripper;
            if (toHTML) {
                stripper = new PDFText2HTML();
            } else {
                stripper = new PDFTextStripper();
            }
            stripper.setSortByPosition(sort);
            stripper.setShouldSeparateByBeads(separateBeads);
            stripper.setStartPage(startPage);
            stripper.setEndPage(endPage);
            startTime = startProcessing("Starting text extraction");
            if (debug) {
                System.err.println("Writing to " + outputFile);
            }
            // Extract text for main document:
            stripper.writeText(document, output);
            // ... also for any embedded PDFs:
            PDDocumentCatalog catalog = document.getDocumentCatalog();
            PDDocumentNameDictionary names = catalog.getNames();
            if (names != null) {
                PDEmbeddedFilesNameTreeNode embeddedFiles = names.getEmbeddedFiles();
                if (embeddedFiles != null) {
                    Map<String, PDComplexFileSpecification> embeddedFileNames = embeddedFiles.getNames();
                    if (embeddedFileNames != null) {
                        for (Map.Entry<String, PDComplexFileSpecification> ent : embeddedFileNames.entrySet()) {
                            if (debug) {
                                System.err.println("Processing embedded file " + ent.getKey() + ":");
                            }
                            PDComplexFileSpecification spec = ent.getValue();
                            PDEmbeddedFile file = spec.getEmbeddedFile();
                            if (file != null && "application/pdf".equals(file.getSubtype())) {
                                if (debug) {
                                    System.err.println("  is PDF (size=" + file.getSize() + ")");
                                }
                                try (InputStream fis = file.createInputStream();
                                    PDDocument subDoc = PDDocument.load(fis)) {
                                    stripper.writeText(subDoc, output);
                                }
                            }
                        }
                    }
                }
            }
            stopProcessing("Time for extraction: ", startTime);
        } finally {
            IOUtils.closeQuietly(output);
            IOUtils.closeQuietly(document);
        }
    }
}
Also used : PDEmbeddedFilesNameTreeNode(org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode) PDEmbeddedFile(org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile) InputStream(java.io.InputStream) AccessPermission(org.apache.pdfbox.pdmodel.encryption.AccessPermission) IOException(java.io.IOException) PDComplexFileSpecification(org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification) PDDocumentCatalog(org.apache.pdfbox.pdmodel.PDDocumentCatalog) PDDocumentNameDictionary(org.apache.pdfbox.pdmodel.PDDocumentNameDictionary) PDDocument(org.apache.pdfbox.pdmodel.PDDocument) FileOutputStream(java.io.FileOutputStream) OutputStreamWriter(java.io.OutputStreamWriter) PDEmbeddedFile(org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile) File(java.io.File) Map(java.util.Map) Writer(java.io.Writer) OutputStreamWriter(java.io.OutputStreamWriter) PDFTextStripper(org.apache.pdfbox.text.PDFTextStripper)

Example 2 with PDEmbeddedFile

use of org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile in project pdfbox by apache.

the class EmbeddedFiles method doIt.

/**
 * create the second sample document from the PDF file format specification.
 *
 * @param file The file to write the PDF to.
 *
 * @throws IOException If there is an error writing the data.
 */
public void doIt(String file) throws IOException {
    try (// the document
    PDDocument doc = new PDDocument()) {
        PDPage page = new PDPage();
        doc.addPage(page);
        PDFont font = PDType1Font.HELVETICA_BOLD;
        try (PDPageContentStream contentStream = new PDPageContentStream(doc, page)) {
            contentStream.beginText();
            contentStream.setFont(font, 12);
            contentStream.newLineAtOffset(100, 700);
            contentStream.showText("Go to Document->File Attachments to View Embedded Files");
            contentStream.endText();
        }
        // embedded files are stored in a named tree
        PDEmbeddedFilesNameTreeNode efTree = new PDEmbeddedFilesNameTreeNode();
        // first create the file specification, which holds the embedded file
        PDComplexFileSpecification fs = new PDComplexFileSpecification();
        fs.setFile("Test.txt");
        // create a dummy file stream, this would probably normally be a FileInputStream
        byte[] data = "This is the contents of the embedded file".getBytes("ISO-8859-1");
        ByteArrayInputStream fakeFile = new ByteArrayInputStream(data);
        PDEmbeddedFile ef = new PDEmbeddedFile(doc, fakeFile);
        // now lets some of the optional parameters
        ef.setSubtype("test/plain");
        ef.setSize(data.length);
        ef.setCreationDate(new GregorianCalendar());
        fs.setEmbeddedFile(ef);
        // create a new tree node and add the embedded file
        PDEmbeddedFilesNameTreeNode treeNode = new PDEmbeddedFilesNameTreeNode();
        treeNode.setNames(Collections.singletonMap("My first attachment", fs));
        // add the new node as kid to the root node
        List<PDEmbeddedFilesNameTreeNode> kids = new ArrayList<>();
        kids.add(treeNode);
        efTree.setKids(kids);
        // add the tree to the document catalog
        PDDocumentNameDictionary names = new PDDocumentNameDictionary(doc.getDocumentCatalog());
        names.setEmbeddedFiles(efTree);
        doc.getDocumentCatalog().setNames(names);
        doc.save(file);
    }
}
Also used : PDFont(org.apache.pdfbox.pdmodel.font.PDFont) PDPage(org.apache.pdfbox.pdmodel.PDPage) PDEmbeddedFilesNameTreeNode(org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode) PDEmbeddedFile(org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile) GregorianCalendar(java.util.GregorianCalendar) ArrayList(java.util.ArrayList) PDComplexFileSpecification(org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification) PDDocumentNameDictionary(org.apache.pdfbox.pdmodel.PDDocumentNameDictionary) ByteArrayInputStream(java.io.ByteArrayInputStream) PDDocument(org.apache.pdfbox.pdmodel.PDDocument) PDPageContentStream(org.apache.pdfbox.pdmodel.PDPageContentStream)

Example 3 with PDEmbeddedFile

use of org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile in project pdfbox by apache.

the class ExtractEmbeddedFiles method extractFile.

private static void extractFile(String filePath, String filename, PDEmbeddedFile embeddedFile) throws IOException {
    String embeddedFilename = filePath + filename;
    File file = new File(filePath + filename);
    System.out.println("Writing " + embeddedFilename);
    try (FileOutputStream fos = new FileOutputStream(file)) {
        fos.write(embeddedFile.toByteArray());
    }
}
Also used : FileOutputStream(java.io.FileOutputStream) PDEmbeddedFile(org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile) File(java.io.File)

Example 4 with PDEmbeddedFile

use of org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile in project pdfbox by apache.

the class ExtractEmbeddedFiles method extractFiles.

private static void extractFiles(Map<String, PDComplexFileSpecification> names, String filePath) throws IOException {
    for (Entry<String, PDComplexFileSpecification> entry : names.entrySet()) {
        String filename = entry.getKey();
        PDComplexFileSpecification fileSpec = entry.getValue();
        PDEmbeddedFile embeddedFile = getEmbeddedFile(fileSpec);
        extractFile(filePath, filename, embeddedFile);
    }
}
Also used : PDEmbeddedFile(org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile) PDComplexFileSpecification(org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification)

Example 5 with PDEmbeddedFile

use of org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile in project mustangproject by ZUGFeRD.

the class OXExporterFromA3 method PDFAttachGenericFile.

/**
 * Embeds an external file (generic - any type allowed) in the PDF.
 *
 * @param doc          PDDocument to attach the file to.
 * @param filename     name of the file that will become attachment name in the PDF
 * @param relationship how the file relates to the content, e.g. "Alternative"
 * @param description  Human-readable description of the file content
 * @param subType      type of the data e.g. could be "text/xml" - mime like
 * @param data         the binary data of the file/attachment
 * @throws IOException if anything is wrong with filename
 */
public void PDFAttachGenericFile(PDDocument doc, String filename, String relationship, String description, String subType, byte[] data) throws IOException {
    fileAttached = true;
    PDComplexFileSpecification fs = new PDComplexFileSpecification();
    fs.setFile(filename);
    COSDictionary dict = fs.getCOSObject();
    dict.setName("AFRelationship", relationship);
    dict.setString("UF", filename);
    dict.setString("Desc", description);
    ByteArrayInputStream fakeFile = new ByteArrayInputStream(data);
    PDEmbeddedFile ef = new PDEmbeddedFile(doc, fakeFile);
    // ef.addCompression();
    ef.setSubtype(subType);
    ef.setSize(data.length);
    ef.setCreationDate(new GregorianCalendar());
    ef.setModDate(GregorianCalendar.getInstance());
    fs.setEmbeddedFile(ef);
    // In addition make sure the embedded file is set under /UF
    dict = fs.getCOSObject();
    COSDictionary efDict = (COSDictionary) dict.getDictionaryObject(COSName.EF);
    COSBase lowerLevelFile = efDict.getItem(COSName.F);
    efDict.setItem(COSName.UF, lowerLevelFile);
    // now add the entry to the embedded file tree and set in the document.
    PDDocumentNameDictionary names = new PDDocumentNameDictionary(doc.getDocumentCatalog());
    PDEmbeddedFilesNameTreeNode efTree = names.getEmbeddedFiles();
    if (efTree == null) {
        efTree = new PDEmbeddedFilesNameTreeNode();
    }
    Map<String, PDComplexFileSpecification> namesMap = new HashMap<>();
    Map<String, PDComplexFileSpecification> oldNamesMap = efTree.getNames();
    if (oldNamesMap != null) {
        for (String key : oldNamesMap.keySet()) {
            namesMap.put(key, oldNamesMap.get(key));
        }
    }
    namesMap.put(filename, fs);
    efTree.setNames(namesMap);
    names.setEmbeddedFiles(efTree);
    doc.getDocumentCatalog().setNames(names);
    // AF entry (Array) in catalog with the FileSpec
    COSBase AFEntry = (COSBase) doc.getDocumentCatalog().getCOSObject().getItem("AF");
    if ((AFEntry == null)) {
        COSArray cosArray = new COSArray();
        cosArray.add(fs);
        doc.getDocumentCatalog().getCOSObject().setItem("AF", cosArray);
    } else if (AFEntry instanceof COSArray) {
        COSArray cosArray = (COSArray) AFEntry;
        cosArray.add(fs);
        doc.getDocumentCatalog().getCOSObject().setItem("AF", cosArray);
    } else if ((AFEntry instanceof COSObject) && ((COSObject) AFEntry).getObject() instanceof COSArray) {
        COSArray cosArray = (COSArray) ((COSObject) AFEntry).getObject();
        cosArray.add(fs);
    } else {
        throw new IOException("Unexpected object type for PDFDocument/Catalog/COSDictionary/Item(AF)");
    }
}
Also used : PDEmbeddedFile(org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile) PDComplexFileSpecification(org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification)

Aggregations

PDEmbeddedFile (org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile)13 PDComplexFileSpecification (org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification)12 PDDocumentNameDictionary (org.apache.pdfbox.pdmodel.PDDocumentNameDictionary)8 PDEmbeddedFilesNameTreeNode (org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode)8 PDDocument (org.apache.pdfbox.pdmodel.PDDocument)7 File (java.io.File)5 PDDocumentCatalog (org.apache.pdfbox.pdmodel.PDDocumentCatalog)5 FileOutputStream (java.io.FileOutputStream)4 ByteArrayInputStream (java.io.ByteArrayInputStream)3 InputStream (java.io.InputStream)3 Test (org.junit.Test)3 GregorianCalendar (java.util.GregorianCalendar)2 Map (java.util.Map)2 PDPage (org.apache.pdfbox.pdmodel.PDPage)2 ByteArrayOutputStream (java.io.ByteArrayOutputStream)1 FileInputStream (java.io.FileInputStream)1 IOException (java.io.IOException)1 OutputStream (java.io.OutputStream)1 OutputStreamWriter (java.io.OutputStreamWriter)1 Writer (java.io.Writer)1