Search in sources :

Example 1 with PDFObjectStreamParser

use of org.apache.pdfbox.pdfparser.PDFObjectStreamParser in project pdfbox by apache.

the class DecompressObjectstreams method main.

/**
 * This is a very simple program, so everything is in the main method.
 * @param args arguments to the program
 */
public static void main(String[] args) {
    // suppress the Dock icon on OS X
    System.setProperty("apple.awt.UIElement", "true");
    if (args.length < 1) {
        usage();
    }
    String inputFilename = args[0];
    String outputFilename;
    if (args.length > 1) {
        outputFilename = args[1];
    } else {
        if (inputFilename.matches(".*\\.[pP][dD][fF]$")) {
            outputFilename = inputFilename.replaceAll("\\.[pP][dD][fF]$", ".unc.pdf");
        } else {
            outputFilename = inputFilename + ".unc.pdf";
        }
    }
    PDDocument doc = null;
    try {
        doc = PDDocument.load(new File(inputFilename));
        for (COSObject objStream : doc.getDocument().getObjectsByType(COSName.OBJ_STM)) {
            COSStream stream = (COSStream) objStream.getObject();
            PDFObjectStreamParser sp = new PDFObjectStreamParser(stream, doc.getDocument());
            sp.parse();
            for (COSObject next : sp.getObjects()) {
                COSObjectKey key = new COSObjectKey(next);
                COSObject obj = doc.getDocument().getObjectFromPool(key);
                obj.setObject(next.getObject());
            }
            doc.getDocument().removeObject(new COSObjectKey(objStream));
        }
        doc.save(outputFilename);
    } catch (Exception e) {
        System.err.println("Error processing file: " + e.getMessage());
    } finally {
        if (doc != null) {
            try {
                doc.close();
            } catch (Exception e) {
            }
        }
    }
}
Also used : COSObjectKey(org.apache.pdfbox.cos.COSObjectKey) COSStream(org.apache.pdfbox.cos.COSStream) COSObject(org.apache.pdfbox.cos.COSObject) PDDocument(org.apache.pdfbox.pdmodel.PDDocument) File(java.io.File) PDFObjectStreamParser(org.apache.pdfbox.pdfparser.PDFObjectStreamParser)

Example 2 with PDFObjectStreamParser

use of org.apache.pdfbox.pdfparser.PDFObjectStreamParser in project pdfbox by apache.

the class COSDocument method dereferenceObjectStreams.

/**
 * This method will search the list of objects for types of ObjStm.  If it finds
 * them then it will parse out all of the objects from the stream that is contains.
 *
 * @throws IOException If there is an error parsing the stream.
 */
public void dereferenceObjectStreams() throws IOException {
    for (COSObject objStream : getObjectsByType(COSName.OBJ_STM)) {
        COSStream stream = (COSStream) objStream.getObject();
        PDFObjectStreamParser parser = new PDFObjectStreamParser(stream, this);
        parser.parse();
        for (COSObject next : parser.getObjects()) {
            COSObjectKey key = new COSObjectKey(next);
            if (objectPool.get(key) == null || objectPool.get(key).getObject() == null || // xrefTable stores negated objNr of objStream for objects in objStreams
            (xrefTable.containsKey(key) && xrefTable.get(key) == -objStream.getObjectNumber())) {
                COSObject obj = getObjectFromPool(key);
                obj.setObject(next.getObject());
            }
        }
    }
}
Also used : PDFObjectStreamParser(org.apache.pdfbox.pdfparser.PDFObjectStreamParser)

Example 3 with PDFObjectStreamParser

use of org.apache.pdfbox.pdfparser.PDFObjectStreamParser in project pdfbox by apache.

the class PreflightParser method parseObjectDynamically.

@Override
protected COSBase parseObjectDynamically(long objNr, int objGenNr, boolean requireExistingNotCompressedObj) throws IOException {
    // ---- create object key and get object (container) from pool
    final COSObjectKey objKey = new COSObjectKey(objNr, objGenNr);
    final COSObject pdfObject = document.getObjectFromPool(objKey);
    if (pdfObject.getObject() == null) {
        // not previously parsed
        // ---- read offset or object stream object number from xref table
        Long offsetOrObjstmObNr = document.getXrefTable().get(objKey);
        // sanity test to circumvent loops with broken documents
        if (requireExistingNotCompressedObj && ((offsetOrObjstmObNr == null))) {
            addValidationError(new ValidationError(ERROR_SYNTAX_MISSING_OFFSET, "Object must be defined and must not be compressed object: " + objKey.getNumber() + ":" + objKey.getGeneration()));
            throw new SyntaxValidationException("Object must be defined and must not be compressed object: " + objKey.getNumber() + ":" + objKey.getGeneration(), validationResult);
        }
        if (offsetOrObjstmObNr == null) {
            // not defined object -> NULL object (Spec. 1.7, chap. 3.2.9)
            pdfObject.setObject(COSNull.NULL);
        } else if (offsetOrObjstmObNr == 0) {
            addValidationError(new ValidationError(ERROR_SYNTAX_INVALID_OFFSET, "Object {" + objKey.getNumber() + ":" + objKey.getGeneration() + "} has an offset of 0"));
        } else if (offsetOrObjstmObNr > 0) {
            // offset of indirect object in file
            // ---- go to object start
            source.seek(offsetOrObjstmObNr);
            // ---- we must have an indirect object
            long readObjNr;
            int readObjGen;
            long offset = source.getPosition();
            String line = readLine();
            Pattern pattern = Pattern.compile("(\\d+)\\s(\\d+)\\sobj");
            Matcher matcher = pattern.matcher(line);
            if (matcher.matches()) {
                readObjNr = Long.parseLong(matcher.group(1));
                readObjGen = Integer.parseInt(matcher.group(2));
            } else {
                addValidationError(new ValidationError(ERROR_SYNTAX_OBJ_DELIMITER, "Single space expected [offset=" + offset + "; key=" + offsetOrObjstmObNr.toString() + "; line=" + line + "; object=" + pdfObject.toString() + "]"));
                // reset source cursor to read object information
                source.seek(offset);
                readObjNr = readObjectNumber();
                readObjGen = readGenerationNumber();
                // skip spaces between Object Generation number and the 'obj' keyword
                skipSpaces();
                for (char c : OBJ_MARKER) {
                    if (source.read() != c) {
                        addValidationError(new ValidationError(ERROR_SYNTAX_OBJ_DELIMITER, "Expected pattern '" + new String(OBJ_MARKER) + " but missed at character '" + c + "'"));
                        throw new SyntaxValidationException("Expected pattern '" + new String(OBJ_MARKER) + " but missed at character '" + c + "'", validationResult);
                    }
                }
            }
            // ---- consistency check
            if ((readObjNr != objKey.getNumber()) || (readObjGen != objKey.getGeneration())) {
                throw new IOException("XREF for " + objKey.getNumber() + ":" + objKey.getGeneration() + " points to wrong object: " + readObjNr + ":" + readObjGen);
            }
            skipSpaces();
            COSBase pb = parseDirObject();
            skipSpaces();
            long endObjectOffset = source.getPosition();
            String endObjectKey = readString();
            if (endObjectKey.equals("stream")) {
                source.seek(endObjectOffset);
                if (pb instanceof COSDictionary) {
                    COSStream stream = parseCOSStream((COSDictionary) pb);
                    if (securityHandler != null) {
                        securityHandler.decryptStream(stream, objNr, objGenNr);
                    }
                    pb = stream;
                } else {
                    // the combination of a dict and the stream/endstream forms a complete stream object
                    throw new IOException("Stream not preceded by dictionary (offset: " + offsetOrObjstmObNr + ").");
                }
                skipSpaces();
                endObjectOffset = source.getPosition();
                endObjectKey = readString();
                // we have case with a second 'endstream' before endobj
                if (!endObjectKey.startsWith("endobj")) {
                    if (endObjectKey.startsWith("endstream")) {
                        endObjectKey = endObjectKey.substring(9).trim();
                        if (endObjectKey.length() == 0) {
                            // no other characters in extra endstream line
                            // read next line
                            endObjectKey = readString();
                        }
                    }
                }
            } else if (securityHandler != null) {
                securityHandler.decrypt(pb, objNr, objGenNr);
            }
            pdfObject.setObject(pb);
            if (!endObjectKey.startsWith("endobj")) {
                throw new IOException("Object (" + readObjNr + ":" + readObjGen + ") at offset " + offsetOrObjstmObNr + " does not end with 'endobj'.");
            } else {
                offset = source.getPosition();
                source.seek(endObjectOffset - 1);
                if (!nextIsEOL()) {
                    addValidationError(new ValidationError(PreflightConstants.ERROR_SYNTAX_OBJ_DELIMITER, "EOL expected before the 'endobj' keyword at offset " + source.getPosition()));
                }
                source.seek(offset);
            }
            if (!nextIsEOL()) {
                addValidationError(new ValidationError(PreflightConstants.ERROR_SYNTAX_OBJ_DELIMITER, "EOL expected after the 'endobj' keyword at offset " + source.getPosition()));
            }
        } else {
            // xref value is object nr of object stream containing object to be parsed;
            // since our object was not found it means object stream was not parsed so far
            final int objstmObjNr = (int) (-offsetOrObjstmObNr);
            final COSBase objstmBaseObj = parseObjectDynamically(objstmObjNr, 0, true);
            if (objstmBaseObj instanceof COSStream) {
                // parse object stream
                PDFObjectStreamParser parser = new PDFObjectStreamParser((COSStream) objstmBaseObj, document);
                parser.parse();
                // register all objects which are referenced to be contained in object stream
                for (COSObject next : parser.getObjects()) {
                    COSObjectKey stmObjKey = new COSObjectKey(next);
                    Long offset = document.getXrefTable().get(stmObjKey);
                    if (offset != null && offset == -objstmObjNr) {
                        COSObject stmObj = document.getObjectFromPool(stmObjKey);
                        stmObj.setObject(next.getObject());
                    }
                }
            }
        }
    }
    return pdfObject.getObject();
}
Also used : Pattern(java.util.regex.Pattern) COSStream(org.apache.pdfbox.cos.COSStream) COSDictionary(org.apache.pdfbox.cos.COSDictionary) Matcher(java.util.regex.Matcher) SyntaxValidationException(org.apache.pdfbox.preflight.exception.SyntaxValidationException) COSString(org.apache.pdfbox.cos.COSString) IOException(java.io.IOException) PDFObjectStreamParser(org.apache.pdfbox.pdfparser.PDFObjectStreamParser) COSObjectKey(org.apache.pdfbox.cos.COSObjectKey) COSObject(org.apache.pdfbox.cos.COSObject) COSBase(org.apache.pdfbox.cos.COSBase) ValidationError(org.apache.pdfbox.preflight.ValidationResult.ValidationError)

Aggregations

PDFObjectStreamParser (org.apache.pdfbox.pdfparser.PDFObjectStreamParser)3 COSObject (org.apache.pdfbox.cos.COSObject)2 COSObjectKey (org.apache.pdfbox.cos.COSObjectKey)2 COSStream (org.apache.pdfbox.cos.COSStream)2 File (java.io.File)1 IOException (java.io.IOException)1 Matcher (java.util.regex.Matcher)1 Pattern (java.util.regex.Pattern)1 COSBase (org.apache.pdfbox.cos.COSBase)1 COSDictionary (org.apache.pdfbox.cos.COSDictionary)1 COSString (org.apache.pdfbox.cos.COSString)1 PDDocument (org.apache.pdfbox.pdmodel.PDDocument)1 ValidationError (org.apache.pdfbox.preflight.ValidationResult.ValidationError)1 SyntaxValidationException (org.apache.pdfbox.preflight.exception.SyntaxValidationException)1