Search in sources :

Example 1 with COSObjectKey

use of org.apache.pdfbox.cos.COSObjectKey in project pdfbox by apache.

the class COSParser method rebuildTrailer.

/**
 * Rebuild the trailer dictionary if startxref can't be found.
 *
 * @return the rebuild trailer dictionary
 *
 * @throws IOException if something went wrong
 */
private final COSDictionary rebuildTrailer() throws IOException {
    COSDictionary trailer = null;
    bfSearchForObjects();
    if (bfSearchCOSObjectKeyOffsets != null) {
        // reset trailer resolver
        xrefTrailerResolver.reset();
        // use the found objects to rebuild the trailer resolver
        xrefTrailerResolver.nextXrefObj(0, XRefType.TABLE);
        for (Entry<COSObjectKey, Long> entry : bfSearchCOSObjectKeyOffsets.entrySet()) {
            xrefTrailerResolver.setXRef(entry.getKey(), entry.getValue());
        }
        xrefTrailerResolver.setStartxref(0);
        trailer = xrefTrailerResolver.getTrailer();
        getDocument().setTrailer(trailer);
        if (!bfSearchForTrailer(trailer)) {
            // search for the different parts of the trailer dictionary
            for (Entry<COSObjectKey, Long> entry : bfSearchCOSObjectKeyOffsets.entrySet()) {
                COSDictionary dictionary = retrieveCOSDictionary(entry.getKey(), entry.getValue());
                if (dictionary == null) {
                    continue;
                }
                // document catalog
                if (isCatalog(dictionary)) {
                    trailer.setItem(COSName.ROOT, document.getObjectFromPool(entry.getKey()));
                } else // info dictionary
                if (isInfo(dictionary)) {
                    trailer.setItem(COSName.INFO, document.getObjectFromPool(entry.getKey()));
                }
            // encryption dictionary, if existing, is lost
            // We can't run "Algorithm 2" from PDF specification because of missing ID
            }
        }
    }
    trailerWasRebuild = true;
    return trailer;
}
Also used : COSObjectKey(org.apache.pdfbox.cos.COSObjectKey) COSDictionary(org.apache.pdfbox.cos.COSDictionary)

Example 2 with COSObjectKey

use of org.apache.pdfbox.cos.COSObjectKey in project pdfbox by apache.

the class COSParser method bfSearchForObjStreams.

/**
 * Brute force search for all object streams.
 *
 * @throws IOException if something went wrong
 */
private void bfSearchForObjStreams() throws IOException {
    HashMap<Long, COSObjectKey> bfSearchObjStreamsOffsets = new HashMap<>();
    long originOffset = source.getPosition();
    source.seek(MINIMUM_SEARCH_OFFSET);
    char[] string = " obj".toCharArray();
    while (!source.isEOF()) {
        // search for EOF marker
        if (isString(OBJ_STREAM)) {
            long currentPosition = source.getPosition();
            // search backwards for the beginning of the object
            long newOffset = -1;
            COSObjectKey streamObjectKey = null;
            boolean objFound = false;
            for (int i = 1; i < 40 && !objFound; i++) {
                long currentOffset = currentPosition - (i * 10);
                if (currentOffset > 0) {
                    source.seek(currentOffset);
                    for (int j = 0; j < 10; j++) {
                        if (isString(string)) {
                            long tempOffset = currentOffset - 1;
                            source.seek(tempOffset);
                            int genID = source.peek();
                            // is the next char a digit?
                            if (isDigit(genID)) {
                                tempOffset--;
                                source.seek(tempOffset);
                                if (isSpace()) {
                                    int length = 0;
                                    source.seek(--tempOffset);
                                    while (tempOffset > MINIMUM_SEARCH_OFFSET && isDigit()) {
                                        source.seek(--tempOffset);
                                        length++;
                                    }
                                    if (length > 0) {
                                        source.read();
                                        newOffset = source.getPosition();
                                        long objNumber = readObjectNumber();
                                        int genNumber = readGenerationNumber();
                                        streamObjectKey = new COSObjectKey(objNumber, genNumber);
                                        bfSearchObjStreamsOffsets.put(newOffset, streamObjectKey);
                                    }
                                }
                            }
                            LOG.debug("Dictionary start for object stream -> " + newOffset);
                            objFound = true;
                            break;
                        } else {
                            currentOffset++;
                            source.read();
                        }
                    }
                }
            }
            source.seek(currentPosition + OBJ_STREAM.length);
        }
        source.read();
    }
    // add all found compressed objects to the brute force search result
    for (Entry<Long, COSObjectKey> streamOffsetsEntry : bfSearchObjStreamsOffsets.entrySet()) {
        Long offset = streamOffsetsEntry.getKey();
        Long bfOffset = bfSearchCOSObjectKeyOffsets.get(streamOffsetsEntry.getValue());
        // incomplete object stream found?
        if (bfOffset == null) {
            LOG.warn("Skipped incomplete object stream:" + streamOffsetsEntry.getValue() + " at " + offset);
            continue;
        }
        // check if the object was overwritten
        if (offset.equals(bfOffset)) {
            source.seek(offset);
            long stmObjNumber = readObjectNumber();
            readGenerationNumber();
            readExpectedString(OBJ_MARKER, true);
            int nrOfObjects = 0;
            byte[] numbersBytes = null;
            COSStream stream = null;
            COSInputStream is = null;
            try {
                COSDictionary dict = parseCOSDictionary();
                int offsetFirstStream = dict.getInt(COSName.FIRST);
                nrOfObjects = dict.getInt(COSName.N);
                // skip the stream if required values are missing
                if (offsetFirstStream == -1 || nrOfObjects == -1) {
                    continue;
                }
                stream = parseCOSStream(dict);
                is = stream.createInputStream();
                numbersBytes = new byte[offsetFirstStream];
                long isResult = is.read(numbersBytes);
                if (Long.compare(isResult, numbersBytes.length) != 0) {
                    LOG.debug("Tried reading " + numbersBytes.length + " bytes but only " + isResult + " bytes read");
                }
            } catch (IOException exception) {
                LOG.debug("Skipped corrupt stream: (" + stmObjNumber + " 0 at offset " + offset, exception);
                continue;
            } finally {
                if (is != null) {
                    is.close();
                }
                if (stream != null) {
                    stream.close();
                }
            }
            int start = 0;
            // skip spaces
            while (numbersBytes[start] == 32) {
                start++;
            }
            String numbersStr = new String(numbersBytes, start, numbersBytes.length - start, "ISO-8859-1");
            numbersStr = numbersStr.replaceAll("\n", " ").replaceAll("  ", " ");
            String[] numbers = numbersStr.split(" ");
            if (numbers.length < nrOfObjects * 2) {
                LOG.debug("Skipped corrupt stream: (" + stmObjNumber + " 0 at offset " + offset);
                continue;
            }
            for (int i = 0; i < nrOfObjects; i++) {
                long objNumber = Long.parseLong(numbers[i * 2]);
                COSObjectKey objKey = new COSObjectKey(objNumber, 0);
                Long existingOffset = bfSearchCOSObjectKeyOffsets.get(objKey);
                if (existingOffset == null || offset > existingOffset) {
                    bfSearchCOSObjectKeyOffsets.put(objKey, -stmObjNumber);
                }
            }
        }
    }
    source.seek(originOffset);
}
Also used : COSStream(org.apache.pdfbox.cos.COSStream) COSInputStream(org.apache.pdfbox.cos.COSInputStream) COSDictionary(org.apache.pdfbox.cos.COSDictionary) HashMap(java.util.HashMap) IOException(java.io.IOException) COSObjectKey(org.apache.pdfbox.cos.COSObjectKey)

Example 3 with COSObjectKey

use of org.apache.pdfbox.cos.COSObjectKey in project pdfbox by apache.

the class COSParser method retrieveCOSDictionary.

private COSDictionary retrieveCOSDictionary(COSObject object) throws IOException {
    COSObjectKey key = new COSObjectKey((COSObject) object);
    Long offset = bfSearchCOSObjectKeyOffsets.get(key);
    if (offset != null) {
        return retrieveCOSDictionary(key, offset);
    }
    return null;
}
Also used : COSObjectKey(org.apache.pdfbox.cos.COSObjectKey)

Example 4 with COSObjectKey

use of org.apache.pdfbox.cos.COSObjectKey in project pdfbox by apache.

the class COSParser method parseXrefTable.

/**
 * This will parse the xref table from the stream and add it to the state
 * The XrefTable contents are ignored.
 * @param startByteOffset the offset to start at
 * @return false on parsing error
 * @throws IOException If an IO error occurs.
 */
protected boolean parseXrefTable(long startByteOffset) throws IOException {
    if (source.peek() != 'x') {
        return false;
    }
    String xref = readString();
    if (!xref.trim().equals("xref")) {
        return false;
    }
    // check for trailer after xref
    String str = readString();
    byte[] b = str.getBytes(ISO_8859_1);
    source.rewind(b.length);
    // signal start of new XRef
    xrefTrailerResolver.nextXrefObj(startByteOffset, XRefType.TABLE);
    if (str.startsWith("trailer")) {
        LOG.warn("skipping empty xref table");
        return false;
    }
    // Xref tables can have multiple sections. Each starts with a starting object id and a count.
    while (true) {
        String currentLine = readLine();
        String[] splitString = currentLine.split("\\s");
        if (splitString.length != 2) {
            LOG.warn("Unexpected XRefTable Entry: " + currentLine);
            break;
        }
        // first obj id
        long currObjID = Long.parseLong(splitString[0]);
        // the number of objects in the xref table
        int count = Integer.parseInt(splitString[1]);
        skipSpaces();
        for (int i = 0; i < count; i++) {
            if (source.isEOF() || isEndOfName((char) source.peek())) {
                break;
            }
            if (source.peek() == 't') {
                break;
            }
            // Ignore table contents
            currentLine = readLine();
            splitString = currentLine.split("\\s");
            if (splitString.length < 3) {
                LOG.warn("invalid xref line: " + currentLine);
                break;
            }
            /* This supports the corrupt table as reported in
                 * PDFBOX-474 (XXXX XXX XX n) */
            if (splitString[splitString.length - 1].equals("n")) {
                try {
                    long currOffset = Long.parseLong(splitString[0]);
                    int currGenID = Integer.parseInt(splitString[1]);
                    COSObjectKey objKey = new COSObjectKey(currObjID, currGenID);
                    xrefTrailerResolver.setXRef(objKey, currOffset);
                } catch (NumberFormatException e) {
                    throw new IOException(e);
                }
            } else if (!splitString[2].equals("f")) {
                throw new IOException("Corrupt XRefTable Entry - ObjID:" + currObjID);
            }
            currObjID++;
            skipSpaces();
        }
        skipSpaces();
        if (!isDigit()) {
            break;
        }
    }
    return true;
}
Also used : COSObjectKey(org.apache.pdfbox.cos.COSObjectKey) IOException(java.io.IOException)

Example 5 with COSObjectKey

use of org.apache.pdfbox.cos.COSObjectKey in project pdfbox by apache.

the class COSParser method parseObjectDynamically.

/**
 * This will parse the next object from the stream and add it to the local state.
 * It's reduced to parsing an indirect object.
 *
 * @param objNr object number of object to be parsed
 * @param objGenNr object generation number of object to be parsed
 * @param requireExistingNotCompressedObj if <code>true</code> the object to be parsed must be defined in xref
 * (comment: null objects may be missing from xref) and it must not be a compressed object within object stream
 * (this is used to circumvent being stuck in a loop in a malicious PDF)
 *
 * @return the parsed object (which is also added to document object)
 *
 * @throws IOException If an IO error occurs.
 */
protected COSBase parseObjectDynamically(long objNr, int objGenNr, boolean requireExistingNotCompressedObj) throws IOException {
    // ---- create object key and get object (container) from pool
    final COSObjectKey objKey = new COSObjectKey(objNr, objGenNr);
    final COSObject pdfObject = document.getObjectFromPool(objKey);
    if (pdfObject.getObject() == null) {
        // not previously parsed
        // ---- read offset or object stream object number from xref table
        Long offsetOrObjstmObNr = document.getXrefTable().get(objKey);
        // maybe something is wrong with the xref table -> perform brute force search for all objects
        if (offsetOrObjstmObNr == null && isLenient) {
            Map<COSObjectKey, Long> bfCOSObjectKeyOffsets = getBFCOSObjectOffsets();
            offsetOrObjstmObNr = bfCOSObjectKeyOffsets.get(objKey);
            if (offsetOrObjstmObNr != null) {
                LOG.debug("Set missing offset " + offsetOrObjstmObNr + " for object " + objKey);
                document.getXrefTable().put(objKey, offsetOrObjstmObNr);
            }
        }
        // sanity test to circumvent loops with broken documents
        if (requireExistingNotCompressedObj && ((offsetOrObjstmObNr == null) || (offsetOrObjstmObNr <= 0))) {
            throw new IOException("Object must be defined and must not be compressed object: " + objKey.getNumber() + ":" + objKey.getGeneration());
        }
        if (offsetOrObjstmObNr == null) {
            // not defined object -> NULL object (Spec. 1.7, chap. 3.2.9)
            pdfObject.setObject(COSNull.NULL);
        } else if (offsetOrObjstmObNr > 0) {
            // offset of indirect object in file
            parseFileObject(offsetOrObjstmObNr, objKey, pdfObject);
        } else {
            // xref value is object nr of object stream containing object to be parsed
            // since our object was not found it means object stream was not parsed so far
            parseObjectStream((int) -offsetOrObjstmObNr);
        }
    }
    return pdfObject.getObject();
}
Also used : COSObjectKey(org.apache.pdfbox.cos.COSObjectKey) COSObject(org.apache.pdfbox.cos.COSObject) IOException(java.io.IOException)

Aggregations

COSObjectKey (org.apache.pdfbox.cos.COSObjectKey)39 COSObject (org.apache.pdfbox.cos.COSObject)25 IOException (java.io.IOException)16 COSDocument (org.apache.pdfbox.cos.COSDocument)13 COSBase (org.apache.pdfbox.cos.COSBase)12 COSDictionary (org.apache.pdfbox.cos.COSDictionary)8 COSStream (org.apache.pdfbox.cos.COSStream)7 ValidationError (org.apache.pdfbox.preflight.ValidationResult.ValidationError)7 COSArray (org.apache.pdfbox.cos.COSArray)6 COSString (org.apache.pdfbox.cos.COSString)6 Test (org.junit.Test)6 ArrayList (java.util.ArrayList)4 HashMap (java.util.HashMap)4 COSNumber (org.apache.pdfbox.cos.COSNumber)4 COSInteger (org.apache.pdfbox.cos.COSInteger)3 COSName (org.apache.pdfbox.cos.COSName)3 InputStream (java.io.InputStream)2 Matcher (java.util.regex.Matcher)2 Pattern (java.util.regex.Pattern)2 PDFObjectStreamParser (org.apache.pdfbox.pdfparser.PDFObjectStreamParser)2