Search in sources :

Example 21 with COSObjectKey

use of org.apache.pdfbox.cos.COSObjectKey in project pdfbox by apache.

the class BaseParser method parseCOSArray.

/**
 * This will parse a PDF array object.
 *
 * @return The parsed PDF array.
 *
 * @throws IOException If there is an error parsing the stream.
 */
protected COSArray parseCOSArray() throws IOException {
    readExpectedChar('[');
    COSArray po = new COSArray();
    COSBase pbo;
    skipSpaces();
    int i;
    while (((i = seqSource.peek()) > 0) && ((char) i != ']')) {
        pbo = parseDirObject();
        if (pbo instanceof COSObject) {
            // We have to check if the expected values are there or not PDFBOX-385
            if (po.get(po.size() - 1) instanceof COSInteger) {
                COSInteger genNumber = (COSInteger) po.remove(po.size() - 1);
                if (po.get(po.size() - 1) instanceof COSInteger) {
                    COSInteger number = (COSInteger) po.remove(po.size() - 1);
                    COSObjectKey key = new COSObjectKey(number.longValue(), genNumber.intValue());
                    pbo = getObjectFromPool(key);
                } else {
                    // the object reference is somehow wrong
                    pbo = null;
                }
            } else {
                pbo = null;
            }
        }
        if (pbo != null) {
            po.add(pbo);
        } else {
            // it could be a bad object in the array which is just skipped
            LOG.warn("Corrupt object reference at offset " + seqSource.getPosition());
            // This could also be an "endobj" or "endstream" which means we can assume that
            // the array has ended.
            String isThisTheEnd = readString();
            seqSource.unread(isThisTheEnd.getBytes(ISO_8859_1));
            if (ENDOBJ_STRING.equals(isThisTheEnd) || ENDSTREAM_STRING.equals(isThisTheEnd)) {
                return po;
            }
        }
        skipSpaces();
    }
    // read ']'
    seqSource.read();
    skipSpaces();
    return po;
}
Also used : COSObjectKey(org.apache.pdfbox.cos.COSObjectKey) COSInteger(org.apache.pdfbox.cos.COSInteger) COSArray(org.apache.pdfbox.cos.COSArray) COSObject(org.apache.pdfbox.cos.COSObject) COSBase(org.apache.pdfbox.cos.COSBase) COSString(org.apache.pdfbox.cos.COSString)

Example 22 with COSObjectKey

use of org.apache.pdfbox.cos.COSObjectKey in project pdfbox by apache.

the class BaseParser method parseCOSDictionaryValue.

/**
 * This will parse a PDF dictionary value.
 *
 * @return The parsed Dictionary object.
 *
 * @throws IOException If there is an error parsing the dictionary object.
 */
private COSBase parseCOSDictionaryValue() throws IOException {
    long numOffset = seqSource.getPosition();
    COSBase value = parseDirObject();
    skipSpaces();
    // proceed if the given object is a number and the following is a number as well
    if (!(value instanceof COSNumber) || !isDigit()) {
        return value;
    }
    // read the remaining information of the object number
    long genOffset = seqSource.getPosition();
    COSBase generationNumber = parseDirObject();
    skipSpaces();
    readExpectedChar('R');
    if (!(value instanceof COSInteger)) {
        throw new IOException("expected number, actual=" + value + " at offset " + numOffset);
    }
    if (!(generationNumber instanceof COSInteger)) {
        throw new IOException("expected number, actual=" + value + " at offset " + genOffset);
    }
    COSObjectKey key = new COSObjectKey(((COSInteger) value).longValue(), ((COSInteger) generationNumber).intValue());
    // dereference the object
    return getObjectFromPool(key);
}
Also used : COSObjectKey(org.apache.pdfbox.cos.COSObjectKey) COSInteger(org.apache.pdfbox.cos.COSInteger) COSNumber(org.apache.pdfbox.cos.COSNumber) COSBase(org.apache.pdfbox.cos.COSBase) IOException(java.io.IOException)

Example 23 with COSObjectKey

use of org.apache.pdfbox.cos.COSObjectKey in project pdfbox by apache.

the class COSParser method validateXrefOffsets.

private boolean validateXrefOffsets(Map<COSObjectKey, Long> xrefOffset) throws IOException {
    if (xrefOffset == null) {
        return true;
    }
    for (Entry<COSObjectKey, Long> objectEntry : xrefOffset.entrySet()) {
        COSObjectKey objectKey = objectEntry.getKey();
        Long objectOffset = objectEntry.getValue();
        // see type 2 entry in xref stream
        if (objectOffset != null && objectOffset >= 0 && !checkObjectKey(objectKey, objectOffset)) {
            LOG.debug("Stop checking xref offsets as at least one (" + objectKey + ") couldn't be dereferenced");
            return false;
        }
    }
    return true;
}
Also used : COSObjectKey(org.apache.pdfbox.cos.COSObjectKey)

Example 24 with COSObjectKey

use of org.apache.pdfbox.cos.COSObjectKey in project pdfbox by apache.

the class COSParser method bfSearchForObjects.

/**
 * Brute force search for every object in the pdf.
 *
 * @throws IOException if something went wrong
 */
private void bfSearchForObjects() throws IOException {
    bfSearchForLastEOFMarker();
    bfSearchCOSObjectKeyOffsets = new HashMap<>();
    long originOffset = source.getPosition();
    long currentOffset = MINIMUM_SEARCH_OFFSET;
    long lastObjectId = Long.MIN_VALUE;
    int lastGenID = Integer.MIN_VALUE;
    long lastObjOffset = Long.MIN_VALUE;
    char[] endobjString = "ndo".toCharArray();
    char[] endobjRemainingString = "bj".toCharArray();
    boolean endOfObjFound = false;
    do {
        source.seek(currentOffset);
        int nextChar = source.read();
        currentOffset++;
        if (isWhitespace(nextChar) && isString(OBJ_MARKER)) {
            long tempOffset = currentOffset - 2;
            source.seek(tempOffset);
            int genID = source.peek();
            // is the next char a digit?
            if (isDigit(genID)) {
                genID -= 48;
                tempOffset--;
                source.seek(tempOffset);
                if (isWhitespace()) {
                    while (tempOffset > MINIMUM_SEARCH_OFFSET && isWhitespace()) {
                        source.seek(--tempOffset);
                    }
                    boolean objectIDFound = false;
                    while (tempOffset > MINIMUM_SEARCH_OFFSET && isDigit()) {
                        source.seek(--tempOffset);
                        objectIDFound = true;
                    }
                    if (objectIDFound) {
                        source.read();
                        long objectId = readObjectNumber();
                        if (lastObjOffset > 0) {
                            // add the former object ID only if there was a subsequent object ID
                            bfSearchCOSObjectKeyOffsets.put(new COSObjectKey(lastObjectId, lastGenID), lastObjOffset);
                        }
                        lastObjectId = objectId;
                        lastGenID = genID;
                        lastObjOffset = tempOffset + 1;
                        currentOffset += OBJ_MARKER.length - 1;
                        endOfObjFound = false;
                    }
                }
            }
        } else // We could possibly implement a more intelligent algorithm if necessary
        if (nextChar == 'e' && isString(endobjString)) {
            currentOffset += endobjString.length;
            source.seek(currentOffset);
            if (source.isEOF()) {
                endOfObjFound = true;
                continue;
            }
            if (isString(endobjRemainingString)) {
                currentOffset += endobjRemainingString.length;
                endOfObjFound = true;
                continue;
            }
        }
    } while (currentOffset < lastEOFMarker && !source.isEOF());
    if ((lastEOFMarker < Long.MAX_VALUE || endOfObjFound) && lastObjOffset > 0) {
        // if the pdf wasn't cut off in the middle or if the last object ends with a "endobj" marker
        // the last object id has to be added here so that it can't get lost as there isn't any subsequent object id
        bfSearchCOSObjectKeyOffsets.put(new COSObjectKey(lastObjectId, lastGenID), lastObjOffset);
    }
    bfSearchForObjStreams();
    // reestablish origin position
    source.seek(originOffset);
}
Also used : COSObjectKey(org.apache.pdfbox.cos.COSObjectKey)

Example 25 with COSObjectKey

use of org.apache.pdfbox.cos.COSObjectKey in project pdfbox by apache.

the class COSParser method parseDictObjects.

/**
 * Will parse every object necessary to load a single page from the pdf document. We try our
 * best to order objects according to offset in file before reading to minimize seek operations.
 *
 * @param dict the COSObject from the parent pages.
 * @param excludeObjects dictionary object reference entries with these names will not be parsed
 *
 * @throws IOException if something went wrong
 */
protected void parseDictObjects(COSDictionary dict, COSName... excludeObjects) throws IOException {
    // ---- create queue for objects waiting for further parsing
    final Queue<COSBase> toBeParsedList = new LinkedList<>();
    // offset ordered object map
    final TreeMap<Long, List<COSObject>> objToBeParsed = new TreeMap<>();
    // in case of compressed objects offset points to stmObj
    final Set<Long> parsedObjects = new HashSet<>();
    final Set<Long> addedObjects = new HashSet<>();
    addExcludedToList(excludeObjects, dict, parsedObjects);
    addNewToList(toBeParsedList, dict.getValues(), addedObjects);
    // ---- go through objects to be parsed
    while (!(toBeParsedList.isEmpty() && objToBeParsed.isEmpty())) {
        // -- first get all COSObject from other kind of objects and
        // put them in objToBeParsed; afterwards toBeParsedList is empty
        COSBase baseObj;
        while ((baseObj = toBeParsedList.poll()) != null) {
            if (baseObj instanceof COSDictionary) {
                addNewToList(toBeParsedList, ((COSDictionary) baseObj).getValues(), addedObjects);
            } else if (baseObj instanceof COSArray) {
                for (COSBase cosBase : ((COSArray) baseObj)) {
                    addNewToList(toBeParsedList, cosBase, addedObjects);
                }
            } else if (baseObj instanceof COSObject) {
                COSObject obj = (COSObject) baseObj;
                long objId = getObjectId(obj);
                COSObjectKey objKey = new COSObjectKey(obj.getObjectNumber(), obj.getGenerationNumber());
                if (!parsedObjects.contains(objId)) {
                    Long fileOffset = document.getXrefTable().get(objKey);
                    if (fileOffset == null && isLenient) {
                        Map<COSObjectKey, Long> bfCOSObjectKeyOffsets = getBFCOSObjectOffsets();
                        fileOffset = bfCOSObjectKeyOffsets.get(objKey);
                        if (fileOffset != null) {
                            LOG.debug("Set missing " + fileOffset + " for object " + objKey);
                            document.getXrefTable().put(objKey, fileOffset);
                        }
                    }
                    // it is allowed that object references point to null, thus we have to test
                    if (fileOffset != null && fileOffset != 0) {
                        if (fileOffset > 0) {
                            objToBeParsed.put(fileOffset, Collections.singletonList(obj));
                        } else {
                            // negative offset means we have a compressed
                            // object within object stream;
                            // get offset of object stream
                            COSObjectKey key = new COSObjectKey((int) -fileOffset, 0);
                            fileOffset = document.getXrefTable().get(key);
                            if ((fileOffset == null) || (fileOffset <= 0)) {
                                if (isLenient) {
                                    Map<COSObjectKey, Long> bfCOSObjectKeyOffsets = getBFCOSObjectOffsets();
                                    fileOffset = bfCOSObjectKeyOffsets.get(key);
                                    if (fileOffset != null) {
                                        LOG.debug("Set missing " + fileOffset + " for object " + key);
                                        document.getXrefTable().put(key, fileOffset);
                                    }
                                } else {
                                    throw new IOException("Invalid object stream xref object reference for key '" + objKey + "': " + fileOffset);
                                }
                            }
                            List<COSObject> stmObjects = objToBeParsed.get(fileOffset);
                            if (stmObjects == null) {
                                stmObjects = new ArrayList<>();
                                objToBeParsed.put(fileOffset, stmObjects);
                            } else // java does not have a test for immutable
                            if (!(stmObjects instanceof ArrayList)) {
                                throw new IOException(obj + " cannot be assigned to offset " + fileOffset + ", this belongs to " + stmObjects.get(0));
                            }
                            stmObjects.add(obj);
                        }
                    } else {
                        // NULL object
                        COSObject pdfObject = document.getObjectFromPool(objKey);
                        pdfObject.setObject(COSNull.NULL);
                    }
                }
            }
        }
        // resulting object will be added to toBeParsedList
        if (objToBeParsed.isEmpty()) {
            break;
        }
        for (COSObject obj : objToBeParsed.remove(objToBeParsed.firstKey())) {
            COSBase parsedObj = parseObjectDynamically(obj, false);
            if (parsedObj != null) {
                obj.setObject(parsedObj);
                addNewToList(toBeParsedList, parsedObj, addedObjects);
                parsedObjects.add(getObjectId(obj));
            }
        }
    }
}
Also used : COSDictionary(org.apache.pdfbox.cos.COSDictionary) ArrayList(java.util.ArrayList) IOException(java.io.IOException) TreeMap(java.util.TreeMap) LinkedList(java.util.LinkedList) COSObjectKey(org.apache.pdfbox.cos.COSObjectKey) COSArray(org.apache.pdfbox.cos.COSArray) COSObject(org.apache.pdfbox.cos.COSObject) COSBase(org.apache.pdfbox.cos.COSBase) ArrayList(java.util.ArrayList) LinkedList(java.util.LinkedList) List(java.util.List) HashSet(java.util.HashSet)

Aggregations

COSObjectKey (org.apache.pdfbox.cos.COSObjectKey)39 COSObject (org.apache.pdfbox.cos.COSObject)25 IOException (java.io.IOException)16 COSDocument (org.apache.pdfbox.cos.COSDocument)13 COSBase (org.apache.pdfbox.cos.COSBase)12 COSDictionary (org.apache.pdfbox.cos.COSDictionary)8 COSStream (org.apache.pdfbox.cos.COSStream)7 ValidationError (org.apache.pdfbox.preflight.ValidationResult.ValidationError)7 COSArray (org.apache.pdfbox.cos.COSArray)6 COSString (org.apache.pdfbox.cos.COSString)6 Test (org.junit.Test)6 ArrayList (java.util.ArrayList)4 HashMap (java.util.HashMap)4 COSNumber (org.apache.pdfbox.cos.COSNumber)4 COSInteger (org.apache.pdfbox.cos.COSInteger)3 COSName (org.apache.pdfbox.cos.COSName)3 InputStream (java.io.InputStream)2 Matcher (java.util.regex.Matcher)2 Pattern (java.util.regex.Pattern)2 PDFObjectStreamParser (org.apache.pdfbox.pdfparser.PDFObjectStreamParser)2