use of org.apache.pdfbox.cos.COSObjectKey in project pdfbox by apache.
the class BaseParser method parseCOSArray.
/**
* This will parse a PDF array object.
*
* @return The parsed PDF array.
*
* @throws IOException If there is an error parsing the stream.
*/
protected COSArray parseCOSArray() throws IOException {
readExpectedChar('[');
COSArray po = new COSArray();
COSBase pbo;
skipSpaces();
int i;
while (((i = seqSource.peek()) > 0) && ((char) i != ']')) {
pbo = parseDirObject();
if (pbo instanceof COSObject) {
// We have to check if the expected values are there or not PDFBOX-385
if (po.get(po.size() - 1) instanceof COSInteger) {
COSInteger genNumber = (COSInteger) po.remove(po.size() - 1);
if (po.get(po.size() - 1) instanceof COSInteger) {
COSInteger number = (COSInteger) po.remove(po.size() - 1);
COSObjectKey key = new COSObjectKey(number.longValue(), genNumber.intValue());
pbo = getObjectFromPool(key);
} else {
// the object reference is somehow wrong
pbo = null;
}
} else {
pbo = null;
}
}
if (pbo != null) {
po.add(pbo);
} else {
// it could be a bad object in the array which is just skipped
LOG.warn("Corrupt object reference at offset " + seqSource.getPosition());
// This could also be an "endobj" or "endstream" which means we can assume that
// the array has ended.
String isThisTheEnd = readString();
seqSource.unread(isThisTheEnd.getBytes(ISO_8859_1));
if (ENDOBJ_STRING.equals(isThisTheEnd) || ENDSTREAM_STRING.equals(isThisTheEnd)) {
return po;
}
}
skipSpaces();
}
// read ']'
seqSource.read();
skipSpaces();
return po;
}
use of org.apache.pdfbox.cos.COSObjectKey in project pdfbox by apache.
the class BaseParser method parseCOSDictionaryValue.
/**
* This will parse a PDF dictionary value.
*
* @return The parsed Dictionary object.
*
* @throws IOException If there is an error parsing the dictionary object.
*/
private COSBase parseCOSDictionaryValue() throws IOException {
long numOffset = seqSource.getPosition();
COSBase value = parseDirObject();
skipSpaces();
// proceed if the given object is a number and the following is a number as well
if (!(value instanceof COSNumber) || !isDigit()) {
return value;
}
// read the remaining information of the object number
long genOffset = seqSource.getPosition();
COSBase generationNumber = parseDirObject();
skipSpaces();
readExpectedChar('R');
if (!(value instanceof COSInteger)) {
throw new IOException("expected number, actual=" + value + " at offset " + numOffset);
}
if (!(generationNumber instanceof COSInteger)) {
throw new IOException("expected number, actual=" + value + " at offset " + genOffset);
}
COSObjectKey key = new COSObjectKey(((COSInteger) value).longValue(), ((COSInteger) generationNumber).intValue());
// dereference the object
return getObjectFromPool(key);
}
use of org.apache.pdfbox.cos.COSObjectKey in project pdfbox by apache.
the class COSParser method validateXrefOffsets.
private boolean validateXrefOffsets(Map<COSObjectKey, Long> xrefOffset) throws IOException {
if (xrefOffset == null) {
return true;
}
for (Entry<COSObjectKey, Long> objectEntry : xrefOffset.entrySet()) {
COSObjectKey objectKey = objectEntry.getKey();
Long objectOffset = objectEntry.getValue();
// see type 2 entry in xref stream
if (objectOffset != null && objectOffset >= 0 && !checkObjectKey(objectKey, objectOffset)) {
LOG.debug("Stop checking xref offsets as at least one (" + objectKey + ") couldn't be dereferenced");
return false;
}
}
return true;
}
use of org.apache.pdfbox.cos.COSObjectKey in project pdfbox by apache.
the class COSParser method bfSearchForObjects.
/**
* Brute force search for every object in the pdf.
*
* @throws IOException if something went wrong
*/
private void bfSearchForObjects() throws IOException {
bfSearchForLastEOFMarker();
bfSearchCOSObjectKeyOffsets = new HashMap<>();
long originOffset = source.getPosition();
long currentOffset = MINIMUM_SEARCH_OFFSET;
long lastObjectId = Long.MIN_VALUE;
int lastGenID = Integer.MIN_VALUE;
long lastObjOffset = Long.MIN_VALUE;
char[] endobjString = "ndo".toCharArray();
char[] endobjRemainingString = "bj".toCharArray();
boolean endOfObjFound = false;
do {
source.seek(currentOffset);
int nextChar = source.read();
currentOffset++;
if (isWhitespace(nextChar) && isString(OBJ_MARKER)) {
long tempOffset = currentOffset - 2;
source.seek(tempOffset);
int genID = source.peek();
// is the next char a digit?
if (isDigit(genID)) {
genID -= 48;
tempOffset--;
source.seek(tempOffset);
if (isWhitespace()) {
while (tempOffset > MINIMUM_SEARCH_OFFSET && isWhitespace()) {
source.seek(--tempOffset);
}
boolean objectIDFound = false;
while (tempOffset > MINIMUM_SEARCH_OFFSET && isDigit()) {
source.seek(--tempOffset);
objectIDFound = true;
}
if (objectIDFound) {
source.read();
long objectId = readObjectNumber();
if (lastObjOffset > 0) {
// add the former object ID only if there was a subsequent object ID
bfSearchCOSObjectKeyOffsets.put(new COSObjectKey(lastObjectId, lastGenID), lastObjOffset);
}
lastObjectId = objectId;
lastGenID = genID;
lastObjOffset = tempOffset + 1;
currentOffset += OBJ_MARKER.length - 1;
endOfObjFound = false;
}
}
}
} else // We could possibly implement a more intelligent algorithm if necessary
if (nextChar == 'e' && isString(endobjString)) {
currentOffset += endobjString.length;
source.seek(currentOffset);
if (source.isEOF()) {
endOfObjFound = true;
continue;
}
if (isString(endobjRemainingString)) {
currentOffset += endobjRemainingString.length;
endOfObjFound = true;
continue;
}
}
} while (currentOffset < lastEOFMarker && !source.isEOF());
if ((lastEOFMarker < Long.MAX_VALUE || endOfObjFound) && lastObjOffset > 0) {
// if the pdf wasn't cut off in the middle or if the last object ends with a "endobj" marker
// the last object id has to be added here so that it can't get lost as there isn't any subsequent object id
bfSearchCOSObjectKeyOffsets.put(new COSObjectKey(lastObjectId, lastGenID), lastObjOffset);
}
bfSearchForObjStreams();
// reestablish origin position
source.seek(originOffset);
}
use of org.apache.pdfbox.cos.COSObjectKey in project pdfbox by apache.
the class COSParser method parseDictObjects.
/**
* Will parse every object necessary to load a single page from the pdf document. We try our
* best to order objects according to offset in file before reading to minimize seek operations.
*
* @param dict the COSObject from the parent pages.
* @param excludeObjects dictionary object reference entries with these names will not be parsed
*
* @throws IOException if something went wrong
*/
protected void parseDictObjects(COSDictionary dict, COSName... excludeObjects) throws IOException {
// ---- create queue for objects waiting for further parsing
final Queue<COSBase> toBeParsedList = new LinkedList<>();
// offset ordered object map
final TreeMap<Long, List<COSObject>> objToBeParsed = new TreeMap<>();
// in case of compressed objects offset points to stmObj
final Set<Long> parsedObjects = new HashSet<>();
final Set<Long> addedObjects = new HashSet<>();
addExcludedToList(excludeObjects, dict, parsedObjects);
addNewToList(toBeParsedList, dict.getValues(), addedObjects);
// ---- go through objects to be parsed
while (!(toBeParsedList.isEmpty() && objToBeParsed.isEmpty())) {
// -- first get all COSObject from other kind of objects and
// put them in objToBeParsed; afterwards toBeParsedList is empty
COSBase baseObj;
while ((baseObj = toBeParsedList.poll()) != null) {
if (baseObj instanceof COSDictionary) {
addNewToList(toBeParsedList, ((COSDictionary) baseObj).getValues(), addedObjects);
} else if (baseObj instanceof COSArray) {
for (COSBase cosBase : ((COSArray) baseObj)) {
addNewToList(toBeParsedList, cosBase, addedObjects);
}
} else if (baseObj instanceof COSObject) {
COSObject obj = (COSObject) baseObj;
long objId = getObjectId(obj);
COSObjectKey objKey = new COSObjectKey(obj.getObjectNumber(), obj.getGenerationNumber());
if (!parsedObjects.contains(objId)) {
Long fileOffset = document.getXrefTable().get(objKey);
if (fileOffset == null && isLenient) {
Map<COSObjectKey, Long> bfCOSObjectKeyOffsets = getBFCOSObjectOffsets();
fileOffset = bfCOSObjectKeyOffsets.get(objKey);
if (fileOffset != null) {
LOG.debug("Set missing " + fileOffset + " for object " + objKey);
document.getXrefTable().put(objKey, fileOffset);
}
}
// it is allowed that object references point to null, thus we have to test
if (fileOffset != null && fileOffset != 0) {
if (fileOffset > 0) {
objToBeParsed.put(fileOffset, Collections.singletonList(obj));
} else {
// negative offset means we have a compressed
// object within object stream;
// get offset of object stream
COSObjectKey key = new COSObjectKey((int) -fileOffset, 0);
fileOffset = document.getXrefTable().get(key);
if ((fileOffset == null) || (fileOffset <= 0)) {
if (isLenient) {
Map<COSObjectKey, Long> bfCOSObjectKeyOffsets = getBFCOSObjectOffsets();
fileOffset = bfCOSObjectKeyOffsets.get(key);
if (fileOffset != null) {
LOG.debug("Set missing " + fileOffset + " for object " + key);
document.getXrefTable().put(key, fileOffset);
}
} else {
throw new IOException("Invalid object stream xref object reference for key '" + objKey + "': " + fileOffset);
}
}
List<COSObject> stmObjects = objToBeParsed.get(fileOffset);
if (stmObjects == null) {
stmObjects = new ArrayList<>();
objToBeParsed.put(fileOffset, stmObjects);
} else // java does not have a test for immutable
if (!(stmObjects instanceof ArrayList)) {
throw new IOException(obj + " cannot be assigned to offset " + fileOffset + ", this belongs to " + stmObjects.get(0));
}
stmObjects.add(obj);
}
} else {
// NULL object
COSObject pdfObject = document.getObjectFromPool(objKey);
pdfObject.setObject(COSNull.NULL);
}
}
}
}
// resulting object will be added to toBeParsedList
if (objToBeParsed.isEmpty()) {
break;
}
for (COSObject obj : objToBeParsed.remove(objToBeParsed.firstKey())) {
COSBase parsedObj = parseObjectDynamically(obj, false);
if (parsedObj != null) {
obj.setObject(parsedObj);
addNewToList(toBeParsedList, parsedObj, addedObjects);
parsedObjects.add(getObjectId(obj));
}
}
}
}
Aggregations