use of org.apache.pdfbox.cos.COSObjectKey in project pdfbox by apache.
the class COSParser method rebuildTrailer.
/**
* Rebuild the trailer dictionary if startxref can't be found.
*
* @return the rebuild trailer dictionary
*
* @throws IOException if something went wrong
*/
private final COSDictionary rebuildTrailer() throws IOException {
COSDictionary trailer = null;
bfSearchForObjects();
if (bfSearchCOSObjectKeyOffsets != null) {
// reset trailer resolver
xrefTrailerResolver.reset();
// use the found objects to rebuild the trailer resolver
xrefTrailerResolver.nextXrefObj(0, XRefType.TABLE);
for (Entry<COSObjectKey, Long> entry : bfSearchCOSObjectKeyOffsets.entrySet()) {
xrefTrailerResolver.setXRef(entry.getKey(), entry.getValue());
}
xrefTrailerResolver.setStartxref(0);
trailer = xrefTrailerResolver.getTrailer();
getDocument().setTrailer(trailer);
if (!bfSearchForTrailer(trailer)) {
// search for the different parts of the trailer dictionary
for (Entry<COSObjectKey, Long> entry : bfSearchCOSObjectKeyOffsets.entrySet()) {
COSDictionary dictionary = retrieveCOSDictionary(entry.getKey(), entry.getValue());
if (dictionary == null) {
continue;
}
// document catalog
if (isCatalog(dictionary)) {
trailer.setItem(COSName.ROOT, document.getObjectFromPool(entry.getKey()));
} else // info dictionary
if (isInfo(dictionary)) {
trailer.setItem(COSName.INFO, document.getObjectFromPool(entry.getKey()));
}
// encryption dictionary, if existing, is lost
// We can't run "Algorithm 2" from PDF specification because of missing ID
}
}
}
trailerWasRebuild = true;
return trailer;
}
use of org.apache.pdfbox.cos.COSObjectKey in project pdfbox by apache.
the class COSParser method bfSearchForObjStreams.
/**
* Brute force search for all object streams.
*
* @throws IOException if something went wrong
*/
private void bfSearchForObjStreams() throws IOException {
HashMap<Long, COSObjectKey> bfSearchObjStreamsOffsets = new HashMap<>();
long originOffset = source.getPosition();
source.seek(MINIMUM_SEARCH_OFFSET);
char[] string = " obj".toCharArray();
while (!source.isEOF()) {
// search for EOF marker
if (isString(OBJ_STREAM)) {
long currentPosition = source.getPosition();
// search backwards for the beginning of the object
long newOffset = -1;
COSObjectKey streamObjectKey = null;
boolean objFound = false;
for (int i = 1; i < 40 && !objFound; i++) {
long currentOffset = currentPosition - (i * 10);
if (currentOffset > 0) {
source.seek(currentOffset);
for (int j = 0; j < 10; j++) {
if (isString(string)) {
long tempOffset = currentOffset - 1;
source.seek(tempOffset);
int genID = source.peek();
// is the next char a digit?
if (isDigit(genID)) {
tempOffset--;
source.seek(tempOffset);
if (isSpace()) {
int length = 0;
source.seek(--tempOffset);
while (tempOffset > MINIMUM_SEARCH_OFFSET && isDigit()) {
source.seek(--tempOffset);
length++;
}
if (length > 0) {
source.read();
newOffset = source.getPosition();
long objNumber = readObjectNumber();
int genNumber = readGenerationNumber();
streamObjectKey = new COSObjectKey(objNumber, genNumber);
bfSearchObjStreamsOffsets.put(newOffset, streamObjectKey);
}
}
}
LOG.debug("Dictionary start for object stream -> " + newOffset);
objFound = true;
break;
} else {
currentOffset++;
source.read();
}
}
}
}
source.seek(currentPosition + OBJ_STREAM.length);
}
source.read();
}
// add all found compressed objects to the brute force search result
for (Entry<Long, COSObjectKey> streamOffsetsEntry : bfSearchObjStreamsOffsets.entrySet()) {
Long offset = streamOffsetsEntry.getKey();
Long bfOffset = bfSearchCOSObjectKeyOffsets.get(streamOffsetsEntry.getValue());
// incomplete object stream found?
if (bfOffset == null) {
LOG.warn("Skipped incomplete object stream:" + streamOffsetsEntry.getValue() + " at " + offset);
continue;
}
// check if the object was overwritten
if (offset.equals(bfOffset)) {
source.seek(offset);
long stmObjNumber = readObjectNumber();
readGenerationNumber();
readExpectedString(OBJ_MARKER, true);
int nrOfObjects = 0;
byte[] numbersBytes = null;
COSStream stream = null;
COSInputStream is = null;
try {
COSDictionary dict = parseCOSDictionary();
int offsetFirstStream = dict.getInt(COSName.FIRST);
nrOfObjects = dict.getInt(COSName.N);
// skip the stream if required values are missing
if (offsetFirstStream == -1 || nrOfObjects == -1) {
continue;
}
stream = parseCOSStream(dict);
is = stream.createInputStream();
numbersBytes = new byte[offsetFirstStream];
long isResult = is.read(numbersBytes);
if (Long.compare(isResult, numbersBytes.length) != 0) {
LOG.debug("Tried reading " + numbersBytes.length + " bytes but only " + isResult + " bytes read");
}
} catch (IOException exception) {
LOG.debug("Skipped corrupt stream: (" + stmObjNumber + " 0 at offset " + offset, exception);
continue;
} finally {
if (is != null) {
is.close();
}
if (stream != null) {
stream.close();
}
}
int start = 0;
// skip spaces
while (numbersBytes[start] == 32) {
start++;
}
String numbersStr = new String(numbersBytes, start, numbersBytes.length - start, "ISO-8859-1");
numbersStr = numbersStr.replaceAll("\n", " ").replaceAll(" ", " ");
String[] numbers = numbersStr.split(" ");
if (numbers.length < nrOfObjects * 2) {
LOG.debug("Skipped corrupt stream: (" + stmObjNumber + " 0 at offset " + offset);
continue;
}
for (int i = 0; i < nrOfObjects; i++) {
long objNumber = Long.parseLong(numbers[i * 2]);
COSObjectKey objKey = new COSObjectKey(objNumber, 0);
Long existingOffset = bfSearchCOSObjectKeyOffsets.get(objKey);
if (existingOffset == null || offset > existingOffset) {
bfSearchCOSObjectKeyOffsets.put(objKey, -stmObjNumber);
}
}
}
}
source.seek(originOffset);
}
use of org.apache.pdfbox.cos.COSObjectKey in project pdfbox by apache.
the class COSParser method retrieveCOSDictionary.
private COSDictionary retrieveCOSDictionary(COSObject object) throws IOException {
COSObjectKey key = new COSObjectKey((COSObject) object);
Long offset = bfSearchCOSObjectKeyOffsets.get(key);
if (offset != null) {
return retrieveCOSDictionary(key, offset);
}
return null;
}
use of org.apache.pdfbox.cos.COSObjectKey in project pdfbox by apache.
the class COSParser method parseXrefTable.
/**
* This will parse the xref table from the stream and add it to the state
* The XrefTable contents are ignored.
* @param startByteOffset the offset to start at
* @return false on parsing error
* @throws IOException If an IO error occurs.
*/
protected boolean parseXrefTable(long startByteOffset) throws IOException {
if (source.peek() != 'x') {
return false;
}
String xref = readString();
if (!xref.trim().equals("xref")) {
return false;
}
// check for trailer after xref
String str = readString();
byte[] b = str.getBytes(ISO_8859_1);
source.rewind(b.length);
// signal start of new XRef
xrefTrailerResolver.nextXrefObj(startByteOffset, XRefType.TABLE);
if (str.startsWith("trailer")) {
LOG.warn("skipping empty xref table");
return false;
}
// Xref tables can have multiple sections. Each starts with a starting object id and a count.
while (true) {
String currentLine = readLine();
String[] splitString = currentLine.split("\\s");
if (splitString.length != 2) {
LOG.warn("Unexpected XRefTable Entry: " + currentLine);
break;
}
// first obj id
long currObjID = Long.parseLong(splitString[0]);
// the number of objects in the xref table
int count = Integer.parseInt(splitString[1]);
skipSpaces();
for (int i = 0; i < count; i++) {
if (source.isEOF() || isEndOfName((char) source.peek())) {
break;
}
if (source.peek() == 't') {
break;
}
// Ignore table contents
currentLine = readLine();
splitString = currentLine.split("\\s");
if (splitString.length < 3) {
LOG.warn("invalid xref line: " + currentLine);
break;
}
/* This supports the corrupt table as reported in
* PDFBOX-474 (XXXX XXX XX n) */
if (splitString[splitString.length - 1].equals("n")) {
try {
long currOffset = Long.parseLong(splitString[0]);
int currGenID = Integer.parseInt(splitString[1]);
COSObjectKey objKey = new COSObjectKey(currObjID, currGenID);
xrefTrailerResolver.setXRef(objKey, currOffset);
} catch (NumberFormatException e) {
throw new IOException(e);
}
} else if (!splitString[2].equals("f")) {
throw new IOException("Corrupt XRefTable Entry - ObjID:" + currObjID);
}
currObjID++;
skipSpaces();
}
skipSpaces();
if (!isDigit()) {
break;
}
}
return true;
}
use of org.apache.pdfbox.cos.COSObjectKey in project pdfbox by apache.
the class COSParser method parseObjectDynamically.
/**
* This will parse the next object from the stream and add it to the local state.
* It's reduced to parsing an indirect object.
*
* @param objNr object number of object to be parsed
* @param objGenNr object generation number of object to be parsed
* @param requireExistingNotCompressedObj if <code>true</code> the object to be parsed must be defined in xref
* (comment: null objects may be missing from xref) and it must not be a compressed object within object stream
* (this is used to circumvent being stuck in a loop in a malicious PDF)
*
* @return the parsed object (which is also added to document object)
*
* @throws IOException If an IO error occurs.
*/
protected COSBase parseObjectDynamically(long objNr, int objGenNr, boolean requireExistingNotCompressedObj) throws IOException {
// ---- create object key and get object (container) from pool
final COSObjectKey objKey = new COSObjectKey(objNr, objGenNr);
final COSObject pdfObject = document.getObjectFromPool(objKey);
if (pdfObject.getObject() == null) {
// not previously parsed
// ---- read offset or object stream object number from xref table
Long offsetOrObjstmObNr = document.getXrefTable().get(objKey);
// maybe something is wrong with the xref table -> perform brute force search for all objects
if (offsetOrObjstmObNr == null && isLenient) {
Map<COSObjectKey, Long> bfCOSObjectKeyOffsets = getBFCOSObjectOffsets();
offsetOrObjstmObNr = bfCOSObjectKeyOffsets.get(objKey);
if (offsetOrObjstmObNr != null) {
LOG.debug("Set missing offset " + offsetOrObjstmObNr + " for object " + objKey);
document.getXrefTable().put(objKey, offsetOrObjstmObNr);
}
}
// sanity test to circumvent loops with broken documents
if (requireExistingNotCompressedObj && ((offsetOrObjstmObNr == null) || (offsetOrObjstmObNr <= 0))) {
throw new IOException("Object must be defined and must not be compressed object: " + objKey.getNumber() + ":" + objKey.getGeneration());
}
if (offsetOrObjstmObNr == null) {
// not defined object -> NULL object (Spec. 1.7, chap. 3.2.9)
pdfObject.setObject(COSNull.NULL);
} else if (offsetOrObjstmObNr > 0) {
// offset of indirect object in file
parseFileObject(offsetOrObjstmObNr, objKey, pdfObject);
} else {
// xref value is object nr of object stream containing object to be parsed
// since our object was not found it means object stream was not parsed so far
parseObjectStream((int) -offsetOrObjstmObNr);
}
}
return pdfObject.getObject();
}
Aggregations