use of org.apache.pdfbox.pdfparser.PDFObjectStreamParser in project pdfbox by apache.
the class DecompressObjectstreams method main.
/**
* This is a very simple program, so everything is in the main method.
* @param args arguments to the program
*/
public static void main(String[] args) {
// suppress the Dock icon on OS X
System.setProperty("apple.awt.UIElement", "true");
if (args.length < 1) {
usage();
}
String inputFilename = args[0];
String outputFilename;
if (args.length > 1) {
outputFilename = args[1];
} else {
if (inputFilename.matches(".*\\.[pP][dD][fF]$")) {
outputFilename = inputFilename.replaceAll("\\.[pP][dD][fF]$", ".unc.pdf");
} else {
outputFilename = inputFilename + ".unc.pdf";
}
}
PDDocument doc = null;
try {
doc = PDDocument.load(new File(inputFilename));
for (COSObject objStream : doc.getDocument().getObjectsByType(COSName.OBJ_STM)) {
COSStream stream = (COSStream) objStream.getObject();
PDFObjectStreamParser sp = new PDFObjectStreamParser(stream, doc.getDocument());
sp.parse();
for (COSObject next : sp.getObjects()) {
COSObjectKey key = new COSObjectKey(next);
COSObject obj = doc.getDocument().getObjectFromPool(key);
obj.setObject(next.getObject());
}
doc.getDocument().removeObject(new COSObjectKey(objStream));
}
doc.save(outputFilename);
} catch (Exception e) {
System.err.println("Error processing file: " + e.getMessage());
} finally {
if (doc != null) {
try {
doc.close();
} catch (Exception e) {
}
}
}
}
use of org.apache.pdfbox.pdfparser.PDFObjectStreamParser in project pdfbox by apache.
the class COSDocument method dereferenceObjectStreams.
/**
* This method will search the list of objects for types of ObjStm. If it finds
* them then it will parse out all of the objects from the stream that is contains.
*
* @throws IOException If there is an error parsing the stream.
*/
public void dereferenceObjectStreams() throws IOException {
for (COSObject objStream : getObjectsByType(COSName.OBJ_STM)) {
COSStream stream = (COSStream) objStream.getObject();
PDFObjectStreamParser parser = new PDFObjectStreamParser(stream, this);
parser.parse();
for (COSObject next : parser.getObjects()) {
COSObjectKey key = new COSObjectKey(next);
if (objectPool.get(key) == null || objectPool.get(key).getObject() == null || // xrefTable stores negated objNr of objStream for objects in objStreams
(xrefTable.containsKey(key) && xrefTable.get(key) == -objStream.getObjectNumber())) {
COSObject obj = getObjectFromPool(key);
obj.setObject(next.getObject());
}
}
}
}
use of org.apache.pdfbox.pdfparser.PDFObjectStreamParser in project pdfbox by apache.
the class PreflightParser method parseObjectDynamically.
@Override
protected COSBase parseObjectDynamically(long objNr, int objGenNr, boolean requireExistingNotCompressedObj) throws IOException {
// ---- create object key and get object (container) from pool
final COSObjectKey objKey = new COSObjectKey(objNr, objGenNr);
final COSObject pdfObject = document.getObjectFromPool(objKey);
if (pdfObject.getObject() == null) {
// not previously parsed
// ---- read offset or object stream object number from xref table
Long offsetOrObjstmObNr = document.getXrefTable().get(objKey);
// sanity test to circumvent loops with broken documents
if (requireExistingNotCompressedObj && ((offsetOrObjstmObNr == null))) {
addValidationError(new ValidationError(ERROR_SYNTAX_MISSING_OFFSET, "Object must be defined and must not be compressed object: " + objKey.getNumber() + ":" + objKey.getGeneration()));
throw new SyntaxValidationException("Object must be defined and must not be compressed object: " + objKey.getNumber() + ":" + objKey.getGeneration(), validationResult);
}
if (offsetOrObjstmObNr == null) {
// not defined object -> NULL object (Spec. 1.7, chap. 3.2.9)
pdfObject.setObject(COSNull.NULL);
} else if (offsetOrObjstmObNr == 0) {
addValidationError(new ValidationError(ERROR_SYNTAX_INVALID_OFFSET, "Object {" + objKey.getNumber() + ":" + objKey.getGeneration() + "} has an offset of 0"));
} else if (offsetOrObjstmObNr > 0) {
// offset of indirect object in file
// ---- go to object start
source.seek(offsetOrObjstmObNr);
// ---- we must have an indirect object
long readObjNr;
int readObjGen;
long offset = source.getPosition();
String line = readLine();
Pattern pattern = Pattern.compile("(\\d+)\\s(\\d+)\\sobj");
Matcher matcher = pattern.matcher(line);
if (matcher.matches()) {
readObjNr = Long.parseLong(matcher.group(1));
readObjGen = Integer.parseInt(matcher.group(2));
} else {
addValidationError(new ValidationError(ERROR_SYNTAX_OBJ_DELIMITER, "Single space expected [offset=" + offset + "; key=" + offsetOrObjstmObNr.toString() + "; line=" + line + "; object=" + pdfObject.toString() + "]"));
// reset source cursor to read object information
source.seek(offset);
readObjNr = readObjectNumber();
readObjGen = readGenerationNumber();
// skip spaces between Object Generation number and the 'obj' keyword
skipSpaces();
for (char c : OBJ_MARKER) {
if (source.read() != c) {
addValidationError(new ValidationError(ERROR_SYNTAX_OBJ_DELIMITER, "Expected pattern '" + new String(OBJ_MARKER) + " but missed at character '" + c + "'"));
throw new SyntaxValidationException("Expected pattern '" + new String(OBJ_MARKER) + " but missed at character '" + c + "'", validationResult);
}
}
}
// ---- consistency check
if ((readObjNr != objKey.getNumber()) || (readObjGen != objKey.getGeneration())) {
throw new IOException("XREF for " + objKey.getNumber() + ":" + objKey.getGeneration() + " points to wrong object: " + readObjNr + ":" + readObjGen);
}
skipSpaces();
COSBase pb = parseDirObject();
skipSpaces();
long endObjectOffset = source.getPosition();
String endObjectKey = readString();
if (endObjectKey.equals("stream")) {
source.seek(endObjectOffset);
if (pb instanceof COSDictionary) {
COSStream stream = parseCOSStream((COSDictionary) pb);
if (securityHandler != null) {
securityHandler.decryptStream(stream, objNr, objGenNr);
}
pb = stream;
} else {
// the combination of a dict and the stream/endstream forms a complete stream object
throw new IOException("Stream not preceded by dictionary (offset: " + offsetOrObjstmObNr + ").");
}
skipSpaces();
endObjectOffset = source.getPosition();
endObjectKey = readString();
// we have case with a second 'endstream' before endobj
if (!endObjectKey.startsWith("endobj")) {
if (endObjectKey.startsWith("endstream")) {
endObjectKey = endObjectKey.substring(9).trim();
if (endObjectKey.length() == 0) {
// no other characters in extra endstream line
// read next line
endObjectKey = readString();
}
}
}
} else if (securityHandler != null) {
securityHandler.decrypt(pb, objNr, objGenNr);
}
pdfObject.setObject(pb);
if (!endObjectKey.startsWith("endobj")) {
throw new IOException("Object (" + readObjNr + ":" + readObjGen + ") at offset " + offsetOrObjstmObNr + " does not end with 'endobj'.");
} else {
offset = source.getPosition();
source.seek(endObjectOffset - 1);
if (!nextIsEOL()) {
addValidationError(new ValidationError(PreflightConstants.ERROR_SYNTAX_OBJ_DELIMITER, "EOL expected before the 'endobj' keyword at offset " + source.getPosition()));
}
source.seek(offset);
}
if (!nextIsEOL()) {
addValidationError(new ValidationError(PreflightConstants.ERROR_SYNTAX_OBJ_DELIMITER, "EOL expected after the 'endobj' keyword at offset " + source.getPosition()));
}
} else {
// xref value is object nr of object stream containing object to be parsed;
// since our object was not found it means object stream was not parsed so far
final int objstmObjNr = (int) (-offsetOrObjstmObNr);
final COSBase objstmBaseObj = parseObjectDynamically(objstmObjNr, 0, true);
if (objstmBaseObj instanceof COSStream) {
// parse object stream
PDFObjectStreamParser parser = new PDFObjectStreamParser((COSStream) objstmBaseObj, document);
parser.parse();
// register all objects which are referenced to be contained in object stream
for (COSObject next : parser.getObjects()) {
COSObjectKey stmObjKey = new COSObjectKey(next);
Long offset = document.getXrefTable().get(stmObjKey);
if (offset != null && offset == -objstmObjNr) {
COSObject stmObj = document.getObjectFromPool(stmObjKey);
stmObj.setObject(next.getObject());
}
}
}
}
}
return pdfObject.getObject();
}
Aggregations