Search in sources :

Example 6 with Operator

use of org.apache.pdfbox.contentstream.operator.Operator in project pdfbox by apache.

the class PDFStreamEngine method processStreamOperators.

/**
 * Processes the operators of the given content stream.
 *
 * @param contentStream to content stream to parse.
 * @throws IOException if there is an error reading or parsing the content stream.
 */
private void processStreamOperators(PDContentStream contentStream) throws IOException {
    List<COSBase> arguments = new ArrayList<>();
    PDFStreamParser parser = new PDFStreamParser(contentStream);
    Object token = parser.parseNextToken();
    while (token != null) {
        if (token instanceof COSObject) {
            arguments.add(((COSObject) token).getObject());
        } else if (token instanceof Operator) {
            processOperator((Operator) token, arguments);
            arguments = new ArrayList<>();
        } else {
            arguments.add((COSBase) token);
        }
        token = parser.parseNextToken();
    }
}
Also used : Operator(org.apache.pdfbox.contentstream.operator.Operator) PDFStreamParser(org.apache.pdfbox.pdfparser.PDFStreamParser) COSObject(org.apache.pdfbox.cos.COSObject) ArrayList(java.util.ArrayList) COSBase(org.apache.pdfbox.cos.COSBase) COSObject(org.apache.pdfbox.cos.COSObject) PDFormXObject(org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject)

Example 7 with Operator

use of org.apache.pdfbox.contentstream.operator.Operator in project pdfbox by apache.

the class PDFStreamParser method parseNextToken.

/**
 * This will parse the next token in the stream.
 *
 * @return The next token in the stream or null if there are no more tokens in the stream.
 *
 * @throws IOException If an io error occurs while parsing the stream.
 */
public Object parseNextToken() throws IOException {
    Object retval;
    skipSpaces();
    int nextByte = seqSource.peek();
    if (((byte) nextByte) == -1) {
        return null;
    }
    char c = (char) nextByte;
    switch(c) {
        case '<':
            {
                // pull off first left bracket
                int leftBracket = seqSource.read();
                // check for second left bracket
                c = (char) seqSource.peek();
                // put back first bracket
                seqSource.unread(leftBracket);
                if (c == '<') {
                    retval = parseCOSDictionary();
                } else {
                    retval = parseCOSString();
                }
                break;
            }
        case '[':
            {
                // array
                retval = parseCOSArray();
                break;
            }
        case '(':
            // string
            retval = parseCOSString();
            break;
        case '/':
            // name
            retval = parseCOSName();
            break;
        case 'n':
            {
                // null
                String nullString = readString();
                if (nullString.equals("null")) {
                    retval = COSNull.NULL;
                } else {
                    retval = Operator.getOperator(nullString);
                }
                break;
            }
        case 't':
        case 'f':
            {
                String next = readString();
                if (next.equals("true")) {
                    retval = COSBoolean.TRUE;
                    break;
                } else if (next.equals("false")) {
                    retval = COSBoolean.FALSE;
                } else {
                    retval = Operator.getOperator(next);
                }
                break;
            }
        case 'R':
            {
                String line = readString();
                if (line.equals("R")) {
                    retval = new COSObject(null);
                } else {
                    retval = Operator.getOperator(line);
                }
                break;
            }
        case '0':
        case '1':
        case '2':
        case '3':
        case '4':
        case '5':
        case '6':
        case '7':
        case '8':
        case '9':
        case '-':
        case '+':
        case '.':
            {
                /* We will be filling buf with the rest of the number.  Only
                 * allow 1 "." and "-" and "+" at start of number. */
                StringBuilder buf = new StringBuilder();
                buf.append(c);
                seqSource.read();
                // Ignore double negative (this is consistent with Adobe Reader)
                if (c == '-' && seqSource.peek() == c) {
                    seqSource.read();
                }
                boolean dotNotRead = c != '.';
                while (Character.isDigit(c = (char) seqSource.peek()) || dotNotRead && c == '.' || c == '-') {
                    if (c != '-') {
                        // PDFBOX-4064: ignore "-" in the middle of a number
                        buf.append(c);
                    }
                    seqSource.read();
                    if (dotNotRead && c == '.') {
                        dotNotRead = false;
                    }
                }
                retval = COSNumber.get(buf.toString());
                break;
            }
        case 'B':
            {
                String next = readString();
                retval = Operator.getOperator(next);
                if (next.equals("BI")) {
                    Operator beginImageOP = (Operator) retval;
                    COSDictionary imageParams = new COSDictionary();
                    beginImageOP.setImageParameters(imageParams);
                    Object nextToken = null;
                    while ((nextToken = parseNextToken()) instanceof COSName) {
                        Object value = parseNextToken();
                        imageParams.setItem((COSName) nextToken, (COSBase) value);
                    }
                    // final token will be the image data, maybe??
                    if (nextToken instanceof Operator) {
                        Operator imageData = (Operator) nextToken;
                        if (imageData.getImageData() == null || imageData.getImageData().length == 0) {
                            LOG.warn("empty inline image at stream offset " + seqSource.getPosition());
                        }
                        beginImageOP.setImageData(imageData.getImageData());
                    }
                }
                break;
            }
        case 'I':
            {
                // Special case for ID operator
                String id = "" + (char) seqSource.read() + (char) seqSource.read();
                if (!id.equals("ID")) {
                    throw new IOException("Error: Expected operator 'ID' actual='" + id + "'");
                }
                ByteArrayOutputStream imageData = new ByteArrayOutputStream();
                if (isWhitespace()) {
                    // pull off the whitespace character
                    seqSource.read();
                }
                int lastByte = seqSource.read();
                int currentByte = seqSource.read();
                // Be aware not all kind of whitespaces are allowed here. see PDFBOX-1561
                while (!(lastByte == 'E' && currentByte == 'I' && hasNextSpaceOrReturn() && hasNoFollowingBinData(seqSource)) && !seqSource.isEOF()) {
                    imageData.write(lastByte);
                    lastByte = currentByte;
                    currentByte = seqSource.read();
                }
                // the EI operator isn't unread, as it won't be processed anyway
                retval = Operator.getOperator("ID");
                // save the image data to the operator, so that it can be accessed later
                ((Operator) retval).setImageData(imageData.toByteArray());
                break;
            }
        case ']':
            {
                // some ']' around without its previous '['
                // this means a PDF is somewhat corrupt but we will continue to parse.
                seqSource.read();
                // must be a better solution than null...
                retval = COSNull.NULL;
                break;
            }
        default:
            {
                // we must be an operator
                String operator = readOperator();
                if (operator.trim().length() == 0) {
                    // we have a corrupt stream, stop reading here
                    retval = null;
                } else {
                    retval = Operator.getOperator(operator);
                }
            }
    }
    return retval;
}
Also used : Operator(org.apache.pdfbox.contentstream.operator.Operator) COSDictionary(org.apache.pdfbox.cos.COSDictionary) COSName(org.apache.pdfbox.cos.COSName) COSObject(org.apache.pdfbox.cos.COSObject) COSBase(org.apache.pdfbox.cos.COSBase) COSObject(org.apache.pdfbox.cos.COSObject) IOException(java.io.IOException) ByteArrayOutputStream(java.io.ByteArrayOutputStream)

Example 8 with Operator

use of org.apache.pdfbox.contentstream.operator.Operator in project pdfbox by apache.

the class PDType3CharProc method getGlyphBBox.

/**
 * Calculate the bounding box of this glyph. This will work only if the first operator in the
 * stream is d1.
 *
 * @return the bounding box of this glyph, or null if the first operator is not d1.
 * @throws IOException If an io error occurs while parsing the stream.
 */
public PDRectangle getGlyphBBox() throws IOException {
    List<COSBase> arguments = new ArrayList<>();
    PDFStreamParser parser = new PDFStreamParser(this);
    Object token = parser.parseNextToken();
    while (token != null) {
        if (token instanceof COSObject) {
            arguments.add(((COSObject) token).getObject());
        } else if (token instanceof Operator) {
            if (((Operator) token).getName().equals("d1") && arguments.size() == 6) {
                for (int i = 0; i < 6; ++i) {
                    if (!(arguments.get(i) instanceof COSNumber)) {
                        return null;
                    }
                }
                return new PDRectangle(((COSNumber) arguments.get(2)).floatValue(), ((COSNumber) arguments.get(3)).floatValue(), ((COSNumber) arguments.get(4)).floatValue() - ((COSNumber) arguments.get(2)).floatValue(), ((COSNumber) arguments.get(5)).floatValue() - ((COSNumber) arguments.get(3)).floatValue());
            } else {
                return null;
            }
        } else {
            arguments.add((COSBase) token);
        }
        token = parser.parseNextToken();
    }
    return null;
}
Also used : Operator(org.apache.pdfbox.contentstream.operator.Operator) PDFStreamParser(org.apache.pdfbox.pdfparser.PDFStreamParser) COSObject(org.apache.pdfbox.cos.COSObject) ArrayList(java.util.ArrayList) COSNumber(org.apache.pdfbox.cos.COSNumber) COSBase(org.apache.pdfbox.cos.COSBase) COSObject(org.apache.pdfbox.cos.COSObject) PDRectangle(org.apache.pdfbox.pdmodel.common.PDRectangle)

Example 9 with Operator

use of org.apache.pdfbox.contentstream.operator.Operator in project pdfbox by apache.

the class RemoveAllText method createTokensWithoutText.

private static List<Object> createTokensWithoutText(PDContentStream contentStream) throws IOException {
    PDFStreamParser parser = new PDFStreamParser(contentStream);
    Object token = parser.parseNextToken();
    List<Object> newTokens = new ArrayList<>();
    while (token != null) {
        if (token instanceof Operator) {
            Operator op = (Operator) token;
            if ("TJ".equals(op.getName()) || "Tj".equals(op.getName()) || "'".equals(op.getName()) || "\"".equals(op.getName())) {
                // remove the one argument to this operator
                newTokens.remove(newTokens.size() - 1);
                token = parser.parseNextToken();
                continue;
            }
        }
        newTokens.add(token);
        token = parser.parseNextToken();
    }
    return newTokens;
}
Also used : Operator(org.apache.pdfbox.contentstream.operator.Operator) PDFStreamParser(org.apache.pdfbox.pdfparser.PDFStreamParser) ArrayList(java.util.ArrayList) PDXObject(org.apache.pdfbox.pdmodel.graphics.PDXObject) PDFormXObject(org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject)

Aggregations

Operator (org.apache.pdfbox.contentstream.operator.Operator)9 COSBase (org.apache.pdfbox.cos.COSBase)6 PDFStreamParser (org.apache.pdfbox.pdfparser.PDFStreamParser)6 ArrayList (java.util.ArrayList)5 COSObject (org.apache.pdfbox.cos.COSObject)5 IOException (java.io.IOException)3 COSDictionary (org.apache.pdfbox.cos.COSDictionary)2 COSFloat (org.apache.pdfbox.cos.COSFloat)2 COSName (org.apache.pdfbox.cos.COSName)2 PDFormXObject (org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject)2 ByteArrayOutputStream (java.io.ByteArrayOutputStream)1 File (java.io.File)1 Map (java.util.Map)1 COSArray (org.apache.pdfbox.cos.COSArray)1 COSBoolean (org.apache.pdfbox.cos.COSBoolean)1 COSInteger (org.apache.pdfbox.cos.COSInteger)1 COSNumber (org.apache.pdfbox.cos.COSNumber)1 COSString (org.apache.pdfbox.cos.COSString)1 ScratchFile (org.apache.pdfbox.io.ScratchFile)1 PDPage (org.apache.pdfbox.pdmodel.PDPage)1