Search in sources :

Example 1 with TextPosition

use of org.apache.pdfbox.text.TextPosition in project pdfbox by apache.

the class DrawPrintTextLocations method writeString.

/**
 * Override the default functionality of PDFTextStripper.
 */
@Override
protected void writeString(String string, List<TextPosition> textPositions) throws IOException {
    for (TextPosition text : textPositions) {
        System.out.println("String[" + text.getXDirAdj() + "," + text.getYDirAdj() + " fs=" + text.getFontSize() + " xscale=" + text.getXScale() + " height=" + text.getHeightDir() + " space=" + text.getWidthOfSpace() + " width=" + text.getWidthDirAdj() + "]" + text.getUnicode());
        // glyph space -> user space
        // note: text.getTextMatrix() is *not* the Text Matrix, it's the Text Rendering Matrix
        AffineTransform at = text.getTextMatrix().createAffineTransform();
        // in red:
        // show rectangles with the "height" (not a real height, but used for text extraction
        // heuristics, it is 1/2 of the bounding box height and starts at y=0)
        Rectangle2D.Float rect = new Rectangle2D.Float(0, 0, text.getWidthDirAdj() / text.getTextMatrix().getScalingFactorX(), text.getHeightDir() / text.getTextMatrix().getScalingFactorY());
        Shape s = at.createTransformedShape(rect);
        s = flipAT.createTransformedShape(s);
        s = rotateAT.createTransformedShape(s);
        g2d.setColor(Color.red);
        g2d.draw(s);
        // in blue:
        // show rectangle with the real vertical bounds, based on the font bounding box y values
        // usually, the height is identical to what you see when marking text in Adobe Reader
        PDFont font = text.getFont();
        BoundingBox bbox = font.getBoundingBox();
        // advance width, bbox height (glyph space)
        // todo: should iterate all chars
        float xadvance = font.getWidth(text.getCharacterCodes()[0]);
        rect = new Rectangle2D.Float(0, bbox.getLowerLeftY(), xadvance, bbox.getHeight());
        if (font instanceof PDType3Font) {
            // bbox and font matrix are unscaled
            at.concatenate(font.getFontMatrix().createAffineTransform());
        } else {
            // bbox and font matrix are already scaled to 1000
            at.scale(1 / 1000f, 1 / 1000f);
        }
        s = at.createTransformedShape(rect);
        s = flipAT.createTransformedShape(s);
        s = rotateAT.createTransformedShape(s);
        g2d.setColor(Color.blue);
        g2d.draw(s);
    }
}
Also used : PDFont(org.apache.pdfbox.pdmodel.font.PDFont) Shape(java.awt.Shape) PDType3Font(org.apache.pdfbox.pdmodel.font.PDType3Font) TextPosition(org.apache.pdfbox.text.TextPosition) BoundingBox(org.apache.fontbox.util.BoundingBox) Rectangle2D(java.awt.geom.Rectangle2D) AffineTransform(java.awt.geom.AffineTransform)

Example 2 with TextPosition

use of org.apache.pdfbox.text.TextPosition in project pdfbox by apache.

the class PDFText2HTML method getTitle.

/**
 * This method will attempt to guess the title of the document using
 * either the document properties or the first lines of text.
 *
 * @return returns the title.
 */
protected String getTitle() {
    String titleGuess = document.getDocumentInformation().getTitle();
    if (titleGuess != null && titleGuess.length() > 0) {
        return titleGuess;
    } else {
        Iterator<List<TextPosition>> textIter = getCharactersByArticle().iterator();
        float lastFontSize = -1.0f;
        StringBuilder titleText = new StringBuilder();
        while (textIter.hasNext()) {
            for (TextPosition position : textIter.next()) {
                float currentFontSize = position.getFontSize();
                // 64 is arbitrary
                if (Float.compare(currentFontSize, lastFontSize) != 0 || titleText.length() > 64) {
                    if (titleText.length() > 0) {
                        return titleText.toString();
                    }
                    lastFontSize = currentFontSize;
                }
                if (currentFontSize > 13.0f) {
                    // most body text is 12pt
                    titleText.append(position.getUnicode());
                }
            }
        }
    }
    return "";
}
Also used : TextPosition(org.apache.pdfbox.text.TextPosition) List(java.util.List) ArrayList(java.util.ArrayList)

Aggregations

TextPosition (org.apache.pdfbox.text.TextPosition)2 Shape (java.awt.Shape)1 AffineTransform (java.awt.geom.AffineTransform)1 Rectangle2D (java.awt.geom.Rectangle2D)1 ArrayList (java.util.ArrayList)1 List (java.util.List)1 BoundingBox (org.apache.fontbox.util.BoundingBox)1 PDFont (org.apache.pdfbox.pdmodel.font.PDFont)1 PDType3Font (org.apache.pdfbox.pdmodel.font.PDType3Font)1