use of org.apache.pdfbox.text.TextPosition in project pdfbox by apache.
the class DrawPrintTextLocations method writeString.
/**
* Override the default functionality of PDFTextStripper.
*/
@Override
protected void writeString(String string, List<TextPosition> textPositions) throws IOException {
for (TextPosition text : textPositions) {
System.out.println("String[" + text.getXDirAdj() + "," + text.getYDirAdj() + " fs=" + text.getFontSize() + " xscale=" + text.getXScale() + " height=" + text.getHeightDir() + " space=" + text.getWidthOfSpace() + " width=" + text.getWidthDirAdj() + "]" + text.getUnicode());
// glyph space -> user space
// note: text.getTextMatrix() is *not* the Text Matrix, it's the Text Rendering Matrix
AffineTransform at = text.getTextMatrix().createAffineTransform();
// in red:
// show rectangles with the "height" (not a real height, but used for text extraction
// heuristics, it is 1/2 of the bounding box height and starts at y=0)
Rectangle2D.Float rect = new Rectangle2D.Float(0, 0, text.getWidthDirAdj() / text.getTextMatrix().getScalingFactorX(), text.getHeightDir() / text.getTextMatrix().getScalingFactorY());
Shape s = at.createTransformedShape(rect);
s = flipAT.createTransformedShape(s);
s = rotateAT.createTransformedShape(s);
g2d.setColor(Color.red);
g2d.draw(s);
// in blue:
// show rectangle with the real vertical bounds, based on the font bounding box y values
// usually, the height is identical to what you see when marking text in Adobe Reader
PDFont font = text.getFont();
BoundingBox bbox = font.getBoundingBox();
// advance width, bbox height (glyph space)
// todo: should iterate all chars
float xadvance = font.getWidth(text.getCharacterCodes()[0]);
rect = new Rectangle2D.Float(0, bbox.getLowerLeftY(), xadvance, bbox.getHeight());
if (font instanceof PDType3Font) {
// bbox and font matrix are unscaled
at.concatenate(font.getFontMatrix().createAffineTransform());
} else {
// bbox and font matrix are already scaled to 1000
at.scale(1 / 1000f, 1 / 1000f);
}
s = at.createTransformedShape(rect);
s = flipAT.createTransformedShape(s);
s = rotateAT.createTransformedShape(s);
g2d.setColor(Color.blue);
g2d.draw(s);
}
}
use of org.apache.pdfbox.text.TextPosition in project pdfbox by apache.
the class PDFText2HTML method getTitle.
/**
* This method will attempt to guess the title of the document using
* either the document properties or the first lines of text.
*
* @return returns the title.
*/
protected String getTitle() {
String titleGuess = document.getDocumentInformation().getTitle();
if (titleGuess != null && titleGuess.length() > 0) {
return titleGuess;
} else {
Iterator<List<TextPosition>> textIter = getCharactersByArticle().iterator();
float lastFontSize = -1.0f;
StringBuilder titleText = new StringBuilder();
while (textIter.hasNext()) {
for (TextPosition position : textIter.next()) {
float currentFontSize = position.getFontSize();
// 64 is arbitrary
if (Float.compare(currentFontSize, lastFontSize) != 0 || titleText.length() > 64) {
if (titleText.length() > 0) {
return titleText.toString();
}
lastFontSize = currentFontSize;
}
if (currentFontSize > 13.0f) {
// most body text is 12pt
titleText.append(position.getUnicode());
}
}
}
}
return "";
}
Aggregations