use of org.apache.pdfbox.text.PDFTextStripperByArea in project pdfbox by apache.
the class ExtractTextByArea method main.
/**
* This will print the documents text in a certain area.
*
* @param args The command line arguments.
*
* @throws IOException If there is an error parsing the document.
*/
public static void main(String[] args) throws IOException {
if (args.length != 1) {
usage();
} else {
try (PDDocument document = PDDocument.load(new File(args[0]))) {
PDFTextStripperByArea stripper = new PDFTextStripperByArea();
stripper.setSortByPosition(true);
Rectangle rect = new Rectangle(10, 280, 275, 60);
stripper.addRegion("class1", rect);
PDPage firstPage = document.getPage(0);
stripper.extractRegions(firstPage);
System.out.println("Text in the area:" + rect);
System.out.println(stripper.getTextForRegion("class1"));
}
}
}
use of org.apache.pdfbox.text.PDFTextStripperByArea in project xwiki-platform by xwiki.
the class PDFTest method getDestinationText.
private String getDestinationText(PDPageXYZDestination destination) throws Exception {
PDFTextStripperByArea stripper = new PDFTextStripperByArea();
stripper.addRegion("destination", getRectangleBelowDestination(destination));
stripper.extractRegions(destination.getPage());
return stripper.getTextForRegion("destination").trim();
}
use of org.apache.pdfbox.text.PDFTextStripperByArea in project xwiki-platform by xwiki.
the class PDFTest method extractLinks.
/**
* Code adapted from http://www.docjar.com/html/api/org/apache/pdfbox/examples/pdmodel/PrintURLs.java.html
*/
private Map<String, PDAction> extractLinks(PDPage page) throws Exception {
Map<String, PDAction> links = new HashMap<String, PDAction>();
PDFTextStripperByArea stripper = new PDFTextStripperByArea();
List<PDAnnotation> annotations = page.getAnnotations();
// First setup the text extraction regions.
for (int j = 0; j < annotations.size(); j++) {
PDAnnotation annotation = annotations.get(j);
if (annotation instanceof PDAnnotationLink) {
PDAnnotationLink link = (PDAnnotationLink) annotation;
PDRectangle rect = link.getRectangle();
// Need to reposition link rectangle to match text space.
float x = rect.getLowerLeftX();
float y = rect.getUpperRightY();
float width = rect.getWidth();
float height = rect.getHeight();
int rotation = page.getRotation();
if (rotation == 0) {
PDRectangle pageSize = page.getMediaBox();
y = pageSize.getHeight() - y;
} else if (rotation == 90) {
// Do nothing.
}
Rectangle2D.Float awtRect = new Rectangle2D.Float(x, y, width, height);
stripper.addRegion(String.valueOf(j), awtRect);
}
}
stripper.extractRegions(page);
for (int j = 0; j < annotations.size(); j++) {
PDAnnotation annotation = annotations.get(j);
if (annotation instanceof PDAnnotationLink) {
PDAnnotationLink link = (PDAnnotationLink) annotation;
String label = stripper.getTextForRegion(String.valueOf(j)).trim();
links.put(label, link.getAction());
}
}
return links;
}
use of org.apache.pdfbox.text.PDFTextStripperByArea in project pdfbox by apache.
the class PrintURLs method main.
/**
* This will output all URLs and the texts in the annotation rectangle of a document.
* <br>
* see usage() for commandline
*
* @param args Command line arguments.
*
* @throws IOException If there is an error extracting the URLs.
*/
public static void main(String[] args) throws IOException {
PDDocument doc = null;
try {
if (args.length != 1) {
usage();
} else {
doc = PDDocument.load(new File(args[0]));
int pageNum = 0;
for (PDPage page : doc.getPages()) {
pageNum++;
PDFTextStripperByArea stripper = new PDFTextStripperByArea();
List<PDAnnotation> annotations = page.getAnnotations();
// first setup text extraction regions
for (int j = 0; j < annotations.size(); j++) {
PDAnnotation annot = annotations.get(j);
if (getActionURI(annot) != null) {
PDRectangle rect = annot.getRectangle();
// need to reposition link rectangle to match text space
float x = rect.getLowerLeftX();
float y = rect.getUpperRightY();
float width = rect.getWidth();
float height = rect.getHeight();
int rotation = page.getRotation();
if (rotation == 0) {
PDRectangle pageSize = page.getMediaBox();
// area stripper uses java coordinates, not PDF coordinates
y = pageSize.getHeight() - y;
} else {
// do nothing
// please send us a sample file
}
Rectangle2D.Float awtRect = new Rectangle2D.Float(x, y, width, height);
stripper.addRegion("" + j, awtRect);
}
}
stripper.extractRegions(page);
for (int j = 0; j < annotations.size(); j++) {
PDAnnotation annot = annotations.get(j);
PDActionURI uri = getActionURI(annot);
if (uri != null) {
String urlText = stripper.getTextForRegion("" + j);
System.out.println("Page " + pageNum + ":'" + urlText.trim() + "'=" + uri.getURI());
}
}
}
}
} finally {
if (doc != null) {
doc.close();
}
}
}
Aggregations