Examples with PDFTextStripperByArea - org.apache.pdfbox.text.PDFTextStripperByArea

Example 1 with PDFTextStripperByArea

use of org.apache.pdfbox.text.PDFTextStripperByArea in project pdfbox by apache.

the class ExtractTextByArea method main.

/**
 * This will print the documents text in a certain area.
 *
 * @param args The command line arguments.
 *
 * @throws IOException If there is an error parsing the document.
 */
public static void main(String[] args) throws IOException {
    if (args.length != 1) {
        usage();
    } else {
        try (PDDocument document = PDDocument.load(new File(args[0]))) {
            PDFTextStripperByArea stripper = new PDFTextStripperByArea();
            stripper.setSortByPosition(true);
            Rectangle rect = new Rectangle(10, 280, 275, 60);
            stripper.addRegion("class1", rect);
            PDPage firstPage = document.getPage(0);
            stripper.extractRegions(firstPage);
            System.out.println("Text in the area:" + rect);
            System.out.println(stripper.getTextForRegion("class1"));
        }
    }
}

Also used : PDPage(org.apache.pdfbox.pdmodel.PDPage) PDDocument(org.apache.pdfbox.pdmodel.PDDocument) Rectangle(java.awt.Rectangle) File(java.io.File) PDFTextStripperByArea(org.apache.pdfbox.text.PDFTextStripperByArea)

Example 2 with PDFTextStripperByArea

use of org.apache.pdfbox.text.PDFTextStripperByArea in project xwiki-platform by xwiki.

the class PDFTest method getDestinationText.

private String getDestinationText(PDPageXYZDestination destination) throws Exception {
    PDFTextStripperByArea stripper = new PDFTextStripperByArea();
    stripper.addRegion("destination", getRectangleBelowDestination(destination));
    stripper.extractRegions(destination.getPage());
    return stripper.getTextForRegion("destination").trim();
}

Also used : PDFTextStripperByArea(org.apache.pdfbox.text.PDFTextStripperByArea)

Example 3 with PDFTextStripperByArea

use of org.apache.pdfbox.text.PDFTextStripperByArea in project xwiki-platform by xwiki.

the class PDFTest method extractLinks.

/**
 * Code adapted from http://www.docjar.com/html/api/org/apache/pdfbox/examples/pdmodel/PrintURLs.java.html
 */
private Map<String, PDAction> extractLinks(PDPage page) throws Exception {
    Map<String, PDAction> links = new HashMap<String, PDAction>();
    PDFTextStripperByArea stripper = new PDFTextStripperByArea();
    List<PDAnnotation> annotations = page.getAnnotations();
    // First setup the text extraction regions.
    for (int j = 0; j < annotations.size(); j++) {
        PDAnnotation annotation = annotations.get(j);
        if (annotation instanceof PDAnnotationLink) {
            PDAnnotationLink link = (PDAnnotationLink) annotation;
            PDRectangle rect = link.getRectangle();
            // Need to reposition link rectangle to match text space.
            float x = rect.getLowerLeftX();
            float y = rect.getUpperRightY();
            float width = rect.getWidth();
            float height = rect.getHeight();
            int rotation = page.getRotation();
            if (rotation == 0) {
                PDRectangle pageSize = page.getMediaBox();
                y = pageSize.getHeight() - y;
            } else if (rotation == 90) {
            // Do nothing.
            }
            Rectangle2D.Float awtRect = new Rectangle2D.Float(x, y, width, height);
            stripper.addRegion(String.valueOf(j), awtRect);
        }
    }
    stripper.extractRegions(page);
    for (int j = 0; j < annotations.size(); j++) {
        PDAnnotation annotation = annotations.get(j);
        if (annotation instanceof PDAnnotationLink) {
            PDAnnotationLink link = (PDAnnotationLink) annotation;
            String label = stripper.getTextForRegion(String.valueOf(j)).trim();
            links.put(label, link.getAction());
        }
    }
    return links;
}

Also used : HashMap(java.util.HashMap) PDAnnotation(org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation) Rectangle2D(java.awt.geom.Rectangle2D) PDAction(org.apache.pdfbox.pdmodel.interactive.action.PDAction) PDRectangle(org.apache.pdfbox.pdmodel.common.PDRectangle) PDAnnotationLink(org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink) PDFTextStripperByArea(org.apache.pdfbox.text.PDFTextStripperByArea)

Example 4 with PDFTextStripperByArea

use of org.apache.pdfbox.text.PDFTextStripperByArea in project pdfbox by apache.

the class PrintURLs method main.

/**
 * This will output all URLs and the texts in the annotation rectangle of a document.
 * <br>
 * see usage() for commandline
 *
 * @param args Command line arguments.
 *
 * @throws IOException If there is an error extracting the URLs.
 */
public static void main(String[] args) throws IOException {
    PDDocument doc = null;
    try {
        if (args.length != 1) {
            usage();
        } else {
            doc = PDDocument.load(new File(args[0]));
            int pageNum = 0;
            for (PDPage page : doc.getPages()) {
                pageNum++;
                PDFTextStripperByArea stripper = new PDFTextStripperByArea();
                List<PDAnnotation> annotations = page.getAnnotations();
                // first setup text extraction regions
                for (int j = 0; j < annotations.size(); j++) {
                    PDAnnotation annot = annotations.get(j);
                    if (getActionURI(annot) != null) {
                        PDRectangle rect = annot.getRectangle();
                        // need to reposition link rectangle to match text space
                        float x = rect.getLowerLeftX();
                        float y = rect.getUpperRightY();
                        float width = rect.getWidth();
                        float height = rect.getHeight();
                        int rotation = page.getRotation();
                        if (rotation == 0) {
                            PDRectangle pageSize = page.getMediaBox();
                            // area stripper uses java coordinates, not PDF coordinates
                            y = pageSize.getHeight() - y;
                        } else {
                        // do nothing
                        // please send us a sample file
                        }
                        Rectangle2D.Float awtRect = new Rectangle2D.Float(x, y, width, height);
                        stripper.addRegion("" + j, awtRect);
                    }
                }
                stripper.extractRegions(page);
                for (int j = 0; j < annotations.size(); j++) {
                    PDAnnotation annot = annotations.get(j);
                    PDActionURI uri = getActionURI(annot);
                    if (uri != null) {
                        String urlText = stripper.getTextForRegion("" + j);
                        System.out.println("Page " + pageNum + ":'" + urlText.trim() + "'=" + uri.getURI());
                    }
                }
            }
        }
    } finally {
        if (doc != null) {
            doc.close();
        }
    }
}

Also used : PDPage(org.apache.pdfbox.pdmodel.PDPage) PDAnnotation(org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation) Rectangle2D(java.awt.geom.Rectangle2D) PDDocument(org.apache.pdfbox.pdmodel.PDDocument) PDRectangle(org.apache.pdfbox.pdmodel.common.PDRectangle) File(java.io.File) PDFTextStripperByArea(org.apache.pdfbox.text.PDFTextStripperByArea) PDActionURI(org.apache.pdfbox.pdmodel.interactive.action.PDActionURI)

Aggregations

PDFTextStripperByArea (org.apache.pdfbox.text.PDFTextStripperByArea)4 Rectangle2D (java.awt.geom.Rectangle2D)2 File (java.io.File)2 PDDocument (org.apache.pdfbox.pdmodel.PDDocument)2 PDPage (org.apache.pdfbox.pdmodel.PDPage)2 PDRectangle (org.apache.pdfbox.pdmodel.common.PDRectangle)2 PDAnnotation (org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation)2 Rectangle (java.awt.Rectangle)1 HashMap (java.util.HashMap)1 PDAction (org.apache.pdfbox.pdmodel.interactive.action.PDAction)1 PDActionURI (org.apache.pdfbox.pdmodel.interactive.action.PDActionURI)1 PDAnnotationLink (org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink)1