Search in sources :

Example 6 with RegionType

use of eu.transkribus.core.model.beans.pagecontent.RegionType in project TranskribusCore by Transkribus.

the class PageXmlDaoTest method main.

public static void main(String[] args) {
    TrpDoc doc = FakeDocProvider.create(false);
    TrpTranscriptMetadata md = doc.getPages().get(0).getTranscripts().get(0);
    try {
        JAXBPageTranscript transcript = new JAXBPageTranscript(md);
        transcript.build();
        // JAXBPageTranscript transcript = TrpPageTranscriptBuilder.build(md);
        // get Source Document as String
        // DOMSource domSource = new DOMSource(transcript.getSourceDoc());
        // StringWriter writer = new StringWriter();
        // StreamResult result = new StreamResult(writer);
        // TransformerFactory tf = TransformerFactory.newInstance();
        // Transformer transformer = tf.newTransformer();
        // transformer.transform(domSource, result);
        // System.out.println("XML IN String format is: \n" + writer.toString());
        // check JaxB Element
        PcGtsType page = transcript.getPageData();
        if (page == null) {
            System.out.println("page XML is null");
            System.exit(0);
        }
        PageType pageType = page.getPage();
        System.out.println(page.getMetadata());
        if (pageType == null) {
            System.out.println("pagetype element is null");
            System.exit(0);
        }
        if (pageType.getTextRegionOrImageRegionOrLineDrawingRegion() == null) {
            System.out.println("Region list is null");
            System.exit(0);
        }
        int i = 0;
        for (TextRegionType tr : transcript.getPage().getTextRegions(true)) {
            tr.setId("" + i++);
        }
        List<TrpRegionType> regions = pageType.getTextRegionOrImageRegionOrLineDrawingRegion();
        for (RegionType r : regions) {
            if (r instanceof TextRegionType) {
                TextRegionType t = (TextRegionType) r;
                System.out.println(t.getId());
            }
        }
    } catch (IllegalArgumentException | IOException e) {
        e.printStackTrace();
    }
// try {
// PrimaPageTranscript ppt = PageXmlDao.getPrimaPageTranscript(md);
// Page page = ppt.getPageData();
// 
// System.out.println(page.getImageFilename());
// Region r = page.getLayout().getRegion("tempReg357564684568544579089");
// System.out.println(r.getType().getName());
// //			System.out.println(page.getLayout().getParentChildRelation(r.getType(), r.getId().toString()).getRelationType());
// 
// IdRegister idr = r.getIdRegister();
// //			idr.
// System.out.println(idr);
// } catch (IllegalArgumentException e) {
// 
// e.printStackTrace();
// } catch (MalformedURLException e) {
// 
// e.printStackTrace();
// } catch (UnsupportedFormatVersionException e) {
// 
// e.printStackTrace();
// }
}
Also used : TrpRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpRegionType) RegionType(eu.transkribus.core.model.beans.pagecontent.RegionType) TextRegionType(eu.transkribus.core.model.beans.pagecontent.TextRegionType) JAXBPageTranscript(eu.transkribus.core.model.beans.JAXBPageTranscript) TrpTranscriptMetadata(eu.transkribus.core.model.beans.TrpTranscriptMetadata) IOException(java.io.IOException) PcGtsType(eu.transkribus.core.model.beans.pagecontent.PcGtsType) TextRegionType(eu.transkribus.core.model.beans.pagecontent.TextRegionType) TrpRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpRegionType) TrpDoc(eu.transkribus.core.model.beans.TrpDoc) PageType(eu.transkribus.core.model.beans.pagecontent.PageType)

Example 7 with RegionType

use of eu.transkribus.core.model.beans.pagecontent.RegionType in project TranskribusCore by Transkribus.

the class TrpPdfDocument method addPage.

@SuppressWarnings("unused")
public void addPage(URL imgUrl, TrpDoc doc, PcGtsType pc, boolean addAdditionalPlainTextPage, boolean imageOnly, FimgStoreImgMd md, boolean doBlackening, ExportCache cache) throws MalformedURLException, IOException, DocumentException, JAXBException, URISyntaxException {
    imgOnly = imageOnly;
    extraTextPage = addAdditionalPlainTextPage;
    // FIXME use this only on cropped (printspace) images!!
    java.awt.Rectangle printspace = null;
    // if(pc.getPage() != null && pc.getPage().getPrintSpace() != null){
    // java.awt.Polygon psPoly = PageXmlUtils.buildPolygon(pc.getPage().getPrintSpace().getCoords());
    // printspace = psPoly.getBounds();
    // }
    BufferedImage imgBuffer = null;
    try (InputStream input = imgUrl.openStream()) {
        imgBuffer = ImageIO.read(input);
    } catch (FileNotFoundException e) {
        logger.error("File was not found at url " + imgUrl);
        URL origUrl = new URL(imgUrl.getProtocol(), imgUrl.getHost(), imgUrl.getFile().replace("view", "orig"));
        logger.debug("try orig file location " + origUrl);
        try (InputStream input = origUrl.openStream()) {
            imgBuffer = ImageIO.read(input);
        }
    }
    Graphics2D graph = imgBuffer.createGraphics();
    graph.setColor(Color.BLACK);
    List<TrpRegionType> regions = pc.getPage().getTextRegionOrImageRegionOrLineDrawingRegion();
    // regions should be sorted after their reading order at this point - so no need to resort
    // Collections.sort(regions, new TrpElementCoordinatesComparator<RegionType>());
    int nrOfTextRegions = 0;
    for (RegionType r : regions) {
        // used later to decide if new page is necessary if there is at least one text region
        if (r instanceof TextRegionType) {
            nrOfTextRegions++;
        } else if (r instanceof UnknownRegionType && doBlackening) {
            UnknownRegionType urt = (UnknownRegionType) r;
            ITrpShapeType trpShape = (ITrpShapeType) r;
            boolean isBlackening = RegionTypeUtil.isBlackening(trpShape);
            if (isBlackening) {
                // Rectangle blackRect = (Rectangle) PageXmlUtils.buildPolygon(urt.getCoords().getPoints()).getBounds();
                Rectangle blackRect = urt.getBoundingBox();
                graph.fillRect((int) blackRect.getMinX(), (int) blackRect.getMinY(), (int) blackRect.getWidth(), (int) blackRect.getHeight());
            }
        }
    }
    graph.dispose();
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    ImageIO.write(imgBuffer, "JPEG", baos);
    byte[] imageBytes = baos.toByteArray();
    Image img = Image.getInstance(imageBytes);
    baos.close();
    imgBuffer.flush();
    imgBuffer = null;
    /*
		 * take resolution from metadata of image store, values in img are not always set
		 */
    if (md != null) {
        double resolutionX = (float) md.getXResolution();
        double resolutionY = (float) md.getYResolution();
        // logger.debug("Dpi: " + md.getXResolution());
        img.setDpi((int) resolutionX, (int) resolutionY);
    }
    // else{
    // 
    // Image img = Image.getInstance(imgUrl);
    // }
    int cutoffLeft = 0;
    int cutoffTop = 0;
    if (printspace == null) {
        /*
			 * 1 Punkt pro cm  = 2,54 dpi
			 * img.getPlainWidth() = horizontal size in Pixel
			 * img.getPlainHeight() = vertical size in Pixel
			 * img.getDpiX() = resolution of x direction
			 * Size in cm: img.getDpiX() / (img.getDpiX()/2,54)
			 */
        // logger.debug("Horizontal size in cm: img.getPlainWidth() / (img.getDpiX()/2,54): " + img.getPlainWidth() / (img.getDpiX()/2.54));
        // logger.debug("Vertical size in cm: img.getPlainHeight() / (img.getDpiY()/2,54): " + img.getPlainHeight() / (img.getDpiY()/2.54));
        setPageSize(img);
    } else {
        int width = (int) printspace.getWidth();
        int height = (int) printspace.getHeight();
        setPageSize(new com.itextpdf.text.Rectangle(width, height));
        cutoffLeft = printspace.x;
        cutoffTop = printspace.y;
    }
    float xSize;
    float ySize;
    /*
		 * calculate size of image with respect to Dpi of the image and the default points of PDF which is 72
		 * PDF also uses the same basic measurement unit as PostScript: 72 points == 1 inch
		 */
    if (img.getDpiX() > 72f) {
        xSize = (float) (img.getPlainWidth() / img.getDpiX() * 72);
        ySize = (float) (img.getPlainHeight() / img.getDpiY() * 72);
        scaleFactorX = scaleFactorY = (float) (72f / img.getDpiX());
    } else {
        xSize = (float) (img.getPlainWidth() / 300 * 72);
        ySize = (float) (img.getPlainHeight() / 300 * 72);
        scaleFactorX = scaleFactorY = 72f / 300;
    }
    /*
		 * construct the grid for the added page
		 */
    for (int i = 0; i <= 12; i++) {
        twelfthPoints[i][0] = i * (img.getPlainWidth() / 12);
        twelfthPoints[i][1] = i * (img.getPlainHeight() / 12);
    }
    // TODO use scaleToFit instead?
    img.scaleAbsolute(xSize, ySize);
    img.setAbsolutePosition(0, 0);
    /*
		 * calculate physical size of image in inch and assign text size dependent on these values
		 */
    if (img.getScaledWidth() / 72f < 9 && img.getScaledHeight() / 72f < 12) {
        lineMeanHeight = 12 / scaleFactorY;
    } else {
        lineMeanHeight = 17 / scaleFactorY;
    }
    if (doc != null && createTitle) {
        addTitlePage(doc);
        // logger.debug("page number " + getPageNumber());
        if (getPageNumber() % 1 != 0) {
            logger.debug("odd page number -> add one new page");
            document.newPage();
            // necessary that an empty page can be created
            writer.setPageEmpty(false);
        }
    }
    document.newPage();
    addTextAndImage(pc, cutoffLeft, cutoffTop, img, imageOnly, cache);
    if (addAdditionalPlainTextPage) {
        if (nrOfTextRegions > 0) {
            logger.debug("add uniform text");
            document.newPage();
            addUniformText(pc, cutoffLeft, cutoffTop, cache);
        }
    }
}
Also used : Rectangle(java.awt.Rectangle) TrpTextRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType) UnknownRegionType(eu.transkribus.core.model.beans.pagecontent.UnknownRegionType) TrpRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpRegionType) TrpTableRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTableRegionType) RegionType(eu.transkribus.core.model.beans.pagecontent.RegionType) TextRegionType(eu.transkribus.core.model.beans.pagecontent.TextRegionType) InputStream(java.io.InputStream) FileNotFoundException(java.io.FileNotFoundException) Rectangle(java.awt.Rectangle) ByteArrayOutputStream(java.io.ByteArrayOutputStream) BufferedImage(java.awt.image.BufferedImage) Image(com.itextpdf.text.Image) BufferedImage(java.awt.image.BufferedImage) URL(java.net.URL) Point(java.awt.Point) ITrpShapeType(eu.transkribus.core.model.beans.pagecontent_trp.ITrpShapeType) Graphics2D(java.awt.Graphics2D) TrpTextRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType) TextRegionType(eu.transkribus.core.model.beans.pagecontent.TextRegionType) UnknownRegionType(eu.transkribus.core.model.beans.pagecontent.UnknownRegionType) TrpRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpRegionType)

Example 8 with RegionType

use of eu.transkribus.core.model.beans.pagecontent.RegionType in project TranskribusCore by Transkribus.

the class TrpPdfDocument method addTextAndImage.

private void addTextAndImage(PcGtsType pc, int cutoffLeft, int cutoffTop, Image img, boolean imageOnly, ExportCache cache) throws DocumentException, IOException {
    lineAndColorList.clear();
    PdfContentByte cb = writer.getDirectContentUnder();
    cb.setColorFill(BaseColor.BLACK);
    cb.setColorStroke(BaseColor.BLACK);
    // BaseFont bf = BaseFont.createFont(BaseFont.TIMES_ROMAN, "UTF-8", BaseFont.NOT_EMBEDDED);
    if (!imageOnly) {
        cb.beginLayer(ocrLayer);
        cb.setFontAndSize(bfArial, 32);
        List<TrpRegionType> regions = pc.getPage().getTextRegionOrImageRegionOrLineDrawingRegion();
        /*
			 * use reading order comparator for sorting since at this time reading order is more trustable
			 * other sorting is not transitive and seldomly produces "Comparison violates its general contract" exception
			 */
        Collections.sort(regions, new TrpElementReadingOrderComparator<RegionType>(true));
        for (RegionType r : regions) {
            // TODO add paths for tables etc.
            if (r instanceof TrpTableRegionType) {
                exportTable(r, cb, cutoffLeft, cutoffTop, false, cache);
            } else if (r instanceof TextRegionType) {
                TextRegionType tr = (TextRegionType) r;
                // PageXmlUtils.buildPolygon(tr.getCoords().getPoints()).getBounds().getMinX();
                addTextFromTextRegion(tr, cb, cutoffLeft, cutoffTop, bfArial, cache);
            }
        }
        // scale after calculating lineMeanHeightForAllRegions
        // lineMeanHeight = lineMeanHeight/scaleFactorX;
        cb.endLayer();
    }
    cb.beginLayer(imgLayer);
    cb.addImage(img);
    cb.endLayer();
    if (highlightTags) {
        highlightAllTagsOnImg(lineAndColorList, cb, cutoffLeft, cutoffTop);
    }
/*
		 * draw tag lines
		 */
// addTocLinks(doc, page,cutoffTop);
}
Also used : TrpTextRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType) TextRegionType(eu.transkribus.core.model.beans.pagecontent.TextRegionType) TrpTextRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType) UnknownRegionType(eu.transkribus.core.model.beans.pagecontent.UnknownRegionType) TrpRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpRegionType) TrpTableRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTableRegionType) RegionType(eu.transkribus.core.model.beans.pagecontent.RegionType) TextRegionType(eu.transkribus.core.model.beans.pagecontent.TextRegionType) TrpTableRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTableRegionType) TrpRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpRegionType) PdfContentByte(com.itextpdf.text.pdf.PdfContentByte)

Example 9 with RegionType

use of eu.transkribus.core.model.beans.pagecontent.RegionType in project TranskribusCore by Transkribus.

the class TrpPdfDocument method hasSmallerColumn.

/*
	 * checks if there is at least one text region on the left of the actual one
	 * But: if text region is completely contained in the other it should not have an effect
	 */
private boolean hasSmallerColumn(List<TrpRegionType> regions, TextRegionType regionToCompare) throws DocumentException, IOException {
    float minX = 0;
    float minY = 0;
    float maxX = 0;
    float maxY = 0;
    float meanX = 0;
    float meanY = 0;
    // java.awt.Rectangle compareBlock = PageXmlUtils.buildPolygon(regionToCompare.getCoords().getPoints()).getBounds();
    java.awt.Rectangle compareBlock = regionToCompare.getBoundingBox();
    float compareMinX = (float) compareBlock.getMinX();
    float compareMinY = (float) compareBlock.getMinY();
    float compareMaxX = (float) compareBlock.getMaxX();
    float compareMaxY = (float) compareBlock.getMaxY();
    float compareMeanX = compareMinX + (compareMaxX - compareMinX) / 2;
    float compareMeanY = compareMinY + (compareMaxY - compareMinY) / 2;
    boolean foundSmallerColumn = false;
    smallerRegionMaxX = 0;
    if (regions.size() == 1) {
        return false;
    } else {
        for (RegionType r : regions) {
            // TODO add paths for tables etc.
            if (r instanceof TextRegionType && r.getId() != regionToCompare.getId()) {
                TextRegionType tr = (TextRegionType) r;
                // empty region can be ignored
                if (tr.getTextLine().isEmpty())
                    continue;
                else {
                    // region with empty lines can also be ignored
                    boolean textFound = false;
                    for (TextLineType tlt : tr.getTextLine()) {
                        TrpTextLineType l = (TrpTextLineType) tlt;
                        textFound = !l.getUnicodeText().isEmpty();
                        if (textFound) {
                            break;
                        }
                    }
                    // no text in region -> go to next region
                    if (!textFound) {
                        continue;
                    }
                }
                // logger.debug("tr id " + tr.getId());
                // compute average text region start
                // java.awt.Rectangle block = PageXmlUtils.buildPolygon(tr.getCoords().getPoints()).getBounds();
                java.awt.Rectangle block = tr.getBoundingBox();
                minX = (float) block.getMinX();
                maxX = (float) block.getMaxX();
                minY = (float) block.getMinY();
                maxY = (float) block.getMaxY();
                // meanX = minX+(maxX - minX)/2;
                meanY = minY + (maxY - minY) / 2;
                if (((meanY > compareMinY && meanY < compareMaxY) || (compareMeanY > minY && compareMeanY < maxY)) && (maxX < compareMeanX)) {
                    // to find the biggest maxX if there are several smaller columns
                    if (maxX > smallerRegionMaxX) {
                        smallerRegionMaxX = maxX;
                    }
                    foundSmallerColumn = true;
                }
            }
        }
    }
    return foundSmallerColumn;
}
Also used : TrpTextRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType) TextRegionType(eu.transkribus.core.model.beans.pagecontent.TextRegionType) TrpTextLineType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextLineType) Rectangle(java.awt.Rectangle) TrpTextRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType) UnknownRegionType(eu.transkribus.core.model.beans.pagecontent.UnknownRegionType) TrpRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpRegionType) TrpTableRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTableRegionType) RegionType(eu.transkribus.core.model.beans.pagecontent.RegionType) TextRegionType(eu.transkribus.core.model.beans.pagecontent.TextRegionType) TextLineType(eu.transkribus.core.model.beans.pagecontent.TextLineType) TrpTextLineType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextLineType)

Example 10 with RegionType

use of eu.transkribus.core.model.beans.pagecontent.RegionType in project TranskribusCore by Transkribus.

the class TrpTeiStringBuilder method setContent.

@Override
protected void setContent(List<TrpPage> pages) throws JAXBException, InterruptedException {
    SebisStringBuilder sbFacsimile = new SebisStringBuilder();
    SebisStringBuilder sbText = new SebisStringBuilder();
    sbText.incIndent();
    sbText.addLine("<text>");
    sbText.incIndent();
    sbText.addLine("<body>");
    // sbText.incIndent();
    // text = tei.createElementNS(TEI_NS, "text");
    // body = tei.createElementNS(TEI_NS, "body");
    int totalPages = pageIndices == null ? pages.size() : pageIndices.size();
    if (monitor != null) {
        monitor.beginTask("Creating TEI", totalPages);
    }
    int c = 0;
    for (int i = 0; i < pages.size(); ++i) {
        if (pageIndices != null && !pageIndices.contains(i))
            continue;
        if (monitor != null) {
            if (monitor.isCanceled()) {
                throw new InterruptedException("Export was canceled by user");
            // break;
            }
            monitor.subTask("Processing page " + (c + 1));
        }
        TrpPage p = pages.get(i);
        logger.debug("1Processing page " + p.getPageNr() + ": " + p.getUrl() + " - XML=" + p.getCurrentTranscript().getUrl());
        // check buffer for transcript or unmarshal the page XML
        PcGtsType pc = this.getPcGtsTypeForPage(p);
        if (pars.hasZones()) {
            // create a facsimile element for each page that are appended to the root element of the TEI after header
            openFacsimileElement(sbFacsimile, p, pc);
        }
        // create page-break element for each page as child of body element:
        writePageBreak(sbText, p, pc);
        // 
        // // append all text-regions / lines / words to the xml:
        List<TrpRegionType> regions = pc.getPage().getTextRegionOrImageRegionOrLineDrawingRegion();
        Collections.sort(regions, new TrpElementReadingOrderComparator<RegionType>(true));
        for (TrpRegionType r : regions) {
            // System.out.println(r.getClass());
            if (r instanceof TextRegionType) {
                if (pars.hasZones()) {
                    writeZonesForTextRegion(sbFacsimile, (TrpTextRegionType) r, p.getPageNr());
                }
                writeTextForTextRegion(sbText, (TrpTextRegionType) r, p.getPageNr());
            } else {
                // write other regions
                if (pars.hasZones()) {
                    String facsId = FACS_ID_PREFIX + p.getPageNr();
                    writeZoneForShape(sbFacsimile, r, facsId, true);
                }
            }
        }
        if (pars.hasZones()) {
            closeFacsimilieElement(sbFacsimile);
        }
        ++c;
        if (monitor != null) {
            monitor.worked(c);
        }
    }
    // text.appendChild(body);
    // root.appendChild(text);
    // sbText.decIndent();
    sbText.addLine("</body>");
    sbText.decIndent();
    sbText.addLine("</text>");
    sbText.decIndent();
    sbTotal.sb.append(sbFacsimile.toString());
    sbTotal.sb.append(sbText.toString());
}
Also used : TrpTextRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType) TextRegionType(eu.transkribus.core.model.beans.pagecontent.TextRegionType) TrpTextRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType) TrpRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpRegionType) RegionType(eu.transkribus.core.model.beans.pagecontent.RegionType) TextRegionType(eu.transkribus.core.model.beans.pagecontent.TextRegionType) TrpPage(eu.transkribus.core.model.beans.TrpPage) TrpRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpRegionType) SebisStringBuilder(eu.transkribus.core.util.SebisStringBuilder) PcGtsType(eu.transkribus.core.model.beans.pagecontent.PcGtsType) Point(java.awt.Point)

Aggregations

RegionType (eu.transkribus.core.model.beans.pagecontent.RegionType)18 TrpRegionType (eu.transkribus.core.model.beans.pagecontent_trp.TrpRegionType)16 TextRegionType (eu.transkribus.core.model.beans.pagecontent.TextRegionType)15 TrpTextRegionType (eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType)15 TrpTableRegionType (eu.transkribus.core.model.beans.pagecontent_trp.TrpTableRegionType)7 TableRegionType (eu.transkribus.core.model.beans.pagecontent.TableRegionType)6 TextLineType (eu.transkribus.core.model.beans.pagecontent.TextLineType)5 UnknownRegionType (eu.transkribus.core.model.beans.pagecontent.UnknownRegionType)5 WordType (eu.transkribus.core.model.beans.pagecontent.WordType)4 Point (java.awt.Point)4 Rectangle (java.awt.Rectangle)4 TrpTextLineType (eu.transkribus.core.model.beans.pagecontent_trp.TrpTextLineType)3 IOException (java.io.IOException)3 ArrayList (java.util.ArrayList)3 PdfContentByte (com.itextpdf.text.pdf.PdfContentByte)2 PcGtsType (eu.transkribus.core.model.beans.pagecontent.PcGtsType)2 ITrpShapeType (eu.transkribus.core.model.beans.pagecontent_trp.ITrpShapeType)2 HashMap (java.util.HashMap)2 List (java.util.List)2 Image (com.itextpdf.text.Image)1