Search in sources :

Example 1 with TrpTableRegionType

use of eu.transkribus.core.model.beans.pagecontent_trp.TrpTableRegionType in project TranskribusCore by Transkribus.

the class TrpPdfDocument method addUniformText.

private void addUniformText(PcGtsType pc, int cutoffLeft, int cutoffTop, ExportCache cache) throws DocumentException, IOException {
    PdfContentByte cb = writer.getDirectContentUnder();
    cb.setColorFill(BaseColor.BLACK);
    cb.setColorStroke(BaseColor.BLACK);
    /**
     * The path to the font.
     */
    // FontFactory.register("c:/windows/fonts/arialbd.ttf");
    // BaseFont bf = BaseFont.createFont("/fonts/arialbd.ttf", BaseFont.IDENTITY_H, BaseFont.NOT_EMBEDDED);
    cb.beginLayer(ocrLayer);
    // FontFactory.register("arialbd.ttf", "my_bold_font");
    // Font fontTest = FontFactory.getFont("arialbd.ttf", Font.BOLDITALIC);
    cb.setFontAndSize(bfArial, 10);
    List<TrpRegionType> regions = pc.getPage().getTextRegionOrImageRegionOrLineDrawingRegion();
    /*
		 * use reading order comparator for sorting since at this time reading order is more trustable
		 * other sorting is not transitive and seldomly produces "Comparison violates its general contract" exception
		 */
    Collections.sort(regions, new TrpElementReadingOrderComparator<RegionType>(true));
    // Collections.sort(regions, new TrpElementCoordinatesComparator<RegionType>());
    float textBlockXStart = 0;
    int i = 0;
    for (TrpRegionType r : regions) {
        // TODO add paths for tables etc.
        if (r instanceof TrpTableRegionType) {
            exportTable(r, cb, cutoffLeft, cutoffTop, true, cache);
        } else if (r instanceof TrpTextRegionType) {
            TrpTextRegionType tr = (TrpTextRegionType) r;
            // compute average text region start
            // textBlockXStart = (float) (PageXmlUtils.buildPolygon(tr.getCoords().getPoints()).getBounds().getMinX());
            // double minX = PageXmlUtils.buildPolygon(tr.getCoords().getPoints()).getBounds().getMinX();
            // this should result in the the same value as the method in the line above which is deprecated
            double minX = tr.getBoundingBox().getMinX();
            double maxX = tr.getBoundingBox().getMaxX();
            double trWidth = tr.getBoundingBox().getWidth();
            // if (hasSmallerColumn(regions, tr)){
            if (isOnlyRegionInThisRow(regions, tr)) {
                // if (regions.size() == 1){
                logger.debug("only one region in this row!!");
                // indent start of text block under certain preconditions
                if (minX < twelfthPoints[1][0] && (twelfthPoints[1][0] < maxX && trWidth > twelfthPoints[2][0])) {
                    textBlockXStart = twelfthPoints[1][0];
                } else // if textregion contains only one line this is probably a headline
                if (tr.getTextLine().size() == 1) {
                    // logger.debug("tr.getTextLine().size() == 1 ");
                    textBlockXStart = getPrintregionStartX((float) (minX), tr.getBoundingBox().getMaxX());
                } else if (twelfthPoints[2][0] < maxX && trWidth > twelfthPoints[3][0]) {
                    // logger.debug("twelfthPoints[2][0] < tr.getBoundingBox().getMaxX() ");
                    textBlockXStart = twelfthPoints[2][0];
                } else {
                    textBlockXStart = (float) minX;
                }
            } else {
                logger.debug("several columns found, minX of text region is : " + minX);
                // float startWithThisX = (float) (minX < smallerRegionMaxX ? smallerRegionMaxX : minX);
                // textBlockXStart = getPrintregionStartX((float) (startWithThisX));
                /*
					 * this is then used for all lines of a region as start point
					 */
                textBlockXStart = getAverageBeginningOfBaselines(tr);
                textBlockXStart += 40;
            }
            // logger.debug("textBlockXStart " + textBlockXStart);
            addUniformTextFromTextRegion(tr, cb, cutoffLeft, cutoffTop, bfArial, textBlockXStart, cache);
        }
    }
    cb.endLayer();
// addTocLinks(doc, page,cutoffTop);
}
Also used : TrpTextRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType) UnknownRegionType(eu.transkribus.core.model.beans.pagecontent.UnknownRegionType) TrpRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpRegionType) TrpTableRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTableRegionType) RegionType(eu.transkribus.core.model.beans.pagecontent.RegionType) TextRegionType(eu.transkribus.core.model.beans.pagecontent.TextRegionType) TrpTableRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTableRegionType) TrpRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpRegionType) TrpTextRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType) PdfContentByte(com.itextpdf.text.pdf.PdfContentByte) Point(java.awt.Point)

Example 2 with TrpTableRegionType

use of eu.transkribus.core.model.beans.pagecontent_trp.TrpTableRegionType in project TranskribusCore by Transkribus.

the class TrpPdfDocument method exportTable.

private void exportTable(RegionType r, PdfContentByte cb, int cutoffLeft, int cutoffTop, boolean addUniformText, ExportCache cache) throws IOException, DocumentException {
    logger.debug("is table");
    TrpTableRegionType table = (TrpTableRegionType) r;
    int cols = table.getNCols();
    int rows = table.getNRows();
    List<List<TrpTableCellType>> allRowCells = new ArrayList<List<TrpTableCellType>>();
    for (int k = 0; k < rows; k++) {
        allRowCells.add(table.getRowCells(k));
    }
    List<HashMap<Integer, TrpTableCellType>> allRows = new ArrayList<HashMap<Integer, TrpTableCellType>>();
    HashMap<Integer, TrpTableCellType> nextRowMap = new HashMap<Integer, TrpTableCellType>();
    for (List<TrpTableCellType> rowCells : allRowCells) {
        HashMap<Integer, TrpTableCellType> currRowMap = new HashMap<Integer, TrpTableCellType>();
        /*
        	 * fill up all cells which are not set in TRP (needed for vertical cell merge)
        	 * the nextRowMap contains already all cells which span vertically with the cells above - means they got merged 
        	 * in the table but have to be considered here 
        	 */
        currRowMap.putAll(nextRowMap);
        nextRowMap.clear();
        for (TrpTableCellType cell : rowCells) {
            // logger.debug("table cell text " + cell.getUnicodeTextFromLines());
            currRowMap.put(cell.getCol(), cell);
            if (cell.getRowSpan() > 1) {
                nextRowMap.put(cell.getCol(), null);
            }
        }
        allRows.add(currRowMap);
    }
    for (HashMap<Integer, TrpTableCellType> entry : allRows) {
        for (Integer key : entry.keySet()) {
            if (addUniformText) {
                float textBlockXStart = getAverageBeginningOfBaselines(entry.get(key));
                textBlockXStart += 40;
                addUniformTextFromTextRegion(entry.get(key), cb, cutoffLeft, cutoffTop, bfArial, textBlockXStart, cache);
            } else {
                addTextFromTextRegion(entry.get(key), cb, cutoffLeft, cutoffTop, bfArial, cache);
            }
        }
    }
}
Also used : TrpTableCellType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTableCellType) TrpTableRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTableRegionType) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) List(java.util.List) ArrayList(java.util.ArrayList) Point(java.awt.Point)

Example 3 with TrpTableRegionType

use of eu.transkribus.core.model.beans.pagecontent_trp.TrpTableRegionType in project TranskribusCore by Transkribus.

the class TrpPdfDocument method addTextAndImage.

private void addTextAndImage(PcGtsType pc, int cutoffLeft, int cutoffTop, Image img, boolean imageOnly, ExportCache cache) throws DocumentException, IOException {
    lineAndColorList.clear();
    PdfContentByte cb = writer.getDirectContentUnder();
    cb.setColorFill(BaseColor.BLACK);
    cb.setColorStroke(BaseColor.BLACK);
    // BaseFont bf = BaseFont.createFont(BaseFont.TIMES_ROMAN, "UTF-8", BaseFont.NOT_EMBEDDED);
    if (!imageOnly) {
        cb.beginLayer(ocrLayer);
        cb.setFontAndSize(bfArial, 32);
        List<TrpRegionType> regions = pc.getPage().getTextRegionOrImageRegionOrLineDrawingRegion();
        /*
			 * use reading order comparator for sorting since at this time reading order is more trustable
			 * other sorting is not transitive and seldomly produces "Comparison violates its general contract" exception
			 */
        Collections.sort(regions, new TrpElementReadingOrderComparator<RegionType>(true));
        for (RegionType r : regions) {
            // TODO add paths for tables etc.
            if (r instanceof TrpTableRegionType) {
                exportTable(r, cb, cutoffLeft, cutoffTop, false, cache);
            } else if (r instanceof TextRegionType) {
                TextRegionType tr = (TextRegionType) r;
                // PageXmlUtils.buildPolygon(tr.getCoords().getPoints()).getBounds().getMinX();
                addTextFromTextRegion(tr, cb, cutoffLeft, cutoffTop, bfArial, cache);
            }
        }
        // scale after calculating lineMeanHeightForAllRegions
        // lineMeanHeight = lineMeanHeight/scaleFactorX;
        cb.endLayer();
    }
    cb.beginLayer(imgLayer);
    cb.addImage(img);
    cb.endLayer();
    if (highlightTags) {
        highlightAllTagsOnImg(lineAndColorList, cb, cutoffLeft, cutoffTop);
    }
/*
		 * draw tag lines
		 */
// addTocLinks(doc, page,cutoffTop);
}
Also used : TrpTextRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType) TextRegionType(eu.transkribus.core.model.beans.pagecontent.TextRegionType) TrpTextRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType) UnknownRegionType(eu.transkribus.core.model.beans.pagecontent.UnknownRegionType) TrpRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpRegionType) TrpTableRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTableRegionType) RegionType(eu.transkribus.core.model.beans.pagecontent.RegionType) TextRegionType(eu.transkribus.core.model.beans.pagecontent.TextRegionType) TrpTableRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTableRegionType) TrpRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpRegionType) PdfContentByte(com.itextpdf.text.pdf.PdfContentByte)

Example 4 with TrpTableRegionType

use of eu.transkribus.core.model.beans.pagecontent_trp.TrpTableRegionType in project TranskribusCore by Transkribus.

the class DocxBuilder method writeDocxForTranscriptWithTables.

private static void writeDocxForTranscriptWithTables(MainDocumentPart mdp, TrpPageType trpPage, boolean wordBased, boolean preserveLineBreaks) {
    boolean rtl = false;
    // TrpTableRegionType is contained in the regions too
    List<TrpRegionType> regions = trpPage.getRegions();
    Collections.sort(regions, new TrpElementReadingOrderComparator<RegionType>(true));
    for (int j = 0; j < regions.size(); ++j) {
        TrpRegionType r = regions.get(j);
        if (r instanceof TrpTableRegionType) {
            logger.debug("is table");
            TrpTableRegionType table = (TrpTableRegionType) r;
            int cols = table.getNCols();
            int rows = table.getNRows();
            // PageXmlUtils.buildPolygon(table.getCoords().getPoints()).getBounds().getMaxX();
            double maxX = table.getBoundingBox().getMaxX();
            // PageXmlUtils.buildPolygon(table.getCoords().getPoints()).getBounds().getMinX();
            double minX = table.getBoundingBox().getMinX();
            int tablesize = (int) (maxX - minX);
            List<List<TrpTableCellType>> allRowCells = new ArrayList<List<TrpTableCellType>>();
            for (int k = 0; k < rows; k++) {
                allRowCells.add(table.getRowCells(k));
            }
            List<HashMap<Integer, TrpTableCellType>> allRows = new ArrayList<HashMap<Integer, TrpTableCellType>>();
            HashMap<Integer, TrpTableCellType> nextRowMap = new HashMap<Integer, TrpTableCellType>();
            for (List<TrpTableCellType> rowCells : allRowCells) {
                HashMap<Integer, TrpTableCellType> currRowMap = new HashMap<Integer, TrpTableCellType>();
                /*
	            	 * fill up all cells which are not set in TRP (needed for vertical cell merge)
	            	 * the nextRowMap contains already all cells which span vertically with the cells above - means they got merged 
	            	 * in the table but have to be considered here 
	            	 */
                currRowMap.putAll(nextRowMap);
                nextRowMap.clear();
                for (TrpTableCellType cell : rowCells) {
                    // logger.debug("table cell text " + cell.getUnicodeTextFromLines());
                    currRowMap.put(cell.getCol(), cell);
                    if (cell.getRowSpan() > 1) {
                        nextRowMap.put(cell.getCol(), null);
                    }
                }
                allRows.add(currRowMap);
            }
            Tbl thisTable;
            try {
                thisTable = getDocxTable(wordMLPackage, wordBased, rows, cols, allRows, tablesize, mdp);
                mdp.addObject(thisTable);
            } catch (Exception e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
            // this Br element is used break the current and go for next line
            Br br = factory.createBr();
            org.docx4j.wml.P p = factory.createP();
            mdp.addObject(p);
            p.getContent().add(br);
        } else if (r instanceof TrpTextRegionType) {
            TrpTextRegionType tr = (TrpTextRegionType) r;
            /*
				 * create one paragraph for each text region
				 * but only if there is some text in it
				 */
            String helper = tr.getUnicodeText().replaceAll("\n", "");
            if (!helper.equals("")) {
                exportTextRegion(tr, wordBased, null, mdp);
            }
        }
    }
}
Also used : TrpTextRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType) TrpRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpRegionType) TrpTableRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTableRegionType) RegionType(eu.transkribus.core.model.beans.pagecontent.RegionType) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) ArrayList(java.util.ArrayList) TrpTableRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTableRegionType) TrpRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpRegionType) CustomTagList(eu.transkribus.core.model.beans.customtags.CustomTagList) List(java.util.List) ArrayList(java.util.ArrayList) Tbl(org.docx4j.wml.Tbl) P(org.docx4j.wml.P) TrpTableCellType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTableCellType) JAXBException(javax.xml.bind.JAXBException) IOException(java.io.IOException) Docx4JException(org.docx4j.openpackaging.exceptions.Docx4JException) BigInteger(java.math.BigInteger) Br(org.docx4j.wml.Br) TrpTextRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType)

Example 5 with TrpTableRegionType

use of eu.transkribus.core.model.beans.pagecontent_trp.TrpTableRegionType in project TranskribusCore by Transkribus.

the class TrpXlsxTableBuilder method writeXlsxForTables.

public static void writeXlsxForTables(TrpDoc doc, File exportFile, Set<Integer> pageIndices, IProgressMonitor monitor, ExportCache cache) throws NoTablesException, IOException, InterruptedException {
    // TrpTableRegionType is contained in the regions too
    List<TrpPage> pages = doc.getPages();
    String exportPath = exportFile.getPath();
    int totalPages = pageIndices == null ? pages.size() : pageIndices.size();
    if (monitor != null) {
        monitor.beginTask("Exporting tables to Excel", totalPages);
    }
    wb = new XSSFWorkbook();
    int c = 0;
    int tableId = 0;
    for (int i = 0; i < pages.size(); ++i) {
        if (pageIndices != null && !pageIndices.contains(i))
            continue;
        if (monitor != null) {
            if (monitor.isCanceled()) {
                throw new InterruptedException("Export was canceled by user");
            // logger.debug("Xlsx export cancelled!");
            // return;
            }
            monitor.subTask("Processing page " + (c + 1));
        }
        TrpPage page = pages.get(i);
        // try to get previously loaded JAXB transcript
        JAXBPageTranscript tr = null;
        if (cache != null) {
            tr = cache.getPageTranscriptAtIndex(i);
        }
        if (tr == null) {
            TrpTranscriptMetadata md = page.getCurrentTranscript();
            tr = new JAXBPageTranscript(md);
            tr.build();
        }
        TrpPageType trpPage = tr.getPage();
        List<TrpRegionType> regions = trpPage.getRegions();
        for (int j = 0; j < regions.size(); ++j) {
            TrpRegionType r = regions.get(j);
            if (r instanceof TrpTableRegionType) {
                tableId++;
                logger.debug("is table");
                TrpTableRegionType table = (TrpTableRegionType) r;
                int cols = table.getNCols();
                int rows = table.getNRows();
                // double maxX = PageXmlUtils.buildPolygon(table.getCoords().getPoints()).getBounds().getMaxX();
                // double minX = PageXmlUtils.buildPolygon(table.getCoords().getPoints()).getBounds().getMinX();
                // int tablesize = (int) (maxX - minX);
                List<List<TrpTableCellType>> allRowCells = new ArrayList<List<TrpTableCellType>>();
                for (int k = 0; k < rows; k++) {
                    allRowCells.add(table.getRowCells(k));
                }
                List<HashMap<Integer, TrpTableCellType>> allRows = new ArrayList<HashMap<Integer, TrpTableCellType>>();
                HashMap<Integer, TrpTableCellType> nextRowMap = new HashMap<Integer, TrpTableCellType>();
                for (List<TrpTableCellType> rowCells : allRowCells) {
                    HashMap<Integer, TrpTableCellType> currRowMap = new HashMap<Integer, TrpTableCellType>();
                    /*
		            	 * fill up all cells which are not set in TRP (needed for vertical cell merge)
		            	 * the nextRowMap contains already all cells which span vertically with the cells above - means they got merged 
		            	 * in the table but have to be considered here 
		            	 */
                    currRowMap.putAll(nextRowMap);
                    nextRowMap.clear();
                    for (TrpTableCellType cell : rowCells) {
                        // logger.debug("table cell text " + cell.getUnicodeTextFromLines());
                        currRowMap.put(cell.getCol(), cell);
                        // only one row or col span is considered -> FIXME: do it for all spans, but may happens never?
                        if (cell.getRowSpan() > 1) {
                            nextRowMap.put(cell.getCol(), null);
                        }
                        if (cell.getColSpan() > 1) {
                            currRowMap.put(cell.getCol() + 1, null);
                        }
                    }
                    allRows.add(currRowMap);
                }
                createTable(rows, cols, allRows, tableId);
            }
            logger.debug("writing xlsx for page " + (i + 1) + "/" + doc.getNPages());
            ++c;
            if (monitor != null) {
                monitor.worked(c);
            }
        }
    }
    /*
		 * auto size the columns
		 */
    for (int i = 0; i < wb.getNumberOfSheets(); i++) {
        int numberOfCells = 0;
        Iterator rowIterator = wb.getSheetAt(i).rowIterator();
        /**
         * Escape the header row *
         */
        if (rowIterator.hasNext()) {
            Row headerRow = (Row) rowIterator.next();
            // get the number of cells in the header row
            numberOfCells = headerRow.getPhysicalNumberOfCells();
            for (int j = 0; j < numberOfCells; j++) {
                wb.getSheetAt(i).autoSizeColumn(j, true);
            }
        }
    }
    FileOutputStream fOut;
    try {
        // means no tables at all
        if (wb.getNumberOfSheets() == 0) {
            throw new NoTablesException("Sorry - No tables available for export");
        }
        fOut = new FileOutputStream(exportPath);
        wb.write(fOut);
        fOut.close();
    } catch (IOException e) {
        if (!(e instanceof NoTablesException)) {
            logger.error(e.getMessage(), e);
        }
        throw e;
    }
    logger.info("wrote xlsx to: " + exportPath);
}
Also used : JAXBPageTranscript(eu.transkribus.core.model.beans.JAXBPageTranscript) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) TrpTranscriptMetadata(eu.transkribus.core.model.beans.TrpTranscriptMetadata) TrpTableRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTableRegionType) NoTablesException(eu.transkribus.core.model.builder.NoTablesException) TrpRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpRegionType) Iterator(java.util.Iterator) XSSFWorkbook(org.apache.poi.xssf.usermodel.XSSFWorkbook) ArrayList(java.util.ArrayList) List(java.util.List) TrpPageType(eu.transkribus.core.model.beans.pagecontent_trp.TrpPageType) TrpTableCellType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTableCellType) TrpPage(eu.transkribus.core.model.beans.TrpPage) IOException(java.io.IOException) FileOutputStream(java.io.FileOutputStream) Row(org.apache.poi.ss.usermodel.Row)

Aggregations

TrpTableRegionType (eu.transkribus.core.model.beans.pagecontent_trp.TrpTableRegionType)6 TrpRegionType (eu.transkribus.core.model.beans.pagecontent_trp.TrpRegionType)5 RegionType (eu.transkribus.core.model.beans.pagecontent.RegionType)4 TrpTextRegionType (eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType)4 ArrayList (java.util.ArrayList)4 List (java.util.List)4 TrpTableCellType (eu.transkribus.core.model.beans.pagecontent_trp.TrpTableCellType)3 IOException (java.io.IOException)3 HashMap (java.util.HashMap)3 PdfContentByte (com.itextpdf.text.pdf.PdfContentByte)2 TextRegionType (eu.transkribus.core.model.beans.pagecontent.TextRegionType)2 UnknownRegionType (eu.transkribus.core.model.beans.pagecontent.UnknownRegionType)2 Point (java.awt.Point)2 JAXBPageTranscript (eu.transkribus.core.model.beans.JAXBPageTranscript)1 TrpPage (eu.transkribus.core.model.beans.TrpPage)1 TrpTranscriptMetadata (eu.transkribus.core.model.beans.TrpTranscriptMetadata)1 CustomTagList (eu.transkribus.core.model.beans.customtags.CustomTagList)1 WordType (eu.transkribus.core.model.beans.pagecontent.WordType)1 ITrpShapeType (eu.transkribus.core.model.beans.pagecontent_trp.ITrpShapeType)1 TrpPageType (eu.transkribus.core.model.beans.pagecontent_trp.TrpPageType)1