Search in sources :

Example 11 with TrpRegionType

use of eu.transkribus.core.model.beans.pagecontent_trp.TrpRegionType in project TranskribusCore by Transkribus.

the class DocxBuilder method writeDocxForTranscriptWithTables.

private static void writeDocxForTranscriptWithTables(MainDocumentPart mdp, TrpPageType trpPage, boolean wordBased, boolean preserveLineBreaks) {
    boolean rtl = false;
    // TrpTableRegionType is contained in the regions too
    List<TrpRegionType> regions = trpPage.getRegions();
    Collections.sort(regions, new TrpElementReadingOrderComparator<RegionType>(true));
    for (int j = 0; j < regions.size(); ++j) {
        TrpRegionType r = regions.get(j);
        if (r instanceof TrpTableRegionType) {
            logger.debug("is table");
            TrpTableRegionType table = (TrpTableRegionType) r;
            int cols = table.getNCols();
            int rows = table.getNRows();
            // PageXmlUtils.buildPolygon(table.getCoords().getPoints()).getBounds().getMaxX();
            double maxX = table.getBoundingBox().getMaxX();
            // PageXmlUtils.buildPolygon(table.getCoords().getPoints()).getBounds().getMinX();
            double minX = table.getBoundingBox().getMinX();
            int tablesize = (int) (maxX - minX);
            List<List<TrpTableCellType>> allRowCells = new ArrayList<List<TrpTableCellType>>();
            for (int k = 0; k < rows; k++) {
                allRowCells.add(table.getRowCells(k));
            }
            List<HashMap<Integer, TrpTableCellType>> allRows = new ArrayList<HashMap<Integer, TrpTableCellType>>();
            HashMap<Integer, TrpTableCellType> nextRowMap = new HashMap<Integer, TrpTableCellType>();
            for (List<TrpTableCellType> rowCells : allRowCells) {
                HashMap<Integer, TrpTableCellType> currRowMap = new HashMap<Integer, TrpTableCellType>();
                /*
	            	 * fill up all cells which are not set in TRP (needed for vertical cell merge)
	            	 * the nextRowMap contains already all cells which span vertically with the cells above - means they got merged 
	            	 * in the table but have to be considered here 
	            	 */
                currRowMap.putAll(nextRowMap);
                nextRowMap.clear();
                for (TrpTableCellType cell : rowCells) {
                    // logger.debug("table cell text " + cell.getUnicodeTextFromLines());
                    currRowMap.put(cell.getCol(), cell);
                    if (cell.getRowSpan() > 1) {
                        nextRowMap.put(cell.getCol(), null);
                    }
                }
                allRows.add(currRowMap);
            }
            Tbl thisTable;
            try {
                thisTable = getDocxTable(wordMLPackage, wordBased, rows, cols, allRows, tablesize, mdp);
                mdp.addObject(thisTable);
            } catch (Exception e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
            // this Br element is used break the current and go for next line
            Br br = factory.createBr();
            org.docx4j.wml.P p = factory.createP();
            mdp.addObject(p);
            p.getContent().add(br);
        } else if (r instanceof TrpTextRegionType) {
            TrpTextRegionType tr = (TrpTextRegionType) r;
            /*
				 * create one paragraph for each text region
				 * but only if there is some text in it
				 */
            String helper = tr.getUnicodeText().replaceAll("\n", "");
            if (!helper.equals("")) {
                exportTextRegion(tr, wordBased, null, mdp);
            }
        }
    }
}
Also used : TrpTextRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType) TrpRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpRegionType) TrpTableRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTableRegionType) RegionType(eu.transkribus.core.model.beans.pagecontent.RegionType) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) ArrayList(java.util.ArrayList) TrpTableRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTableRegionType) TrpRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpRegionType) CustomTagList(eu.transkribus.core.model.beans.customtags.CustomTagList) List(java.util.List) ArrayList(java.util.ArrayList) Tbl(org.docx4j.wml.Tbl) P(org.docx4j.wml.P) TrpTableCellType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTableCellType) JAXBException(javax.xml.bind.JAXBException) IOException(java.io.IOException) Docx4JException(org.docx4j.openpackaging.exceptions.Docx4JException) BigInteger(java.math.BigInteger) Br(org.docx4j.wml.Br) TrpTextRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType)

Example 12 with TrpRegionType

use of eu.transkribus.core.model.beans.pagecontent_trp.TrpRegionType in project TranskribusCore by Transkribus.

the class TrpXlsxTableBuilder method writeXlsxForTables.

public static void writeXlsxForTables(TrpDoc doc, File exportFile, Set<Integer> pageIndices, IProgressMonitor monitor, ExportCache cache) throws NoTablesException, IOException, InterruptedException {
    // TrpTableRegionType is contained in the regions too
    List<TrpPage> pages = doc.getPages();
    String exportPath = exportFile.getPath();
    int totalPages = pageIndices == null ? pages.size() : pageIndices.size();
    if (monitor != null) {
        monitor.beginTask("Exporting tables to Excel", totalPages);
    }
    wb = new XSSFWorkbook();
    int c = 0;
    int tableId = 0;
    for (int i = 0; i < pages.size(); ++i) {
        if (pageIndices != null && !pageIndices.contains(i))
            continue;
        if (monitor != null) {
            if (monitor.isCanceled()) {
                throw new InterruptedException("Export was canceled by user");
            // logger.debug("Xlsx export cancelled!");
            // return;
            }
            monitor.subTask("Processing page " + (c + 1));
        }
        TrpPage page = pages.get(i);
        // try to get previously loaded JAXB transcript
        JAXBPageTranscript tr = null;
        if (cache != null) {
            tr = cache.getPageTranscriptAtIndex(i);
        }
        if (tr == null) {
            TrpTranscriptMetadata md = page.getCurrentTranscript();
            tr = new JAXBPageTranscript(md);
            tr.build();
        }
        TrpPageType trpPage = tr.getPage();
        List<TrpRegionType> regions = trpPage.getRegions();
        for (int j = 0; j < regions.size(); ++j) {
            TrpRegionType r = regions.get(j);
            if (r instanceof TrpTableRegionType) {
                tableId++;
                logger.debug("is table");
                TrpTableRegionType table = (TrpTableRegionType) r;
                int cols = table.getNCols();
                int rows = table.getNRows();
                // double maxX = PageXmlUtils.buildPolygon(table.getCoords().getPoints()).getBounds().getMaxX();
                // double minX = PageXmlUtils.buildPolygon(table.getCoords().getPoints()).getBounds().getMinX();
                // int tablesize = (int) (maxX - minX);
                List<List<TrpTableCellType>> allRowCells = new ArrayList<List<TrpTableCellType>>();
                for (int k = 0; k < rows; k++) {
                    allRowCells.add(table.getRowCells(k));
                }
                List<HashMap<Integer, TrpTableCellType>> allRows = new ArrayList<HashMap<Integer, TrpTableCellType>>();
                HashMap<Integer, TrpTableCellType> nextRowMap = new HashMap<Integer, TrpTableCellType>();
                for (List<TrpTableCellType> rowCells : allRowCells) {
                    HashMap<Integer, TrpTableCellType> currRowMap = new HashMap<Integer, TrpTableCellType>();
                    /*
		            	 * fill up all cells which are not set in TRP (needed for vertical cell merge)
		            	 * the nextRowMap contains already all cells which span vertically with the cells above - means they got merged 
		            	 * in the table but have to be considered here 
		            	 */
                    currRowMap.putAll(nextRowMap);
                    nextRowMap.clear();
                    for (TrpTableCellType cell : rowCells) {
                        // logger.debug("table cell text " + cell.getUnicodeTextFromLines());
                        currRowMap.put(cell.getCol(), cell);
                        // only one row or col span is considered -> FIXME: do it for all spans, but may happens never?
                        if (cell.getRowSpan() > 1) {
                            nextRowMap.put(cell.getCol(), null);
                        }
                        if (cell.getColSpan() > 1) {
                            currRowMap.put(cell.getCol() + 1, null);
                        }
                    }
                    allRows.add(currRowMap);
                }
                createTable(rows, cols, allRows, tableId);
            }
            logger.debug("writing xlsx for page " + (i + 1) + "/" + doc.getNPages());
            ++c;
            if (monitor != null) {
                monitor.worked(c);
            }
        }
    }
    /*
		 * auto size the columns
		 */
    for (int i = 0; i < wb.getNumberOfSheets(); i++) {
        int numberOfCells = 0;
        Iterator rowIterator = wb.getSheetAt(i).rowIterator();
        /**
         * Escape the header row *
         */
        if (rowIterator.hasNext()) {
            Row headerRow = (Row) rowIterator.next();
            // get the number of cells in the header row
            numberOfCells = headerRow.getPhysicalNumberOfCells();
            for (int j = 0; j < numberOfCells; j++) {
                wb.getSheetAt(i).autoSizeColumn(j, true);
            }
        }
    }
    FileOutputStream fOut;
    try {
        // means no tables at all
        if (wb.getNumberOfSheets() == 0) {
            throw new NoTablesException("Sorry - No tables available for export");
        }
        fOut = new FileOutputStream(exportPath);
        wb.write(fOut);
        fOut.close();
    } catch (IOException e) {
        if (!(e instanceof NoTablesException)) {
            logger.error(e.getMessage(), e);
        }
        throw e;
    }
    logger.info("wrote xlsx to: " + exportPath);
}
Also used : JAXBPageTranscript(eu.transkribus.core.model.beans.JAXBPageTranscript) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) TrpTranscriptMetadata(eu.transkribus.core.model.beans.TrpTranscriptMetadata) TrpTableRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTableRegionType) NoTablesException(eu.transkribus.core.model.builder.NoTablesException) TrpRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpRegionType) Iterator(java.util.Iterator) XSSFWorkbook(org.apache.poi.xssf.usermodel.XSSFWorkbook) ArrayList(java.util.ArrayList) List(java.util.List) TrpPageType(eu.transkribus.core.model.beans.pagecontent_trp.TrpPageType) TrpTableCellType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTableCellType) TrpPage(eu.transkribus.core.model.beans.TrpPage) IOException(java.io.IOException) FileOutputStream(java.io.FileOutputStream) Row(org.apache.poi.ss.usermodel.Row)

Example 13 with TrpRegionType

use of eu.transkribus.core.model.beans.pagecontent_trp.TrpRegionType in project TranskribusCore by Transkribus.

the class TrpTxtBuilder method writeTxtForSinglePage.

private static void writeTxtForSinglePage(File file, TrpPageType trpPage, boolean wordBased, boolean preserveLineBreaks) {
    boolean rtl = false;
    // TrpTableRegionType is contained in the regions too
    List<TrpRegionType> regions = trpPage.getRegions();
    Collections.sort(regions, new TrpElementReadingOrderComparator<RegionType>(true));
    List<String> content = new ArrayList<String>();
    for (int j = 0; j < regions.size(); ++j) {
        TrpRegionType r = regions.get(j);
        if (r instanceof TrpTableRegionType) {
            /*
				 * TODO: for simple txt export: how to handle tables
				 */
            continue;
        } else if (r instanceof TrpTextRegionType) {
            TrpTextRegionType tr = (TrpTextRegionType) r;
            List<TextLineType> lines = tr.getTextLine();
            for (int i = 0; i < lines.size(); ++i) {
                TrpTextLineType trpL = (TrpTextLineType) lines.get(i);
                String textOfCurrLine = trpL.getUnicodeText();
                if (wordBased && trpL.getWord().size() > 0) {
                    for (WordType word : trpL.getWord()) {
                        content.add(((ITrpShapeType) word).getUnicodeText());
                    }
                } else if (textOfCurrLine != "") {
                    content.add(textOfCurrLine);
                }
            // if(preserveLineBreaks){
            // content.add(System.lineSeparator());
            // }
            }
            if (lines.size() > 0) {
                content.add(System.lineSeparator());
            // try {
            // //Add line separator after each region
            // Files.write(Paths.get(file.getAbsolutePath()), new ArrayList<String>() {{ add(System.lineSeparator()); }}, utf8,
            // StandardOpenOption.CREATE, StandardOpenOption.APPEND);
            // } catch (IOException e) {
            // // TODO Auto-generated catch block
            // e.printStackTrace();
            // }
            }
        }
    }
    try {
        logger.debug("path " + Paths.get(file.getAbsolutePath()));
        Files.write(Paths.get(file.getAbsolutePath()), content, utf8, StandardOpenOption.CREATE, StandardOpenOption.APPEND);
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
}
Also used : TrpTextRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType) TrpRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpRegionType) RegionType(eu.transkribus.core.model.beans.pagecontent.RegionType) TrpTableRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTableRegionType) ArrayList(java.util.ArrayList) IOException(java.io.IOException) ITrpShapeType(eu.transkribus.core.model.beans.pagecontent_trp.ITrpShapeType) WordType(eu.transkribus.core.model.beans.pagecontent.WordType) TrpTextLineType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextLineType) TrpTableRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTableRegionType) TrpRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpRegionType) TrpTextRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType) ArrayList(java.util.ArrayList) List(java.util.List)

Example 14 with TrpRegionType

use of eu.transkribus.core.model.beans.pagecontent_trp.TrpRegionType in project TranskribusCore by Transkribus.

the class PageXmlUtils method removeAllLines.

public static void removeAllLines(PcGtsType pc) {
    if (!hasRegions(pc)) {
        return;
    }
    List<TrpRegionType> regions = pc.getPage().getTextRegionOrImageRegionOrLineDrawingRegion();
    for (RegionType r : regions) {
        if (r instanceof TextRegionType) {
            TextRegionType tr = (TextRegionType) r;
            logger.debug("Clearing text region: " + tr.getId());
            tr.getTextLine().clear();
        }
    }
}
Also used : TrpTextRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType) TextRegionType(eu.transkribus.core.model.beans.pagecontent.TextRegionType) TrpTextRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType) TrpRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpRegionType) RegionType(eu.transkribus.core.model.beans.pagecontent.RegionType) TextRegionType(eu.transkribus.core.model.beans.pagecontent.TextRegionType) TableRegionType(eu.transkribus.core.model.beans.pagecontent.TableRegionType) TrpRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpRegionType)

Example 15 with TrpRegionType

use of eu.transkribus.core.model.beans.pagecontent_trp.TrpRegionType in project TranskribusCore by Transkribus.

the class PageXmlUtils method cutPolysAtImgBorder.

/**
 * If regions overlap the img border, reset offlimit coordinates to min/max
 *
 * @param pc
 */
public static void cutPolysAtImgBorder(PcGtsType pc) {
    final int maxX = pc.getPage().getImageWidth();
    final int maxY = pc.getPage().getImageHeight();
    List<TrpRegionType> regions = pc.getPage().getTextRegionOrImageRegionOrLineDrawingRegion();
    if (regions == null || regions.isEmpty()) {
        return;
    }
    StringBuilder sb;
    for (RegionType r : regions) {
        sb = new StringBuilder();
        CoordsType c = r.getCoords();
        final String pointsStr = c.getPoints();
        if (pointsStr == null || pointsStr.isEmpty()) {
            continue;
        }
        final String[] coordsArr = pointsStr.split(" ");
        for (int i = 0; i < coordsArr.length; i++) {
            final String[] xy = coordsArr[i].split(",");
            final int x = Integer.parseInt(xy[0]);
            final int y = Integer.parseInt(xy[1]);
            sb.append(x < 0 ? 0 : (x > maxX ? maxX : x));
            sb.append(",");
            sb.append(y < 0 ? 0 : (y > maxY ? maxY : y));
            sb.append(" ");
        }
        c.setPoints(sb.toString().trim());
    }
}
Also used : TrpTextRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType) TrpRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpRegionType) RegionType(eu.transkribus.core.model.beans.pagecontent.RegionType) TextRegionType(eu.transkribus.core.model.beans.pagecontent.TextRegionType) TableRegionType(eu.transkribus.core.model.beans.pagecontent.TableRegionType) TrpRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpRegionType) CoordsType(eu.transkribus.core.model.beans.pagecontent.CoordsType)

Aggregations

TrpRegionType (eu.transkribus.core.model.beans.pagecontent_trp.TrpRegionType)16 RegionType (eu.transkribus.core.model.beans.pagecontent.RegionType)15 TrpTextRegionType (eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType)14 TextRegionType (eu.transkribus.core.model.beans.pagecontent.TextRegionType)13 TrpTableRegionType (eu.transkribus.core.model.beans.pagecontent_trp.TrpTableRegionType)8 TableRegionType (eu.transkribus.core.model.beans.pagecontent.TableRegionType)5 UnknownRegionType (eu.transkribus.core.model.beans.pagecontent.UnknownRegionType)5 IOException (java.io.IOException)4 ArrayList (java.util.ArrayList)4 TextLineType (eu.transkribus.core.model.beans.pagecontent.TextLineType)3 TrpTextLineType (eu.transkribus.core.model.beans.pagecontent_trp.TrpTextLineType)3 Point (java.awt.Point)3 Rectangle (java.awt.Rectangle)3 HashMap (java.util.HashMap)3 List (java.util.List)3 PdfContentByte (com.itextpdf.text.pdf.PdfContentByte)2 JAXBPageTranscript (eu.transkribus.core.model.beans.JAXBPageTranscript)2 TrpPage (eu.transkribus.core.model.beans.TrpPage)2 TrpTranscriptMetadata (eu.transkribus.core.model.beans.TrpTranscriptMetadata)2 PcGtsType (eu.transkribus.core.model.beans.pagecontent.PcGtsType)2