use of eu.transkribus.core.model.beans.pagecontent_trp.TrpTableRegionType in project TranskribusCore by Transkribus.
the class TrpPdfDocument method addUniformText.
private void addUniformText(PcGtsType pc, int cutoffLeft, int cutoffTop, ExportCache cache) throws DocumentException, IOException {
PdfContentByte cb = writer.getDirectContentUnder();
cb.setColorFill(BaseColor.BLACK);
cb.setColorStroke(BaseColor.BLACK);
/**
* The path to the font.
*/
// FontFactory.register("c:/windows/fonts/arialbd.ttf");
// BaseFont bf = BaseFont.createFont("/fonts/arialbd.ttf", BaseFont.IDENTITY_H, BaseFont.NOT_EMBEDDED);
cb.beginLayer(ocrLayer);
// FontFactory.register("arialbd.ttf", "my_bold_font");
// Font fontTest = FontFactory.getFont("arialbd.ttf", Font.BOLDITALIC);
cb.setFontAndSize(bfArial, 10);
List<TrpRegionType> regions = pc.getPage().getTextRegionOrImageRegionOrLineDrawingRegion();
/*
* use reading order comparator for sorting since at this time reading order is more trustable
* other sorting is not transitive and seldomly produces "Comparison violates its general contract" exception
*/
Collections.sort(regions, new TrpElementReadingOrderComparator<RegionType>(true));
// Collections.sort(regions, new TrpElementCoordinatesComparator<RegionType>());
float textBlockXStart = 0;
int i = 0;
for (TrpRegionType r : regions) {
// TODO add paths for tables etc.
if (r instanceof TrpTableRegionType) {
exportTable(r, cb, cutoffLeft, cutoffTop, true, cache);
} else if (r instanceof TrpTextRegionType) {
TrpTextRegionType tr = (TrpTextRegionType) r;
// compute average text region start
// textBlockXStart = (float) (PageXmlUtils.buildPolygon(tr.getCoords().getPoints()).getBounds().getMinX());
// double minX = PageXmlUtils.buildPolygon(tr.getCoords().getPoints()).getBounds().getMinX();
// this should result in the the same value as the method in the line above which is deprecated
double minX = tr.getBoundingBox().getMinX();
double maxX = tr.getBoundingBox().getMaxX();
double trWidth = tr.getBoundingBox().getWidth();
// if (hasSmallerColumn(regions, tr)){
if (isOnlyRegionInThisRow(regions, tr)) {
// if (regions.size() == 1){
logger.debug("only one region in this row!!");
// indent start of text block under certain preconditions
if (minX < twelfthPoints[1][0] && (twelfthPoints[1][0] < maxX && trWidth > twelfthPoints[2][0])) {
textBlockXStart = twelfthPoints[1][0];
} else // if textregion contains only one line this is probably a headline
if (tr.getTextLine().size() == 1) {
// logger.debug("tr.getTextLine().size() == 1 ");
textBlockXStart = getPrintregionStartX((float) (minX), tr.getBoundingBox().getMaxX());
} else if (twelfthPoints[2][0] < maxX && trWidth > twelfthPoints[3][0]) {
// logger.debug("twelfthPoints[2][0] < tr.getBoundingBox().getMaxX() ");
textBlockXStart = twelfthPoints[2][0];
} else {
textBlockXStart = (float) minX;
}
} else {
logger.debug("several columns found, minX of text region is : " + minX);
// float startWithThisX = (float) (minX < smallerRegionMaxX ? smallerRegionMaxX : minX);
// textBlockXStart = getPrintregionStartX((float) (startWithThisX));
/*
* this is then used for all lines of a region as start point
*/
textBlockXStart = getAverageBeginningOfBaselines(tr);
textBlockXStart += 40;
}
// logger.debug("textBlockXStart " + textBlockXStart);
addUniformTextFromTextRegion(tr, cb, cutoffLeft, cutoffTop, bfArial, textBlockXStart, cache);
}
}
cb.endLayer();
// addTocLinks(doc, page,cutoffTop);
}
use of eu.transkribus.core.model.beans.pagecontent_trp.TrpTableRegionType in project TranskribusCore by Transkribus.
the class TrpPdfDocument method exportTable.
private void exportTable(RegionType r, PdfContentByte cb, int cutoffLeft, int cutoffTop, boolean addUniformText, ExportCache cache) throws IOException, DocumentException {
logger.debug("is table");
TrpTableRegionType table = (TrpTableRegionType) r;
int cols = table.getNCols();
int rows = table.getNRows();
List<List<TrpTableCellType>> allRowCells = new ArrayList<List<TrpTableCellType>>();
for (int k = 0; k < rows; k++) {
allRowCells.add(table.getRowCells(k));
}
List<HashMap<Integer, TrpTableCellType>> allRows = new ArrayList<HashMap<Integer, TrpTableCellType>>();
HashMap<Integer, TrpTableCellType> nextRowMap = new HashMap<Integer, TrpTableCellType>();
for (List<TrpTableCellType> rowCells : allRowCells) {
HashMap<Integer, TrpTableCellType> currRowMap = new HashMap<Integer, TrpTableCellType>();
/*
* fill up all cells which are not set in TRP (needed for vertical cell merge)
* the nextRowMap contains already all cells which span vertically with the cells above - means they got merged
* in the table but have to be considered here
*/
currRowMap.putAll(nextRowMap);
nextRowMap.clear();
for (TrpTableCellType cell : rowCells) {
// logger.debug("table cell text " + cell.getUnicodeTextFromLines());
currRowMap.put(cell.getCol(), cell);
if (cell.getRowSpan() > 1) {
nextRowMap.put(cell.getCol(), null);
}
}
allRows.add(currRowMap);
}
for (HashMap<Integer, TrpTableCellType> entry : allRows) {
for (Integer key : entry.keySet()) {
if (addUniformText) {
float textBlockXStart = getAverageBeginningOfBaselines(entry.get(key));
textBlockXStart += 40;
addUniformTextFromTextRegion(entry.get(key), cb, cutoffLeft, cutoffTop, bfArial, textBlockXStart, cache);
} else {
addTextFromTextRegion(entry.get(key), cb, cutoffLeft, cutoffTop, bfArial, cache);
}
}
}
}
use of eu.transkribus.core.model.beans.pagecontent_trp.TrpTableRegionType in project TranskribusCore by Transkribus.
the class TrpPdfDocument method addTextAndImage.
private void addTextAndImage(PcGtsType pc, int cutoffLeft, int cutoffTop, Image img, boolean imageOnly, ExportCache cache) throws DocumentException, IOException {
lineAndColorList.clear();
PdfContentByte cb = writer.getDirectContentUnder();
cb.setColorFill(BaseColor.BLACK);
cb.setColorStroke(BaseColor.BLACK);
// BaseFont bf = BaseFont.createFont(BaseFont.TIMES_ROMAN, "UTF-8", BaseFont.NOT_EMBEDDED);
if (!imageOnly) {
cb.beginLayer(ocrLayer);
cb.setFontAndSize(bfArial, 32);
List<TrpRegionType> regions = pc.getPage().getTextRegionOrImageRegionOrLineDrawingRegion();
/*
* use reading order comparator for sorting since at this time reading order is more trustable
* other sorting is not transitive and seldomly produces "Comparison violates its general contract" exception
*/
Collections.sort(regions, new TrpElementReadingOrderComparator<RegionType>(true));
for (RegionType r : regions) {
// TODO add paths for tables etc.
if (r instanceof TrpTableRegionType) {
exportTable(r, cb, cutoffLeft, cutoffTop, false, cache);
} else if (r instanceof TextRegionType) {
TextRegionType tr = (TextRegionType) r;
// PageXmlUtils.buildPolygon(tr.getCoords().getPoints()).getBounds().getMinX();
addTextFromTextRegion(tr, cb, cutoffLeft, cutoffTop, bfArial, cache);
}
}
// scale after calculating lineMeanHeightForAllRegions
// lineMeanHeight = lineMeanHeight/scaleFactorX;
cb.endLayer();
}
cb.beginLayer(imgLayer);
cb.addImage(img);
cb.endLayer();
if (highlightTags) {
highlightAllTagsOnImg(lineAndColorList, cb, cutoffLeft, cutoffTop);
}
/*
* draw tag lines
*/
// addTocLinks(doc, page,cutoffTop);
}
use of eu.transkribus.core.model.beans.pagecontent_trp.TrpTableRegionType in project TranskribusCore by Transkribus.
the class DocxBuilder method writeDocxForTranscriptWithTables.
private static void writeDocxForTranscriptWithTables(MainDocumentPart mdp, TrpPageType trpPage, boolean wordBased, boolean preserveLineBreaks) {
boolean rtl = false;
// TrpTableRegionType is contained in the regions too
List<TrpRegionType> regions = trpPage.getRegions();
Collections.sort(regions, new TrpElementReadingOrderComparator<RegionType>(true));
for (int j = 0; j < regions.size(); ++j) {
TrpRegionType r = regions.get(j);
if (r instanceof TrpTableRegionType) {
logger.debug("is table");
TrpTableRegionType table = (TrpTableRegionType) r;
int cols = table.getNCols();
int rows = table.getNRows();
// PageXmlUtils.buildPolygon(table.getCoords().getPoints()).getBounds().getMaxX();
double maxX = table.getBoundingBox().getMaxX();
// PageXmlUtils.buildPolygon(table.getCoords().getPoints()).getBounds().getMinX();
double minX = table.getBoundingBox().getMinX();
int tablesize = (int) (maxX - minX);
List<List<TrpTableCellType>> allRowCells = new ArrayList<List<TrpTableCellType>>();
for (int k = 0; k < rows; k++) {
allRowCells.add(table.getRowCells(k));
}
List<HashMap<Integer, TrpTableCellType>> allRows = new ArrayList<HashMap<Integer, TrpTableCellType>>();
HashMap<Integer, TrpTableCellType> nextRowMap = new HashMap<Integer, TrpTableCellType>();
for (List<TrpTableCellType> rowCells : allRowCells) {
HashMap<Integer, TrpTableCellType> currRowMap = new HashMap<Integer, TrpTableCellType>();
/*
* fill up all cells which are not set in TRP (needed for vertical cell merge)
* the nextRowMap contains already all cells which span vertically with the cells above - means they got merged
* in the table but have to be considered here
*/
currRowMap.putAll(nextRowMap);
nextRowMap.clear();
for (TrpTableCellType cell : rowCells) {
// logger.debug("table cell text " + cell.getUnicodeTextFromLines());
currRowMap.put(cell.getCol(), cell);
if (cell.getRowSpan() > 1) {
nextRowMap.put(cell.getCol(), null);
}
}
allRows.add(currRowMap);
}
Tbl thisTable;
try {
thisTable = getDocxTable(wordMLPackage, wordBased, rows, cols, allRows, tablesize, mdp);
mdp.addObject(thisTable);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
// this Br element is used break the current and go for next line
Br br = factory.createBr();
org.docx4j.wml.P p = factory.createP();
mdp.addObject(p);
p.getContent().add(br);
} else if (r instanceof TrpTextRegionType) {
TrpTextRegionType tr = (TrpTextRegionType) r;
/*
* create one paragraph for each text region
* but only if there is some text in it
*/
String helper = tr.getUnicodeText().replaceAll("\n", "");
if (!helper.equals("")) {
exportTextRegion(tr, wordBased, null, mdp);
}
}
}
}
use of eu.transkribus.core.model.beans.pagecontent_trp.TrpTableRegionType in project TranskribusCore by Transkribus.
the class TrpXlsxTableBuilder method writeXlsxForTables.
public static void writeXlsxForTables(TrpDoc doc, File exportFile, Set<Integer> pageIndices, IProgressMonitor monitor, ExportCache cache) throws NoTablesException, IOException, InterruptedException {
// TrpTableRegionType is contained in the regions too
List<TrpPage> pages = doc.getPages();
String exportPath = exportFile.getPath();
int totalPages = pageIndices == null ? pages.size() : pageIndices.size();
if (monitor != null) {
monitor.beginTask("Exporting tables to Excel", totalPages);
}
wb = new XSSFWorkbook();
int c = 0;
int tableId = 0;
for (int i = 0; i < pages.size(); ++i) {
if (pageIndices != null && !pageIndices.contains(i))
continue;
if (monitor != null) {
if (monitor.isCanceled()) {
throw new InterruptedException("Export was canceled by user");
// logger.debug("Xlsx export cancelled!");
// return;
}
monitor.subTask("Processing page " + (c + 1));
}
TrpPage page = pages.get(i);
// try to get previously loaded JAXB transcript
JAXBPageTranscript tr = null;
if (cache != null) {
tr = cache.getPageTranscriptAtIndex(i);
}
if (tr == null) {
TrpTranscriptMetadata md = page.getCurrentTranscript();
tr = new JAXBPageTranscript(md);
tr.build();
}
TrpPageType trpPage = tr.getPage();
List<TrpRegionType> regions = trpPage.getRegions();
for (int j = 0; j < regions.size(); ++j) {
TrpRegionType r = regions.get(j);
if (r instanceof TrpTableRegionType) {
tableId++;
logger.debug("is table");
TrpTableRegionType table = (TrpTableRegionType) r;
int cols = table.getNCols();
int rows = table.getNRows();
// double maxX = PageXmlUtils.buildPolygon(table.getCoords().getPoints()).getBounds().getMaxX();
// double minX = PageXmlUtils.buildPolygon(table.getCoords().getPoints()).getBounds().getMinX();
// int tablesize = (int) (maxX - minX);
List<List<TrpTableCellType>> allRowCells = new ArrayList<List<TrpTableCellType>>();
for (int k = 0; k < rows; k++) {
allRowCells.add(table.getRowCells(k));
}
List<HashMap<Integer, TrpTableCellType>> allRows = new ArrayList<HashMap<Integer, TrpTableCellType>>();
HashMap<Integer, TrpTableCellType> nextRowMap = new HashMap<Integer, TrpTableCellType>();
for (List<TrpTableCellType> rowCells : allRowCells) {
HashMap<Integer, TrpTableCellType> currRowMap = new HashMap<Integer, TrpTableCellType>();
/*
* fill up all cells which are not set in TRP (needed for vertical cell merge)
* the nextRowMap contains already all cells which span vertically with the cells above - means they got merged
* in the table but have to be considered here
*/
currRowMap.putAll(nextRowMap);
nextRowMap.clear();
for (TrpTableCellType cell : rowCells) {
// logger.debug("table cell text " + cell.getUnicodeTextFromLines());
currRowMap.put(cell.getCol(), cell);
// only one row or col span is considered -> FIXME: do it for all spans, but may happens never?
if (cell.getRowSpan() > 1) {
nextRowMap.put(cell.getCol(), null);
}
if (cell.getColSpan() > 1) {
currRowMap.put(cell.getCol() + 1, null);
}
}
allRows.add(currRowMap);
}
createTable(rows, cols, allRows, tableId);
}
logger.debug("writing xlsx for page " + (i + 1) + "/" + doc.getNPages());
++c;
if (monitor != null) {
monitor.worked(c);
}
}
}
/*
* auto size the columns
*/
for (int i = 0; i < wb.getNumberOfSheets(); i++) {
int numberOfCells = 0;
Iterator rowIterator = wb.getSheetAt(i).rowIterator();
/**
* Escape the header row *
*/
if (rowIterator.hasNext()) {
Row headerRow = (Row) rowIterator.next();
// get the number of cells in the header row
numberOfCells = headerRow.getPhysicalNumberOfCells();
for (int j = 0; j < numberOfCells; j++) {
wb.getSheetAt(i).autoSizeColumn(j, true);
}
}
}
FileOutputStream fOut;
try {
// means no tables at all
if (wb.getNumberOfSheets() == 0) {
throw new NoTablesException("Sorry - No tables available for export");
}
fOut = new FileOutputStream(exportPath);
wb.write(fOut);
fOut.close();
} catch (IOException e) {
if (!(e instanceof NoTablesException)) {
logger.error(e.getMessage(), e);
}
throw e;
}
logger.info("wrote xlsx to: " + exportPath);
}
Aggregations