use of eu.transkribus.core.model.beans.pagecontent_trp.TrpRegionType in project TranskribusCore by Transkribus.
the class PageXmlUtils method getTextRegions.
public static List<TextRegionType> getTextRegions(PcGtsType pc) {
List<TrpRegionType> regions = pc.getPage().getTextRegionOrImageRegionOrLineDrawingRegion();
List<TextRegionType> tRegions = new ArrayList<>();
if (regions == null || regions.isEmpty()) {
return tRegions;
}
for (RegionType r : regions) {
if (r == null)
continue;
if (TextRegionType.class.isAssignableFrom(r.getClass())) {
tRegions.add((TextRegionType) r);
}
if (TableRegionType.class.isAssignableFrom(r.getClass())) {
TableRegionType table = (TableRegionType) r;
tRegions.addAll(table.getTableCell());
}
}
return tRegions;
}
use of eu.transkribus.core.model.beans.pagecontent_trp.TrpRegionType in project TranskribusCore by Transkribus.
the class PageXmlUtils method removeExcludedRegions.
public static void removeExcludedRegions(PcGtsType pc, List<String> regIds) {
List<TrpRegionType> regions = pc.getPage().getTextRegionOrImageRegionOrLineDrawingRegion();
if (regions == null || regions.isEmpty()) {
return;
}
for (int i = 0; i < regions.size(); ) {
RegionType r = regions.get(i);
if (!regIds.contains(r.getId())) {
logger.debug("Removing excluded region: " + r.getId());
regions.remove(r);
} else {
i++;
}
}
}
use of eu.transkribus.core.model.beans.pagecontent_trp.TrpRegionType in project TranskribusCore by Transkribus.
the class FinereaderUtils method replaceBadChars.
/**
* Method for replacing certain systematic errors in OCR Text.
* Replacements are done with regexes from private static regexReglMap (see above in this class).
*
* TODO add parameters to pass custom maps from a search/replace dialog!?
* @param pc
* @return
*/
public static PcGtsType replaceBadChars(PcGtsType pc) {
List<TrpRegionType> regs = pc.getPage().getTextRegionOrImageRegionOrLineDrawingRegion();
boolean success = true;
for (RegionType r : regs) {
if (!isTextRegion(r)) {
continue;
}
TextRegionType tr = (TextRegionType) r;
if (tr.getTextEquiv() == null && tr.getTextEquiv().getUnicode() == null) {
// no text at all
continue;
}
final String textblockBefore = tr.getTextEquiv().getUnicode();
final String textblockAfter = replaceChars(textblockBefore, regexRepl);
// iterate lines
List<TextLineType> lines = tr.getTextLine();
if (lines == null || lines.isEmpty()) {
// textblockAfter = replaceChars(textblockBefore, regexRepl);
continue;
}
// setRegionText
tr.getTextEquiv().setUnicode(textblockAfter);
StringBuffer linesBefore = new StringBuffer();
StringBuffer linesAfter = new StringBuffer();
// DEBUG END
boolean isFirstLine = true;
for (TextLineType l : lines) {
if (l.getTextEquiv() == null && l.getTextEquiv().getUnicode() == null) {
// empty line
continue;
}
// Build the textRegion for later use
final String textlineBefore = l.getTextEquiv().getUnicode();
final String textlineAfter = replaceChars(textlineBefore, regexRepl);
linesBefore.append(isFirstLine ? textlineBefore : "\n" + textlineBefore);
linesAfter.append(isFirstLine ? textlineAfter : "\n" + textlineAfter);
if (isFirstLine)
isFirstLine = false;
l.getTextEquiv().setUnicode(textlineAfter);
// iterate words
List<WordType> words = l.getWord();
if (words == null || words.isEmpty()) {
// with next line
continue;
}
boolean isFirstWord = true;
StringBuffer wordsBefore = new StringBuffer();
StringBuffer wordsAfter = new StringBuffer();
for (int i = 0; i < words.size(); i++) {
WordType w = words.get(i);
if (w.getTextEquiv() == null || w.getTextEquiv().getUnicode() == null) {
continue;
}
final String wordText = w.getTextEquiv().getUnicode();
final String wordTextAfter;
if (i < words.size() - 1) {
// use general replacement map for all words
wordTextAfter = replaceChars(wordText, repl);
} else {
// use regex map for EOL words
wordTextAfter = replaceChars(wordText, regexRepl);
}
// DEBUG
wordsBefore.append(isFirstWord ? wordText : " " + wordText);
wordsAfter.append(isFirstWord ? wordTextAfter : " " + wordTextAfter);
if (isFirstWord)
isFirstWord = false;
// DEBUG END
w.getTextEquiv().setUnicode(wordTextAfter);
}
boolean lineSuccess = textlineBefore.toString().replace(" ", "").equals(wordsBefore.toString().replace(" ", ""));
lineSuccess &= textlineAfter.toString().replace(" ", "").equals(wordsAfter.toString().replace(" ", ""));
if (!lineSuccess) {
logger.debug("Line before: " + textlineBefore.toString());
logger.debug("Words before : " + wordsBefore.toString());
logger.debug("Line after: " + textlineAfter.toString());
logger.debug("Words after : " + wordsAfter.toString());
}
success &= lineSuccess;
// TODO propagate words -> lines -> regions
}
boolean regionSuccess = textblockBefore.replace(" ", "").equals(linesBefore.toString().replace(" ", ""));
regionSuccess &= textblockAfter.replace(" ", "").equals(linesAfter.toString().replace(" ", ""));
if (!regionSuccess) {
logger.debug("\nblock:\n");
logger.debug(textblockAfter);
logger.debug("\nblock from lines:\n");
logger.debug(linesAfter.toString());
}
success &= regionSuccess;
}
logger.info("Bad character replacement: " + (success ? "SUCCESS" : "FAILURE"));
// if(!success) throw new IllegalArgumentException();
return pc;
}
use of eu.transkribus.core.model.beans.pagecontent_trp.TrpRegionType in project TranskribusCore by Transkribus.
the class TrpPdfDocument method addUniformText.
private void addUniformText(PcGtsType pc, int cutoffLeft, int cutoffTop, ExportCache cache) throws DocumentException, IOException {
PdfContentByte cb = writer.getDirectContentUnder();
cb.setColorFill(BaseColor.BLACK);
cb.setColorStroke(BaseColor.BLACK);
/**
* The path to the font.
*/
// FontFactory.register("c:/windows/fonts/arialbd.ttf");
// BaseFont bf = BaseFont.createFont("/fonts/arialbd.ttf", BaseFont.IDENTITY_H, BaseFont.NOT_EMBEDDED);
cb.beginLayer(ocrLayer);
// FontFactory.register("arialbd.ttf", "my_bold_font");
// Font fontTest = FontFactory.getFont("arialbd.ttf", Font.BOLDITALIC);
cb.setFontAndSize(bfArial, 10);
List<TrpRegionType> regions = pc.getPage().getTextRegionOrImageRegionOrLineDrawingRegion();
/*
* use reading order comparator for sorting since at this time reading order is more trustable
* other sorting is not transitive and seldomly produces "Comparison violates its general contract" exception
*/
Collections.sort(regions, new TrpElementReadingOrderComparator<RegionType>(true));
// Collections.sort(regions, new TrpElementCoordinatesComparator<RegionType>());
float textBlockXStart = 0;
int i = 0;
for (TrpRegionType r : regions) {
// TODO add paths for tables etc.
if (r instanceof TrpTableRegionType) {
exportTable(r, cb, cutoffLeft, cutoffTop, true, cache);
} else if (r instanceof TrpTextRegionType) {
TrpTextRegionType tr = (TrpTextRegionType) r;
// compute average text region start
// textBlockXStart = (float) (PageXmlUtils.buildPolygon(tr.getCoords().getPoints()).getBounds().getMinX());
// double minX = PageXmlUtils.buildPolygon(tr.getCoords().getPoints()).getBounds().getMinX();
// this should result in the the same value as the method in the line above which is deprecated
double minX = tr.getBoundingBox().getMinX();
double maxX = tr.getBoundingBox().getMaxX();
double trWidth = tr.getBoundingBox().getWidth();
// if (hasSmallerColumn(regions, tr)){
if (isOnlyRegionInThisRow(regions, tr)) {
// if (regions.size() == 1){
logger.debug("only one region in this row!!");
// indent start of text block under certain preconditions
if (minX < twelfthPoints[1][0] && (twelfthPoints[1][0] < maxX && trWidth > twelfthPoints[2][0])) {
textBlockXStart = twelfthPoints[1][0];
} else // if textregion contains only one line this is probably a headline
if (tr.getTextLine().size() == 1) {
// logger.debug("tr.getTextLine().size() == 1 ");
textBlockXStart = getPrintregionStartX((float) (minX), tr.getBoundingBox().getMaxX());
} else if (twelfthPoints[2][0] < maxX && trWidth > twelfthPoints[3][0]) {
// logger.debug("twelfthPoints[2][0] < tr.getBoundingBox().getMaxX() ");
textBlockXStart = twelfthPoints[2][0];
} else {
textBlockXStart = (float) minX;
}
} else {
logger.debug("several columns found, minX of text region is : " + minX);
// float startWithThisX = (float) (minX < smallerRegionMaxX ? smallerRegionMaxX : minX);
// textBlockXStart = getPrintregionStartX((float) (startWithThisX));
/*
* this is then used for all lines of a region as start point
*/
textBlockXStart = getAverageBeginningOfBaselines(tr);
textBlockXStart += 40;
}
// logger.debug("textBlockXStart " + textBlockXStart);
addUniformTextFromTextRegion(tr, cb, cutoffLeft, cutoffTop, bfArial, textBlockXStart, cache);
}
}
cb.endLayer();
// addTocLinks(doc, page,cutoffTop);
}
use of eu.transkribus.core.model.beans.pagecontent_trp.TrpRegionType in project TranskribusCore by Transkribus.
the class TrpPdfDocument method isOnlyRegionInThisRow.
private boolean isOnlyRegionInThisRow(List<TrpRegionType> regions, TextRegionType regionToCompare) {
float minX = 0;
float minY = 0;
float maxX = 0;
float maxY = 0;
float meanX = 0;
float meanY = 0;
java.awt.Rectangle compareBlock = regionToCompare.getBoundingBox();
float compareMinX = (float) compareBlock.getMinX();
float compareMinY = (float) compareBlock.getMinY();
float compareMaxX = (float) compareBlock.getMaxX();
float compareMaxY = (float) compareBlock.getMaxY();
float compareMeanX = compareMinX + (compareMaxX - compareMinX) / 2;
float compareMeanY = compareMinY + (compareMaxY - compareMinY) / 2;
boolean foundSmallerColumn = false;
if (regions.size() == 1) {
return true;
} else {
for (RegionType r : regions) {
// TODO add paths for tables etc.
if (r instanceof TextRegionType && r.getId() != regionToCompare.getId()) {
TextRegionType tr = (TextRegionType) r;
// empty region can be ignored
if (tr.getTextLine().isEmpty())
continue;
else {
// region with empty lines can also be ignored
boolean textFound = false;
for (TextLineType tlt : tr.getTextLine()) {
TrpTextLineType l = (TrpTextLineType) tlt;
textFound = !l.getUnicodeText().isEmpty();
if (textFound) {
break;
}
}
// no text in region -> go to next region
if (!textFound) {
continue;
}
}
// logger.debug("tr id " + tr.getId());
// compute average text region start
// java.awt.Rectangle block = PageXmlUtils.buildPolygon(tr.getCoords().getPoints()).getBounds();
java.awt.Rectangle block = tr.getBoundingBox();
minX = (float) block.getMinX();
maxX = (float) block.getMaxX();
minY = (float) block.getMinY();
maxY = (float) block.getMaxY();
// meanX = minX+(maxX - minX)/2;
meanY = minY + (maxY - minY) / 2;
if (((meanY > compareMinY && meanY < compareMaxY) || (compareMeanY > minY && compareMeanY < maxY))) {
return false;
}
}
}
}
return true;
}
Aggregations