use of eu.transkribus.core.model.beans.pagecontent.TextRegionType in project TranskribusCore by Transkribus.
the class PageXmlUtils method findLinesByBaseline.
public static List<TextLineType> findLinesByBaseline(PcGtsType pc, String baseline) {
List<TextRegionType> regions = getTextRegions(pc);
List<TextLineType> matchingLines = new LinkedList<>();
for (TextRegionType r : regions) {
r.getTextLine().stream().filter(// isBaselineInLineBounds(l, baseline, threshold))
l -> doesIntersect(l, baseline)).forEach(l -> matchingLines.add(l));
}
if (matchingLines.size() > 1) {
TrpElementCoordinatesComparator<TextLineType> comp = new TrpElementCoordinatesComparator<>(true);
Collections.sort(matchingLines, comp);
}
return matchingLines;
}
use of eu.transkribus.core.model.beans.pagecontent.TextRegionType in project TranskribusCore by Transkribus.
the class PageXmlUtils method getTextRegions.
public static List<TextRegionType> getTextRegions(PcGtsType pc) {
List<TrpRegionType> regions = pc.getPage().getTextRegionOrImageRegionOrLineDrawingRegion();
List<TextRegionType> tRegions = new ArrayList<>();
if (regions == null || regions.isEmpty()) {
return tRegions;
}
for (RegionType r : regions) {
if (r == null)
continue;
if (TextRegionType.class.isAssignableFrom(r.getClass())) {
tRegions.add((TextRegionType) r);
}
if (TableRegionType.class.isAssignableFrom(r.getClass())) {
TableRegionType table = (TableRegionType) r;
tRegions.addAll(table.getTableCell());
}
}
return tRegions;
}
use of eu.transkribus.core.model.beans.pagecontent.TextRegionType in project TranskribusCore by Transkribus.
the class PageXmlUtils method getLines.
public static List<TextLineType> getLines(PcGtsType pc) {
List<TextLineType> lines = new ArrayList<>();
List<TextRegionType> regions = PageXmlUtils.getTextRegions(pc);
for (TextRegionType r : regions) {
lines.addAll((r.getTextLine()));
}
return lines;
}
use of eu.transkribus.core.model.beans.pagecontent.TextRegionType in project TranskribusCore by Transkribus.
the class FinereaderUtils method replaceBadChars.
/**
* Method for replacing certain systematic errors in OCR Text.
* Replacements are done with regexes from private static regexReglMap (see above in this class).
*
* TODO add parameters to pass custom maps from a search/replace dialog!?
* @param pc
* @return
*/
public static PcGtsType replaceBadChars(PcGtsType pc) {
List<TrpRegionType> regs = pc.getPage().getTextRegionOrImageRegionOrLineDrawingRegion();
boolean success = true;
for (RegionType r : regs) {
if (!isTextRegion(r)) {
continue;
}
TextRegionType tr = (TextRegionType) r;
if (tr.getTextEquiv() == null && tr.getTextEquiv().getUnicode() == null) {
// no text at all
continue;
}
final String textblockBefore = tr.getTextEquiv().getUnicode();
final String textblockAfter = replaceChars(textblockBefore, regexRepl);
// iterate lines
List<TextLineType> lines = tr.getTextLine();
if (lines == null || lines.isEmpty()) {
// textblockAfter = replaceChars(textblockBefore, regexRepl);
continue;
}
// setRegionText
tr.getTextEquiv().setUnicode(textblockAfter);
StringBuffer linesBefore = new StringBuffer();
StringBuffer linesAfter = new StringBuffer();
// DEBUG END
boolean isFirstLine = true;
for (TextLineType l : lines) {
if (l.getTextEquiv() == null && l.getTextEquiv().getUnicode() == null) {
// empty line
continue;
}
// Build the textRegion for later use
final String textlineBefore = l.getTextEquiv().getUnicode();
final String textlineAfter = replaceChars(textlineBefore, regexRepl);
linesBefore.append(isFirstLine ? textlineBefore : "\n" + textlineBefore);
linesAfter.append(isFirstLine ? textlineAfter : "\n" + textlineAfter);
if (isFirstLine)
isFirstLine = false;
l.getTextEquiv().setUnicode(textlineAfter);
// iterate words
List<WordType> words = l.getWord();
if (words == null || words.isEmpty()) {
// with next line
continue;
}
boolean isFirstWord = true;
StringBuffer wordsBefore = new StringBuffer();
StringBuffer wordsAfter = new StringBuffer();
for (int i = 0; i < words.size(); i++) {
WordType w = words.get(i);
if (w.getTextEquiv() == null || w.getTextEquiv().getUnicode() == null) {
continue;
}
final String wordText = w.getTextEquiv().getUnicode();
final String wordTextAfter;
if (i < words.size() - 1) {
// use general replacement map for all words
wordTextAfter = replaceChars(wordText, repl);
} else {
// use regex map for EOL words
wordTextAfter = replaceChars(wordText, regexRepl);
}
// DEBUG
wordsBefore.append(isFirstWord ? wordText : " " + wordText);
wordsAfter.append(isFirstWord ? wordTextAfter : " " + wordTextAfter);
if (isFirstWord)
isFirstWord = false;
// DEBUG END
w.getTextEquiv().setUnicode(wordTextAfter);
}
boolean lineSuccess = textlineBefore.toString().replace(" ", "").equals(wordsBefore.toString().replace(" ", ""));
lineSuccess &= textlineAfter.toString().replace(" ", "").equals(wordsAfter.toString().replace(" ", ""));
if (!lineSuccess) {
logger.debug("Line before: " + textlineBefore.toString());
logger.debug("Words before : " + wordsBefore.toString());
logger.debug("Line after: " + textlineAfter.toString());
logger.debug("Words after : " + wordsAfter.toString());
}
success &= lineSuccess;
// TODO propagate words -> lines -> regions
}
boolean regionSuccess = textblockBefore.replace(" ", "").equals(linesBefore.toString().replace(" ", ""));
regionSuccess &= textblockAfter.replace(" ", "").equals(linesAfter.toString().replace(" ", ""));
if (!regionSuccess) {
logger.debug("\nblock:\n");
logger.debug(textblockAfter);
logger.debug("\nblock from lines:\n");
logger.debug(linesAfter.toString());
}
success &= regionSuccess;
}
logger.info("Bad character replacement: " + (success ? "SUCCESS" : "FAILURE"));
// if(!success) throw new IllegalArgumentException();
return pc;
}
use of eu.transkribus.core.model.beans.pagecontent.TextRegionType in project TranskribusCore by Transkribus.
the class PageXmlUtilsTest method testGetTextRegions.
public static void testGetTextRegions() throws Exception {
String transcriptWithTables = "https://dbis-thure.uibk.ac.at/f/Get?id=VCLTRLDSWETCXIHQNHKOPRLS";
PcGtsType t = PageXmlUtils.unmarshal(new URL(transcriptWithTables));
List<TextRegionType> tr = PageXmlUtils.getTextRegions(t);
for (TextRegionType r : tr) {
System.out.println("tr: " + r.getClass().getSimpleName() + " id: " + r.getId() + " n-lines: " + r.getTextLine().size());
}
}
Aggregations