use of eu.transkribus.core.model.beans.pagecontent.TextRegionType in project TranskribusCore by Transkribus.
the class PageXmlUtils method extractStats.
public static TrpTranscriptStatistics extractStats(PcGtsType page) {
TrpTranscriptStatistics s = new TrpTranscriptStatistics();
int nrOfRegions, nrOfTranscribedRegions, nrOfWordsInRegions, nrOfLines, nrOfTranscribedLines, nrOfWordsInLines, nrOfWords, nrOfTranscribedWords;
nrOfRegions = nrOfTranscribedRegions = nrOfWordsInRegions = nrOfLines = nrOfTranscribedLines = nrOfWordsInLines = nrOfWords = nrOfTranscribedWords = 0;
List<TextRegionType> regs = PageXmlUtils.getTextRegions(page);
nrOfRegions = regs.size();
for (TextRegionType r : regs) {
if (r.getTextEquiv() != null && r.getTextEquiv().getUnicode() != null && !r.getTextEquiv().getUnicode().trim().isEmpty()) {
nrOfTranscribedRegions += 1;
// TODO use tokenizer here
nrOfWordsInRegions += r.getTextEquiv().getUnicode().split(" ").length;
}
List<TextLineType> lines = r.getTextLine();
nrOfLines += lines.size();
for (TextLineType l : lines) {
if (l.getTextEquiv() != null && l.getTextEquiv().getUnicode() != null && !l.getTextEquiv().getUnicode().trim().isEmpty()) {
nrOfTranscribedLines += 1;
// TODO use tokenizer here
nrOfWordsInLines += l.getTextEquiv().getUnicode().split(" ").length;
}
List<WordType> words = l.getWord();
nrOfWords += words.size();
for (WordType w : words) {
if (w.getTextEquiv() != null && w.getTextEquiv().getUnicode() != null && !w.getTextEquiv().getUnicode().trim().isEmpty()) {
nrOfTranscribedWords += 1;
}
}
}
}
s.setNrOfLines(nrOfLines);
s.setNrOfRegions(nrOfRegions);
s.setNrOfTranscribedLines(nrOfTranscribedLines);
s.setNrOfTranscribedWords(nrOfTranscribedWords);
s.setNrOfTranscribedRegions(nrOfTranscribedRegions);
s.setNrOfWords(nrOfWords);
s.setNrOfWordsInLines(nrOfWordsInLines);
s.setNrOfWordsInRegions(nrOfWordsInRegions);
return s;
}
use of eu.transkribus.core.model.beans.pagecontent.TextRegionType in project TranskribusCore by Transkribus.
the class PageXmlUtils method copyTextContent.
public static void copyTextContent(PcGtsType origPc, PcGtsType newPc) {
if (!hasRegions(origPc) || !hasRegions(newPc)) {
return;
}
List<TrpRegionType> origRegs = origPc.getPage().getTextRegionOrImageRegionOrLineDrawingRegion();
List<TrpRegionType> newRegs = newPc.getPage().getTextRegionOrImageRegionOrLineDrawingRegion();
// map the regions where we want to keep the textContent
Map<String, TextRegionType> textMap = new HashMap<>();
// iterate all old regions. Map the ones containing lines
for (RegionType r : origRegs) {
if (!(r instanceof TextRegionType)) {
continue;
}
TextRegionType tr = (TextRegionType) r;
boolean hasTextLines = tr.getTextLine() != null && !tr.getTextLine().isEmpty();
if (hasTextLines) {
textMap.put(tr.getId(), tr);
}
}
// iterate the new regions and move all the line contents from the old one
for (RegionType r : newRegs) {
if (!(r instanceof TextRegionType) || !textMap.containsKey(r.getId())) {
continue;
}
// this region corresponds with an old one
TextRegionType newTr = (TextRegionType) r;
TextRegionType oldTr = textMap.get(newTr.getId());
copyTextRegionContent(oldTr, newTr);
}
}
use of eu.transkribus.core.model.beans.pagecontent.TextRegionType in project TranskribusCore by Transkribus.
the class FinereaderUtils method addTextStyleToWords.
public static void addTextStyleToWords(TrpDoc doc) throws JAXBException, FileNotFoundException {
/*
* Ich hab im folgenden Ordner das Buch abgelegt, wo die Sprecherangaben automatisiert als „letter-spaced“ zu markieren wären (sofern sich das mit angemessenem Aufwand bewältigen lässt):
ftp://ftp.uibk.ac.at/private/x3061015_20140902_78e054475d7532953c204ce6d392d8e9/Andy_Barbara_Bettina/zu_bearbeiten/
dabei handelt es sich um folgende Namen, sofern sie am Zeilenanfang stehen:
Ernst
Albrecht
Preising
Marschall
Pappenheim
Pienzenau
Bern
Törring
Nothafft von Wernberg
Frauenhoven
Hans von Läubelfing
Caspar Bernauer
Agnes
Theobald
Knippeldollinger
Bürgermeister
Barbara
Martha
Stachus
Der Kastellan
Herold
Legat
FIXME Der Herold
FIXME Der Legat
*/
String[] names = { "Ernst", "Albrecht", "Preising", "Marschall", "Pappenheim", "Pienzenau", "Bern", "Törring", "Nothafft von Wernberg", "Frauenhoven", "Hans von Läubelfing", "Caspar Bernauer", "Agnes", "Theobald", "Knippeldollinger", "Bürgermeister", "Barbara", "Martha", "Stachus", "Der Kastellan", "Der Herold", "Der Legat" };
List<String[]> nameList = new ArrayList<>(names.length);
List<String> nameStartList = new ArrayList<>(names.length);
// List<String> nameSet = new ArrayLilst<>();
String[] tmp;
for (int i = 0; i < names.length; i++) {
String s = names[i];
tmp = s.split(" ");
String tmpStr = "{ ";
for (String t : tmp) {
tmpStr += t + "|";
}
System.out.println(i + "\t- splitting: " + tmpStr + "}");
nameList.add(i, tmp);
nameStartList.add(tmp[0]);
}
TrpElementCoordinatesComparator<WordType> wordComp = new TrpElementCoordinatesComparator<WordType>();
for (TrpPage p : doc.getPages()) {
System.out.println("Processing page: " + p.getPageNr());
URL url = p.getCurrentTranscript().getUrl();
final String xmlPath = FileUtils.toFile(url).getAbsolutePath();
File xmlFile = new File(xmlPath);
PcGtsType pc = JaxbUtils.unmarshal(xmlFile, PcGtsType.class);
List<TextRegionType> regions = PageXmlUtils.getTextRegions(pc);
for (TextRegionType r : regions) {
// System.out.println("Processing region: " + r.getId());
List<Integer> candidatesIndex;
int i;
for (TextLineType l : r.getTextLine()) {
candidatesIndex = new LinkedList<>();
i = 0;
// System.out.println("Processing line: " + l.getId());
List<WordType> words = l.getWord();
if (words != null && !words.isEmpty()) {
Collections.sort(words, wordComp);
// read first word and iterate to second
WordType w1 = words.get(i);
// List<Integer> candidates = new LinkedList<>();
for (int j = 0; j < nameStartList.size(); j++) {
String e = nameStartList.get(j);
if (w1.getTextEquiv() != null && w1.getTextEquiv().getUnicode() != null && isMatch(w1.getTextEquiv().getUnicode(), e)) {
candidatesIndex.add(j);
// System.out.println("Found candidate word: " + j + " - " + w1.getTextEquiv().getUnicode());
}
}
if (!candidatesIndex.isEmpty()) {
for (Integer index : candidatesIndex) {
String[] name = nameList.get(index);
if (name.length == 1) {
// Done.
w1.getTextStyle().setLetterSpaced(true);
System.out.println("OK: " + name[i]);
break;
} else {
List<WordType> wordList = new ArrayList<>(name.length);
boolean isName = true;
wordList.add(w1);
String nameStr = w1.getTextEquiv().getUnicode() + " ";
// check subsequent words
for (i = 1; i < name.length; i++) {
WordType wi = words.get(i);
if (isMatch(wi.getTextEquiv().getUnicode(), name[i])) {
nameStr += wi.getTextEquiv().getUnicode() + " ";
wordList.add(wi);
} else {
System.out.println("NEGATIVE: " + nameStr + words.get(i).getTextEquiv().getUnicode() + " != " + name[i]);
isName = false;
break;
}
}
if (isName) {
System.out.println("OK : " + nameStr);
for (WordType w : wordList) {
// System.out.println(w.getTextEquiv().getUnicode());
w.getTextStyle().setLetterSpaced(true);
}
break;
}
}
}
}
}
}
}
// TODO store pageXML
JaxbUtils.marshalToFile(pc, xmlFile);
}
}
use of eu.transkribus.core.model.beans.pagecontent.TextRegionType in project TranskribusCore by Transkribus.
the class TrpPageTypeUtils method assignUniqueIDs.
/**
* Assigns unique IDs to the elements in the page using the current order of the elements.
*/
public static void assignUniqueIDs(PageType page) {
int i = 1;
for (RegionType r : page.getTextRegionOrImageRegionOrLineDrawingRegion()) {
if (r instanceof TextRegionType) {
TextRegionType region = (TextRegionType) r;
String rid = "r" + i;
region.setId(rid);
int j = 1;
for (TextLineType l : region.getTextLine()) {
String lid = rid + "l" + j;
l.setId(lid);
int k = 1;
for (WordType word : l.getWord()) {
String wid = lid + "w" + k;
word.setId(wid);
k++;
}
++j;
}
++i;
}
}
}
Aggregations