use of eu.transkribus.core.model.beans.pagecontent.WordType in project TranskribusCore by Transkribus.
the class TrpTxtBuilder method writeTxtForSinglePage.
private static void writeTxtForSinglePage(File file, TrpPageType trpPage, boolean wordBased, boolean preserveLineBreaks) {
boolean rtl = false;
// TrpTableRegionType is contained in the regions too
List<TrpRegionType> regions = trpPage.getRegions();
Collections.sort(regions, new TrpElementReadingOrderComparator<RegionType>(true));
List<String> content = new ArrayList<String>();
for (int j = 0; j < regions.size(); ++j) {
TrpRegionType r = regions.get(j);
if (r instanceof TrpTableRegionType) {
/*
* TODO: for simple txt export: how to handle tables
*/
continue;
} else if (r instanceof TrpTextRegionType) {
TrpTextRegionType tr = (TrpTextRegionType) r;
List<TextLineType> lines = tr.getTextLine();
for (int i = 0; i < lines.size(); ++i) {
TrpTextLineType trpL = (TrpTextLineType) lines.get(i);
String textOfCurrLine = trpL.getUnicodeText();
if (wordBased && trpL.getWord().size() > 0) {
for (WordType word : trpL.getWord()) {
content.add(((ITrpShapeType) word).getUnicodeText());
}
} else if (textOfCurrLine != "") {
content.add(textOfCurrLine);
}
// if(preserveLineBreaks){
// content.add(System.lineSeparator());
// }
}
if (lines.size() > 0) {
content.add(System.lineSeparator());
// try {
// //Add line separator after each region
// Files.write(Paths.get(file.getAbsolutePath()), new ArrayList<String>() {{ add(System.lineSeparator()); }}, utf8,
// StandardOpenOption.CREATE, StandardOpenOption.APPEND);
// } catch (IOException e) {
// // TODO Auto-generated catch block
// e.printStackTrace();
// }
}
}
}
try {
logger.debug("path " + Paths.get(file.getAbsolutePath()));
Files.write(Paths.get(file.getAbsolutePath()), content, utf8, StandardOpenOption.CREATE, StandardOpenOption.APPEND);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
use of eu.transkribus.core.model.beans.pagecontent.WordType in project TranskribusCore by Transkribus.
the class PageXmlUtils method extractStats.
public static TrpTranscriptStatistics extractStats(PcGtsType page) {
TrpTranscriptStatistics s = new TrpTranscriptStatistics();
int nrOfRegions, nrOfTranscribedRegions, nrOfWordsInRegions, nrOfLines, nrOfTranscribedLines, nrOfWordsInLines, nrOfWords, nrOfTranscribedWords;
nrOfRegions = nrOfTranscribedRegions = nrOfWordsInRegions = nrOfLines = nrOfTranscribedLines = nrOfWordsInLines = nrOfWords = nrOfTranscribedWords = 0;
List<TextRegionType> regs = PageXmlUtils.getTextRegions(page);
nrOfRegions = regs.size();
for (TextRegionType r : regs) {
if (r.getTextEquiv() != null && r.getTextEquiv().getUnicode() != null && !r.getTextEquiv().getUnicode().trim().isEmpty()) {
nrOfTranscribedRegions += 1;
// TODO use tokenizer here
nrOfWordsInRegions += r.getTextEquiv().getUnicode().split(" ").length;
}
List<TextLineType> lines = r.getTextLine();
nrOfLines += lines.size();
for (TextLineType l : lines) {
if (l.getTextEquiv() != null && l.getTextEquiv().getUnicode() != null && !l.getTextEquiv().getUnicode().trim().isEmpty()) {
nrOfTranscribedLines += 1;
// TODO use tokenizer here
nrOfWordsInLines += l.getTextEquiv().getUnicode().split(" ").length;
}
List<WordType> words = l.getWord();
nrOfWords += words.size();
for (WordType w : words) {
if (w.getTextEquiv() != null && w.getTextEquiv().getUnicode() != null && !w.getTextEquiv().getUnicode().trim().isEmpty()) {
nrOfTranscribedWords += 1;
}
}
}
}
s.setNrOfLines(nrOfLines);
s.setNrOfRegions(nrOfRegions);
s.setNrOfTranscribedLines(nrOfTranscribedLines);
s.setNrOfTranscribedWords(nrOfTranscribedWords);
s.setNrOfTranscribedRegions(nrOfTranscribedRegions);
s.setNrOfWords(nrOfWords);
s.setNrOfWordsInLines(nrOfWordsInLines);
s.setNrOfWordsInRegions(nrOfWordsInRegions);
return s;
}
use of eu.transkribus.core.model.beans.pagecontent.WordType in project TranskribusCore by Transkribus.
the class PageXmlUtils method removeAllIndexedTags.
public static void removeAllIndexedTags(PcGtsType pc) {
TrpPageType p = (TrpPageType) pc.getPage();
List<TrpTextRegionType> trList = p.getTextRegions(true);
for (TrpTextRegionType tr : trList) {
tr.getCustomTagList().removeIndexedTags();
List<TextLineType> lineList = tr.getTextLine();
for (TextLineType l : lineList) {
TrpTextLineType trpL = (TrpTextLineType) l;
trpL.getCustomTagList().removeIndexedTags();
List<WordType> wordList = trpL.getWord();
for (WordType w : wordList) {
TrpWordType trpW = (TrpWordType) w;
trpW.getCustomTagList().removeIndexedTags();
}
}
}
}
use of eu.transkribus.core.model.beans.pagecontent.WordType in project TranskribusCore by Transkribus.
the class ExportCache method storeCustomTagMapForDoc.
/**
* @param doc
* @param wordBased
* @param pageIndices
* @param blackening
* @return all (custom) tags of the given document
* @throws JAXBException
* @throws IOException
* @throws InterruptedException
*/
public void storeCustomTagMapForDoc(TrpDoc doc, boolean wordBased, Set<Integer> pageIndices, IProgressMonitor monitor, boolean blackening) throws JAXBException, IOException, InterruptedException {
doBlackening = blackening;
tags.clear();
List<TrpPage> pages = doc.getPages();
int totalPages = pages.size();
int c = 0;
for (int i = 0; i < totalPages; ++i) {
if (pageIndices != null && !pageIndices.contains(i))
continue;
if (monitor != null && monitor.isCanceled()) {
throw new InterruptedException("User canceled the export");
}
// pageTranscripts get fetched before the custom tag map is stored - so normally pageTranscripts.get(i) != null
JAXBPageTranscript tr;
if (pageTranscripts == null || pageTranscripts.get(i) == null) {
TrpPage page = pages.get(i);
TrpTranscriptMetadata md = page.getCurrentTranscript();
tr = new JAXBPageTranscript(md);
} else {
tr = pageTranscripts.get(i);
tr.getPageData();
}
tr.build();
TrpPageType trpPage = tr.getPage();
logger.debug("get tags for page " + (i + 1) + "/" + doc.getNPages());
List<TrpTextRegionType> textRegions = trpPage.getTextRegions(true);
for (int j = 0; j < textRegions.size(); ++j) {
TrpTextRegionType r = textRegions.get(j);
List<TextLineType> lines = r.getTextLine();
for (int k = 0; k < lines.size(); ++k) {
TrpTextLineType trpL = (TrpTextLineType) lines.get(k);
List<WordType> words = trpL.getWord();
getTagsForShapeElement(trpL);
if (wordBased) {
for (int l = 0; l < words.size(); ++l) {
TrpWordType w = (TrpWordType) words.get(l);
getTagsForShapeElement(w);
}
}
// else{
// getTagsForShapeElement(trpL);
// }
}
}
if (monitor != null) {
monitor.setTaskName("Loaded tags for page " + (i + 1));
monitor.worked(++c);
}
}
}
use of eu.transkribus.core.model.beans.pagecontent.WordType in project TranskribusCore by Transkribus.
the class FinereaderUtils method addTextStyleToWords.
public static void addTextStyleToWords(TrpDoc doc) throws JAXBException, FileNotFoundException {
/*
* Ich hab im folgenden Ordner das Buch abgelegt, wo die Sprecherangaben automatisiert als „letter-spaced“ zu markieren wären (sofern sich das mit angemessenem Aufwand bewältigen lässt):
ftp://ftp.uibk.ac.at/private/x3061015_20140902_78e054475d7532953c204ce6d392d8e9/Andy_Barbara_Bettina/zu_bearbeiten/
dabei handelt es sich um folgende Namen, sofern sie am Zeilenanfang stehen:
Ernst
Albrecht
Preising
Marschall
Pappenheim
Pienzenau
Bern
Törring
Nothafft von Wernberg
Frauenhoven
Hans von Läubelfing
Caspar Bernauer
Agnes
Theobald
Knippeldollinger
Bürgermeister
Barbara
Martha
Stachus
Der Kastellan
Herold
Legat
FIXME Der Herold
FIXME Der Legat
*/
String[] names = { "Ernst", "Albrecht", "Preising", "Marschall", "Pappenheim", "Pienzenau", "Bern", "Törring", "Nothafft von Wernberg", "Frauenhoven", "Hans von Läubelfing", "Caspar Bernauer", "Agnes", "Theobald", "Knippeldollinger", "Bürgermeister", "Barbara", "Martha", "Stachus", "Der Kastellan", "Der Herold", "Der Legat" };
List<String[]> nameList = new ArrayList<>(names.length);
List<String> nameStartList = new ArrayList<>(names.length);
// List<String> nameSet = new ArrayLilst<>();
String[] tmp;
for (int i = 0; i < names.length; i++) {
String s = names[i];
tmp = s.split(" ");
String tmpStr = "{ ";
for (String t : tmp) {
tmpStr += t + "|";
}
System.out.println(i + "\t- splitting: " + tmpStr + "}");
nameList.add(i, tmp);
nameStartList.add(tmp[0]);
}
TrpElementCoordinatesComparator<WordType> wordComp = new TrpElementCoordinatesComparator<WordType>();
for (TrpPage p : doc.getPages()) {
System.out.println("Processing page: " + p.getPageNr());
URL url = p.getCurrentTranscript().getUrl();
final String xmlPath = FileUtils.toFile(url).getAbsolutePath();
File xmlFile = new File(xmlPath);
PcGtsType pc = JaxbUtils.unmarshal(xmlFile, PcGtsType.class);
List<TextRegionType> regions = PageXmlUtils.getTextRegions(pc);
for (TextRegionType r : regions) {
// System.out.println("Processing region: " + r.getId());
List<Integer> candidatesIndex;
int i;
for (TextLineType l : r.getTextLine()) {
candidatesIndex = new LinkedList<>();
i = 0;
// System.out.println("Processing line: " + l.getId());
List<WordType> words = l.getWord();
if (words != null && !words.isEmpty()) {
Collections.sort(words, wordComp);
// read first word and iterate to second
WordType w1 = words.get(i);
// List<Integer> candidates = new LinkedList<>();
for (int j = 0; j < nameStartList.size(); j++) {
String e = nameStartList.get(j);
if (w1.getTextEquiv() != null && w1.getTextEquiv().getUnicode() != null && isMatch(w1.getTextEquiv().getUnicode(), e)) {
candidatesIndex.add(j);
// System.out.println("Found candidate word: " + j + " - " + w1.getTextEquiv().getUnicode());
}
}
if (!candidatesIndex.isEmpty()) {
for (Integer index : candidatesIndex) {
String[] name = nameList.get(index);
if (name.length == 1) {
// Done.
w1.getTextStyle().setLetterSpaced(true);
System.out.println("OK: " + name[i]);
break;
} else {
List<WordType> wordList = new ArrayList<>(name.length);
boolean isName = true;
wordList.add(w1);
String nameStr = w1.getTextEquiv().getUnicode() + " ";
// check subsequent words
for (i = 1; i < name.length; i++) {
WordType wi = words.get(i);
if (isMatch(wi.getTextEquiv().getUnicode(), name[i])) {
nameStr += wi.getTextEquiv().getUnicode() + " ";
wordList.add(wi);
} else {
System.out.println("NEGATIVE: " + nameStr + words.get(i).getTextEquiv().getUnicode() + " != " + name[i]);
isName = false;
break;
}
}
if (isName) {
System.out.println("OK : " + nameStr);
for (WordType w : wordList) {
// System.out.println(w.getTextEquiv().getUnicode());
w.getTextStyle().setLetterSpaced(true);
}
break;
}
}
}
}
}
}
}
// TODO store pageXML
JaxbUtils.marshalToFile(pc, xmlFile);
}
}
Aggregations