use of eu.transkribus.core.model.beans.pagecontent.TextLineType in project TranskribusCore by Transkribus.
the class TrpTeiStringBuilder method writeTextForTextRegion.
void writeTextForTextRegion(SebisStringBuilder sb, TextRegionType r, int pageNr) {
String facsId = FACS_ID_PREFIX + pageNr;
if (r.getTextLine().isEmpty()) {
logger.warn("skipping empty region: " + r.getId());
return;
}
writeTextRegion(sb, r, facsId);
for (TextLineType tl : r.getTextLine()) {
TrpTextLineType ttl = (TrpTextLineType) tl;
if (!commonPars.isWriteTextOnWordLevel()) {
writeLineOrWord(sb, ttl, facsId);
} else {
String lStart = getLineOrWordStart(ttl, facsId);
sb.incIndent();
sb.addLine(lStart);
// TODO: write text for words???
for (WordType w : ttl.getWord()) {
writeLineOrWord(sb, (TrpWordType) w, facsId);
}
String lEnd = getLineOrWordEnd(ttl, facsId);
sb.addLine(lEnd);
// sb.append("\n");
sb.decIndent();
}
}
closeTextRegion(sb);
}
use of eu.transkribus.core.model.beans.pagecontent.TextLineType in project TranskribusCore by Transkribus.
the class DocxBuilder method exportTextRegion.
private static void exportTextRegion(TrpTextRegionType tr, boolean wordBased, P p, MainDocumentPart mdp) {
if (p == null) {
p = factory.createP();
mdp.addObject(p);
}
List<TextLineType> lines = tr.getTextLine();
for (int i = 0; i < lines.size(); ++i) {
TrpTextLineType trpL = (TrpTextLineType) lines.get(i);
try {
if (wordBased && trpL.getWord().size() > 0) {
getFormattedTextForLineElement(trpL.getWord(), p, mdp);
} else {
getFormattedTextForShapeElement(trpL, p, mdp);
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
/* with ΒΆ the user can mark a new paragraph inside a text region
* unicode is \u00B6
*/
if (trpL.getCustomTagList().containsParagraphTag()) {
// then new paragraph should be used;
p = factory.createP();
mdp.addObject(p);
} else /*add line break after each text line
* or omit this if explicitely wished to have dense lines
* No line break at end of paragraph
*/
if (preserveLineBreaks && !(i + 1 == lines.size())) {
// this Br element is used break the current and go for next line
Br br = factory.createBr();
p.getContent().add(br);
}
}
}
use of eu.transkribus.core.model.beans.pagecontent.TextLineType in project TranskribusCore by Transkribus.
the class TrpRtfBuilder method getRtfParagraphsForTranscript.
// public static void writeRtfForElement(Rtf rtf, ITrpShapeType element, boolean wordBased, File file, boolean append) throws IOException, JAXBException {
// element.getUnicodeText();
// CustomTagList cl = element.getCustomTagList();
//
// RtfText text = RtfText.text(element.getUnicodeText());
// text = formatRtfText(text, element.getTextStyle());
//
//
//
// if (element instanceof TextLineType || element instanceof TextRegionType) {// TODO words vs lines and regions
// rtf.p(text);
// } else if (element instanceof TrpWordType) {
// // rtf.p(texts);
// }
//
//
// // cl.getCustomTagAndContinuations(tag)
//
//
//
// }
public static RtfPara[] getRtfParagraphsForTranscript(TrpPageType trpPage, boolean wordBased) throws IOException, JAXBException {
boolean rtl = false;
List<TrpTextRegionType> textRegions = trpPage.getTextRegions(true);
// List<TrpTextRegionType> textRegions = trpPage.getTextRegionsAndTextRegionsFromTableRegions(true);
RtfPara[] paras = new RtfPara[textRegions.size()];
for (int j = 0; j < textRegions.size(); ++j) {
TrpTextRegionType r = textRegions.get(j);
// if (exportTags){
// getTagsForShapeElement(r);
// }
List<TextLineType> lines = r.getTextLine();
RtfText[] linesTexts = new RtfText[lines.size()];
for (int i = 0; i < lines.size(); ++i) {
TrpTextLineType trpL = (TrpTextLineType) lines.get(i);
linesTexts[i] = (wordBased && trpL.getWord().size() > 0) ? getRtfTextForLineFromWords(trpL) : getRtfTextForShapeElement(trpL);
linesTexts[i] = RtfText.text(linesTexts[i], "\n");
}
// read from right to left -> alignment is right
if (rtl) {
// paras[j] = RtfPara.p(linesTexts).footnote("Test").alignRight();
} else {
String test = "test";
paras[j] = RtfPara.p(linesTexts);
// paras[j] = RtfPara.p(linesTexts, RtfText.footnote("Test")).alignLeft();
}
}
return paras;
// Rtf rtf = Rtf.rtf().section(paras);
// return rtf;
// for (RegionType r : trpPage.getTextRegionOrImageRegionOrLineDrawingRegion()) {
// if (r instanceof GraphicRegionType) {
// GraphicRegionType gr = (GraphicRegionType) r;
// // TODO: how to export images in pdf??
// r.getTextRegions(recursive);
// }
// }
// tr.getPage().getTextRegions(recursive);
// Rtf.rtf();
// RtfWriter;
}
use of eu.transkribus.core.model.beans.pagecontent.TextLineType in project TranskribusCore by Transkribus.
the class PageXmlUtils method getFulltextFromLines.
public static String getFulltextFromLines(PcGtsType pc) {
List<TextRegionType> regions = PageXmlUtils.getTextRegions(pc);
TrpElementCoordinatesComparator<TextLineType> comp = new TrpElementCoordinatesComparator<>();
StringBuilder sb = new StringBuilder();
for (TextRegionType r : regions) {
List<TextLineType> lines = r.getTextLine();
if (lines != null && !lines.isEmpty()) {
Collections.sort(lines, comp);
for (TextLineType l : lines) {
if (l.getTextEquiv() != null && l.getTextEquiv().getUnicode() != null) {
sb.append(l.getTextEquiv().getUnicode() + " ");
}
}
}
}
return sb.toString();
}
use of eu.transkribus.core.model.beans.pagecontent.TextLineType in project TranskribusCore by Transkribus.
the class PageXmlUtils method extractStats.
public static TrpTranscriptStatistics extractStats(PcGtsType page) {
TrpTranscriptStatistics s = new TrpTranscriptStatistics();
int nrOfRegions, nrOfTranscribedRegions, nrOfWordsInRegions, nrOfLines, nrOfTranscribedLines, nrOfWordsInLines, nrOfWords, nrOfTranscribedWords;
nrOfRegions = nrOfTranscribedRegions = nrOfWordsInRegions = nrOfLines = nrOfTranscribedLines = nrOfWordsInLines = nrOfWords = nrOfTranscribedWords = 0;
List<TextRegionType> regs = PageXmlUtils.getTextRegions(page);
nrOfRegions = regs.size();
for (TextRegionType r : regs) {
if (r.getTextEquiv() != null && r.getTextEquiv().getUnicode() != null && !r.getTextEquiv().getUnicode().trim().isEmpty()) {
nrOfTranscribedRegions += 1;
// TODO use tokenizer here
nrOfWordsInRegions += r.getTextEquiv().getUnicode().split(" ").length;
}
List<TextLineType> lines = r.getTextLine();
nrOfLines += lines.size();
for (TextLineType l : lines) {
if (l.getTextEquiv() != null && l.getTextEquiv().getUnicode() != null && !l.getTextEquiv().getUnicode().trim().isEmpty()) {
nrOfTranscribedLines += 1;
// TODO use tokenizer here
nrOfWordsInLines += l.getTextEquiv().getUnicode().split(" ").length;
}
List<WordType> words = l.getWord();
nrOfWords += words.size();
for (WordType w : words) {
if (w.getTextEquiv() != null && w.getTextEquiv().getUnicode() != null && !w.getTextEquiv().getUnicode().trim().isEmpty()) {
nrOfTranscribedWords += 1;
}
}
}
}
s.setNrOfLines(nrOfLines);
s.setNrOfRegions(nrOfRegions);
s.setNrOfTranscribedLines(nrOfTranscribedLines);
s.setNrOfTranscribedWords(nrOfTranscribedWords);
s.setNrOfTranscribedRegions(nrOfTranscribedRegions);
s.setNrOfWords(nrOfWords);
s.setNrOfWordsInLines(nrOfWordsInLines);
s.setNrOfWordsInRegions(nrOfWordsInRegions);
return s;
}
Aggregations