use of eu.transkribus.core.model.beans.pagecontent_trp.TrpTextLineType in project TranskribusCore by Transkribus.
the class DocxBuilder method exportTextRegion.
private static void exportTextRegion(TrpTextRegionType tr, boolean wordBased, P p, MainDocumentPart mdp) {
if (p == null) {
p = factory.createP();
mdp.addObject(p);
}
List<TextLineType> lines = tr.getTextLine();
for (int i = 0; i < lines.size(); ++i) {
TrpTextLineType trpL = (TrpTextLineType) lines.get(i);
try {
if (wordBased && trpL.getWord().size() > 0) {
getFormattedTextForLineElement(trpL.getWord(), p, mdp);
} else {
getFormattedTextForShapeElement(trpL, p, mdp);
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
/* with ΒΆ the user can mark a new paragraph inside a text region
* unicode is \u00B6
*/
if (trpL.getCustomTagList().containsParagraphTag()) {
// then new paragraph should be used;
p = factory.createP();
mdp.addObject(p);
} else /*add line break after each text line
* or omit this if explicitely wished to have dense lines
* No line break at end of paragraph
*/
if (preserveLineBreaks && !(i + 1 == lines.size())) {
// this Br element is used break the current and go for next line
Br br = factory.createBr();
p.getContent().add(br);
}
}
}
use of eu.transkribus.core.model.beans.pagecontent_trp.TrpTextLineType in project TranskribusCore by Transkribus.
the class TrpRtfBuilder method getRtfParagraphsForTranscript.
// public static void writeRtfForElement(Rtf rtf, ITrpShapeType element, boolean wordBased, File file, boolean append) throws IOException, JAXBException {
// element.getUnicodeText();
// CustomTagList cl = element.getCustomTagList();
//
// RtfText text = RtfText.text(element.getUnicodeText());
// text = formatRtfText(text, element.getTextStyle());
//
//
//
// if (element instanceof TextLineType || element instanceof TextRegionType) {// TODO words vs lines and regions
// rtf.p(text);
// } else if (element instanceof TrpWordType) {
// // rtf.p(texts);
// }
//
//
// // cl.getCustomTagAndContinuations(tag)
//
//
//
// }
public static RtfPara[] getRtfParagraphsForTranscript(TrpPageType trpPage, boolean wordBased) throws IOException, JAXBException {
boolean rtl = false;
List<TrpTextRegionType> textRegions = trpPage.getTextRegions(true);
// List<TrpTextRegionType> textRegions = trpPage.getTextRegionsAndTextRegionsFromTableRegions(true);
RtfPara[] paras = new RtfPara[textRegions.size()];
for (int j = 0; j < textRegions.size(); ++j) {
TrpTextRegionType r = textRegions.get(j);
// if (exportTags){
// getTagsForShapeElement(r);
// }
List<TextLineType> lines = r.getTextLine();
RtfText[] linesTexts = new RtfText[lines.size()];
for (int i = 0; i < lines.size(); ++i) {
TrpTextLineType trpL = (TrpTextLineType) lines.get(i);
linesTexts[i] = (wordBased && trpL.getWord().size() > 0) ? getRtfTextForLineFromWords(trpL) : getRtfTextForShapeElement(trpL);
linesTexts[i] = RtfText.text(linesTexts[i], "\n");
}
// read from right to left -> alignment is right
if (rtl) {
// paras[j] = RtfPara.p(linesTexts).footnote("Test").alignRight();
} else {
String test = "test";
paras[j] = RtfPara.p(linesTexts);
// paras[j] = RtfPara.p(linesTexts, RtfText.footnote("Test")).alignLeft();
}
}
return paras;
// Rtf rtf = Rtf.rtf().section(paras);
// return rtf;
// for (RegionType r : trpPage.getTextRegionOrImageRegionOrLineDrawingRegion()) {
// if (r instanceof GraphicRegionType) {
// GraphicRegionType gr = (GraphicRegionType) r;
// // TODO: how to export images in pdf??
// r.getTextRegions(recursive);
// }
// }
// tr.getPage().getTextRegions(recursive);
// Rtf.rtf();
// RtfWriter;
}
use of eu.transkribus.core.model.beans.pagecontent_trp.TrpTextLineType in project TranskribusCore by Transkribus.
the class TrpTxtBuilder method writeTxtForSinglePage.
private static void writeTxtForSinglePage(File file, TrpPageType trpPage, boolean wordBased, boolean preserveLineBreaks) {
boolean rtl = false;
// TrpTableRegionType is contained in the regions too
List<TrpRegionType> regions = trpPage.getRegions();
Collections.sort(regions, new TrpElementReadingOrderComparator<RegionType>(true));
List<String> content = new ArrayList<String>();
for (int j = 0; j < regions.size(); ++j) {
TrpRegionType r = regions.get(j);
if (r instanceof TrpTableRegionType) {
/*
* TODO: for simple txt export: how to handle tables
*/
continue;
} else if (r instanceof TrpTextRegionType) {
TrpTextRegionType tr = (TrpTextRegionType) r;
List<TextLineType> lines = tr.getTextLine();
for (int i = 0; i < lines.size(); ++i) {
TrpTextLineType trpL = (TrpTextLineType) lines.get(i);
String textOfCurrLine = trpL.getUnicodeText();
if (wordBased && trpL.getWord().size() > 0) {
for (WordType word : trpL.getWord()) {
content.add(((ITrpShapeType) word).getUnicodeText());
}
} else if (textOfCurrLine != "") {
content.add(textOfCurrLine);
}
// if(preserveLineBreaks){
// content.add(System.lineSeparator());
// }
}
if (lines.size() > 0) {
content.add(System.lineSeparator());
// try {
// //Add line separator after each region
// Files.write(Paths.get(file.getAbsolutePath()), new ArrayList<String>() {{ add(System.lineSeparator()); }}, utf8,
// StandardOpenOption.CREATE, StandardOpenOption.APPEND);
// } catch (IOException e) {
// // TODO Auto-generated catch block
// e.printStackTrace();
// }
}
}
}
try {
logger.debug("path " + Paths.get(file.getAbsolutePath()));
Files.write(Paths.get(file.getAbsolutePath()), content, utf8, StandardOpenOption.CREATE, StandardOpenOption.APPEND);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
use of eu.transkribus.core.model.beans.pagecontent_trp.TrpTextLineType in project TranskribusCore by Transkribus.
the class KlosterTeiToPageParser method parsePage.
static void parsePage(Node pbNode, boolean save) throws IOException, JAXBException {
Element pb = (Element) pbNode;
String imgFn = pb.getAttribute("facs");
int pageN = Integer.parseInt(pb.getAttribute("n"));
int pageHeight = Integer.parseInt(pb.getAttribute("xmlns:h"));
int pageWidth = Integer.parseInt(pb.getAttribute("xmlns:w"));
PcGtsType page = PageXmlUtils.createEmptyPcGtsType("imgfn", pageWidth, pageHeight);
TrpTextRegionType region = new TrpTextRegionType();
region.setId("region_1");
System.out.println("page data: imgFn = " + imgFn + " n = " + pageN + " pageWidth = " + pageWidth + " pageHeight = " + pageHeight);
int minX = 999999, minY = 999999, maxX = -1, maxY = -1;
Node sibling = pbNode.getNextSibling();
int lineCount = 0;
while (sibling != null) {
if (sibling.getNodeName().equals("pb")) {
break;
}
// System.out.println("sibling type: "+sibling.getTextContent());
if (sibling.getNodeType() == Node.ELEMENT_NODE && sibling.getNodeName().equals("lb")) {
Element lb = (Element) sibling;
int n = Integer.parseInt(lb.getAttribute("n"));
int x = Integer.parseInt(lb.getAttribute("xmlns:x"));
int y = Integer.parseInt(lb.getAttribute("xmlns:y"));
int w = Integer.parseInt(lb.getAttribute("xmlns:w"));
int h = Integer.parseInt(lb.getAttribute("xmlns:h"));
if (x < minX)
minX = x;
if (y < minY)
minY = y;
if (x + w > maxX)
maxX = x + w;
if (y + h > maxY)
maxY = y + h;
String txt = sibling.getNextSibling().getTextContent();
txt = StringUtils.stripEnd(txt, " \r\n");
// System.out.println("line: txt = "+txt+" [x,y,w,h] = ["+x+","+y+","+w+","+h+"]");
System.out.format("line: n = %d, txt = %s, coords = [%d,%d,%d,%d]\n", n, txt, x, y, w, h);
TrpTextLineType line = new TrpTextLineType();
line.setCoords(bbToCoords(x, y, w, h));
TextEquivType te = new TextEquivType();
te.setUnicode(txt);
line.setTextEquiv(te);
line.setId("line_" + (++lineCount));
// create baseline:
TrpBaselineType bl = new TrpBaselineType();
int yBl = (int) (y + 0.7 * h);
bl.setPoints(x + "," + yBl + " " + (x + w) + "," + yBl);
line.setBaseline(bl);
region.getTextLine().add(line);
}
sibling = sibling.getNextSibling();
// System.out.println("sibling node name: "+sibling.getNodeName());
// if (!sibling.getNodeName().equals("lb"))
// break;
}
if (!region.getTextLine().isEmpty()) {
region.setCoords(bbToCoords(minX, minY, maxX - minX, maxY - minY));
} else {
region.setCoords(bbToCoords(0, 0, pageWidth, pageHeight));
}
page.getPage().getTextRegionOrImageRegionOrLineDrawingRegion().add(region);
if (save && !region.getTextLine().isEmpty()) {
File xmlFile = new File(PAGE_DIR + FilenameUtils.getBaseName(imgFn) + ".xml");
PageXmlUtils.marshalToFile(page, xmlFile);
FileUtils.copyFile(new File(DIR + imgFn), new File(DST_DIR + imgFn));
System.out.println("written page to: " + xmlFile.getAbsolutePath());
}
}
use of eu.transkribus.core.model.beans.pagecontent_trp.TrpTextLineType in project TranskribusCore by Transkribus.
the class PageXmlUtils method removeAllIndexedTags.
public static void removeAllIndexedTags(PcGtsType pc) {
TrpPageType p = (TrpPageType) pc.getPage();
List<TrpTextRegionType> trList = p.getTextRegions(true);
for (TrpTextRegionType tr : trList) {
tr.getCustomTagList().removeIndexedTags();
List<TextLineType> lineList = tr.getTextLine();
for (TextLineType l : lineList) {
TrpTextLineType trpL = (TrpTextLineType) l;
trpL.getCustomTagList().removeIndexedTags();
List<WordType> wordList = trpL.getWord();
for (WordType w : wordList) {
TrpWordType trpW = (TrpWordType) w;
trpW.getCustomTagList().removeIndexedTags();
}
}
}
}
Aggregations