use of eu.transkribus.core.model.beans.pagecontent.PcGtsType in project TranskribusCore by Transkribus.
the class PageXmlUtils method createPcGtsTypeFromText.
public static PcGtsType createPcGtsTypeFromText(final String imgFileName, Dimension dim, String text, TranscriptionLevel level, boolean skipEmptyLines) throws IOException {
// create empty page
PcGtsType pcGtsType = createEmptyPcGtsType(imgFileName, dim);
TrpPageType page = (TrpPageType) pcGtsType.getPage();
// create and add text region with size of image
Rectangle r = new Rectangle(0, 0, page.getImageWidth(), page.getImageHeight());
String defaultCoords = PointStrUtils.pointsToString(r);
TrpTextRegionType region = new TrpTextRegionType((TrpPageType) page);
region.setId("region_1");
region.setCoordinates(defaultCoords, null);
page.getTextRegionOrImageRegionOrLineDrawingRegion().add(region);
if (level == null) {
level = TranscriptionLevel.LINE_BASED;
}
if (level != TranscriptionLevel.REGION_BASED && level != TranscriptionLevel.LINE_BASED && level != TranscriptionLevel.WORD_BASED) {
throw new IOException("Invalide TranscriptionLevel: " + level);
}
if (level == TranscriptionLevel.REGION_BASED) {
region.setUnicodeText(text, null);
} else {
String splitRegex = skipEmptyLines ? "[\\r\\n]+" : "\\r?\\n";
String[] lines = text.split(splitRegex);
logger.debug("nr of lines = " + lines.length);
int lc = 1;
for (String lineText : lines) {
TrpTextLineType line = new TrpTextLineType(region);
line.setId("line_" + (lc++));
line.setCoordinates(defaultCoords, null);
region.getTextLine().add(line);
if (level == TranscriptionLevel.LINE_BASED) {
line.setUnicodeText(lineText, null);
} else if (level == TranscriptionLevel.WORD_BASED) {
int wc = 1;
for (String wordText : lineText.split(" ")) {
// TODO: better word splitting??
TrpWordType word = new TrpWordType(line);
word.setId("word_" + (wc++));
word.setCoordinates(defaultCoords, null);
word.setUnicodeText(wordText, null);
line.getWord().add(word);
}
}
}
}
return pcGtsType;
}
use of eu.transkribus.core.model.beans.pagecontent.PcGtsType in project TranskribusCore by Transkribus.
the class PageXmlUtils method createEmptyPcGtsType.
public static PcGtsType createEmptyPcGtsType(final String imgFileName, final int xDim, final int yDim) {
// create md
MetadataType md = new MetadataType();
md.setCreator("TRP");
XMLGregorianCalendar xmlCal = JaxbUtils.getXmlCalendar(new Date());
md.setCreated(xmlCal);
md.setLastChange(xmlCal);
// create TRP (!) pageType
TrpPageType pt = new TrpPageType();
pt.setImageFilename(imgFileName);
pt.setImageHeight(yDim);
pt.setImageWidth(xDim);
// create root and set stuff
PcGtsType pc = new PcGtsType();
pc.setMetadata(md);
pc.setPage(pt);
return pc;
}
use of eu.transkribus.core.model.beans.pagecontent.PcGtsType in project TranskribusCore by Transkribus.
the class PageXmlUtilsTest method testSth.
public static void testSth() throws Exception {
File[] files = { new File("/mnt/dea_scratch/TRP/test/page_xsl_test/ocr/Mittheilungen_Perthes_1855_0009.xml"), new File("/mnt/dea_scratch/TRP/test/ImagesOldPageXml/page/2010-03-19_backup/035_320_001.xml"), new File("/mnt/dea_scratch/TRP/test/page_xsl_test/Mittheilungen_Perthes_1855_0009.xml") };
try {
URL url = new URL("https://dbis-thure.uibk.ac.at/fimagestore/Get?fileType=metadata&id=YSUGXUUGAHYCUQVMEUJAYQGO");
FimgStoreImgMd md = FimgStoreReadConnection.getImgMd(url);
PcGtsType t = PageXmlUtils.createEmptyPcGtsType(url, md.getDimension());
JaxbUtils.marshalToSysOut(t);
} catch (IOException | JAXBException e) {
e.printStackTrace();
}
// for(File f : files){
// try {
// System.out.println(XmlUtils.getXmlFormat(f).toString());
// } catch (IOException e) {
//
// e.printStackTrace();
// }
// }
}
use of eu.transkribus.core.model.beans.pagecontent.PcGtsType in project TranskribusCore by Transkribus.
the class PageXmlUtilsTest method testGetTextRegions.
public static void testGetTextRegions() throws Exception {
String transcriptWithTables = "https://dbis-thure.uibk.ac.at/f/Get?id=VCLTRLDSWETCXIHQNHKOPRLS";
PcGtsType t = PageXmlUtils.unmarshal(new URL(transcriptWithTables));
List<TextRegionType> tr = PageXmlUtils.getTextRegions(t);
for (TextRegionType r : tr) {
System.out.println("tr: " + r.getClass().getSimpleName() + " id: " + r.getId() + " n-lines: " + r.getTextLine().size());
}
}
use of eu.transkribus.core.model.beans.pagecontent.PcGtsType in project TranskribusCore by Transkribus.
the class XslTransformTest method main.
public static void main(String[] args) {
final File abbyyXml = new File("C:/tmp/Alto2PageTest/ocr/177907.xml");
File pageOutFile = new File("C:/tmp/Alto2PageTest/abbyyToPageTest.xml");
final String TEXT_STYLE_PARAM_NAME = "preserveTextStyles";
Map<String, Object> params = new HashMap<>();
params.put(TEXT_STYLE_PARAM_NAME, Boolean.FALSE);
PcGtsType pc;
try {
pc = JaxbUtils.transformToObject(abbyyXml, ABBY_TO_PAGE_XSLT, params, PcGtsType.class);
pc.getPage().setImageFilename("177907.JPG");
File pageXml = JaxbUtils.marshalToFile(pc, pageOutFile);
PcGtsType result = PageXmlUtils.unmarshal(pageOutFile);
} catch (TransformerException | SAXException | IOException | ParserConfigurationException | JAXBException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
Aggregations