use of eu.transkribus.core.model.beans.pagecontent.PcGtsType in project TranskribusCore by Transkribus.
the class GoobiMetsImporter method createPageFromAlto2.
/**
* create a page file from the given Alto file
*
* @param imgFile
* @param altoXml
* @param pageOutFile
* @param preserveOcrTxtStyles
* @param preserveOcrFontFamily
* @param replaceBadChars
* @return
* @throws IOException
* @throws TransformerException
* @throws SAXException
* @throws ParserConfigurationException
* @throws JAXBException
*/
public File createPageFromAlto2(File imgFile, File altoXml, File pageOutFile, boolean preserveOcrTxtStyles, boolean preserveOcrFontFamily, boolean replaceBadChars) throws IOException, TransformerException, SAXException, ParserConfigurationException, JAXBException {
XmlFormat xmlFormat = XmlUtils.getXmlFormat(altoXml);
if (xmlFormat.equals(XmlFormat.ALTO_2)) {
logger.info(altoXml.getAbsolutePath() + ": Transforming ALTO v2 XMLs to PAGE XML.");
PcGtsType pc = PageXmlUtils.createPcGtsTypeFromAlto(altoXml, imgFile.getName(), preserveOcrTxtStyles, preserveOcrFontFamily, replaceBadChars);
return JaxbUtils.marshalToFile(pc, pageOutFile);
}
throw new IOException("Could not determine xml file as valid alto2: " + altoXml.getAbsolutePath());
}
use of eu.transkribus.core.model.beans.pagecontent.PcGtsType in project TranskribusCore by Transkribus.
the class TrpTeiStringBuilder method setContent.
@Override
protected void setContent(List<TrpPage> pages) throws JAXBException, InterruptedException {
SebisStringBuilder sbFacsimile = new SebisStringBuilder();
SebisStringBuilder sbText = new SebisStringBuilder();
sbText.incIndent();
sbText.addLine("<text>");
sbText.incIndent();
sbText.addLine("<body>");
// sbText.incIndent();
// text = tei.createElementNS(TEI_NS, "text");
// body = tei.createElementNS(TEI_NS, "body");
int totalPages = pageIndices == null ? pages.size() : pageIndices.size();
if (monitor != null) {
monitor.beginTask("Creating TEI", totalPages);
}
int c = 0;
for (int i = 0; i < pages.size(); ++i) {
if (pageIndices != null && !pageIndices.contains(i))
continue;
if (monitor != null) {
if (monitor.isCanceled()) {
throw new InterruptedException("Export was canceled by user");
// break;
}
monitor.subTask("Processing page " + (c + 1));
}
TrpPage p = pages.get(i);
logger.debug("1Processing page " + p.getPageNr() + ": " + p.getUrl() + " - XML=" + p.getCurrentTranscript().getUrl());
// check buffer for transcript or unmarshal the page XML
PcGtsType pc = this.getPcGtsTypeForPage(p);
if (pars.hasZones()) {
// create a facsimile element for each page that are appended to the root element of the TEI after header
openFacsimileElement(sbFacsimile, p, pc);
}
// create page-break element for each page as child of body element:
writePageBreak(sbText, p, pc);
//
// // append all text-regions / lines / words to the xml:
List<TrpRegionType> regions = pc.getPage().getTextRegionOrImageRegionOrLineDrawingRegion();
Collections.sort(regions, new TrpElementReadingOrderComparator<RegionType>(true));
for (TrpRegionType r : regions) {
// System.out.println(r.getClass());
if (r instanceof TextRegionType) {
if (pars.hasZones()) {
writeZonesForTextRegion(sbFacsimile, (TrpTextRegionType) r, p.getPageNr());
}
writeTextForTextRegion(sbText, (TrpTextRegionType) r, p.getPageNr());
} else {
// write other regions
if (pars.hasZones()) {
String facsId = FACS_ID_PREFIX + p.getPageNr();
writeZoneForShape(sbFacsimile, r, facsId, true);
}
}
}
if (pars.hasZones()) {
closeFacsimilieElement(sbFacsimile);
}
++c;
if (monitor != null) {
monitor.worked(c);
}
}
// text.appendChild(body);
// root.appendChild(text);
// sbText.decIndent();
sbText.addLine("</body>");
sbText.decIndent();
sbText.addLine("</text>");
sbText.decIndent();
sbTotal.sb.append(sbFacsimile.toString());
sbTotal.sb.append(sbText.toString());
}
use of eu.transkribus.core.model.beans.pagecontent.PcGtsType in project TranskribusCore by Transkribus.
the class AltoExporter method exportAltoFile.
public File exportAltoFile(TrpPage p, final String fileName, File altoOutputDir, boolean splitIntoWords) throws JAXBException, FileNotFoundException, TransformerException {
if (p == null || fileName == null) {
throw new IllegalArgumentException("An argument is null!");
}
TrpTranscriptMetadata t = p.getCurrentTranscript();
PcGtsType pc = PageXmlUtils.unmarshal(t.getUrl());
StreamSource mySrc = new StreamSource();
mySrc.setInputStream(new ByteArrayInputStream(PageXmlUtils.marshalToBytes(pc)));
InputStream is;
if (splitIntoWords) {
is = XslTransformer.class.getClassLoader().getResourceAsStream(PAGE_TO_ALTO_WORD_LEVEL_XSLT);
} else {
is = XslTransformer.class.getClassLoader().getResourceAsStream(PAGE_TO_ALTO_XSLT);
}
// InputStream xslIS = new BufferedInputStream(new FileInputStream(xslID));
InputStream xslIS = new BufferedInputStream(is);
StreamSource xslSource = new StreamSource(xslIS);
// das Factory-Pattern unterstützt verschiedene XSLT-Prozessoren
TransformerFactory transFact = TransformerFactory.newInstance();
Transformer trans;
// try {
trans = transFact.newTransformer(xslSource);
File altoFile = new File(altoOutputDir.getAbsolutePath() + "/" + fileName);
trans.transform(mySrc, new StreamResult(new FileOutputStream(altoFile)));
return altoFile;
// } catch (TransformerConfigurationException e) {
// // TODO Auto-generated catch block
// e.printStackTrace();
// } catch (TransformerException e) {
// // TODO Auto-generated catch block
// e.printStackTrace();
// }
}
use of eu.transkribus.core.model.beans.pagecontent.PcGtsType in project TranskribusCore by Transkribus.
the class KlosterTeiToPageParser method parsePage.
static void parsePage(Node pbNode, boolean save) throws IOException, JAXBException {
Element pb = (Element) pbNode;
String imgFn = pb.getAttribute("facs");
int pageN = Integer.parseInt(pb.getAttribute("n"));
int pageHeight = Integer.parseInt(pb.getAttribute("xmlns:h"));
int pageWidth = Integer.parseInt(pb.getAttribute("xmlns:w"));
PcGtsType page = PageXmlUtils.createEmptyPcGtsType("imgfn", pageWidth, pageHeight);
TrpTextRegionType region = new TrpTextRegionType();
region.setId("region_1");
System.out.println("page data: imgFn = " + imgFn + " n = " + pageN + " pageWidth = " + pageWidth + " pageHeight = " + pageHeight);
int minX = 999999, minY = 999999, maxX = -1, maxY = -1;
Node sibling = pbNode.getNextSibling();
int lineCount = 0;
while (sibling != null) {
if (sibling.getNodeName().equals("pb")) {
break;
}
// System.out.println("sibling type: "+sibling.getTextContent());
if (sibling.getNodeType() == Node.ELEMENT_NODE && sibling.getNodeName().equals("lb")) {
Element lb = (Element) sibling;
int n = Integer.parseInt(lb.getAttribute("n"));
int x = Integer.parseInt(lb.getAttribute("xmlns:x"));
int y = Integer.parseInt(lb.getAttribute("xmlns:y"));
int w = Integer.parseInt(lb.getAttribute("xmlns:w"));
int h = Integer.parseInt(lb.getAttribute("xmlns:h"));
if (x < minX)
minX = x;
if (y < minY)
minY = y;
if (x + w > maxX)
maxX = x + w;
if (y + h > maxY)
maxY = y + h;
String txt = sibling.getNextSibling().getTextContent();
txt = StringUtils.stripEnd(txt, " \r\n");
// System.out.println("line: txt = "+txt+" [x,y,w,h] = ["+x+","+y+","+w+","+h+"]");
System.out.format("line: n = %d, txt = %s, coords = [%d,%d,%d,%d]\n", n, txt, x, y, w, h);
TrpTextLineType line = new TrpTextLineType();
line.setCoords(bbToCoords(x, y, w, h));
TextEquivType te = new TextEquivType();
te.setUnicode(txt);
line.setTextEquiv(te);
line.setId("line_" + (++lineCount));
// create baseline:
TrpBaselineType bl = new TrpBaselineType();
int yBl = (int) (y + 0.7 * h);
bl.setPoints(x + "," + yBl + " " + (x + w) + "," + yBl);
line.setBaseline(bl);
region.getTextLine().add(line);
}
sibling = sibling.getNextSibling();
// System.out.println("sibling node name: "+sibling.getNodeName());
// if (!sibling.getNodeName().equals("lb"))
// break;
}
if (!region.getTextLine().isEmpty()) {
region.setCoords(bbToCoords(minX, minY, maxX - minX, maxY - minY));
} else {
region.setCoords(bbToCoords(0, 0, pageWidth, pageHeight));
}
page.getPage().getTextRegionOrImageRegionOrLineDrawingRegion().add(region);
if (save && !region.getTextLine().isEmpty()) {
File xmlFile = new File(PAGE_DIR + FilenameUtils.getBaseName(imgFn) + ".xml");
PageXmlUtils.marshalToFile(page, xmlFile);
FileUtils.copyFile(new File(DIR + imgFn), new File(DST_DIR + imgFn));
System.out.println("written page to: " + xmlFile.getAbsolutePath());
}
}
use of eu.transkribus.core.model.beans.pagecontent.PcGtsType in project TranskribusCore by Transkribus.
the class PageXmlUtils method unmarshal.
public static PcGtsType unmarshal(URL url) throws JAXBException {
Unmarshaller u = createUnmarshaller();
@SuppressWarnings("unchecked") PcGtsType pageData = ((JAXBElement<PcGtsType>) u.unmarshal(url)).getValue();
onPostConstruct(pageData);
return pageData;
}
Aggregations