use of eu.transkribus.core.model.beans.pagecontent_trp.TrpPrintSpaceType in project TranskribusCore by Transkribus.
the class Pdf2TrpDoc method main.
public static void main(String[] args) {
if (args.length != 1) {
return;
}
File in = new File(args[0]);
final String name = in.getName();
File outDir = new File("/tmp/");
outDir.mkdirs();
try {
// PageImageWriter imgWriter = new PageImageWriter();
// String imgDirPath = imgWriter.extractImages(in.getAbsolutePath(), outDir.getAbsolutePath());
String imgDirPath = "/tmp/KurzgefaĆte_Geschichte_Statistik_und_Topographie_von_Tirol";
File pageDir = new File(imgDirPath + File.separator + "page");
pageDir.mkdirs();
TreeMap<String, File> imgs = LocalDocReader.findImgFiles(new File(imgDirPath));
ArrayList<PDFPage> pages = PDFTextExtractor.processPDF(in.getAbsolutePath());
if (imgs.size() != pages.size()) {
logger.error("Nr. of image files does not match nr. of text pages!");
return;
}
int i = 0;
for (Entry<String, File> img : imgs.entrySet()) {
PDFPage pdfPage = pages.get(i++);
Dimension dim = ImgUtils.readImageDimensions(img.getValue());
PcGtsType pc = PageXmlUtils.createEmptyPcGtsType(img.getValue(), dim);
final File xmlOut = new File(pageDir.getAbsolutePath() + File.separator + img.getKey() + ".xml");
Rectangle printspace = pdfPage.getContentRect();
if (printspace != null) {
TrpPrintSpaceType psType = new TrpPrintSpaceType();
psType.setCoords(rect2Coords(printspace));
TrpPageType pageType = (TrpPageType) pc.getPage();
// ((ITrpShapeType) pageType).getObservable().setActive(false);
pageType.setPrintSpace(psType);
for (PDFRegion r : pdfPage.regions) {
TrpTextRegionType rType = new TrpTextRegionType(pageType);
rType.setCoords(rect2Coords(r.getRect()));
rType.setUnicodeText(r.getText(), null);
for (PDFLine l : r.lines) {
TrpTextLineType lType = new TrpTextLineType(rType);
lType.setCoords(rect2Coords(l.getRect()));
lType.setUnicodeText(l.getText(), null);
for (PDFString s : l.strings) {
TrpWordType wType = new TrpWordType(lType);
wType.setCoords(rect2Coords(s.getRect()));
wType.setUnicodeText(s.value, null);
lType.getWord().add(wType);
}
rType.getTextLine().add(lType);
}
pageType.getRegions().add(rType);
}
}
PageXmlUtils.marshalToFile(pc, xmlOut);
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
Aggregations