use of eu.transkribus.core.model.beans.pagecontent.PcGtsType in project TranskribusCore by Transkribus.
the class PcGtsTypeMessageBodyReader method readFrom.
@Override
public PcGtsType readFrom(Class<PcGtsType> type, Type genericType, Annotation[] annotations, MediaType mediaType, MultivaluedMap<String, String> httpHeaders, InputStream entityStream) throws IOException, WebApplicationException {
try {
logger.debug("unmarshalling PcGtsType from input stream, type = " + type + " genericType = " + genericType + " mediaType = " + mediaType);
sw.start();
PcGtsType pc = PageXmlUtils.unmarshal(entityStream);
sw.stop(true, "time to unmarshal: ", logger);
return pc;
} catch (Exception e) {
logger.error(e.getMessage(), e);
throw new WebApplicationException(e);
}
}
use of eu.transkribus.core.model.beans.pagecontent.PcGtsType in project TranskribusCore by Transkribus.
the class JAXBPageTranscript method build.
// public TrpPrintSpaceType getPrintSpace() {
// if (pageData != null)
// return (TrpPrintSpaceType) pageData.getPage().getPrintSpace();
// else
// return null;
// }
// public void build() throws JAXBException, IOException {
// if (md != null) {
// JAXBPageTranscript tr = TrpPageTranscriptBuilder.build(md);
// setPageData(tr.getPageData());
// }
// }
public void build() throws IOException {
try {
PcGtsType pageData = PageXmlUtils.unmarshal(md, true);
setPageData(pageData);
} catch (JAXBException e) {
throw new IOException(e);
}
}
use of eu.transkribus.core.model.beans.pagecontent.PcGtsType in project TranskribusCore by Transkribus.
the class Pdf2TrpDoc method main.
public static void main(String[] args) {
if (args.length != 1) {
return;
}
File in = new File(args[0]);
final String name = in.getName();
File outDir = new File("/tmp/");
outDir.mkdirs();
try {
// PageImageWriter imgWriter = new PageImageWriter();
// String imgDirPath = imgWriter.extractImages(in.getAbsolutePath(), outDir.getAbsolutePath());
String imgDirPath = "/tmp/KurzgefaĆte_Geschichte_Statistik_und_Topographie_von_Tirol";
File pageDir = new File(imgDirPath + File.separator + "page");
pageDir.mkdirs();
TreeMap<String, File> imgs = LocalDocReader.findImgFiles(new File(imgDirPath));
ArrayList<PDFPage> pages = PDFTextExtractor.processPDF(in.getAbsolutePath());
if (imgs.size() != pages.size()) {
logger.error("Nr. of image files does not match nr. of text pages!");
return;
}
int i = 0;
for (Entry<String, File> img : imgs.entrySet()) {
PDFPage pdfPage = pages.get(i++);
Dimension dim = ImgUtils.readImageDimensions(img.getValue());
PcGtsType pc = PageXmlUtils.createEmptyPcGtsType(img.getValue(), dim);
final File xmlOut = new File(pageDir.getAbsolutePath() + File.separator + img.getKey() + ".xml");
Rectangle printspace = pdfPage.getContentRect();
if (printspace != null) {
TrpPrintSpaceType psType = new TrpPrintSpaceType();
psType.setCoords(rect2Coords(printspace));
TrpPageType pageType = (TrpPageType) pc.getPage();
// ((ITrpShapeType) pageType).getObservable().setActive(false);
pageType.setPrintSpace(psType);
for (PDFRegion r : pdfPage.regions) {
TrpTextRegionType rType = new TrpTextRegionType(pageType);
rType.setCoords(rect2Coords(r.getRect()));
rType.setUnicodeText(r.getText(), null);
for (PDFLine l : r.lines) {
TrpTextLineType lType = new TrpTextLineType(rType);
lType.setCoords(rect2Coords(l.getRect()));
lType.setUnicodeText(l.getText(), null);
for (PDFString s : l.strings) {
TrpWordType wType = new TrpWordType(lType);
wType.setCoords(rect2Coords(s.getRect()));
wType.setUnicodeText(s.value, null);
lType.getWord().add(wType);
}
rType.getTextLine().add(lType);
}
pageType.getRegions().add(rType);
}
}
PageXmlUtils.marshalToFile(pc, xmlOut);
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
use of eu.transkribus.core.model.beans.pagecontent.PcGtsType in project TranskribusCore by Transkribus.
the class ATeiBuilder method getPcGtsTypeForPage.
// protected abstract void setTextRegion(TextRegionType r, int pageNr);
protected PcGtsType getPcGtsTypeForPage(TrpPage p) throws JAXBException {
PcGtsType pc;
if (transcrBuffer.containsKey(p.getPageNr())) {
pc = transcrBuffer.get(p.getPageNr());
} else {
TrpTranscriptMetadata tMd = p.getCurrentTranscript();
try {
JAXBPageTranscript tr = new JAXBPageTranscript(tMd);
tr.build();
pc = tr.getPageData();
} catch (IOException je) {
throw new JAXBException("Could not unmarshal page " + p.getPageNr(), je);
}
transcrBuffer.put(p.getPageNr(), pc);
}
return pc;
}
use of eu.transkribus.core.model.beans.pagecontent.PcGtsType in project TranskribusCore by Transkribus.
the class PdfExporter method export.
public File export(final TrpDoc doc, final String path, Set<Integer> pageIndices, final boolean useWordLevel, final boolean addTextPages, final boolean imagesOnly, final boolean highlightTags, final boolean doBlackening, boolean createTitle, ExportCache cache) throws DocumentException, MalformedURLException, IOException, JAXBException, URISyntaxException, InterruptedException {
if (doc == null) {
throw new IllegalArgumentException("TrpDoc is null!");
}
if (path == null) {
throw new IllegalArgumentException("path is null!");
}
if (cache == null) {
cache = new ExportCache();
}
// if(startPage == null || startPage < 1) startPage = 1;
// final int nrOfPages = doc.getPages().size();
// if(endPage == null || endPage > nrOfPages+1) endPage = nrOfPages;
//
// if(startPage > endPage){
// throw new IllegalArgumentException("Start page must be smaller than end page!");
// }
File pdfFile = new File(path);
TrpPdfDocument pdf = new TrpPdfDocument(pdfFile, useWordLevel, highlightTags, doBlackening, createTitle);
setChanged();
notifyObservers("Creating PDF document...");
boolean onePagePrinted = false;
// for(int i = startPage-1; i <= endPage-1; i++){
for (int i = 0; i < doc.getPages().size(); ++i) {
if (pageIndices != null && !pageIndices.contains(i))
continue;
logger.info("Processing page " + (i + 1));
TrpPage p = doc.getPages().get(i);
URL imgUrl = p.getUrl();
/*
* md is only needed for getting resolution because in the image it may be missing
* But if it is a local doc we have to try to get from img because md is null
*/
FimgStoreImgMd md = null;
if (doc.isRemoteDoc()) {
FimgStoreGetClient getter = new FimgStoreGetClient(p.getUrl());
md = (FimgStoreImgMd) getter.getFileMd(p.getKey());
}
URL xmlUrl = p.getCurrentTranscript().getUrl();
logger.debug("output with tags " + highlightTags);
// PcGtsType pc = PageXmlUtils.unmarshal(xmlUrl);
// should be the same as above
JAXBPageTranscript pt = null;
if (cache != null) {
pt = cache.getPageTranscriptAtIndex(i);
}
PcGtsType pc;
if (pt != null) {
pc = pt.getPageData();
} else {
pc = PageXmlUtils.unmarshal(xmlUrl);
}
if (!onePagePrinted) {
// add first page and previously add a title page with doc metadata and editorial declarations (if this option is set)
pdf.addPage(imgUrl, doc, pc, addTextPages, imagesOnly, md, doBlackening, cache);
onePagePrinted = true;
} else {
pdf.addPage(imgUrl, null, pc, addTextPages, imagesOnly, md, doBlackening, cache);
}
setChanged();
notifyObservers(Integer.valueOf(i + 1));
if (cancel) {
pdf.close();
File file = new File(path);
if (!file.delete()) {
throw new IOException("Could not delete the incomplete PDF file during export cancel");
}
throw new InterruptedException("Export canceled by the user");
// break;
}
}
if (highlightTags) {
pdf.addTags(doc, pageIndices, useWordLevel, cache);
}
pdf.close();
setChanged();
notifyObservers("PDF written at: " + path);
logger.info("PDF written at: " + path);
return pdfFile;
}
Aggregations