Search in sources :

Example 16 with PcGtsType

use of eu.transkribus.core.model.beans.pagecontent.PcGtsType in project TranskribusCore by Transkribus.

the class PcGtsTypeMessageBodyReader method readFrom.

@Override
public PcGtsType readFrom(Class<PcGtsType> type, Type genericType, Annotation[] annotations, MediaType mediaType, MultivaluedMap<String, String> httpHeaders, InputStream entityStream) throws IOException, WebApplicationException {
    try {
        logger.debug("unmarshalling PcGtsType from input stream, type = " + type + " genericType = " + genericType + " mediaType = " + mediaType);
        sw.start();
        PcGtsType pc = PageXmlUtils.unmarshal(entityStream);
        sw.stop(true, "time to unmarshal: ", logger);
        return pc;
    } catch (Exception e) {
        logger.error(e.getMessage(), e);
        throw new WebApplicationException(e);
    }
}
Also used : WebApplicationException(javax.ws.rs.WebApplicationException) PcGtsType(eu.transkribus.core.model.beans.pagecontent.PcGtsType) IOException(java.io.IOException) WebApplicationException(javax.ws.rs.WebApplicationException)

Example 17 with PcGtsType

use of eu.transkribus.core.model.beans.pagecontent.PcGtsType in project TranskribusCore by Transkribus.

the class JAXBPageTranscript method build.

// public TrpPrintSpaceType getPrintSpace() {
// if (pageData != null)
// return (TrpPrintSpaceType) pageData.getPage().getPrintSpace();
// else
// return null;
// }
// public void build() throws JAXBException, IOException {
// if (md != null) {
// JAXBPageTranscript tr = TrpPageTranscriptBuilder.build(md);
// setPageData(tr.getPageData());
// }
// }
public void build() throws IOException {
    try {
        PcGtsType pageData = PageXmlUtils.unmarshal(md, true);
        setPageData(pageData);
    } catch (JAXBException e) {
        throw new IOException(e);
    }
}
Also used : JAXBException(javax.xml.bind.JAXBException) IOException(java.io.IOException) PcGtsType(eu.transkribus.core.model.beans.pagecontent.PcGtsType)

Example 18 with PcGtsType

use of eu.transkribus.core.model.beans.pagecontent.PcGtsType in project TranskribusCore by Transkribus.

the class Pdf2TrpDoc method main.

public static void main(String[] args) {
    if (args.length != 1) {
        return;
    }
    File in = new File(args[0]);
    final String name = in.getName();
    File outDir = new File("/tmp/");
    outDir.mkdirs();
    try {
        // PageImageWriter imgWriter = new PageImageWriter();
        // String imgDirPath = imgWriter.extractImages(in.getAbsolutePath(), outDir.getAbsolutePath());
        String imgDirPath = "/tmp/KurzgefaƟte_Geschichte_Statistik_und_Topographie_von_Tirol";
        File pageDir = new File(imgDirPath + File.separator + "page");
        pageDir.mkdirs();
        TreeMap<String, File> imgs = LocalDocReader.findImgFiles(new File(imgDirPath));
        ArrayList<PDFPage> pages = PDFTextExtractor.processPDF(in.getAbsolutePath());
        if (imgs.size() != pages.size()) {
            logger.error("Nr. of image files does not match nr. of text pages!");
            return;
        }
        int i = 0;
        for (Entry<String, File> img : imgs.entrySet()) {
            PDFPage pdfPage = pages.get(i++);
            Dimension dim = ImgUtils.readImageDimensions(img.getValue());
            PcGtsType pc = PageXmlUtils.createEmptyPcGtsType(img.getValue(), dim);
            final File xmlOut = new File(pageDir.getAbsolutePath() + File.separator + img.getKey() + ".xml");
            Rectangle printspace = pdfPage.getContentRect();
            if (printspace != null) {
                TrpPrintSpaceType psType = new TrpPrintSpaceType();
                psType.setCoords(rect2Coords(printspace));
                TrpPageType pageType = (TrpPageType) pc.getPage();
                // ((ITrpShapeType) pageType).getObservable().setActive(false);
                pageType.setPrintSpace(psType);
                for (PDFRegion r : pdfPage.regions) {
                    TrpTextRegionType rType = new TrpTextRegionType(pageType);
                    rType.setCoords(rect2Coords(r.getRect()));
                    rType.setUnicodeText(r.getText(), null);
                    for (PDFLine l : r.lines) {
                        TrpTextLineType lType = new TrpTextLineType(rType);
                        lType.setCoords(rect2Coords(l.getRect()));
                        lType.setUnicodeText(l.getText(), null);
                        for (PDFString s : l.strings) {
                            TrpWordType wType = new TrpWordType(lType);
                            wType.setCoords(rect2Coords(s.getRect()));
                            wType.setUnicodeText(s.value, null);
                            lType.getWord().add(wType);
                        }
                        rType.getTextLine().add(lType);
                    }
                    pageType.getRegions().add(rType);
                }
            }
            PageXmlUtils.marshalToFile(pc, xmlOut);
        }
    } catch (Exception e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
}
Also used : PDFString(org.dea.util.pdf.beans.PDFString) Rectangle(java.awt.Rectangle) PDFString(org.dea.util.pdf.beans.PDFString) Dimension(java.awt.Dimension) PcGtsType(eu.transkribus.core.model.beans.pagecontent.PcGtsType) TrpWordType(eu.transkribus.core.model.beans.pagecontent_trp.TrpWordType) TrpTextLineType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextLineType) PDFRegion(org.dea.util.pdf.beans.PDFRegion) TrpTextRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType) PDFLine(org.dea.util.pdf.beans.PDFLine) File(java.io.File) PDFPage(org.dea.util.pdf.beans.PDFPage) TrpPrintSpaceType(eu.transkribus.core.model.beans.pagecontent_trp.TrpPrintSpaceType) TrpPageType(eu.transkribus.core.model.beans.pagecontent_trp.TrpPageType)

Example 19 with PcGtsType

use of eu.transkribus.core.model.beans.pagecontent.PcGtsType in project TranskribusCore by Transkribus.

the class ATeiBuilder method getPcGtsTypeForPage.

// protected abstract void setTextRegion(TextRegionType r, int pageNr);
protected PcGtsType getPcGtsTypeForPage(TrpPage p) throws JAXBException {
    PcGtsType pc;
    if (transcrBuffer.containsKey(p.getPageNr())) {
        pc = transcrBuffer.get(p.getPageNr());
    } else {
        TrpTranscriptMetadata tMd = p.getCurrentTranscript();
        try {
            JAXBPageTranscript tr = new JAXBPageTranscript(tMd);
            tr.build();
            pc = tr.getPageData();
        } catch (IOException je) {
            throw new JAXBException("Could not unmarshal page " + p.getPageNr(), je);
        }
        transcrBuffer.put(p.getPageNr(), pc);
    }
    return pc;
}
Also used : JAXBPageTranscript(eu.transkribus.core.model.beans.JAXBPageTranscript) JAXBException(javax.xml.bind.JAXBException) TrpTranscriptMetadata(eu.transkribus.core.model.beans.TrpTranscriptMetadata) IOException(java.io.IOException) PcGtsType(eu.transkribus.core.model.beans.pagecontent.PcGtsType)

Example 20 with PcGtsType

use of eu.transkribus.core.model.beans.pagecontent.PcGtsType in project TranskribusCore by Transkribus.

the class PdfExporter method export.

public File export(final TrpDoc doc, final String path, Set<Integer> pageIndices, final boolean useWordLevel, final boolean addTextPages, final boolean imagesOnly, final boolean highlightTags, final boolean doBlackening, boolean createTitle, ExportCache cache) throws DocumentException, MalformedURLException, IOException, JAXBException, URISyntaxException, InterruptedException {
    if (doc == null) {
        throw new IllegalArgumentException("TrpDoc is null!");
    }
    if (path == null) {
        throw new IllegalArgumentException("path is null!");
    }
    if (cache == null) {
        cache = new ExportCache();
    }
    // if(startPage == null || startPage < 1) startPage = 1;
    // final int nrOfPages = doc.getPages().size();
    // if(endPage == null || endPage > nrOfPages+1) endPage = nrOfPages;
    // 
    // if(startPage > endPage){
    // throw new IllegalArgumentException("Start page must be smaller than end page!");
    // }
    File pdfFile = new File(path);
    TrpPdfDocument pdf = new TrpPdfDocument(pdfFile, useWordLevel, highlightTags, doBlackening, createTitle);
    setChanged();
    notifyObservers("Creating PDF document...");
    boolean onePagePrinted = false;
    // for(int i = startPage-1; i <= endPage-1; i++){
    for (int i = 0; i < doc.getPages().size(); ++i) {
        if (pageIndices != null && !pageIndices.contains(i))
            continue;
        logger.info("Processing page " + (i + 1));
        TrpPage p = doc.getPages().get(i);
        URL imgUrl = p.getUrl();
        /*
			 * md is only needed for getting resolution because in the image it may be missing
			 * But if it is a local doc we have to try to get from img because md is null
			 */
        FimgStoreImgMd md = null;
        if (doc.isRemoteDoc()) {
            FimgStoreGetClient getter = new FimgStoreGetClient(p.getUrl());
            md = (FimgStoreImgMd) getter.getFileMd(p.getKey());
        }
        URL xmlUrl = p.getCurrentTranscript().getUrl();
        logger.debug("output with tags " + highlightTags);
        // PcGtsType pc = PageXmlUtils.unmarshal(xmlUrl);
        // should be the same as above
        JAXBPageTranscript pt = null;
        if (cache != null) {
            pt = cache.getPageTranscriptAtIndex(i);
        }
        PcGtsType pc;
        if (pt != null) {
            pc = pt.getPageData();
        } else {
            pc = PageXmlUtils.unmarshal(xmlUrl);
        }
        if (!onePagePrinted) {
            // add first page and previously add a title page with doc metadata and editorial declarations (if this option is set)
            pdf.addPage(imgUrl, doc, pc, addTextPages, imagesOnly, md, doBlackening, cache);
            onePagePrinted = true;
        } else {
            pdf.addPage(imgUrl, null, pc, addTextPages, imagesOnly, md, doBlackening, cache);
        }
        setChanged();
        notifyObservers(Integer.valueOf(i + 1));
        if (cancel) {
            pdf.close();
            File file = new File(path);
            if (!file.delete()) {
                throw new IOException("Could not delete the incomplete PDF file during export cancel");
            }
            throw new InterruptedException("Export canceled by the user");
        // break;
        }
    }
    if (highlightTags) {
        pdf.addTags(doc, pageIndices, useWordLevel, cache);
    }
    pdf.close();
    setChanged();
    notifyObservers("PDF written at: " + path);
    logger.info("PDF written at: " + path);
    return pdfFile;
}
Also used : FimgStoreImgMd(org.dea.fimgstoreclient.beans.FimgStoreImgMd) JAXBPageTranscript(eu.transkribus.core.model.beans.JAXBPageTranscript) TrpPage(eu.transkribus.core.model.beans.TrpPage) IOException(java.io.IOException) PcGtsType(eu.transkribus.core.model.beans.pagecontent.PcGtsType) URL(java.net.URL) FimgStoreGetClient(org.dea.fimgstoreclient.FimgStoreGetClient) ExportCache(eu.transkribus.core.model.builder.ExportCache) File(java.io.File)

Aggregations

PcGtsType (eu.transkribus.core.model.beans.pagecontent.PcGtsType)36 File (java.io.File)16 IOException (java.io.IOException)16 JAXBException (javax.xml.bind.JAXBException)11 TrpPage (eu.transkribus.core.model.beans.TrpPage)8 TrpTextRegionType (eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType)6 URL (java.net.URL)6 JAXBElement (javax.xml.bind.JAXBElement)6 Unmarshaller (javax.xml.bind.Unmarshaller)6 TextRegionType (eu.transkribus.core.model.beans.pagecontent.TextRegionType)5 FileNotFoundException (java.io.FileNotFoundException)5 ParserConfigurationException (javax.xml.parsers.ParserConfigurationException)5 TransformerException (javax.xml.transform.TransformerException)5 SAXException (org.xml.sax.SAXException)5 TrpTranscriptMetadata (eu.transkribus.core.model.beans.TrpTranscriptMetadata)4 TrpPageType (eu.transkribus.core.model.beans.pagecontent_trp.TrpPageType)4 TrpTextLineType (eu.transkribus.core.model.beans.pagecontent_trp.TrpTextLineType)4 Dimension (java.awt.Dimension)4 FimgStoreImgMd (org.dea.fimgstoreclient.beans.FimgStoreImgMd)4 XmlFormat (eu.transkribus.core.io.formats.XmlFormat)3