Search in sources :

Example 11 with PcGtsType

use of eu.transkribus.core.model.beans.pagecontent.PcGtsType in project TranskribusCore by Transkribus.

the class PageXmlUtils method createPcGtsTypeFromText.

public static PcGtsType createPcGtsTypeFromText(final String imgFileName, Dimension dim, String text, TranscriptionLevel level, boolean skipEmptyLines) throws IOException {
    // create empty page
    PcGtsType pcGtsType = createEmptyPcGtsType(imgFileName, dim);
    TrpPageType page = (TrpPageType) pcGtsType.getPage();
    // create and add text region with size of image
    Rectangle r = new Rectangle(0, 0, page.getImageWidth(), page.getImageHeight());
    String defaultCoords = PointStrUtils.pointsToString(r);
    TrpTextRegionType region = new TrpTextRegionType((TrpPageType) page);
    region.setId("region_1");
    region.setCoordinates(defaultCoords, null);
    page.getTextRegionOrImageRegionOrLineDrawingRegion().add(region);
    if (level == null) {
        level = TranscriptionLevel.LINE_BASED;
    }
    if (level != TranscriptionLevel.REGION_BASED && level != TranscriptionLevel.LINE_BASED && level != TranscriptionLevel.WORD_BASED) {
        throw new IOException("Invalide TranscriptionLevel: " + level);
    }
    if (level == TranscriptionLevel.REGION_BASED) {
        region.setUnicodeText(text, null);
    } else {
        String splitRegex = skipEmptyLines ? "[\\r\\n]+" : "\\r?\\n";
        String[] lines = text.split(splitRegex);
        logger.debug("nr of lines = " + lines.length);
        int lc = 1;
        for (String lineText : lines) {
            TrpTextLineType line = new TrpTextLineType(region);
            line.setId("line_" + (lc++));
            line.setCoordinates(defaultCoords, null);
            region.getTextLine().add(line);
            if (level == TranscriptionLevel.LINE_BASED) {
                line.setUnicodeText(lineText, null);
            } else if (level == TranscriptionLevel.WORD_BASED) {
                int wc = 1;
                for (String wordText : lineText.split(" ")) {
                    // TODO: better word splitting??
                    TrpWordType word = new TrpWordType(line);
                    word.setId("word_" + (wc++));
                    word.setCoordinates(defaultCoords, null);
                    word.setUnicodeText(wordText, null);
                    line.getWord().add(word);
                }
            }
        }
    }
    return pcGtsType;
}
Also used : TrpTextLineType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextLineType) TrpTextRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType) Rectangle(java.awt.Rectangle) IOException(java.io.IOException) PcGtsType(eu.transkribus.core.model.beans.pagecontent.PcGtsType) TrpWordType(eu.transkribus.core.model.beans.pagecontent_trp.TrpWordType) TrpPageType(eu.transkribus.core.model.beans.pagecontent_trp.TrpPageType)

Example 12 with PcGtsType

use of eu.transkribus.core.model.beans.pagecontent.PcGtsType in project TranskribusCore by Transkribus.

the class PageXmlUtils method createEmptyPcGtsType.

public static PcGtsType createEmptyPcGtsType(final String imgFileName, final int xDim, final int yDim) {
    // create md
    MetadataType md = new MetadataType();
    md.setCreator("TRP");
    XMLGregorianCalendar xmlCal = JaxbUtils.getXmlCalendar(new Date());
    md.setCreated(xmlCal);
    md.setLastChange(xmlCal);
    // create TRP (!) pageType
    TrpPageType pt = new TrpPageType();
    pt.setImageFilename(imgFileName);
    pt.setImageHeight(yDim);
    pt.setImageWidth(xDim);
    // create root and set stuff
    PcGtsType pc = new PcGtsType();
    pc.setMetadata(md);
    pc.setPage(pt);
    return pc;
}
Also used : XMLGregorianCalendar(javax.xml.datatype.XMLGregorianCalendar) MetadataType(eu.transkribus.core.model.beans.pagecontent.MetadataType) PcGtsType(eu.transkribus.core.model.beans.pagecontent.PcGtsType) Date(java.util.Date) TrpPageType(eu.transkribus.core.model.beans.pagecontent_trp.TrpPageType)

Example 13 with PcGtsType

use of eu.transkribus.core.model.beans.pagecontent.PcGtsType in project TranskribusCore by Transkribus.

the class PageXmlUtilsTest method testSth.

public static void testSth() throws Exception {
    File[] files = { new File("/mnt/dea_scratch/TRP/test/page_xsl_test/ocr/Mittheilungen_Perthes_1855_0009.xml"), new File("/mnt/dea_scratch/TRP/test/ImagesOldPageXml/page/2010-03-19_backup/035_320_001.xml"), new File("/mnt/dea_scratch/TRP/test/page_xsl_test/Mittheilungen_Perthes_1855_0009.xml") };
    try {
        URL url = new URL("https://dbis-thure.uibk.ac.at/fimagestore/Get?fileType=metadata&id=YSUGXUUGAHYCUQVMEUJAYQGO");
        FimgStoreImgMd md = FimgStoreReadConnection.getImgMd(url);
        PcGtsType t = PageXmlUtils.createEmptyPcGtsType(url, md.getDimension());
        JaxbUtils.marshalToSysOut(t);
    } catch (IOException | JAXBException e) {
        e.printStackTrace();
    }
// for(File f : files){
// try {
// System.out.println(XmlUtils.getXmlFormat(f).toString());
// } catch (IOException e) {
// 
// e.printStackTrace();
// }
// }
}
Also used : FimgStoreImgMd(org.dea.fimgstoreclient.beans.FimgStoreImgMd) JAXBException(javax.xml.bind.JAXBException) IOException(java.io.IOException) File(java.io.File) PcGtsType(eu.transkribus.core.model.beans.pagecontent.PcGtsType) URL(java.net.URL)

Example 14 with PcGtsType

use of eu.transkribus.core.model.beans.pagecontent.PcGtsType in project TranskribusCore by Transkribus.

the class PageXmlUtilsTest method testGetTextRegions.

public static void testGetTextRegions() throws Exception {
    String transcriptWithTables = "https://dbis-thure.uibk.ac.at/f/Get?id=VCLTRLDSWETCXIHQNHKOPRLS";
    PcGtsType t = PageXmlUtils.unmarshal(new URL(transcriptWithTables));
    List<TextRegionType> tr = PageXmlUtils.getTextRegions(t);
    for (TextRegionType r : tr) {
        System.out.println("tr: " + r.getClass().getSimpleName() + " id: " + r.getId() + " n-lines: " + r.getTextLine().size());
    }
}
Also used : TextRegionType(eu.transkribus.core.model.beans.pagecontent.TextRegionType) PcGtsType(eu.transkribus.core.model.beans.pagecontent.PcGtsType) URL(java.net.URL)

Example 15 with PcGtsType

use of eu.transkribus.core.model.beans.pagecontent.PcGtsType in project TranskribusCore by Transkribus.

the class XslTransformTest method main.

public static void main(String[] args) {
    final File abbyyXml = new File("C:/tmp/Alto2PageTest/ocr/177907.xml");
    File pageOutFile = new File("C:/tmp/Alto2PageTest/abbyyToPageTest.xml");
    final String TEXT_STYLE_PARAM_NAME = "preserveTextStyles";
    Map<String, Object> params = new HashMap<>();
    params.put(TEXT_STYLE_PARAM_NAME, Boolean.FALSE);
    PcGtsType pc;
    try {
        pc = JaxbUtils.transformToObject(abbyyXml, ABBY_TO_PAGE_XSLT, params, PcGtsType.class);
        pc.getPage().setImageFilename("177907.JPG");
        File pageXml = JaxbUtils.marshalToFile(pc, pageOutFile);
        PcGtsType result = PageXmlUtils.unmarshal(pageOutFile);
    } catch (TransformerException | SAXException | IOException | ParserConfigurationException | JAXBException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
}
Also used : HashMap(java.util.HashMap) JAXBException(javax.xml.bind.JAXBException) IOException(java.io.IOException) ParserConfigurationException(javax.xml.parsers.ParserConfigurationException) File(java.io.File) PcGtsType(eu.transkribus.core.model.beans.pagecontent.PcGtsType) TransformerException(javax.xml.transform.TransformerException) SAXException(org.xml.sax.SAXException)

Aggregations

PcGtsType (eu.transkribus.core.model.beans.pagecontent.PcGtsType)36 File (java.io.File)16 IOException (java.io.IOException)16 JAXBException (javax.xml.bind.JAXBException)11 TrpPage (eu.transkribus.core.model.beans.TrpPage)8 TrpTextRegionType (eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType)6 URL (java.net.URL)6 JAXBElement (javax.xml.bind.JAXBElement)6 Unmarshaller (javax.xml.bind.Unmarshaller)6 TextRegionType (eu.transkribus.core.model.beans.pagecontent.TextRegionType)5 FileNotFoundException (java.io.FileNotFoundException)5 ParserConfigurationException (javax.xml.parsers.ParserConfigurationException)5 TransformerException (javax.xml.transform.TransformerException)5 SAXException (org.xml.sax.SAXException)5 TrpTranscriptMetadata (eu.transkribus.core.model.beans.TrpTranscriptMetadata)4 TrpPageType (eu.transkribus.core.model.beans.pagecontent_trp.TrpPageType)4 TrpTextLineType (eu.transkribus.core.model.beans.pagecontent_trp.TrpTextLineType)4 Dimension (java.awt.Dimension)4 FimgStoreImgMd (org.dea.fimgstoreclient.beans.FimgStoreImgMd)4 XmlFormat (eu.transkribus.core.io.formats.XmlFormat)3