Search in sources :

Example 1 with PcGtsType

use of eu.transkribus.core.model.beans.pagecontent.PcGtsType in project TranskribusCore by Transkribus.

the class LocalDocReader method load.

public static TrpDoc load(TrpUpload upload) throws IOException {
    // validate most necessary things
    if (upload == null) {
        throw new IllegalArgumentException("Upload is null.");
    }
    if (upload.getUploadId() < 1) {
        throw new IllegalArgumentException("Invalid upload ID: " + upload.getUploadId());
    }
    if (!upload.canReadDirectories()) {
        throw new IllegalArgumentException("Directories are not readable: " + upload.getUploadTmpDir().getAbsolutePath());
    }
    // transform the upload object into a TRP document
    TrpDoc doc = new TrpDoc();
    TrpDocMetadata md = upload.getMd();
    md.setLocalFolder(upload.getUploadTmpDir());
    doc.setMd(md);
    File baseDir = upload.getUploadTmpDir();
    File xmlDir = upload.getUploadPageTmpDir();
    File thumbDir = new File(baseDir.getAbsolutePath() + File.separatorChar + LocalDocConst.THUMBS_FILE_SUB_FOLDER);
    for (PageUploadDescriptor p : upload.getPages()) {
        final int pageNr = p.getPageNr();
        File img = new File(baseDir.getAbsolutePath() + File.separator + p.getFileName());
        if (!img.isFile()) {
            throw new FileNotFoundException("Image for page " + pageNr + " does not exist: " + img.getAbsolutePath());
        }
        // try to read image dimension in any case to detect corrupt files immediately!
        Dimension dim = null;
        String imageRemark = null;
        try {
            dim = ImgUtils.readImageDimensions(img);
        } catch (CorruptImageException cie) {
            logger.error("Image is corrupt: " + img.getAbsolutePath(), cie);
            imageRemark = getCorruptImgMsg(img.getName());
        }
        final String imgBaseName = FilenameUtils.getBaseName(img.getName());
        File thumb = getThumbFile(thumbDir, imgBaseName);
        File pageXml = null;
        if (!StringUtils.isEmpty(p.getPageXmlName())) {
            pageXml = new File(xmlDir.getAbsolutePath() + File.separator + p.getPageXmlName());
            if (!pageXml.isFile()) {
                throw new FileNotFoundException("PAGE XML for page " + pageNr + " does not exist: " + img.getAbsolutePath());
            }
        } else if (StringUtils.isEmpty(imageRemark)) {
            // if a problem occured when reading the image
            File pageOutFile = new File(xmlDir.getAbsolutePath() + File.separatorChar + imgBaseName + ".xml");
            PcGtsType pc = PageXmlUtils.createEmptyPcGtsType(img, dim);
            try {
                pageXml = JaxbUtils.marshalToFile(pc, pageOutFile);
            } catch (JAXBException je) {
                logger.error(je.getMessage(), je);
                throw new IOException("Could not create empty PageXml on disk!", je);
            }
        }
        TrpPage page = buildPage(baseDir, pageNr, img, pageXml, thumb, dim, imageRemark);
        doc.getPages().add(page);
    }
    return doc;
}
Also used : CorruptImageException(eu.transkribus.core.exceptions.CorruptImageException) TrpPage(eu.transkribus.core.model.beans.TrpPage) JAXBException(javax.xml.bind.JAXBException) FileNotFoundException(java.io.FileNotFoundException) Dimension(java.awt.Dimension) IOException(java.io.IOException) PcGtsType(eu.transkribus.core.model.beans.pagecontent.PcGtsType) TrpDoc(eu.transkribus.core.model.beans.TrpDoc) TrpDocMetadata(eu.transkribus.core.model.beans.TrpDocMetadata) File(java.io.File) PageUploadDescriptor(eu.transkribus.core.model.beans.DocumentUploadDescriptor.PageUploadDescriptor)

Example 2 with PcGtsType

use of eu.transkribus.core.model.beans.pagecontent.PcGtsType in project TranskribusCore by Transkribus.

the class LocalDocReader method createPageXml.

/**
 * Method will create a PAGE XML from the given source files at pageOutFile.
 * if no supported source file exists (abbyy/alto/txt), then a skeleton will be created if possible.
 * <br/><br/>
 * This method must NEVER return null. Many mechanisms in Transkribus
 * depend on this method reliably creating a file.
 *
 * @param pageOutFile
 * @param doOverwrite
 * @param abbyyXml
 * @param altoXml
 * @param txtFile
 * @param preserveOcrFontFamily
 * @param preserveOcrTxtStyles
 * @param replaceBadChars
 * @param imgFile
 * @param dim
 * @return
 * @throws FileNotFoundException
 * @throws IOException
 */
protected static File createPageXml(File pageOutFile, boolean doOverwrite, File abbyyXml, File altoXml, File txtFile, boolean preserveOcrFontFamily, boolean preserveOcrTxtStyles, boolean replaceBadChars, final String imgFileName, Dimension dim) throws FileNotFoundException, IOException {
    if (pageOutFile == null) {
        throw new IllegalArgumentException("PAGE XML output File is null.");
    }
    if (pageOutFile.exists() && !doOverwrite) {
        throw new IOException("PAGE XML already exists at: " + pageOutFile.getAbsolutePath());
    }
    if (StringUtils.isEmpty(imgFileName)) {
        throw new IllegalArgumentException("Image filename must not be empty");
    }
    PcGtsType pc = null;
    if (abbyyXml != null) {
        // try find Abbyy XML
        pc = createPageFromAbbyy(imgFileName, abbyyXml, preserveOcrTxtStyles, preserveOcrFontFamily, replaceBadChars);
    }
    if (pc == null && altoXml != null) {
        // try find ALTO XML
        pc = createPageFromAlto2(imgFileName, altoXml, preserveOcrTxtStyles, preserveOcrFontFamily, replaceBadChars);
    }
    // from here we need the dimension of the image
    if (dim == null) {
        // set (0,0) here in order to make the following work
        dim = new Dimension();
    }
    if (pc == null && txtFile != null) {
        // try find TXT file
        pc = createPageFromTxt(imgFileName, dim, txtFile);
    }
    // if still null, there is no suitable file for this page yet => create one
    if (pc == null) {
        logger.warn("No Transcript XML found for img: " + FilenameUtils.getBaseName(imgFileName));
        logger.info("Creating empty PageXml.");
        pc = PageXmlUtils.createEmptyPcGtsType(imgFileName, dim);
    }
    // create the file
    try {
        JaxbUtils.marshalToFile(pc, pageOutFile);
    } catch (JAXBException je) {
        throw new IOException("Could not create PageXml on disk!", je);
    }
    return pageOutFile;
}
Also used : JAXBException(javax.xml.bind.JAXBException) IOException(java.io.IOException) Dimension(java.awt.Dimension) PcGtsType(eu.transkribus.core.model.beans.pagecontent.PcGtsType)

Example 3 with PcGtsType

use of eu.transkribus.core.model.beans.pagecontent.PcGtsType in project TranskribusCore by Transkribus.

the class LocalDocReader method createPageFromAbbyy.

private static PcGtsType createPageFromAbbyy(final String imgFileName, File abbyyXml, boolean preserveOcrTxtStyles, boolean preserveOcrFontFamily, boolean replaceBadChars) throws IOException {
    try {
        XmlFormat xmlFormat = XmlUtils.getXmlFormat(abbyyXml);
        if (xmlFormat.equals(XmlFormat.ABBYY_10)) {
            logger.info(abbyyXml.getAbsolutePath() + ": Transforming Finereader10/11 XML to PAGE XML.");
            PcGtsType pc = PageXmlUtils.createPcGtsTypeFromAbbyy(abbyyXml, imgFileName, preserveOcrTxtStyles, preserveOcrFontFamily, replaceBadChars);
            return pc;
        }
        throw new IOException("Not a valid Finereader10/11 XML file.");
    } catch (IOException | TransformerException ioe) {
        logger.error(ioe.getMessage(), ioe);
        throw new IOException("Could not migrate file: " + abbyyXml.getAbsolutePath(), ioe);
    } catch (ParserConfigurationException | SAXException xmle) {
        logger.error(xmle.getMessage(), xmle);
        throw new IOException("Could not transform XML file!", xmle);
    } catch (JAXBException je) {
        /* TODO This exception is only thrown when the pageXML is unmarshalled 
			 * for inserting the image filename which is not included in the abbyy xml! */
        logger.error(je.getMessage(), je);
        throw new IOException("Transformation output is not a valid page XML!", je);
    }
}
Also used : XmlFormat(eu.transkribus.core.io.formats.XmlFormat) JAXBException(javax.xml.bind.JAXBException) IOException(java.io.IOException) ParserConfigurationException(javax.xml.parsers.ParserConfigurationException) PcGtsType(eu.transkribus.core.model.beans.pagecontent.PcGtsType) TransformerException(javax.xml.transform.TransformerException) SAXException(org.xml.sax.SAXException)

Example 4 with PcGtsType

use of eu.transkribus.core.model.beans.pagecontent.PcGtsType in project TranskribusCore by Transkribus.

the class PageXmlUtils method findLinesByBaseline.

public static List<TextLineType> findLinesByBaseline(PcGtsType pc, String baseline) {
    List<TextRegionType> regions = getTextRegions(pc);
    List<TextLineType> matchingLines = new LinkedList<>();
    for (TextRegionType r : regions) {
        r.getTextLine().stream().filter(// isBaselineInLineBounds(l, baseline, threshold))
        l -> doesIntersect(l, baseline)).forEach(l -> matchingLines.add(l));
    }
    if (matchingLines.size() > 1) {
        TrpElementCoordinatesComparator<TextLineType> comp = new TrpElementCoordinatesComparator<>(true);
        Collections.sort(matchingLines, comp);
    }
    return matchingLines;
}
Also used : TrpTextRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType) TextRegionType(eu.transkribus.core.model.beans.pagecontent.TextRegionType) FimgStoreReadConnection(eu.transkribus.core.io.FimgStoreReadConnection) TranscriptionLevel(eu.transkribus.core.model.beans.enums.TranscriptionLevel) URL(java.net.URL) Date(java.util.Date) TextLineType(eu.transkribus.core.model.beans.pagecontent.TextLineType) Rectangle2D(java.awt.geom.Rectangle2D) TrpPageUnmarshalListener(eu.transkribus.core.model.builder.TrpPageUnmarshalListener) LoggerFactory(org.slf4j.LoggerFactory) TrpTextRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType) TrpRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpRegionType) MarshalException(javax.xml.bind.MarshalException) ByteArrayInputStream(java.io.ByteArrayInputStream) Map(java.util.Map) FimgStoreImgMd(org.dea.fimgstoreclient.beans.FimgStoreImgMd) JAXBException(javax.xml.bind.JAXBException) FileNotFoundException(java.io.FileNotFoundException) Dimension(java.awt.Dimension) List(java.util.List) TrpElementCoordinatesComparator(eu.transkribus.core.model.beans.pagecontent_trp.TrpElementCoordinatesComparator) TrpTextLineType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextLineType) SAXException(org.xml.sax.SAXException) TrpObjectFactory(eu.transkribus.core.model.beans.pagecontent_trp.TrpObjectFactory) CustomTagUtil(eu.transkribus.core.model.beans.customtags.CustomTagUtil) PcGtsType(eu.transkribus.core.model.beans.pagecontent.PcGtsType) Polygon(java.awt.Polygon) Rectangle(java.awt.Rectangle) TextEquivType(eu.transkribus.core.model.beans.pagecontent.TextEquivType) ByteArrayOutputStream(java.io.ByteArrayOutputStream) TransformerException(javax.xml.transform.TransformerException) CoordsType(eu.transkribus.core.model.beans.pagecontent.CoordsType) TrpPageMarshalListener(eu.transkribus.core.model.builder.TrpPageMarshalListener) Marshaller(javax.xml.bind.Marshaller) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) ValidationEventCollector(javax.xml.bind.util.ValidationEventCollector) TrpTranscriptStatistics(eu.transkribus.core.model.beans.TrpTranscriptStatistics) LinkedList(java.util.LinkedList) TrpPage(eu.transkribus.core.model.beans.TrpPage) JAXBContext(javax.xml.bind.JAXBContext) Unmarshaller(javax.xml.bind.Unmarshaller) Logger(org.slf4j.Logger) RegionType(eu.transkribus.core.model.beans.pagecontent.RegionType) WordType(eu.transkribus.core.model.beans.pagecontent.WordType) JAXBElement(javax.xml.bind.JAXBElement) IOException(java.io.IOException) FileUtils(org.apache.commons.io.FileUtils) TrpPageType(eu.transkribus.core.model.beans.pagecontent_trp.TrpPageType) FileInputStream(java.io.FileInputStream) XMLGregorianCalendar(javax.xml.datatype.XMLGregorianCalendar) XmlFormat(eu.transkribus.core.io.formats.XmlFormat) File(java.io.File) MetadataType(eu.transkribus.core.model.beans.pagecontent.MetadataType) StringReader(java.io.StringReader) ParserConfigurationException(javax.xml.parsers.ParserConfigurationException) TextRegionType(eu.transkribus.core.model.beans.pagecontent.TextRegionType) TableRegionType(eu.transkribus.core.model.beans.pagecontent.TableRegionType) PrintSpaceType(eu.transkribus.core.model.beans.pagecontent.PrintSpaceType) Collections(java.util.Collections) TrpTranscriptMetadata(eu.transkribus.core.model.beans.TrpTranscriptMetadata) ObjectFactory(eu.transkribus.core.model.beans.pagecontent.ObjectFactory) TrpWordType(eu.transkribus.core.model.beans.pagecontent_trp.TrpWordType) InputStream(java.io.InputStream) TextLineType(eu.transkribus.core.model.beans.pagecontent.TextLineType) TrpTextLineType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextLineType) TrpElementCoordinatesComparator(eu.transkribus.core.model.beans.pagecontent_trp.TrpElementCoordinatesComparator) LinkedList(java.util.LinkedList)

Example 5 with PcGtsType

use of eu.transkribus.core.model.beans.pagecontent.PcGtsType in project TranskribusCore by Transkribus.

the class PageXmlUtils method marshalToBytes.

public static byte[] marshalToBytes(PcGtsType page) throws JAXBException {
    ValidationEventCollector vec = new ValidationEventCollector();
    Marshaller marshaller = createMarshaller(vec);
    ObjectFactory objectFactory = new ObjectFactory();
    JAXBElement<PcGtsType> je = objectFactory.createPcGts(page);
    byte[] data;
    ByteArrayOutputStream out = new ByteArrayOutputStream();
    try {
        try {
            marshaller.marshal(je, out);
            data = out.toByteArray();
        } finally {
            out.close();
        }
    } catch (Exception e) {
        throw new MarshalException(e);
    }
    String msg = buildMsg(vec, page);
    if (!msg.startsWith(NO_EVENTS_MSG))
        logger.info(msg);
    return data;
}
Also used : Marshaller(javax.xml.bind.Marshaller) MarshalException(javax.xml.bind.MarshalException) TrpObjectFactory(eu.transkribus.core.model.beans.pagecontent_trp.TrpObjectFactory) ObjectFactory(eu.transkribus.core.model.beans.pagecontent.ObjectFactory) ByteArrayOutputStream(java.io.ByteArrayOutputStream) ValidationEventCollector(javax.xml.bind.util.ValidationEventCollector) PcGtsType(eu.transkribus.core.model.beans.pagecontent.PcGtsType) MarshalException(javax.xml.bind.MarshalException) JAXBException(javax.xml.bind.JAXBException) FileNotFoundException(java.io.FileNotFoundException) SAXException(org.xml.sax.SAXException) TransformerException(javax.xml.transform.TransformerException) IOException(java.io.IOException) ParserConfigurationException(javax.xml.parsers.ParserConfigurationException)

Aggregations

PcGtsType (eu.transkribus.core.model.beans.pagecontent.PcGtsType)36 File (java.io.File)16 IOException (java.io.IOException)16 JAXBException (javax.xml.bind.JAXBException)11 TrpPage (eu.transkribus.core.model.beans.TrpPage)8 TrpTextRegionType (eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType)6 URL (java.net.URL)6 JAXBElement (javax.xml.bind.JAXBElement)6 Unmarshaller (javax.xml.bind.Unmarshaller)6 TextRegionType (eu.transkribus.core.model.beans.pagecontent.TextRegionType)5 FileNotFoundException (java.io.FileNotFoundException)5 ParserConfigurationException (javax.xml.parsers.ParserConfigurationException)5 TransformerException (javax.xml.transform.TransformerException)5 SAXException (org.xml.sax.SAXException)5 TrpTranscriptMetadata (eu.transkribus.core.model.beans.TrpTranscriptMetadata)4 TrpPageType (eu.transkribus.core.model.beans.pagecontent_trp.TrpPageType)4 TrpTextLineType (eu.transkribus.core.model.beans.pagecontent_trp.TrpTextLineType)4 Dimension (java.awt.Dimension)4 FimgStoreImgMd (org.dea.fimgstoreclient.beans.FimgStoreImgMd)4 XmlFormat (eu.transkribus.core.io.formats.XmlFormat)3