Search in sources :

Example 1 with XmlFormat

use of eu.transkribus.core.io.formats.XmlFormat in project TranskribusCore by Transkribus.

the class LocalDocReader method load.

/**
 * Loads a document from path.<br>
 *
 * Document metadata has to be in an XML called "metadata.xml".<br>
 *
 * Image files and corresponding XML/txt files have to have the same name. <br>
 * Lexicographic order of image names will imply order of pages.<br>
 * Types of transcript source files are searched in this order:
 * <ol>
 * <li>./page: PAGE XMLs according to schema 2010/2013</li>
 * <li>./ocr: Abbyy Finereader XMLs schema version 10</li>
 * <li>./alto: ALTO v2 XMls
 * <li>./txt: txt files with transcription fulltext only
 * </ol>
 * Testdoc is in $dea_scratch/TRP/TrpTestDoc <br>
 * No versioning of files for local use!<br>
 *
 * @param path the path where the document is stored
 * @param config {@link DocLoadConfig}
 * @return the constructed document
 * @throws IOException if the path can't be read or is malformed
 *
 * @todo implement monitor feedback!
 * @todo Respect Storage.uploadDocument where the monitor will be used by the upload itself later.
 */
public static TrpDoc load(final String path, DocLoadConfig config, IProgressMonitor monitor) throws IOException {
    // create the document
    TrpDoc doc = new TrpDoc();
    // check OS and adjust URL protocol
    final String os = System.getProperty("os.name");
    /*
		 * FIXME use SysUtils.isWin() here?
		 */
    if (os.toLowerCase().contains("win")) {
        LocalDocConst.URL_PROT_CONST = "file:///";
    }
    // else: keep default
    final File inputDir = new File(path);
    final File docXml = new File(inputDir.getAbsolutePath() + File.separator + LocalDocConst.DOC_XML_FILENAME);
    // validate input path ======================================================
    checkInputDir(inputDir);
    // search for IMG files
    TreeMap<String, File> pageMap = findImgFiles(inputDir);
    logger.info("Found " + pageMap.entrySet().size() + " page images.");
    if (pageMap.isEmpty()) {
        throw new FileNotFoundException("The directory does not contain any images: " + inputDir.getAbsolutePath());
    }
    TrpDocMetadata docMd = null;
    boolean doRefresh = true;
    // try to read doc structure from disk
    if (docXml.isFile()) {
        doc = loadDocXml(docXml);
        if (isValid(doc, pageMap.size(), config.isForceCreatePageXml())) {
            logger.info("Loaded document structure from disk.");
            docMd = doc.getMd();
            // no refresh is necessary as doc structure matches the input dir content
            doRefresh = false;
        } else {
            if (doc != null && doc.getMd() != null) {
                // keep any existing metadata if invalid doc structure was found
                docMd = doc.getMd();
            }
            logger.info("Removing faulty doc XML from disk and doing reload.");
            docXml.delete();
            doc = new TrpDoc();
        }
    }
    logger.info("Reading document at " + inputDir.getAbsolutePath());
    // find metadata file if not extracted from doc.xml =============================================
    if (docMd == null) {
        try {
            docMd = loadDocMd(inputDir);
        } catch (IOException ioe) {
            docMd = new TrpDocMetadata();
        }
    }
    initDocMd(docMd, inputDir, config.isStripServerRelatedMetadata());
    // Set the docMd
    doc.setMd(docMd);
    if (!doRefresh) {
        // Stop now and reuse doc structure from file
        return doc;
    }
    // Construct the input dir with pageXml Files.
    File pageInputDir = getPageXmlInputDir(inputDir);
    if (config.isForceCreatePageXml() && !pageInputDir.isDirectory()) {
        pageInputDir.mkdir();
    }
    // abbyy XML search path
    File ocrInputDir = getOcrXmlInputDir(inputDir);
    // alto XML search path
    File altoInputDir = getAltoXmlInputDir(inputDir);
    // alto XML search path
    File txtInputDir = getTxtInputDir(inputDir);
    // backupfolder for outdated page format files, if any
    final String backupFolderName = XmlFormat.PAGE_2010.toString().toLowerCase() + "_backup";
    final String backupPath = pageInputDir.getAbsolutePath() + File.separator + backupFolderName;
    // iterate imgList, search for corresponding XML files and build TrpPages
    int pageNr = 1;
    List<TrpPage> pages = new ArrayList<TrpPage>(pageMap.entrySet().size());
    // need a special variable to test whether we are in sync mode (only then do the following!!!!)
    if (pageMap.entrySet().size() == 0 && config.isEnableSyncWithoutImages()) {
        pageMap = createDummyImgFilesForXmls(inputDir, pageInputDir);
    }
    for (Entry<String, File> e : pageMap.entrySet()) {
        File imgFile = e.getValue();
        // the img file name without extension
        final String imgFileName = e.getKey();
        // check for a page XML of this name
        File pageXml = findXml(imgFileName, pageInputDir);
        // TODO thumbURL dir + imgFile.getName())+".jpg"
        File thumbFile = getThumbFile(inputDir, imgFileName);
        if (pageXml != null) {
            XmlFormat xmlFormat = XmlUtils.getXmlFormat(pageXml);
            switch(xmlFormat) {
                case PAGE_2010:
                    Page2010Converter.updatePageFormatSingleFile(pageXml, backupPath);
                    break;
                case PAGE_2013:
                    break;
                default:
                    throw new IOException("Incompatible XML file in PAGE XML path! " + pageXml.getAbsolutePath());
            }
        }
        // try to read image dimension in any case to detect corrupt files immediately!
        // FIXME this is taking too long and is only necessary on initial loading
        Dimension dim = null;
        String imageRemark = null;
        try {
            dim = ImgUtils.readImageDimensions(imgFile);
        } catch (CorruptImageException cie) {
            logger.error("Image is corrupt: " + imgFile.getAbsolutePath(), cie);
            imageRemark = getCorruptImgMsg(imgFile.getName());
        }
        if (pageXml == null && config.isForceCreatePageXml()) {
            // if no page XML, then create one at this path
            File pageOutFile = new File(pageInputDir.getAbsolutePath() + File.separatorChar + imgFileName + ".xml");
            File abbyyXml = findXml(imgFileName, ocrInputDir);
            File altoXml = findXml(imgFileName, altoInputDir);
            File txtFile = findFile(imgFileName, txtInputDir, "txt");
            pageXml = createPageXml(pageOutFile, false, abbyyXml, altoXml, txtFile, config.isPreserveOcrFontFamily(), config.isPreserveOcrTxtStyles(), config.isReplaceBadChars(), imgFile.getName(), dim);
        }
        TrpPage page = buildPage(inputDir, pageNr++, imgFile, pageXml, thumbFile, dim, imageRemark);
        pages.add(page);
    }
    doc.setPages(pages);
    doc.getMd().setNrOfPages(doc.getPages().size());
    // set editorial declaration:
    List<EdFeature> features = readEditDeclFeatures(doc.getMd().getLocalFolder());
    doc.setEdDeclList(features);
    logger.debug(doc.toString());
    // store doc on disk to save time on next load
    LocalDocWriter.writeDocXml(doc, docXml);
    return doc;
}
Also used : CorruptImageException(eu.transkribus.core.exceptions.CorruptImageException) XmlFormat(eu.transkribus.core.io.formats.XmlFormat) TrpPage(eu.transkribus.core.model.beans.TrpPage) FileNotFoundException(java.io.FileNotFoundException) ArrayList(java.util.ArrayList) IOException(java.io.IOException) Dimension(java.awt.Dimension) EdFeature(eu.transkribus.core.model.beans.EdFeature) TrpDoc(eu.transkribus.core.model.beans.TrpDoc) TrpDocMetadata(eu.transkribus.core.model.beans.TrpDocMetadata) File(java.io.File)

Example 2 with XmlFormat

use of eu.transkribus.core.io.formats.XmlFormat in project TranskribusCore by Transkribus.

the class LocalDocReader method createPageFromAbbyy.

private static PcGtsType createPageFromAbbyy(final String imgFileName, File abbyyXml, boolean preserveOcrTxtStyles, boolean preserveOcrFontFamily, boolean replaceBadChars) throws IOException {
    try {
        XmlFormat xmlFormat = XmlUtils.getXmlFormat(abbyyXml);
        if (xmlFormat.equals(XmlFormat.ABBYY_10)) {
            logger.info(abbyyXml.getAbsolutePath() + ": Transforming Finereader10/11 XML to PAGE XML.");
            PcGtsType pc = PageXmlUtils.createPcGtsTypeFromAbbyy(abbyyXml, imgFileName, preserveOcrTxtStyles, preserveOcrFontFamily, replaceBadChars);
            return pc;
        }
        throw new IOException("Not a valid Finereader10/11 XML file.");
    } catch (IOException | TransformerException ioe) {
        logger.error(ioe.getMessage(), ioe);
        throw new IOException("Could not migrate file: " + abbyyXml.getAbsolutePath(), ioe);
    } catch (ParserConfigurationException | SAXException xmle) {
        logger.error(xmle.getMessage(), xmle);
        throw new IOException("Could not transform XML file!", xmle);
    } catch (JAXBException je) {
        /* TODO This exception is only thrown when the pageXML is unmarshalled 
			 * for inserting the image filename which is not included in the abbyy xml! */
        logger.error(je.getMessage(), je);
        throw new IOException("Transformation output is not a valid page XML!", je);
    }
}
Also used : XmlFormat(eu.transkribus.core.io.formats.XmlFormat) JAXBException(javax.xml.bind.JAXBException) IOException(java.io.IOException) ParserConfigurationException(javax.xml.parsers.ParserConfigurationException) PcGtsType(eu.transkribus.core.model.beans.pagecontent.PcGtsType) TransformerException(javax.xml.transform.TransformerException) SAXException(org.xml.sax.SAXException)

Example 3 with XmlFormat

use of eu.transkribus.core.io.formats.XmlFormat in project TranskribusCore by Transkribus.

the class JaxbUtils method createXmlMarshaller.

private static <T> Marshaller createXmlMarshaller(T object, boolean doFormatting, Class<?>... nestedClasses) throws JAXBException {
    Class<?>[] targetClasses = merge(object.getClass(), nestedClasses);
    JAXBContext jc = createJAXBContext(targetClasses);
    Marshaller marshaller = jc.createMarshaller();
    marshaller.setProperty(Marshaller.JAXB_FORMATTED_OUTPUT, doFormatting);
    marshaller.setProperty(Marshaller.JAXB_ENCODING, "UTF-8");
    XmlFormat format = XmlFormat.resolveFromClazz(object.getClass());
    if (format != null && !format.equals(XmlFormat.UNKNOWN)) {
        marshaller.setProperty(Marshaller.JAXB_SCHEMA_LOCATION, format.xsiSchemaLocation);
    }
    logger.debug(marshaller.getClass().getCanonicalName());
    return marshaller;
}
Also used : Marshaller(javax.xml.bind.Marshaller) XmlFormat(eu.transkribus.core.io.formats.XmlFormat) JAXBContext(javax.xml.bind.JAXBContext)

Example 4 with XmlFormat

use of eu.transkribus.core.io.formats.XmlFormat in project TranskribusCore by Transkribus.

the class XmlUtils method getXmlFormat.

/**
 * Returns supported XML format or throws Exception
 * @param xmlFile
 * @return
 * @throws IOException
 */
public static XmlFormat getXmlFormat(File xmlFile) throws IOException {
    String namespace = null;
    // build pattern to extract "xmlns="(http://my.namespa.ce)""
    final String URL_CHARSET = "[a-zA-Z0-9:/_&?~#%=\\.\\-]*";
    final String xmlnsRegex = ".*\\s*xmlns=\"(" + URL_CHARSET + ")\".*";
    Pattern pattern = Pattern.compile(xmlnsRegex);
    // read first 2048 characters of this file.
    // if the root element is too long and the namespace is not included here, this won't work
    final String content = DeaFileUtils.readFileAsString(xmlFile, 2048);
    Matcher m = pattern.matcher(content);
    XmlFormat format;
    if (m.find()) {
        namespace = m.group(1);
        format = XmlFormat.resolveFromNs(namespace);
        logger.debug(xmlFile.getName() + ": " + format.toString() + " - namespace: " + namespace);
    } else {
        final String msg = "No namespace found in file: " + xmlFile.getAbsolutePath();
        logger.error(msg);
        format = XmlFormat.UNKNOWN;
    }
    return format;
/* alternative:
		 * read the whole file as DOM. This is less performant but does well-formedness check
		 */
// try{
// DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
// DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
// Document doc = dBuilder.parse(xmlFile);
// logger.info(doc.getDocumentElement().getAttribute("xsi:schemaLocation"));
// namespace = doc.getDocumentElement().getAttribute("xmlns");
// logger.info(namespace);
// } catch (SAXException | ParserConfigurationException e){
// throw new IOException(e);
// }
}
Also used : Pattern(java.util.regex.Pattern) XmlFormat(eu.transkribus.core.io.formats.XmlFormat) Matcher(java.util.regex.Matcher)

Example 5 with XmlFormat

use of eu.transkribus.core.io.formats.XmlFormat in project TranskribusCore by Transkribus.

the class GoobiMetsImporter method createPageFromAlto2.

/**
 *  create a page file from the given Alto file
 *
 * @param imgFile
 * @param altoXml
 * @param pageOutFile
 * @param preserveOcrTxtStyles
 * @param preserveOcrFontFamily
 * @param replaceBadChars
 * @return
 * @throws IOException
 * @throws TransformerException
 * @throws SAXException
 * @throws ParserConfigurationException
 * @throws JAXBException
 */
public File createPageFromAlto2(File imgFile, File altoXml, File pageOutFile, boolean preserveOcrTxtStyles, boolean preserveOcrFontFamily, boolean replaceBadChars) throws IOException, TransformerException, SAXException, ParserConfigurationException, JAXBException {
    XmlFormat xmlFormat = XmlUtils.getXmlFormat(altoXml);
    if (xmlFormat.equals(XmlFormat.ALTO_2)) {
        logger.info(altoXml.getAbsolutePath() + ": Transforming ALTO v2 XMLs to PAGE XML.");
        PcGtsType pc = PageXmlUtils.createPcGtsTypeFromAlto(altoXml, imgFile.getName(), preserveOcrTxtStyles, preserveOcrFontFamily, replaceBadChars);
        return JaxbUtils.marshalToFile(pc, pageOutFile);
    }
    throw new IOException("Could not determine xml file as valid alto2: " + altoXml.getAbsolutePath());
}
Also used : XmlFormat(eu.transkribus.core.io.formats.XmlFormat) IOException(java.io.IOException) PcGtsType(eu.transkribus.core.model.beans.pagecontent.PcGtsType)

Aggregations

XmlFormat (eu.transkribus.core.io.formats.XmlFormat)5 IOException (java.io.IOException)3 PcGtsType (eu.transkribus.core.model.beans.pagecontent.PcGtsType)2 CorruptImageException (eu.transkribus.core.exceptions.CorruptImageException)1 EdFeature (eu.transkribus.core.model.beans.EdFeature)1 TrpDoc (eu.transkribus.core.model.beans.TrpDoc)1 TrpDocMetadata (eu.transkribus.core.model.beans.TrpDocMetadata)1 TrpPage (eu.transkribus.core.model.beans.TrpPage)1 Dimension (java.awt.Dimension)1 File (java.io.File)1 FileNotFoundException (java.io.FileNotFoundException)1 ArrayList (java.util.ArrayList)1 Matcher (java.util.regex.Matcher)1 Pattern (java.util.regex.Pattern)1 JAXBContext (javax.xml.bind.JAXBContext)1 JAXBException (javax.xml.bind.JAXBException)1 Marshaller (javax.xml.bind.Marshaller)1 ParserConfigurationException (javax.xml.parsers.ParserConfigurationException)1 TransformerException (javax.xml.transform.TransformerException)1 SAXException (org.xml.sax.SAXException)1