 * Loads a document from path.<br>
 * Document metadata has to be in an XML called "metadata.xml".<br>
 * Image files and corresponding XML/txt files have to have the same name. <br>
 * Lexicographic order of image names will imply order of pages.<br>
 * Types of transcript source files are searched in this order:
 * <ol>
 * <li>./page: PAGE XMLs according to schema 2010/2013</li>
 * <li>./ocr: Abbyy Finereader XMLs schema version 10</li>
 * <li>./alto: ALTO v2 XMls
 * <li>./txt: txt files with transcription fulltext only
 * </ol>
 * Testdoc is in $dea_scratch/TRP/TrpTestDoc <br>
 * No versioning of files for local use!<br>
 * @param path the path where the document is stored
 * @param config {@link DocLoadConfig}
 * @return the constructed document
 * @throws IOException if the path can't be read or is malformed
 * @todo implement monitor feedback!
 * @todo Respect Storage.uploadDocument where the monitor will be used by the upload itself later.
public static TrpDoc load(final String path, DocLoadConfig config, IProgressMonitor monitor) throws IOException {
    // create the document
    TrpDoc doc = new TrpDoc();
    // check OS and adjust URL protocol
    final String os = System.getProperty("");
		 * FIXME use SysUtils.isWin() here?
    if (os.toLowerCase().contains("win")) {
        LocalDocConst.URL_PROT_CONST = "file:///";
    // else: keep default
    final File inputDir = new File(path);
    final File docXml = new File(inputDir.getAbsolutePath() + File.separator + LocalDocConst.DOC_XML_FILENAME);
    // validate input path ======================================================
    // search for IMG files
    TreeMap<String, File> pageMap = findImgFiles(inputDir);"Found " + pageMap.entrySet().size() + " page images.");
    if (pageMap.isEmpty()) {
        throw new FileNotFoundException("The directory does not contain any images: " + inputDir.getAbsolutePath());
    TrpDocMetadata docMd = null;
    boolean doRefresh = true;
    // try to read doc structure from disk
    if (docXml.isFile()) {
        doc = loadDocXml(docXml);
        if (isValid(doc, pageMap.size(), config.isForceCreatePageXml())) {
  "Loaded document structure from disk.");
            docMd = doc.getMd();
            // no refresh is necessary as doc structure matches the input dir content
            doRefresh = false;
        } else {
            if (doc != null && doc.getMd() != null) {
                // keep any existing metadata if invalid doc structure was found
                docMd = doc.getMd();
  "Removing faulty doc XML from disk and doing reload.");
            doc = new TrpDoc();
    }"Reading document at " + inputDir.getAbsolutePath());
    // find metadata file if not extracted from doc.xml =============================================
    if (docMd == null) {
        try {
            docMd = loadDocMd(inputDir);
        } catch (IOException ioe) {
            docMd = new TrpDocMetadata();
    initDocMd(docMd, inputDir, config.isStripServerRelatedMetadata());
    // Set the docMd
    if (!doRefresh) {
        // Stop now and reuse doc structure from file
        return doc;
    // Construct the input dir with pageXml Files.
    File pageInputDir = getPageXmlInputDir(inputDir);
    if (config.isForceCreatePageXml() && !pageInputDir.isDirectory()) {
    // abbyy XML search path
    File ocrInputDir = getOcrXmlInputDir(inputDir);
    // alto XML search path
    File altoInputDir = getAltoXmlInputDir(inputDir);
    // alto XML search path
    File txtInputDir = getTxtInputDir(inputDir);
    // backupfolder for outdated page format files, if any
    final String backupFolderName = XmlFormat.PAGE_2010.toString().toLowerCase() + "_backup";
    final String backupPath = pageInputDir.getAbsolutePath() + File.separator + backupFolderName;
    // iterate imgList, search for corresponding XML files and build TrpPages
    int pageNr = 1;
    List<TrpPage> pages = new ArrayList<TrpPage>(pageMap.entrySet().size());
    // need a special variable to test whether we are in sync mode (only then do the following!!!!)
    if (pageMap.entrySet().size() == 0 && config.isEnableSyncWithoutImages()) {
        pageMap = createDummyImgFilesForXmls(inputDir, pageInputDir);
    for (Entry<String, File> e : pageMap.entrySet()) {
        File imgFile = e.getValue();
        // the img file name without extension
        final String imgFileName = e.getKey();
        // check for a page XML of this name
        File pageXml = findXml(imgFileName, pageInputDir);
        // TODO thumbURL dir + imgFile.getName())+".jpg"
        File thumbFile = getThumbFile(inputDir, imgFileName);
        if (pageXml != null) {
            XmlFormat xmlFormat = XmlUtils.getXmlFormat(pageXml);
            switch(xmlFormat) {
                case PAGE_2010:
                    Page2010Converter.updatePageFormatSingleFile(pageXml, backupPath);
                case PAGE_2013:
                    throw new IOException("Incompatible XML file in PAGE XML path! " + pageXml.getAbsolutePath());
        // try to read image dimension in any case to detect corrupt files immediately!
        // FIXME this is taking too long and is only necessary on initial loading
        Dimension dim = null;
        String imageRemark = null;
        try {
            dim = ImgUtils.readImageDimensions(imgFile);
        } catch (CorruptImageException cie) {
            logger.error("Image is corrupt: " + imgFile.getAbsolutePath(), cie);
            imageRemark = getCorruptImgMsg(imgFile.getName());
        if (pageXml == null && config.isForceCreatePageXml()) {
            // if no page XML, then create one at this path
            File pageOutFile = new File(pageInputDir.getAbsolutePath() + File.separatorChar + imgFileName + ".xml");
            File abbyyXml = findXml(imgFileName, ocrInputDir);
            File altoXml = findXml(imgFileName, altoInputDir);
            File txtFile = findFile(imgFileName, txtInputDir, "txt");
            pageXml = createPageXml(pageOutFile, false, abbyyXml, altoXml, txtFile, config.isPreserveOcrFontFamily(), config.isPreserveOcrTxtStyles(), config.isReplaceBadChars(), imgFile.getName(), dim);
        TrpPage page = buildPage(inputDir, pageNr++, imgFile, pageXml, thumbFile, dim, imageRemark);
    // set editorial declaration:
    List<EdFeature> features = readEditDeclFeatures(doc.getMd().getLocalFolder());
    // store doc on disk to save time on next load
    LocalDocWriter.writeDocXml(doc, docXml);
    return doc;
Also used : CorruptImageException(eu.transkribus.core.exceptions.CorruptImageException) XmlFormat( TrpPage(eu.transkribus.core.model.beans.TrpPage) FileNotFoundException( ArrayList(java.util.ArrayList) IOException( Dimension(java.awt.Dimension) EdFeature(eu.transkribus.core.model.beans.EdFeature) TrpDoc(eu.transkribus.core.model.beans.TrpDoc) TrpDocMetadata(eu.transkribus.core.model.beans.TrpDocMetadata) File(

Example 2 with XmlFormat

use of in project TranskribusCore by Transkribus.

the class LocalDocReader method createPageFromAbbyy.

private static PcGtsType createPageFromAbbyy(final String imgFileName, File abbyyXml, boolean preserveOcrTxtStyles, boolean preserveOcrFontFamily, boolean replaceBadChars) throws IOException {
    try {
        XmlFormat xmlFormat = XmlUtils.getXmlFormat(abbyyXml);
        if (xmlFormat.equals(XmlFormat.ABBYY_10)) {
   + ": Transforming Finereader10/11 XML to PAGE XML.");
            PcGtsType pc = PageXmlUtils.createPcGtsTypeFromAbbyy(abbyyXml, imgFileName, preserveOcrTxtStyles, preserveOcrFontFamily, replaceBadChars);
            return pc;
        throw new IOException("Not a valid Finereader10/11 XML file.");
    } catch (IOException | TransformerException ioe) {
        logger.error(ioe.getMessage(), ioe);
        throw new IOException("Could not migrate file: " + abbyyXml.getAbsolutePath(), ioe);
    } catch (ParserConfigurationException | SAXException xmle) {
        logger.error(xmle.getMessage(), xmle);
        throw new IOException("Could not transform XML file!", xmle);
    } catch (JAXBException je) {
        /* TODO This exception is only thrown when the pageXML is unmarshalled 
			 * for inserting the image filename which is not included in the abbyy xml! */
        logger.error(je.getMessage(), je);
        throw new IOException("Transformation output is not a valid page XML!", je);
Also used : XmlFormat( JAXBException(javax.xml.bind.JAXBException) IOException( ParserConfigurationException(javax.xml.parsers.ParserConfigurationException) PcGtsType(eu.transkribus.core.model.beans.pagecontent.PcGtsType) TransformerException(javax.xml.transform.TransformerException) SAXException(org.xml.sax.SAXException)

Example 3 with XmlFormat

use of in project TranskribusCore by Transkribus.

the class JaxbUtils method createXmlMarshaller.

private static <T> Marshaller createXmlMarshaller(T object, boolean doFormatting, Class<?>... nestedClasses) throws JAXBException {
    Class<?>[] targetClasses = merge(object.getClass(), nestedClasses);
    JAXBContext jc = createJAXBContext(targetClasses);
    Marshaller marshaller = jc.createMarshaller();
    marshaller.setProperty(Marshaller.JAXB_FORMATTED_OUTPUT, doFormatting);
    marshaller.setProperty(Marshaller.JAXB_ENCODING, "UTF-8");
    XmlFormat format = XmlFormat.resolveFromClazz(object.getClass());
    if (format != null && !format.equals(XmlFormat.UNKNOWN)) {
        marshaller.setProperty(Marshaller.JAXB_SCHEMA_LOCATION, format.xsiSchemaLocation);
    return marshaller;
Also used : Marshaller(javax.xml.bind.Marshaller) XmlFormat( JAXBContext(javax.xml.bind.JAXBContext)

Example 4 with XmlFormat

use of in project TranskribusCore by Transkribus.

the class XmlUtils method getXmlFormat.

 * Returns supported XML format or throws Exception
 * @param xmlFile
 * @return
 * @throws IOException
public static XmlFormat getXmlFormat(File xmlFile) throws IOException {
    String namespace = null;
    // build pattern to extract "xmlns="(http://my.namespa.ce)""
    final String URL_CHARSET = "[a-zA-Z0-9:/_&?~#%=\\.\\-]*";
    final String xmlnsRegex = ".*\\s*xmlns=\"(" + URL_CHARSET + ")\".*";
    Pattern pattern = Pattern.compile(xmlnsRegex);
    // read first 2048 characters of this file.
    // if the root element is too long and the namespace is not included here, this won't work
    final String content = DeaFileUtils.readFileAsString(xmlFile, 2048);
    Matcher m = pattern.matcher(content);
    XmlFormat format;
    if (m.find()) {
        namespace =;
        format = XmlFormat.resolveFromNs(namespace);
        logger.debug(xmlFile.getName() + ": " + format.toString() + " - namespace: " + namespace);
    } else {
        final String msg = "No namespace found in file: " + xmlFile.getAbsolutePath();
        format = XmlFormat.UNKNOWN;
    return format;
/* alternative:
		 * read the whole file as DOM. This is less performant but does well-formedness check
// try{
// DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
// DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
// Document doc = dBuilder.parse(xmlFile);
// namespace = doc.getDocumentElement().getAttribute("xmlns");
// } catch (SAXException | ParserConfigurationException e){
// throw new IOException(e);
// }
Also used : Pattern(java.util.regex.Pattern) XmlFormat( Matcher(java.util.regex.Matcher)

Example 5 with XmlFormat

use of in project TranskribusCore by Transkribus.

the class GoobiMetsImporter method createPageFromAlto2.

 *  create a page file from the given Alto file
 * @param imgFile
 * @param altoXml
 * @param pageOutFile
 * @param preserveOcrTxtStyles
 * @param preserveOcrFontFamily
 * @param replaceBadChars
 * @return
 * @throws IOException
 * @throws TransformerException
 * @throws SAXException
 * @throws ParserConfigurationException
 * @throws JAXBException
public File createPageFromAlto2(File imgFile, File altoXml, File pageOutFile, boolean preserveOcrTxtStyles, boolean preserveOcrFontFamily, boolean replaceBadChars) throws IOException, TransformerException, SAXException, ParserConfigurationException, JAXBException {
    XmlFormat xmlFormat = XmlUtils.getXmlFormat(altoXml);
    if (xmlFormat.equals(XmlFormat.ALTO_2)) { + ": Transforming ALTO v2 XMLs to PAGE XML.");
        PcGtsType pc = PageXmlUtils.createPcGtsTypeFromAlto(altoXml, imgFile.getName(), preserveOcrTxtStyles, preserveOcrFontFamily, replaceBadChars);
        return JaxbUtils.marshalToFile(pc, pageOutFile);
    throw new IOException("Could not determine xml file as valid alto2: " + altoXml.getAbsolutePath());
Also used : XmlFormat( IOException( PcGtsType(eu.transkribus.core.model.beans.pagecontent.PcGtsType)


