Search in sources :

Example 1 with TrpPage

use of eu.transkribus.core.model.beans.TrpPage in project TranskribusCore by Transkribus.

the class GoobiMetsImporter method fetchFiles.

/**
 * @param mets: The unmarshalled Goobi Mets file
 * @return
 * @throws IOException
 */
public List<TrpPage> fetchFiles(String dir, Mets mets) throws IOException {
    List<FileGrp> fileGrps = mets.getFileSec().getFileGrp();
    List<FileType> xmlGrp = null;
    List<FileType> imgGrp = null;
    List<FileType> defaultImgGrp = null;
    for (FileGrpType type : fileGrps) {
        switch(type.getUSE()) {
            case "MAX":
                imgGrp = type.getFile();
                break;
            /*
				 * could also be that USE='Content' and ID="AltoFiles" or ID="AbbyyXmlFiles"  is necessary to get the transcriptions
				 */
            case "DEFAULT":
                defaultImgGrp = type.getFile();
                break;
            case "XML":
                // possibility to load also an existent Alto or Abbyy XML and convert it to Page later on
                // TODO: Abklären
                xmlGrp = type.getFile();
                break;
            default:
                break;
        }
    }
    // take default images if no MAX images are available
    if (imgGrp == null && defaultImgGrp != null) {
        imgGrp = defaultImgGrp;
    }
    if (imgGrp == null)
        throw new IOException("METS file has no image file list!");
    if (xmlGrp == null) {
        logger.debug("no xml file list");
    // throw new IOException("METS file has no xml file list!");
    }
    List<DivType> pageDivs = null;
    for (StructMapType sMap : mets.getStructMap()) {
        if (sMap.getTYPE().equals("PHYSICAL") && // && sMap.getDiv().getID().equals("PHYS_0000")){
        sMap.getDiv().getTYPE().equals("physSequence")) {
            pageDivs = sMap.getDiv().getDiv();
            break;
        }
    }
    if (pageDivs == null)
        throw new IOException("No valid StructMap was found!");
    List<TrpPage> pages = new ArrayList<TrpPage>(pageDivs.size());
    // Implement a reverse-order Comparator by lambda function
    Comparator<DivType> comp = (DivType a, DivType b) -> {
        return a.getORDER().compareTo(b.getORDER());
    };
    pageDivs.sort(comp);
    for (DivType div : pageDivs) {
        // fetch all files and store them locally
        TrpPage p = fetchFilesFromUrl(div, imgGrp, xmlGrp, dir);
        pages.add(p);
    }
    return pages;
}
Also used : TrpPage(eu.transkribus.core.model.beans.TrpPage) FileGrp(eu.transkribus.core.model.beans.mets.MetsType.FileSec.FileGrp) ArrayList(java.util.ArrayList) IOException(java.io.IOException) FileGrpType(eu.transkribus.core.model.beans.mets.FileGrpType) DivType(eu.transkribus.core.model.beans.mets.DivType) FileType(eu.transkribus.core.model.beans.mets.FileType) StructMapType(eu.transkribus.core.model.beans.mets.StructMapType)

Example 2 with TrpPage

use of eu.transkribus.core.model.beans.TrpPage in project TranskribusCore by Transkribus.

the class GoobiMetsImporter method fetchFilesFromUrl.

private TrpPage fetchFilesFromUrl(DivType div, List<FileType> imgGrp, List<FileType> xmlGrp, String dir) throws IOException {
    final int pageNr = div.getORDER().intValue();
    updateStatus("Downloading file for page nr. " + pageNr);
    File imgFile = null;
    File abbyyFile = null;
    File altoFile = null;
    String imgDirPath = dir + File.separator + "img";
    String abbyyDirPath = dir + File.separator + LocalDocConst.OCR_FILE_SUB_FOLDER;
    String altoDirPath = dir + File.separator + LocalDocConst.ALTO_FILE_SUB_FOLDER;
    String pageDirPath = dir + File.separator + LocalDocConst.PAGE_FILE_SUB_FOLDER;
    File pageDirFile = new File(pageDirPath);
    if (!pageDirFile.isDirectory() && !pageDirFile.mkdir()) {
        throw new IOException("Could not create page dir at: " + pageDirPath);
    }
    /**
     * handle cases where no image can be retrieved/stored for this page:
     * -image URL is broken
     * -the image dimension can not be read from the downloaded file
     * -no image file is mapped in the structmap for this page
     *
     * problemMsg is used to store info on that.
     */
    String problemMsg = null;
    for (Fptr ptr : div.getFptr()) {
        FileType type = (FileType) ptr.getFILEID();
        FLocat fLocat = type.getFLocat().get(0);
        // FIXME at the moment only remote files are supported here!
        final String locType = fLocat.getLOCTYPE();
        if (!"URL".equals(locType)) {
            throw new IOException("Bad or no LOCTYPE in an FLocat element: " + locType);
        }
        // MIMETYPE="image/jpeg"
        final String mimetype = type.getMIMETYPE();
        final URL url = new URL(fLocat.getHref());
        String ext = MimeTypes.lookupExtension(mimetype);
        /*
			 * brought problems with file/img links without the filname + ext at the end of the URL 
			 */
        // final String filename = determineFilename(url, type.getID(), mimetype);
        /*
			 * Preferred filename is the name in the getHeaderField("Content-Disposition");
			 * as fallback we use the fileID and mimetype extension
			 * 
			 */
        String filename = type.getID() + "." + ext;
        logger.debug("url.getProtocol() " + url.getProtocol());
        if (url.getProtocol().startsWith("http")) {
            String tmpFn = UrlUtils.getFilenameFromHeaderField(url);
            // logger.debug("tmpFn " + tmpFn);
            if (tmpFn != null) {
                filename = tmpFn;
            }
        }
        // logger.debug("mimetype " + mimetype);
        logger.debug("imported filename " + filename);
        if (imgGrp.contains(type)) {
            imgFile = new File(imgDirPath + File.separator + filename);
            logger.debug("Downloading: " + url);
            // fetch file from this URL and store locally
            int imgDownloadStatus = UrlUtils.copyUrlToFile(url, imgFile);
            if (imgDownloadStatus >= 400) {
                // the image URL connection attempt returns a response with code > 400
                problemMsg = getBrokenUrlMsg(url, imgDownloadStatus);
            }
        }
        if (xmlGrp != null && xmlGrp.contains(type)) {
            // check for ALTO or Abbyy XML
            String xmlId = type.getID();
            // FIXME check on ID string might not be reliable
            if (xmlId.contains("AbbyyXml")) {
                logger.debug("Found potential Abbyy XML: " + type.getID());
                // TODO: implement
                abbyyFile = new File(abbyyDirPath + File.separator + filename);
                if (UrlUtils.copyUrlToFile(url, abbyyFile) >= 400) {
                    logger.error("Could not download Abbyy XML and it will be ignored!");
                    // don't fail if abbyy XML could not be retrieved
                    abbyyFile = null;
                }
            } else if (xmlId.contains("Alto")) {
                logger.debug("Found potential ALTO XML: " + type.getID());
                // TODO: implement
                altoFile = new File(altoDirPath + File.separator + filename);
                if (UrlUtils.copyUrlToFile(url, altoFile) >= 400) {
                    logger.error("Could not download ALTO XML and it will be ignored!");
                    // don't fail if ALTO XML could not be retrieved
                    altoFile = null;
                }
            }
        }
    }
    File pageXml = null;
    File thumb = null;
    File imgDir = new File(imgDirPath);
    Dimension dim = null;
    if (imgFile == null) {
        // the divType did not include an image pointer
        logger.error("No image mapped for page " + pageNr + " in the structmap!");
        problemMsg = getMissingImgMsg(pageNr);
    } else {
        logger.info("Page " + pageNr + " image: " + imgFile.getAbsolutePath());
        if (imgFile.isFile()) {
            try {
                dim = ImgUtils.readImageDimensions(imgFile);
            } catch (CorruptImageException cie) {
                logger.error("Image is corrupted!", cie);
                // the image dimension can not be read from the downloaded file
                problemMsg = LocalDocReader.getCorruptImgMsg(imgFile.getName());
            }
        }
        File pageOutFile = new File(pageDirPath + File.separatorChar + FilenameUtils.getBaseName(imgFile.getName()) + ".xml");
        pageXml = LocalDocReader.createPageXml(pageOutFile, true, abbyyFile, altoFile, null, true, true, false, imgFile.getName(), dim);
        thumb = LocalDocReader.getThumbFile(imgDir, imgFile.getName());
    }
    TrpPage page = LocalDocReader.buildPage(new File(dir), pageNr, imgFile, pageXml, thumb, dim, problemMsg);
    // page.getTranscripts().add(tmd);
    return page;
}
Also used : CorruptImageException(eu.transkribus.core.exceptions.CorruptImageException) FileType(eu.transkribus.core.model.beans.mets.FileType) TrpPage(eu.transkribus.core.model.beans.TrpPage) Fptr(eu.transkribus.core.model.beans.mets.DivType.Fptr) IOException(java.io.IOException) FLocat(eu.transkribus.core.model.beans.mets.FileType.FLocat) Dimension(java.awt.Dimension) File(java.io.File) URL(java.net.URL)

Example 3 with TrpPage

use of eu.transkribus.core.model.beans.TrpPage in project TranskribusCore by Transkribus.

the class LocalDocReader method load.

/**
 * Read a local doc based on its mets.xml file that has to contain file
 * paths relative to parentDir
 *
 * @param mets the mets object
 * @param parentDir the directory of the local document
 * @return the constructed Document
 * @throws IOException if the path can't be read
 */
public static TrpDoc load(Mets mets, File parentDir) throws IOException {
    final TrpDoc doc = new TrpDoc();
    TrpDocMetadata md;
    List<TrpPage> pages;
    // FIXME set TRP_METS_VERSION to PROFILE, not TYPE
    if (mets.getPROFILE().equals(TrpMetsBuilder.TRP_METS_PROFILE)) {
        // unmarshal TrpDocMetadata
        md = MetsUtil.getTrpDocMd(mets);
        // collect files
        pages = MetsUtil.getTrpPages(mets, parentDir);
    // } else if (mets.getPROFILE().equals(EnmapMetsBuilder.ENMAP_METS_PROFILE)){
    // md = EnmapMetsBuilder.getTrpDocMd(mets);
    // pages = EnmapMetsBuilder.getTrpPages(mets, parentDir);
    } else {
        throw new IOException("Unsupported METS PROFILE: " + mets.getPROFILE());
    }
    md.setLocalFolder(parentDir);
    doc.setMd(md);
    doc.setPages(pages);
    return doc;
}
Also used : TrpPage(eu.transkribus.core.model.beans.TrpPage) TrpDoc(eu.transkribus.core.model.beans.TrpDoc) TrpDocMetadata(eu.transkribus.core.model.beans.TrpDocMetadata) IOException(java.io.IOException)

Example 4 with TrpPage

use of eu.transkribus.core.model.beans.TrpPage in project TranskribusCore by Transkribus.

the class LocalDocReader method load.

public static TrpDoc load(TrpUpload upload) throws IOException {
    // validate most necessary things
    if (upload == null) {
        throw new IllegalArgumentException("Upload is null.");
    }
    if (upload.getUploadId() < 1) {
        throw new IllegalArgumentException("Invalid upload ID: " + upload.getUploadId());
    }
    if (!upload.canReadDirectories()) {
        throw new IllegalArgumentException("Directories are not readable: " + upload.getUploadTmpDir().getAbsolutePath());
    }
    // transform the upload object into a TRP document
    TrpDoc doc = new TrpDoc();
    TrpDocMetadata md = upload.getMd();
    md.setLocalFolder(upload.getUploadTmpDir());
    doc.setMd(md);
    File baseDir = upload.getUploadTmpDir();
    File xmlDir = upload.getUploadPageTmpDir();
    File thumbDir = new File(baseDir.getAbsolutePath() + File.separatorChar + LocalDocConst.THUMBS_FILE_SUB_FOLDER);
    for (PageUploadDescriptor p : upload.getPages()) {
        final int pageNr = p.getPageNr();
        File img = new File(baseDir.getAbsolutePath() + File.separator + p.getFileName());
        if (!img.isFile()) {
            throw new FileNotFoundException("Image for page " + pageNr + " does not exist: " + img.getAbsolutePath());
        }
        // try to read image dimension in any case to detect corrupt files immediately!
        Dimension dim = null;
        String imageRemark = null;
        try {
            dim = ImgUtils.readImageDimensions(img);
        } catch (CorruptImageException cie) {
            logger.error("Image is corrupt: " + img.getAbsolutePath(), cie);
            imageRemark = getCorruptImgMsg(img.getName());
        }
        final String imgBaseName = FilenameUtils.getBaseName(img.getName());
        File thumb = getThumbFile(thumbDir, imgBaseName);
        File pageXml = null;
        if (!StringUtils.isEmpty(p.getPageXmlName())) {
            pageXml = new File(xmlDir.getAbsolutePath() + File.separator + p.getPageXmlName());
            if (!pageXml.isFile()) {
                throw new FileNotFoundException("PAGE XML for page " + pageNr + " does not exist: " + img.getAbsolutePath());
            }
        } else if (StringUtils.isEmpty(imageRemark)) {
            // if a problem occured when reading the image
            File pageOutFile = new File(xmlDir.getAbsolutePath() + File.separatorChar + imgBaseName + ".xml");
            PcGtsType pc = PageXmlUtils.createEmptyPcGtsType(img, dim);
            try {
                pageXml = JaxbUtils.marshalToFile(pc, pageOutFile);
            } catch (JAXBException je) {
                logger.error(je.getMessage(), je);
                throw new IOException("Could not create empty PageXml on disk!", je);
            }
        }
        TrpPage page = buildPage(baseDir, pageNr, img, pageXml, thumb, dim, imageRemark);
        doc.getPages().add(page);
    }
    return doc;
}
Also used : CorruptImageException(eu.transkribus.core.exceptions.CorruptImageException) TrpPage(eu.transkribus.core.model.beans.TrpPage) JAXBException(javax.xml.bind.JAXBException) FileNotFoundException(java.io.FileNotFoundException) Dimension(java.awt.Dimension) IOException(java.io.IOException) PcGtsType(eu.transkribus.core.model.beans.pagecontent.PcGtsType) TrpDoc(eu.transkribus.core.model.beans.TrpDoc) TrpDocMetadata(eu.transkribus.core.model.beans.TrpDocMetadata) File(java.io.File) PageUploadDescriptor(eu.transkribus.core.model.beans.DocumentUploadDescriptor.PageUploadDescriptor)

Example 5 with TrpPage

use of eu.transkribus.core.model.beans.TrpPage in project TranskribusCore by Transkribus.

the class LocalDocReader method load.

/**
 * Loads a document from path.<br>
 *
 * Document metadata has to be in an XML called "metadata.xml".<br>
 *
 * Image files and corresponding XML/txt files have to have the same name. <br>
 * Lexicographic order of image names will imply order of pages.<br>
 * Types of transcript source files are searched in this order:
 * <ol>
 * <li>./page: PAGE XMLs according to schema 2010/2013</li>
 * <li>./ocr: Abbyy Finereader XMLs schema version 10</li>
 * <li>./alto: ALTO v2 XMls
 * <li>./txt: txt files with transcription fulltext only
 * </ol>
 * Testdoc is in $dea_scratch/TRP/TrpTestDoc <br>
 * No versioning of files for local use!<br>
 *
 * @param path the path where the document is stored
 * @param config {@link DocLoadConfig}
 * @return the constructed document
 * @throws IOException if the path can't be read or is malformed
 *
 * @todo implement monitor feedback!
 * @todo Respect Storage.uploadDocument where the monitor will be used by the upload itself later.
 */
public static TrpDoc load(final String path, DocLoadConfig config, IProgressMonitor monitor) throws IOException {
    // create the document
    TrpDoc doc = new TrpDoc();
    // check OS and adjust URL protocol
    final String os = System.getProperty("os.name");
    /*
		 * FIXME use SysUtils.isWin() here?
		 */
    if (os.toLowerCase().contains("win")) {
        LocalDocConst.URL_PROT_CONST = "file:///";
    }
    // else: keep default
    final File inputDir = new File(path);
    final File docXml = new File(inputDir.getAbsolutePath() + File.separator + LocalDocConst.DOC_XML_FILENAME);
    // validate input path ======================================================
    checkInputDir(inputDir);
    // search for IMG files
    TreeMap<String, File> pageMap = findImgFiles(inputDir);
    logger.info("Found " + pageMap.entrySet().size() + " page images.");
    if (pageMap.isEmpty()) {
        throw new FileNotFoundException("The directory does not contain any images: " + inputDir.getAbsolutePath());
    }
    TrpDocMetadata docMd = null;
    boolean doRefresh = true;
    // try to read doc structure from disk
    if (docXml.isFile()) {
        doc = loadDocXml(docXml);
        if (isValid(doc, pageMap.size(), config.isForceCreatePageXml())) {
            logger.info("Loaded document structure from disk.");
            docMd = doc.getMd();
            // no refresh is necessary as doc structure matches the input dir content
            doRefresh = false;
        } else {
            if (doc != null && doc.getMd() != null) {
                // keep any existing metadata if invalid doc structure was found
                docMd = doc.getMd();
            }
            logger.info("Removing faulty doc XML from disk and doing reload.");
            docXml.delete();
            doc = new TrpDoc();
        }
    }
    logger.info("Reading document at " + inputDir.getAbsolutePath());
    // find metadata file if not extracted from doc.xml =============================================
    if (docMd == null) {
        try {
            docMd = loadDocMd(inputDir);
        } catch (IOException ioe) {
            docMd = new TrpDocMetadata();
        }
    }
    initDocMd(docMd, inputDir, config.isStripServerRelatedMetadata());
    // Set the docMd
    doc.setMd(docMd);
    if (!doRefresh) {
        // Stop now and reuse doc structure from file
        return doc;
    }
    // Construct the input dir with pageXml Files.
    File pageInputDir = getPageXmlInputDir(inputDir);
    if (config.isForceCreatePageXml() && !pageInputDir.isDirectory()) {
        pageInputDir.mkdir();
    }
    // abbyy XML search path
    File ocrInputDir = getOcrXmlInputDir(inputDir);
    // alto XML search path
    File altoInputDir = getAltoXmlInputDir(inputDir);
    // alto XML search path
    File txtInputDir = getTxtInputDir(inputDir);
    // backupfolder for outdated page format files, if any
    final String backupFolderName = XmlFormat.PAGE_2010.toString().toLowerCase() + "_backup";
    final String backupPath = pageInputDir.getAbsolutePath() + File.separator + backupFolderName;
    // iterate imgList, search for corresponding XML files and build TrpPages
    int pageNr = 1;
    List<TrpPage> pages = new ArrayList<TrpPage>(pageMap.entrySet().size());
    // need a special variable to test whether we are in sync mode (only then do the following!!!!)
    if (pageMap.entrySet().size() == 0 && config.isEnableSyncWithoutImages()) {
        pageMap = createDummyImgFilesForXmls(inputDir, pageInputDir);
    }
    for (Entry<String, File> e : pageMap.entrySet()) {
        File imgFile = e.getValue();
        // the img file name without extension
        final String imgFileName = e.getKey();
        // check for a page XML of this name
        File pageXml = findXml(imgFileName, pageInputDir);
        // TODO thumbURL dir + imgFile.getName())+".jpg"
        File thumbFile = getThumbFile(inputDir, imgFileName);
        if (pageXml != null) {
            XmlFormat xmlFormat = XmlUtils.getXmlFormat(pageXml);
            switch(xmlFormat) {
                case PAGE_2010:
                    Page2010Converter.updatePageFormatSingleFile(pageXml, backupPath);
                    break;
                case PAGE_2013:
                    break;
                default:
                    throw new IOException("Incompatible XML file in PAGE XML path! " + pageXml.getAbsolutePath());
            }
        }
        // try to read image dimension in any case to detect corrupt files immediately!
        // FIXME this is taking too long and is only necessary on initial loading
        Dimension dim = null;
        String imageRemark = null;
        try {
            dim = ImgUtils.readImageDimensions(imgFile);
        } catch (CorruptImageException cie) {
            logger.error("Image is corrupt: " + imgFile.getAbsolutePath(), cie);
            imageRemark = getCorruptImgMsg(imgFile.getName());
        }
        if (pageXml == null && config.isForceCreatePageXml()) {
            // if no page XML, then create one at this path
            File pageOutFile = new File(pageInputDir.getAbsolutePath() + File.separatorChar + imgFileName + ".xml");
            File abbyyXml = findXml(imgFileName, ocrInputDir);
            File altoXml = findXml(imgFileName, altoInputDir);
            File txtFile = findFile(imgFileName, txtInputDir, "txt");
            pageXml = createPageXml(pageOutFile, false, abbyyXml, altoXml, txtFile, config.isPreserveOcrFontFamily(), config.isPreserveOcrTxtStyles(), config.isReplaceBadChars(), imgFile.getName(), dim);
        }
        TrpPage page = buildPage(inputDir, pageNr++, imgFile, pageXml, thumbFile, dim, imageRemark);
        pages.add(page);
    }
    doc.setPages(pages);
    doc.getMd().setNrOfPages(doc.getPages().size());
    // set editorial declaration:
    List<EdFeature> features = readEditDeclFeatures(doc.getMd().getLocalFolder());
    doc.setEdDeclList(features);
    logger.debug(doc.toString());
    // store doc on disk to save time on next load
    LocalDocWriter.writeDocXml(doc, docXml);
    return doc;
}
Also used : CorruptImageException(eu.transkribus.core.exceptions.CorruptImageException) XmlFormat(eu.transkribus.core.io.formats.XmlFormat) TrpPage(eu.transkribus.core.model.beans.TrpPage) FileNotFoundException(java.io.FileNotFoundException) ArrayList(java.util.ArrayList) IOException(java.io.IOException) Dimension(java.awt.Dimension) EdFeature(eu.transkribus.core.model.beans.EdFeature) TrpDoc(eu.transkribus.core.model.beans.TrpDoc) TrpDocMetadata(eu.transkribus.core.model.beans.TrpDocMetadata) File(java.io.File)

Aggregations

TrpPage (eu.transkribus.core.model.beans.TrpPage)32 TrpTranscriptMetadata (eu.transkribus.core.model.beans.TrpTranscriptMetadata)14 File (java.io.File)14 IOException (java.io.IOException)14 JAXBPageTranscript (eu.transkribus.core.model.beans.JAXBPageTranscript)10 PcGtsType (eu.transkribus.core.model.beans.pagecontent.PcGtsType)7 TrpPageType (eu.transkribus.core.model.beans.pagecontent_trp.TrpPageType)7 URL (java.net.URL)7 ArrayList (java.util.ArrayList)7 TrpDoc (eu.transkribus.core.model.beans.TrpDoc)6 TrpDocMetadata (eu.transkribus.core.model.beans.TrpDocMetadata)5 FileType (eu.transkribus.core.model.beans.mets.FileType)5 JAXBException (javax.xml.bind.JAXBException)5 TrpTextRegionType (eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType)4 Dimension (java.awt.Dimension)4 FileNotFoundException (java.io.FileNotFoundException)4 CorruptImageException (eu.transkribus.core.exceptions.CorruptImageException)3 DivType (eu.transkribus.core.model.beans.mets.DivType)3 Fptr (eu.transkribus.core.model.beans.mets.DivType.Fptr)3 FileGrpType (eu.transkribus.core.model.beans.mets.FileGrpType)3