Search in sources :

Example 1 with FileType

use of eu.transkribus.core.model.beans.mets.FileType in project TranskribusCore by Transkribus.

the class FEPLocalDocReader method parsePhysicalStructure.

static List<HashMap<String, File>> parsePhysicalStructure(File inputDir, Mets mets) throws IOException {
    StructMapType physSm = findStructMap(mets, PHYSICAL_STRUCT_MAP_LABEL);
    DivType rootDiv = physSm.getDiv();
    // sort divs by order:
    Collections.sort(rootDiv.getDiv(), new Comparator<DivType>() {

        @Override
        public int compare(DivType o1, DivType o2) {
            return o1.getORDER().compareTo(o2.getORDER());
        }
    });
    List<HashMap<String, File>> fepFileGrps = new ArrayList<>();
    // parse them bloody divs:
    for (DivType div : rootDiv.getDiv()) {
        if (div.getFptr().size() != 1)
            throw new IOException("Error parsing physical structure: nr of fptr elements is not 1 in div: " + div.getFptr().size() + ", id: " + div.getID());
        ParType par = div.getFptr().get(0).getPar();
        if (par == null)
            throw new IOException("Error parsing physical structure: could not parse par element in fptr of div: " + div.getID());
        HashMap<String, File> files = new HashMap<>();
        for (Serializable o : par.getAreaOrSeq()) {
            if (o instanceof AreaType) {
                AreaType area = (AreaType) o;
                FileType fileType = (FileType) area.getFILEID();
                Pair<FileGrp, File> filePair = findFile(inputDir, mets, fileType.getID());
                logger.debug("found file with id: " + fileType.getID() + ", path: " + filePair.getRight().getAbsolutePath());
                files.put(filePair.getLeft().getID(), filePair.getRight());
            }
        }
        fepFileGrps.add(files);
    }
    return fepFileGrps;
}
Also used : Serializable(java.io.Serializable) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) FileGrp(eu.transkribus.core.model.beans.mets.MetsType.FileSec.FileGrp) IOException(java.io.IOException) AreaType(eu.transkribus.core.model.beans.mets.AreaType) DivType(eu.transkribus.core.model.beans.mets.DivType) FileType(eu.transkribus.core.model.beans.mets.FileType) StructMapType(eu.transkribus.core.model.beans.mets.StructMapType) ParType(eu.transkribus.core.model.beans.mets.ParType) File(java.io.File)

Example 2 with FileType

use of eu.transkribus.core.model.beans.mets.FileType in project TranskribusCore by Transkribus.

the class GoobiMetsImporter method fetchFiles.

/**
 * @param mets: The unmarshalled Goobi Mets file
 * @return
 * @throws IOException
 */
public List<TrpPage> fetchFiles(String dir, Mets mets) throws IOException {
    List<FileGrp> fileGrps = mets.getFileSec().getFileGrp();
    List<FileType> xmlGrp = null;
    List<FileType> imgGrp = null;
    List<FileType> defaultImgGrp = null;
    for (FileGrpType type : fileGrps) {
        switch(type.getUSE()) {
            case "MAX":
                imgGrp = type.getFile();
                break;
            /*
				 * could also be that USE='Content' and ID="AltoFiles" or ID="AbbyyXmlFiles"  is necessary to get the transcriptions
				 */
            case "DEFAULT":
                defaultImgGrp = type.getFile();
                break;
            case "XML":
                // possibility to load also an existent Alto or Abbyy XML and convert it to Page later on
                // TODO: Abklären
                xmlGrp = type.getFile();
                break;
            default:
                break;
        }
    }
    // take default images if no MAX images are available
    if (imgGrp == null && defaultImgGrp != null) {
        imgGrp = defaultImgGrp;
    }
    if (imgGrp == null)
        throw new IOException("METS file has no image file list!");
    if (xmlGrp == null) {
        logger.debug("no xml file list");
    // throw new IOException("METS file has no xml file list!");
    }
    List<DivType> pageDivs = null;
    for (StructMapType sMap : mets.getStructMap()) {
        if (sMap.getTYPE().equals("PHYSICAL") && // && sMap.getDiv().getID().equals("PHYS_0000")){
        sMap.getDiv().getTYPE().equals("physSequence")) {
            pageDivs = sMap.getDiv().getDiv();
            break;
        }
    }
    if (pageDivs == null)
        throw new IOException("No valid StructMap was found!");
    List<TrpPage> pages = new ArrayList<TrpPage>(pageDivs.size());
    // Implement a reverse-order Comparator by lambda function
    Comparator<DivType> comp = (DivType a, DivType b) -> {
        return a.getORDER().compareTo(b.getORDER());
    };
    pageDivs.sort(comp);
    for (DivType div : pageDivs) {
        // fetch all files and store them locally
        TrpPage p = fetchFilesFromUrl(div, imgGrp, xmlGrp, dir);
        pages.add(p);
    }
    return pages;
}
Also used : TrpPage(eu.transkribus.core.model.beans.TrpPage) FileGrp(eu.transkribus.core.model.beans.mets.MetsType.FileSec.FileGrp) ArrayList(java.util.ArrayList) IOException(java.io.IOException) FileGrpType(eu.transkribus.core.model.beans.mets.FileGrpType) DivType(eu.transkribus.core.model.beans.mets.DivType) FileType(eu.transkribus.core.model.beans.mets.FileType) StructMapType(eu.transkribus.core.model.beans.mets.StructMapType)

Example 3 with FileType

use of eu.transkribus.core.model.beans.mets.FileType in project TranskribusCore by Transkribus.

the class GoobiMetsImporter method fetchFilesFromUrl.

private TrpPage fetchFilesFromUrl(DivType div, List<FileType> imgGrp, List<FileType> xmlGrp, String dir) throws IOException {
    final int pageNr = div.getORDER().intValue();
    updateStatus("Downloading file for page nr. " + pageNr);
    File imgFile = null;
    File abbyyFile = null;
    File altoFile = null;
    String imgDirPath = dir + File.separator + "img";
    String abbyyDirPath = dir + File.separator + LocalDocConst.OCR_FILE_SUB_FOLDER;
    String altoDirPath = dir + File.separator + LocalDocConst.ALTO_FILE_SUB_FOLDER;
    String pageDirPath = dir + File.separator + LocalDocConst.PAGE_FILE_SUB_FOLDER;
    File pageDirFile = new File(pageDirPath);
    if (!pageDirFile.isDirectory() && !pageDirFile.mkdir()) {
        throw new IOException("Could not create page dir at: " + pageDirPath);
    }
    /**
     * handle cases where no image can be retrieved/stored for this page:
     * -image URL is broken
     * -the image dimension can not be read from the downloaded file
     * -no image file is mapped in the structmap for this page
     *
     * problemMsg is used to store info on that.
     */
    String problemMsg = null;
    for (Fptr ptr : div.getFptr()) {
        FileType type = (FileType) ptr.getFILEID();
        FLocat fLocat = type.getFLocat().get(0);
        // FIXME at the moment only remote files are supported here!
        final String locType = fLocat.getLOCTYPE();
        if (!"URL".equals(locType)) {
            throw new IOException("Bad or no LOCTYPE in an FLocat element: " + locType);
        }
        // MIMETYPE="image/jpeg"
        final String mimetype = type.getMIMETYPE();
        final URL url = new URL(fLocat.getHref());
        String ext = MimeTypes.lookupExtension(mimetype);
        /*
			 * brought problems with file/img links without the filname + ext at the end of the URL 
			 */
        // final String filename = determineFilename(url, type.getID(), mimetype);
        /*
			 * Preferred filename is the name in the getHeaderField("Content-Disposition");
			 * as fallback we use the fileID and mimetype extension
			 * 
			 */
        String filename = type.getID() + "." + ext;
        logger.debug("url.getProtocol() " + url.getProtocol());
        if (url.getProtocol().startsWith("http")) {
            String tmpFn = UrlUtils.getFilenameFromHeaderField(url);
            // logger.debug("tmpFn " + tmpFn);
            if (tmpFn != null) {
                filename = tmpFn;
            }
        }
        // logger.debug("mimetype " + mimetype);
        logger.debug("imported filename " + filename);
        if (imgGrp.contains(type)) {
            imgFile = new File(imgDirPath + File.separator + filename);
            logger.debug("Downloading: " + url);
            // fetch file from this URL and store locally
            int imgDownloadStatus = UrlUtils.copyUrlToFile(url, imgFile);
            if (imgDownloadStatus >= 400) {
                // the image URL connection attempt returns a response with code > 400
                problemMsg = getBrokenUrlMsg(url, imgDownloadStatus);
            }
        }
        if (xmlGrp != null && xmlGrp.contains(type)) {
            // check for ALTO or Abbyy XML
            String xmlId = type.getID();
            // FIXME check on ID string might not be reliable
            if (xmlId.contains("AbbyyXml")) {
                logger.debug("Found potential Abbyy XML: " + type.getID());
                // TODO: implement
                abbyyFile = new File(abbyyDirPath + File.separator + filename);
                if (UrlUtils.copyUrlToFile(url, abbyyFile) >= 400) {
                    logger.error("Could not download Abbyy XML and it will be ignored!");
                    // don't fail if abbyy XML could not be retrieved
                    abbyyFile = null;
                }
            } else if (xmlId.contains("Alto")) {
                logger.debug("Found potential ALTO XML: " + type.getID());
                // TODO: implement
                altoFile = new File(altoDirPath + File.separator + filename);
                if (UrlUtils.copyUrlToFile(url, altoFile) >= 400) {
                    logger.error("Could not download ALTO XML and it will be ignored!");
                    // don't fail if ALTO XML could not be retrieved
                    altoFile = null;
                }
            }
        }
    }
    File pageXml = null;
    File thumb = null;
    File imgDir = new File(imgDirPath);
    Dimension dim = null;
    if (imgFile == null) {
        // the divType did not include an image pointer
        logger.error("No image mapped for page " + pageNr + " in the structmap!");
        problemMsg = getMissingImgMsg(pageNr);
    } else {
        logger.info("Page " + pageNr + " image: " + imgFile.getAbsolutePath());
        if (imgFile.isFile()) {
            try {
                dim = ImgUtils.readImageDimensions(imgFile);
            } catch (CorruptImageException cie) {
                logger.error("Image is corrupted!", cie);
                // the image dimension can not be read from the downloaded file
                problemMsg = LocalDocReader.getCorruptImgMsg(imgFile.getName());
            }
        }
        File pageOutFile = new File(pageDirPath + File.separatorChar + FilenameUtils.getBaseName(imgFile.getName()) + ".xml");
        pageXml = LocalDocReader.createPageXml(pageOutFile, true, abbyyFile, altoFile, null, true, true, false, imgFile.getName(), dim);
        thumb = LocalDocReader.getThumbFile(imgDir, imgFile.getName());
    }
    TrpPage page = LocalDocReader.buildPage(new File(dir), pageNr, imgFile, pageXml, thumb, dim, problemMsg);
    // page.getTranscripts().add(tmd);
    return page;
}
Also used : CorruptImageException(eu.transkribus.core.exceptions.CorruptImageException) FileType(eu.transkribus.core.model.beans.mets.FileType) TrpPage(eu.transkribus.core.model.beans.TrpPage) Fptr(eu.transkribus.core.model.beans.mets.DivType.Fptr) IOException(java.io.IOException) FLocat(eu.transkribus.core.model.beans.mets.FileType.FLocat) Dimension(java.awt.Dimension) File(java.io.File) URL(java.net.URL)

Example 4 with FileType

use of eu.transkribus.core.model.beans.mets.FileType in project TranskribusCore by Transkribus.

the class TrpDocPacker method getFiles.

private List<String> getFiles(FileGrpType type) {
    List<String> fileList = new LinkedList<>();
    for (FileType ft : type.getFile()) {
        for (FLocat fl : ft.getFLocat()) {
            if (fl.getLOCTYPE().equals("OTHER") && fl.getOTHERLOCTYPE().equals("FILE")) {
                logger.debug("Adding File: " + fl.getHref());
                fileList.add(fl.getHref());
                updateStatus("Adding File: " + fl.getHref());
            }
        }
    }
    return fileList;
}
Also used : FileType(eu.transkribus.core.model.beans.mets.FileType) FLocat(eu.transkribus.core.model.beans.mets.FileType.FLocat) LinkedList(java.util.LinkedList)

Example 5 with FileType

use of eu.transkribus.core.model.beans.mets.FileType in project TranskribusCore by Transkribus.

the class MetsUtil method buildUploadImage.

private static PageUploadDescriptor buildUploadImage(DivType div, List<FileType> imgGrp, List<FileType> xmlGrp) {
    PageUploadDescriptor image = new PageUploadDescriptor();
    int pageIndex = div.getORDER().intValue() - 1;
    image.setPageNr(pageIndex);
    String imgFileName = null;
    String xmlFileName = null;
    String imgChecksum = null;
    String xmlChecksum = null;
    for (Fptr ptr : div.getFptr()) {
        FileType type = (FileType) ptr.getArea().getFILEID();
        final Pair<String, String> fileNameAndChecksum = MetsUtil.getFileNameAndChecksum(type);
        if (imgGrp.contains(type)) {
            imgFileName = fileNameAndChecksum.getLeft();
            if (!IMG_NAME_FILTER.accept(null, imgFileName)) {
                throw new IllegalArgumentException("Image type is not supported: " + imgFileName);
            }
            imgChecksum = fileNameAndChecksum.getRight();
        } else if (xmlGrp != null && xmlGrp.contains(type)) {
            xmlFileName = fileNameAndChecksum.getLeft();
            xmlChecksum = fileNameAndChecksum.getRight();
            if (!StringUtils.isEmpty(xmlFileName) && xmlFileName.startsWith(LocalDocConst.PAGE_FILE_SUB_FOLDER + "/")) {
                // remove the "page/" prefix in XML filename if existent
                xmlFileName = xmlFileName.replaceFirst(LocalDocConst.PAGE_FILE_SUB_FOLDER + "/", "");
            }
        }
    }
    if (StringUtils.isEmpty(imgFileName)) {
        logger.error("No master image mapped for page index = " + pageIndex + " in the structmap!");
    } else {
        logger.info("Page " + image.getPageNr() + " image: " + imgFileName);
    }
    image.setFileName(imgFileName);
    image.setImgChecksum(imgChecksum);
    image.setPageXmlName(xmlFileName);
    image.setPageXmlChecksum(xmlChecksum);
    return image;
}
Also used : FileType(eu.transkribus.core.model.beans.mets.FileType) Fptr(eu.transkribus.core.model.beans.mets.DivType.Fptr) PageUploadDescriptor(eu.transkribus.core.model.beans.DocumentUploadDescriptor.PageUploadDescriptor)

Aggregations

FileType (eu.transkribus.core.model.beans.mets.FileType)10 TrpPage (eu.transkribus.core.model.beans.TrpPage)5 DivType (eu.transkribus.core.model.beans.mets.DivType)5 File (java.io.File)5 IOException (java.io.IOException)5 Fptr (eu.transkribus.core.model.beans.mets.DivType.Fptr)4 FileGrpType (eu.transkribus.core.model.beans.mets.FileGrpType)4 FLocat (eu.transkribus.core.model.beans.mets.FileType.FLocat)4 ArrayList (java.util.ArrayList)4 FileGrp (eu.transkribus.core.model.beans.mets.MetsType.FileSec.FileGrp)3 StructMapType (eu.transkribus.core.model.beans.mets.StructMapType)3 URL (java.net.URL)3 Date (java.util.Date)3 PageUploadDescriptor (eu.transkribus.core.model.beans.DocumentUploadDescriptor.PageUploadDescriptor)2 ITrpFile (eu.transkribus.core.model.beans.ITrpFile)2 TrpTranscriptMetadata (eu.transkribus.core.model.beans.TrpTranscriptMetadata)2 XMLGregorianCalendar (javax.xml.datatype.XMLGregorianCalendar)2 CorruptImageException (eu.transkribus.core.exceptions.CorruptImageException)1 TrpDocMetadata (eu.transkribus.core.model.beans.TrpDocMetadata)1 AmdSecType (eu.transkribus.core.model.beans.mets.AmdSecType)1