Search in sources :

Example 1 with FLocat

use of eu.transkribus.core.model.beans.mets.FileType.FLocat in project TranskribusCore by Transkribus.

the class GoobiMetsImporter method fetchFilesFromUrl.

private TrpPage fetchFilesFromUrl(DivType div, List<FileType> imgGrp, List<FileType> xmlGrp, String dir) throws IOException {
    final int pageNr = div.getORDER().intValue();
    updateStatus("Downloading file for page nr. " + pageNr);
    File imgFile = null;
    File abbyyFile = null;
    File altoFile = null;
    String imgDirPath = dir + File.separator + "img";
    String abbyyDirPath = dir + File.separator + LocalDocConst.OCR_FILE_SUB_FOLDER;
    String altoDirPath = dir + File.separator + LocalDocConst.ALTO_FILE_SUB_FOLDER;
    String pageDirPath = dir + File.separator + LocalDocConst.PAGE_FILE_SUB_FOLDER;
    File pageDirFile = new File(pageDirPath);
    if (!pageDirFile.isDirectory() && !pageDirFile.mkdir()) {
        throw new IOException("Could not create page dir at: " + pageDirPath);
    }
    /**
     * handle cases where no image can be retrieved/stored for this page:
     * -image URL is broken
     * -the image dimension can not be read from the downloaded file
     * -no image file is mapped in the structmap for this page
     *
     * problemMsg is used to store info on that.
     */
    String problemMsg = null;
    for (Fptr ptr : div.getFptr()) {
        FileType type = (FileType) ptr.getFILEID();
        FLocat fLocat = type.getFLocat().get(0);
        // FIXME at the moment only remote files are supported here!
        final String locType = fLocat.getLOCTYPE();
        if (!"URL".equals(locType)) {
            throw new IOException("Bad or no LOCTYPE in an FLocat element: " + locType);
        }
        // MIMETYPE="image/jpeg"
        final String mimetype = type.getMIMETYPE();
        final URL url = new URL(fLocat.getHref());
        String ext = MimeTypes.lookupExtension(mimetype);
        /*
			 * brought problems with file/img links without the filname + ext at the end of the URL 
			 */
        // final String filename = determineFilename(url, type.getID(), mimetype);
        /*
			 * Preferred filename is the name in the getHeaderField("Content-Disposition");
			 * as fallback we use the fileID and mimetype extension
			 * 
			 */
        String filename = type.getID() + "." + ext;
        logger.debug("url.getProtocol() " + url.getProtocol());
        if (url.getProtocol().startsWith("http")) {
            String tmpFn = UrlUtils.getFilenameFromHeaderField(url);
            // logger.debug("tmpFn " + tmpFn);
            if (tmpFn != null) {
                filename = tmpFn;
            }
        }
        // logger.debug("mimetype " + mimetype);
        logger.debug("imported filename " + filename);
        if (imgGrp.contains(type)) {
            imgFile = new File(imgDirPath + File.separator + filename);
            logger.debug("Downloading: " + url);
            // fetch file from this URL and store locally
            int imgDownloadStatus = UrlUtils.copyUrlToFile(url, imgFile);
            if (imgDownloadStatus >= 400) {
                // the image URL connection attempt returns a response with code > 400
                problemMsg = getBrokenUrlMsg(url, imgDownloadStatus);
            }
        }
        if (xmlGrp != null && xmlGrp.contains(type)) {
            // check for ALTO or Abbyy XML
            String xmlId = type.getID();
            // FIXME check on ID string might not be reliable
            if (xmlId.contains("AbbyyXml")) {
                logger.debug("Found potential Abbyy XML: " + type.getID());
                // TODO: implement
                abbyyFile = new File(abbyyDirPath + File.separator + filename);
                if (UrlUtils.copyUrlToFile(url, abbyyFile) >= 400) {
                    logger.error("Could not download Abbyy XML and it will be ignored!");
                    // don't fail if abbyy XML could not be retrieved
                    abbyyFile = null;
                }
            } else if (xmlId.contains("Alto")) {
                logger.debug("Found potential ALTO XML: " + type.getID());
                // TODO: implement
                altoFile = new File(altoDirPath + File.separator + filename);
                if (UrlUtils.copyUrlToFile(url, altoFile) >= 400) {
                    logger.error("Could not download ALTO XML and it will be ignored!");
                    // don't fail if ALTO XML could not be retrieved
                    altoFile = null;
                }
            }
        }
    }
    File pageXml = null;
    File thumb = null;
    File imgDir = new File(imgDirPath);
    Dimension dim = null;
    if (imgFile == null) {
        // the divType did not include an image pointer
        logger.error("No image mapped for page " + pageNr + " in the structmap!");
        problemMsg = getMissingImgMsg(pageNr);
    } else {
        logger.info("Page " + pageNr + " image: " + imgFile.getAbsolutePath());
        if (imgFile.isFile()) {
            try {
                dim = ImgUtils.readImageDimensions(imgFile);
            } catch (CorruptImageException cie) {
                logger.error("Image is corrupted!", cie);
                // the image dimension can not be read from the downloaded file
                problemMsg = LocalDocReader.getCorruptImgMsg(imgFile.getName());
            }
        }
        File pageOutFile = new File(pageDirPath + File.separatorChar + FilenameUtils.getBaseName(imgFile.getName()) + ".xml");
        pageXml = LocalDocReader.createPageXml(pageOutFile, true, abbyyFile, altoFile, null, true, true, false, imgFile.getName(), dim);
        thumb = LocalDocReader.getThumbFile(imgDir, imgFile.getName());
    }
    TrpPage page = LocalDocReader.buildPage(new File(dir), pageNr, imgFile, pageXml, thumb, dim, problemMsg);
    // page.getTranscripts().add(tmd);
    return page;
}
Also used : CorruptImageException(eu.transkribus.core.exceptions.CorruptImageException) FileType(eu.transkribus.core.model.beans.mets.FileType) TrpPage(eu.transkribus.core.model.beans.TrpPage) Fptr(eu.transkribus.core.model.beans.mets.DivType.Fptr) IOException(java.io.IOException) FLocat(eu.transkribus.core.model.beans.mets.FileType.FLocat) Dimension(java.awt.Dimension) File(java.io.File) URL(java.net.URL)

Example 2 with FLocat

use of eu.transkribus.core.model.beans.mets.FileType.FLocat in project TranskribusCore by Transkribus.

the class TrpDocPacker method getFiles.

private List<String> getFiles(FileGrpType type) {
    List<String> fileList = new LinkedList<>();
    for (FileType ft : type.getFile()) {
        for (FLocat fl : ft.getFLocat()) {
            if (fl.getLOCTYPE().equals("OTHER") && fl.getOTHERLOCTYPE().equals("FILE")) {
                logger.debug("Adding File: " + fl.getHref());
                fileList.add(fl.getHref());
                updateStatus("Adding File: " + fl.getHref());
            }
        }
    }
    return fileList;
}
Also used : FileType(eu.transkribus.core.model.beans.mets.FileType) FLocat(eu.transkribus.core.model.beans.mets.FileType.FLocat) LinkedList(java.util.LinkedList)

Example 3 with FLocat

use of eu.transkribus.core.model.beans.mets.FileType.FLocat in project TranskribusCore by Transkribus.

the class MetsUtil method getFile.

public static File getFile(FileType type, File parentDir) throws IOException {
    File file = null;
    FLocat fLocat = type.getFLocat().get(0);
    if (fLocat.getOTHERLOCTYPE() != null && fLocat.getOTHERLOCTYPE().equals("FILE")) {
        // localdoc
        file = new File(parentDir.getAbsolutePath() + File.separator + fLocat.getHref());
        if (!file.exists()) {
            throw new IOException("File does not exist: " + file.getAbsolutePath());
        }
        if (!type.isSetCHECKSUMTYPE()) {
            logger.error("No checksum set!");
        } else if (!type.getCHECKSUMTYPE().equals(ChecksumUtils.ChkSumAlg.MD5.toString())) {
            logger.error("Unknown checksum algorithm: " + type.getCHECKSUMTYPE());
        } else {
            final String metsChkSum = type.getCHECKSUM();
            final String chkSum = ChecksumUtils.getMd5SumHex(file);
            if (!metsChkSum.equals(chkSum)) {
                throw new IOException("Checksum error: METS=" + metsChkSum + " <-> FILE=" + chkSum + " | " + file.getAbsolutePath());
            }
            logger.debug("Checksum is correct: " + file.getAbsolutePath());
        }
    } else {
        // TODO implement for URL type
        throw new IOException("METS file does not belong to a local document!");
    }
    return file;
}
Also used : FLocat(eu.transkribus.core.model.beans.mets.FileType.FLocat) IOException(java.io.IOException) File(java.io.File)

Example 4 with FLocat

use of eu.transkribus.core.model.beans.mets.FileType.FLocat in project TranskribusCore by Transkribus.

the class TrpMetsBuilder method buildMets.

/**
 * Generate a METS containing
 * <ul>
 * <li>TrpDocMetadata embedded in sourceMd</li>
 * <li>all page images</li>
 * <li>the most recent PAGE XML files from the Doc</li>
 * </ul>
 *
 * If a local document is passed, all hrefs will contain the relative paths to files based on the localFolder!
 *
 * @param doc
 * @param exportImages
 * @param pageIndices
 * @return
 * @throws IOException if image/xml files can't be accessed for reading the mimetype etc.
 */
public static Mets buildMets(TrpDoc doc, boolean exportPage, boolean exportAlto, boolean exportImages, Set<Integer> pageIndices) throws IOException {
    Mets mets = new Mets();
    TrpDocMetadata md = doc.getMd();
    File localFolder = md.getLocalFolder();
    boolean isLocalDoc = localFolder != null;
    mets.setLABEL(md.getTitle());
    mets.setOBJID("" + md.getDocId());
    mets.setPROFILE(TRP_METS_PROFILE);
    // FIXME remove TYPE
    // mets.setTYPE(TRP_METS_PROFILE);
    // metsHdr
    MetsHdr hdr = buildMetsHdr(md);
    mets.setMetsHdr(hdr);
    // TODO dcmd_elec omitted meanwhile
    // md_orig
    AmdSecType amdSec = new AmdSecType();
    amdSec.setID(SOURCE_MD_ID_CONST);
    MdSecType sourceMdSec = buildSourceMdSec(md);
    amdSec.getSourceMD().add(sourceMdSec);
    mets.getAmdSec().add(amdSec);
    // structmap div, linking to the sourceMd section with dmd
    DivType div = new DivType();
    div.getADMID().add(sourceMdSec);
    div.setID(TRP_DOC_DIV_ID);
    FileSec fileSec = new FileSec();
    StructMapType structMap = new StructMapType();
    structMap.setID(TRP_STRUCTMAP_ID);
    structMap.setTYPE("MANUSCRIPT");
    structMap.setDiv(div);
    List<TrpPage> pages = doc.getPages();
    FimgStoreGetClient client = null;
    if (!isLocalDoc) {
        // TODO maybe we need this stuff in the docMetadata?
        URL url = pages.get(0).getUrl();
        client = new FimgStoreGetClient(url);
    }
    FileGrp masterGrp = new FileGrp();
    masterGrp.setID(MASTER_FILE_GRP_ID);
    FileGrpType imgGrp = new FileGrpType();
    imgGrp.setID(IMG_GROUP_ID);
    FileGrpType pageGrp = new FileGrpType();
    pageGrp.setID(PAGE_GROUP_ID);
    FileGrpType altoGrp = new FileGrpType();
    altoGrp.setID(ALTO_GROUP_ID);
    int i = -1;
    for (TrpPage p : pages) {
        i++;
        if (pageIndices != null && !pageIndices.contains(i)) {
            continue;
        }
        // build a page div for the structmap
        DivType pageDiv = new DivType();
        pageDiv.setID("PAGE_" + p.getPageNr());
        pageDiv.setTYPE("SINGLE_PAGE");
        pageDiv.setORDER(BigInteger.valueOf(p.getPageNr()));
        final String imgId = "IMG_" + p.getPageNr();
        final String xmlId = PAGE_GROUP_ID + "_" + p.getPageNr();
        final String altoId = ALTO_GROUP_ID + "_" + p.getPageNr();
        /* only the most recent transcript is added here for now
			 * 
			 * TODO how to deal with imagestore files? use orig image? right now, it's just the view file...
			 * TODO thumbnails not yet included
			*/
        if (exportImages) {
            FileType img = buildFileType(localFolder, imgId, p, p.getPageNr(), client);
            imgGrp.getFile().add(img);
            // linking images
            Fptr imgPtr = buildFptr(img);
            pageDiv.getFptr().add(imgPtr);
        }
        // TODO error handling.. if no transcript??
        if (exportPage) {
            // xmlfiletype: just add the most recent transcript
            TrpTranscriptMetadata tMd;
            // get the transcript chosen for export
            tMd = p.getCurrentTranscript();
            FileType xml = buildFileType(md.getLocalFolder(), xmlId, tMd, p.getPageNr(), client);
            pageGrp.getFile().add(xml);
            Fptr xmlPtr = buildFptr(xml);
            pageDiv.getFptr().add(xmlPtr);
        }
        // creat ALTO fileGrp
        if (exportAlto) {
            FileType altoFt = new FileType();
            altoFt.setCHECKSUMTYPE(ChecksumUtils.ChkSumAlg.MD5.toString());
            // TODO calculate checksum
            altoFt.setCHECKSUM("");
            FLocat fLocat = new FLocat();
            fLocat.setLOCTYPE("OTHER");
            fLocat.setOTHERLOCTYPE("FILE");
            altoFt.setID(altoId);
            altoFt.setSEQ(p.getPageNr());
            // String tmpImgName = img.getFLocat().get(0).getHref();
            String relAltoPath = "alto".concat(File.separator).concat(p.getImgFileName().substring(0, p.getImgFileName().lastIndexOf(".")).concat(".xml"));
            fLocat.setHref(relAltoPath);
            // String absAltoPath = tMd.getUrl().getPath().replace("page", "alto");
            final String path = FileUtils.toFile(p.getUrl()).getAbsolutePath();
            String absAltoPath = path.substring(0, path.lastIndexOf(File.separator));
            absAltoPath = absAltoPath.concat("/alto/").concat(p.getImgFileName().substring(0, p.getImgFileName().lastIndexOf(".")).concat(".xml"));
            // logger.info("alto path starts with: " + absAltoPath);
            if (absAltoPath.startsWith("\\")) /*|| absAltoPath.startsWith("/")*/
            {
                // logger.info("alto path starts with \\ or /");
                absAltoPath = absAltoPath.substring(1);
            }
            String mime = MimeTypes.getMimeType("xml");
            altoFt.setMIMETYPE(mime);
            File altoTmp = new File(absAltoPath);
            if (altoTmp.exists()) {
                // logger.info("alto file exist at " + absAltoPath);
                Date date = new Date(altoTmp.lastModified());
                XMLGregorianCalendar cal = JaxbUtils.getXmlCalendar(date);
                altoFt.setCREATED(cal);
            } else {
                logger.info("alto file does not exist at " + absAltoPath);
            }
            // System.out.println("relAltoPath " + relAltoPath);
            // System.out.println("absAltoPath " + absAltoPath);
            // System.in.read();
            altoFt.getFLocat().add(fLocat);
            altoGrp.getFile().add(altoFt);
            Fptr altoPtr = buildFptr(altoFt);
            pageDiv.getFptr().add(altoPtr);
        }
        div.getDiv().add(pageDiv);
    }
    fileSec.getFileGrp().add(masterGrp);
    mets.setFileSec(fileSec);
    if (exportImages) {
        masterGrp.getFileGrp().add(imgGrp);
    }
    if (exportPage) {
        masterGrp.getFileGrp().add(pageGrp);
    }
    if (exportAlto) {
        masterGrp.getFileGrp().add(altoGrp);
    }
    mets.getStructMap().add(structMap);
    return mets;
}
Also used : TrpPage(eu.transkribus.core.model.beans.TrpPage) FileGrp(eu.transkribus.core.model.beans.mets.MetsType.FileSec.FileGrp) Fptr(eu.transkribus.core.model.beans.mets.DivType.Fptr) TrpTranscriptMetadata(eu.transkribus.core.model.beans.TrpTranscriptMetadata) AmdSecType(eu.transkribus.core.model.beans.mets.AmdSecType) MetsHdr(eu.transkribus.core.model.beans.mets.MetsType.MetsHdr) URL(java.net.URL) Date(java.util.Date) MdSecType(eu.transkribus.core.model.beans.mets.MdSecType) DivType(eu.transkribus.core.model.beans.mets.DivType) FileGrpType(eu.transkribus.core.model.beans.mets.FileGrpType) XMLGregorianCalendar(javax.xml.datatype.XMLGregorianCalendar) Mets(eu.transkribus.core.model.beans.mets.Mets) FimgStoreGetClient(org.dea.fimgstoreclient.FimgStoreGetClient) FileType(eu.transkribus.core.model.beans.mets.FileType) FileSec(eu.transkribus.core.model.beans.mets.MetsType.FileSec) TrpDocMetadata(eu.transkribus.core.model.beans.TrpDocMetadata) StructMapType(eu.transkribus.core.model.beans.mets.StructMapType) FLocat(eu.transkribus.core.model.beans.mets.FileType.FLocat) ITrpFile(eu.transkribus.core.model.beans.ITrpFile) File(java.io.File)

Example 5 with FLocat

use of eu.transkribus.core.model.beans.mets.FileType.FLocat in project TranskribusCore by Transkribus.

the class TrpMetsBuilder method buildFileType.

/**
 * @param localFolder null if isLocalDoc
 * @param id
 * @param o
 * @param client
 * @return
 * @throws IOException
 */
private static FileType buildFileType(File localFolder, String id, ITrpFile o, final int seq, FimgStoreGetClient client) throws IOException {
    FileType fType = new FileType();
    fType.setID(id);
    String mime = null;
    Date date = null;
    FLocat fLocat = new FLocat();
    String loc = null;
    if (localFolder != null) {
        URL url = o.getUrl();
        if (!url.getProtocol().contains("file")) {
            throw new IOException("Doc contains local folder reference but an URL refers to a non-local file! " + url.toString());
        }
        final String path = FileUtils.toFile(url).getAbsolutePath();
        File f = new File(path);
        mime = MimeTypes.getMimeType(FilenameUtils.getExtension(f.getName()));
        date = new Date(f.lastModified());
        fLocat.setLOCTYPE("OTHER");
        fLocat.setOTHERLOCTYPE("FILE");
        // remove protocol and localfolder, i.e. get relative path to this file
        // loc = path.substring(localFolder.getAbsolutePath().length() + 1); // BUG: localFolder != path!!
        loc = FilenameUtils.getName(path);
        if (id.startsWith(PAGE_GROUP_ID)) {
            // append relative folder for PAGE XML files
            loc = "page/" + loc;
        }
        logger.debug("loc = " + loc);
        if (o.getMd5Sum() != null) {
            fType.setCHECKSUMTYPE(ChecksumUtils.ChkSumAlg.MD5.toString());
            fType.setCHECKSUM(o.getMd5Sum());
        }
    } else {
        try {
            FimgStoreFileMd fMd = client.getFileMd(o.getKey());
            date = fMd.getUploadDate();
            mime = fMd.getMimetype();
            fLocat.setLOCTYPE("URL");
            // full URL in case of remote file
            loc = o.getUrl().toString();
        } catch (IOException e) {
            logger.error(e.getMessage(), e);
            throw new IOException("FileMetadata could not be retrieved from imagestore for key: " + o.getKey());
        }
    }
    fType.setMIMETYPE(mime);
    XMLGregorianCalendar cal = JaxbUtils.getXmlCalendar(date);
    fType.setCREATED(cal);
    fType.setSEQ(seq);
    fLocat.setHref(loc);
    fType.getFLocat().add(fLocat);
    return fType;
}
Also used : XMLGregorianCalendar(javax.xml.datatype.XMLGregorianCalendar) FileType(eu.transkribus.core.model.beans.mets.FileType) FLocat(eu.transkribus.core.model.beans.mets.FileType.FLocat) IOException(java.io.IOException) FimgStoreFileMd(org.dea.fimgstoreclient.beans.FimgStoreFileMd) ITrpFile(eu.transkribus.core.model.beans.ITrpFile) File(java.io.File) Date(java.util.Date) URL(java.net.URL)

Aggregations

FLocat (eu.transkribus.core.model.beans.mets.FileType.FLocat)6 FileType (eu.transkribus.core.model.beans.mets.FileType)4 File (java.io.File)4 IOException (java.io.IOException)3 URL (java.net.URL)3 ITrpFile (eu.transkribus.core.model.beans.ITrpFile)2 TrpPage (eu.transkribus.core.model.beans.TrpPage)2 Fptr (eu.transkribus.core.model.beans.mets.DivType.Fptr)2 Date (java.util.Date)2 XMLGregorianCalendar (javax.xml.datatype.XMLGregorianCalendar)2 CorruptImageException (eu.transkribus.core.exceptions.CorruptImageException)1 TrpDocMetadata (eu.transkribus.core.model.beans.TrpDocMetadata)1 TrpTranscriptMetadata (eu.transkribus.core.model.beans.TrpTranscriptMetadata)1 AmdSecType (eu.transkribus.core.model.beans.mets.AmdSecType)1 DivType (eu.transkribus.core.model.beans.mets.DivType)1 FileGrpType (eu.transkribus.core.model.beans.mets.FileGrpType)1 MdSecType (eu.transkribus.core.model.beans.mets.MdSecType)1 Mets (eu.transkribus.core.model.beans.mets.Mets)1 FileSec (eu.transkribus.core.model.beans.mets.MetsType.FileSec)1 FileGrp (eu.transkribus.core.model.beans.mets.MetsType.FileSec.FileGrp)1