Search in sources :

Example 1 with FileGrpType

use of eu.transkribus.core.model.beans.mets.FileGrpType in project TranskribusCore by Transkribus.

the class GoobiMetsImporter method fetchFiles.

/**
 * @param mets: The unmarshalled Goobi Mets file
 * @return
 * @throws IOException
 */
public List<TrpPage> fetchFiles(String dir, Mets mets) throws IOException {
    List<FileGrp> fileGrps = mets.getFileSec().getFileGrp();
    List<FileType> xmlGrp = null;
    List<FileType> imgGrp = null;
    List<FileType> defaultImgGrp = null;
    for (FileGrpType type : fileGrps) {
        switch(type.getUSE()) {
            case "MAX":
                imgGrp = type.getFile();
                break;
            /*
				 * could also be that USE='Content' and ID="AltoFiles" or ID="AbbyyXmlFiles"  is necessary to get the transcriptions
				 */
            case "DEFAULT":
                defaultImgGrp = type.getFile();
                break;
            case "XML":
                // possibility to load also an existent Alto or Abbyy XML and convert it to Page later on
                // TODO: Abklären
                xmlGrp = type.getFile();
                break;
            default:
                break;
        }
    }
    // take default images if no MAX images are available
    if (imgGrp == null && defaultImgGrp != null) {
        imgGrp = defaultImgGrp;
    }
    if (imgGrp == null)
        throw new IOException("METS file has no image file list!");
    if (xmlGrp == null) {
        logger.debug("no xml file list");
    // throw new IOException("METS file has no xml file list!");
    }
    List<DivType> pageDivs = null;
    for (StructMapType sMap : mets.getStructMap()) {
        if (sMap.getTYPE().equals("PHYSICAL") && // && sMap.getDiv().getID().equals("PHYS_0000")){
        sMap.getDiv().getTYPE().equals("physSequence")) {
            pageDivs = sMap.getDiv().getDiv();
            break;
        }
    }
    if (pageDivs == null)
        throw new IOException("No valid StructMap was found!");
    List<TrpPage> pages = new ArrayList<TrpPage>(pageDivs.size());
    // Implement a reverse-order Comparator by lambda function
    Comparator<DivType> comp = (DivType a, DivType b) -> {
        return a.getORDER().compareTo(b.getORDER());
    };
    pageDivs.sort(comp);
    for (DivType div : pageDivs) {
        // fetch all files and store them locally
        TrpPage p = fetchFilesFromUrl(div, imgGrp, xmlGrp, dir);
        pages.add(p);
    }
    return pages;
}
Also used : TrpPage(eu.transkribus.core.model.beans.TrpPage) FileGrp(eu.transkribus.core.model.beans.mets.MetsType.FileSec.FileGrp) ArrayList(java.util.ArrayList) IOException(java.io.IOException) FileGrpType(eu.transkribus.core.model.beans.mets.FileGrpType) DivType(eu.transkribus.core.model.beans.mets.DivType) FileType(eu.transkribus.core.model.beans.mets.FileType) StructMapType(eu.transkribus.core.model.beans.mets.StructMapType)

Example 2 with FileGrpType

use of eu.transkribus.core.model.beans.mets.FileGrpType in project TranskribusCore by Transkribus.

the class TrpDocPacker method packDocFiles.

/**
 * Zips a local TrpDoc into a file at the given zipFilePath.
 * The process involves computing MD5 sums for all files.
 * METS file will be included.
 *
 * @param doc
 * @param zipFilePath
 * @return
 * @throws IOException
 */
public File packDocFiles(TrpDoc doc, String zipFilePath) throws IOException {
    File localFolder = doc.getMd().getLocalFolder();
    if (localFolder == null) {
        throw new IOException("Not a local Document!");
    }
    Md5SumComputer md5Comp = new Md5SumComputer();
    md5Comp.addObserver(passthroughObserver);
    doc = md5Comp.computeAndSetMd5Sums(doc);
    if (zipFilePath == null || zipFilePath.isEmpty()) {
        logger.info("No zip file path specified.");
        zipFilePath = TEMP_DIR + File.separator + "TRP_DOC_" + System.currentTimeMillis() + ".zip";
    } else if (!(new File(zipFilePath).getParentFile().exists())) {
        throw new IllegalArgumentException(zipFilePath + " refers to a non-existent directory!");
    }
    logger.info("Creating zip file at: " + zipFilePath);
    String metsFilePath = localFolder.getAbsoluteFile() + File.separator + TrpMetsBuilder.METS_FILE_NAME;
    File metsFile = new File(metsFilePath);
    Mets mets;
    logger.info("Creating METS file at: " + metsFilePath);
    // build a mets that points to all files we need
    // 2nd arg: export page files (add to mets filesec), 3rd arg: export alto files, 4th arg: export images
    mets = TrpMetsBuilder.buildMets(doc, true, false, true, null);
    try {
        metsFile = JaxbUtils.marshalToFile(mets, metsFile, TrpDocMetadata.class);
    } catch (JAXBException e) {
        logger.error(e.getMessage(), e);
        throw new IOException("Could not create METS file.", e);
    }
    updateStatus("Built METS file");
    // prepare the list with files to be packed into the ZIP
    List<String> fileList = new LinkedList<>();
    fileList.add(TrpMetsBuilder.METS_FILE_NAME);
    // traverse the METS filesection and add all files to be zipped
    List<FileGrpType> typeGrps = MetsUtil.getMasterFileGrp(mets);
    for (FileGrpType type : typeGrps) {
        if (type.getID().equals(TrpMetsBuilder.IMG_GROUP_ID) || type.getID().equals(TrpMetsBuilder.PAGE_GROUP_ID)) {
            List<String> files = getFiles(type);
            fileList.addAll(files);
        }
    }
    updateStatus("Creating ZIP file...");
    File zipFile = ZipUtils.zip(fileList, localFolder.getAbsolutePath(), zipFilePath);
    return zipFile;
}
Also used : FileGrpType(eu.transkribus.core.model.beans.mets.FileGrpType) Md5SumComputer(eu.transkribus.core.io.util.Md5SumComputer) Mets(eu.transkribus.core.model.beans.mets.Mets) JAXBException(javax.xml.bind.JAXBException) TrpDocMetadata(eu.transkribus.core.model.beans.TrpDocMetadata) IOException(java.io.IOException) File(java.io.File) LinkedList(java.util.LinkedList)

Example 3 with FileGrpType

use of eu.transkribus.core.model.beans.mets.FileGrpType in project TranskribusCore by Transkribus.

the class MetsUtil method getImagesToUpload.

public static List<PageUploadDescriptor> getImagesToUpload(Mets mets) {
    // check filesection. needs img group and xml group to distinguish them without going for mimetypes
    List<FileGrpType> typeGrps = getMasterFileGrp(mets);
    boolean hasXml = true;
    List<FileType> xmlGrp = null;
    List<FileType> imgGrp = null;
    for (FileGrpType type : typeGrps) {
        switch(type.getID()) {
            case TrpMetsBuilder.IMG_GROUP_ID:
                imgGrp = type.getFile();
                break;
            case TrpMetsBuilder.PAGE_GROUP_ID:
                xmlGrp = type.getFile();
                break;
            default:
                break;
        }
    }
    if (imgGrp == null) {
        throw new IllegalArgumentException("METS file has no image file list!");
    }
    if (xmlGrp == null) {
        logger.debug("METS file has no xml file list!");
    }
    List<DivType> pageDivs = getPageDivsFromStructMap(mets);
    if (pageDivs == null)
        throw new IllegalArgumentException("No valid StructMap was found!");
    List<PageUploadDescriptor> images = new ArrayList<PageUploadDescriptor>(pageDivs.size());
    for (DivType div : pageDivs) {
        PageUploadDescriptor image = buildUploadImage(div, imgGrp, xmlGrp);
        images.add(image);
    }
    return images;
}
Also used : FileGrpType(eu.transkribus.core.model.beans.mets.FileGrpType) DivType(eu.transkribus.core.model.beans.mets.DivType) FileType(eu.transkribus.core.model.beans.mets.FileType) ArrayList(java.util.ArrayList) PageUploadDescriptor(eu.transkribus.core.model.beans.DocumentUploadDescriptor.PageUploadDescriptor)

Example 4 with FileGrpType

use of eu.transkribus.core.model.beans.mets.FileGrpType in project TranskribusCore by Transkribus.

the class MetsUtil method getTrpPages.

/**
 * Builds the set of TrpPage objects with
 * local file references from the mets master file group and structmap.
 * The method is strict regarding PAGE XML existence! Each image file must have a correspondent PAGE XML.
 * @param mets
 * @param parentDir
 * @return
 * @throws IOException
 */
public static List<TrpPage> getTrpPages(Mets mets, File parentDir) throws IOException {
    // check filesection. needs img group and xml group to distinguish them without going for mimetypes
    List<FileGrpType> typeGrps = getMasterFileGrp(mets);
    List<FileType> xmlGrp = null;
    List<FileType> imgGrp = null;
    for (FileGrpType type : typeGrps) {
        switch(type.getID()) {
            case TrpMetsBuilder.IMG_GROUP_ID:
                imgGrp = type.getFile();
                break;
            case TrpMetsBuilder.PAGE_GROUP_ID:
                xmlGrp = type.getFile();
                break;
            default:
                break;
        }
    }
    if (imgGrp == null)
        throw new IOException("METS file has no image file list!");
    if (xmlGrp == null)
        throw new IOException("METS file has no xml file list!");
    List<DivType> pageDivs = getPageDivsFromStructMap(mets);
    if (pageDivs == null) {
        throw new IOException("No valid StructMap was found!");
    }
    List<TrpPage> pages = new ArrayList<TrpPage>(pageDivs.size());
    for (DivType div : pageDivs) {
        TrpPage page = buildPage(div, imgGrp, xmlGrp, parentDir);
        pages.add(page);
    }
    return pages;
}
Also used : FileGrpType(eu.transkribus.core.model.beans.mets.FileGrpType) DivType(eu.transkribus.core.model.beans.mets.DivType) FileType(eu.transkribus.core.model.beans.mets.FileType) TrpPage(eu.transkribus.core.model.beans.TrpPage) ArrayList(java.util.ArrayList) IOException(java.io.IOException)

Example 5 with FileGrpType

use of eu.transkribus.core.model.beans.mets.FileGrpType in project TranskribusCore by Transkribus.

the class TrpMetsBuilder method buildMets.

/**
 * Generate a METS containing
 * <ul>
 * <li>TrpDocMetadata embedded in sourceMd</li>
 * <li>all page images</li>
 * <li>the most recent PAGE XML files from the Doc</li>
 * </ul>
 *
 * If a local document is passed, all hrefs will contain the relative paths to files based on the localFolder!
 *
 * @param doc
 * @param exportImages
 * @param pageIndices
 * @return
 * @throws IOException if image/xml files can't be accessed for reading the mimetype etc.
 */
public static Mets buildMets(TrpDoc doc, boolean exportPage, boolean exportAlto, boolean exportImages, Set<Integer> pageIndices) throws IOException {
    Mets mets = new Mets();
    TrpDocMetadata md = doc.getMd();
    File localFolder = md.getLocalFolder();
    boolean isLocalDoc = localFolder != null;
    mets.setLABEL(md.getTitle());
    mets.setOBJID("" + md.getDocId());
    mets.setPROFILE(TRP_METS_PROFILE);
    // FIXME remove TYPE
    // mets.setTYPE(TRP_METS_PROFILE);
    // metsHdr
    MetsHdr hdr = buildMetsHdr(md);
    mets.setMetsHdr(hdr);
    // TODO dcmd_elec omitted meanwhile
    // md_orig
    AmdSecType amdSec = new AmdSecType();
    amdSec.setID(SOURCE_MD_ID_CONST);
    MdSecType sourceMdSec = buildSourceMdSec(md);
    amdSec.getSourceMD().add(sourceMdSec);
    mets.getAmdSec().add(amdSec);
    // structmap div, linking to the sourceMd section with dmd
    DivType div = new DivType();
    div.getADMID().add(sourceMdSec);
    div.setID(TRP_DOC_DIV_ID);
    FileSec fileSec = new FileSec();
    StructMapType structMap = new StructMapType();
    structMap.setID(TRP_STRUCTMAP_ID);
    structMap.setTYPE("MANUSCRIPT");
    structMap.setDiv(div);
    List<TrpPage> pages = doc.getPages();
    FimgStoreGetClient client = null;
    if (!isLocalDoc) {
        // TODO maybe we need this stuff in the docMetadata?
        URL url = pages.get(0).getUrl();
        client = new FimgStoreGetClient(url);
    }
    FileGrp masterGrp = new FileGrp();
    masterGrp.setID(MASTER_FILE_GRP_ID);
    FileGrpType imgGrp = new FileGrpType();
    imgGrp.setID(IMG_GROUP_ID);
    FileGrpType pageGrp = new FileGrpType();
    pageGrp.setID(PAGE_GROUP_ID);
    FileGrpType altoGrp = new FileGrpType();
    altoGrp.setID(ALTO_GROUP_ID);
    int i = -1;
    for (TrpPage p : pages) {
        i++;
        if (pageIndices != null && !pageIndices.contains(i)) {
            continue;
        }
        // build a page div for the structmap
        DivType pageDiv = new DivType();
        pageDiv.setID("PAGE_" + p.getPageNr());
        pageDiv.setTYPE("SINGLE_PAGE");
        pageDiv.setORDER(BigInteger.valueOf(p.getPageNr()));
        final String imgId = "IMG_" + p.getPageNr();
        final String xmlId = PAGE_GROUP_ID + "_" + p.getPageNr();
        final String altoId = ALTO_GROUP_ID + "_" + p.getPageNr();
        /* only the most recent transcript is added here for now
			 * 
			 * TODO how to deal with imagestore files? use orig image? right now, it's just the view file...
			 * TODO thumbnails not yet included
			*/
        if (exportImages) {
            FileType img = buildFileType(localFolder, imgId, p, p.getPageNr(), client);
            imgGrp.getFile().add(img);
            // linking images
            Fptr imgPtr = buildFptr(img);
            pageDiv.getFptr().add(imgPtr);
        }
        // TODO error handling.. if no transcript??
        if (exportPage) {
            // xmlfiletype: just add the most recent transcript
            TrpTranscriptMetadata tMd;
            // get the transcript chosen for export
            tMd = p.getCurrentTranscript();
            FileType xml = buildFileType(md.getLocalFolder(), xmlId, tMd, p.getPageNr(), client);
            pageGrp.getFile().add(xml);
            Fptr xmlPtr = buildFptr(xml);
            pageDiv.getFptr().add(xmlPtr);
        }
        // creat ALTO fileGrp
        if (exportAlto) {
            FileType altoFt = new FileType();
            altoFt.setCHECKSUMTYPE(ChecksumUtils.ChkSumAlg.MD5.toString());
            // TODO calculate checksum
            altoFt.setCHECKSUM("");
            FLocat fLocat = new FLocat();
            fLocat.setLOCTYPE("OTHER");
            fLocat.setOTHERLOCTYPE("FILE");
            altoFt.setID(altoId);
            altoFt.setSEQ(p.getPageNr());
            // String tmpImgName = img.getFLocat().get(0).getHref();
            String relAltoPath = "alto".concat(File.separator).concat(p.getImgFileName().substring(0, p.getImgFileName().lastIndexOf(".")).concat(".xml"));
            fLocat.setHref(relAltoPath);
            // String absAltoPath = tMd.getUrl().getPath().replace("page", "alto");
            final String path = FileUtils.toFile(p.getUrl()).getAbsolutePath();
            String absAltoPath = path.substring(0, path.lastIndexOf(File.separator));
            absAltoPath = absAltoPath.concat("/alto/").concat(p.getImgFileName().substring(0, p.getImgFileName().lastIndexOf(".")).concat(".xml"));
            // logger.info("alto path starts with: " + absAltoPath);
            if (absAltoPath.startsWith("\\")) /*|| absAltoPath.startsWith("/")*/
            {
                // logger.info("alto path starts with \\ or /");
                absAltoPath = absAltoPath.substring(1);
            }
            String mime = MimeTypes.getMimeType("xml");
            altoFt.setMIMETYPE(mime);
            File altoTmp = new File(absAltoPath);
            if (altoTmp.exists()) {
                // logger.info("alto file exist at " + absAltoPath);
                Date date = new Date(altoTmp.lastModified());
                XMLGregorianCalendar cal = JaxbUtils.getXmlCalendar(date);
                altoFt.setCREATED(cal);
            } else {
                logger.info("alto file does not exist at " + absAltoPath);
            }
            // System.out.println("relAltoPath " + relAltoPath);
            // System.out.println("absAltoPath " + absAltoPath);
            // System.in.read();
            altoFt.getFLocat().add(fLocat);
            altoGrp.getFile().add(altoFt);
            Fptr altoPtr = buildFptr(altoFt);
            pageDiv.getFptr().add(altoPtr);
        }
        div.getDiv().add(pageDiv);
    }
    fileSec.getFileGrp().add(masterGrp);
    mets.setFileSec(fileSec);
    if (exportImages) {
        masterGrp.getFileGrp().add(imgGrp);
    }
    if (exportPage) {
        masterGrp.getFileGrp().add(pageGrp);
    }
    if (exportAlto) {
        masterGrp.getFileGrp().add(altoGrp);
    }
    mets.getStructMap().add(structMap);
    return mets;
}
Also used : TrpPage(eu.transkribus.core.model.beans.TrpPage) FileGrp(eu.transkribus.core.model.beans.mets.MetsType.FileSec.FileGrp) Fptr(eu.transkribus.core.model.beans.mets.DivType.Fptr) TrpTranscriptMetadata(eu.transkribus.core.model.beans.TrpTranscriptMetadata) AmdSecType(eu.transkribus.core.model.beans.mets.AmdSecType) MetsHdr(eu.transkribus.core.model.beans.mets.MetsType.MetsHdr) URL(java.net.URL) Date(java.util.Date) MdSecType(eu.transkribus.core.model.beans.mets.MdSecType) DivType(eu.transkribus.core.model.beans.mets.DivType) FileGrpType(eu.transkribus.core.model.beans.mets.FileGrpType) XMLGregorianCalendar(javax.xml.datatype.XMLGregorianCalendar) Mets(eu.transkribus.core.model.beans.mets.Mets) FimgStoreGetClient(org.dea.fimgstoreclient.FimgStoreGetClient) FileType(eu.transkribus.core.model.beans.mets.FileType) FileSec(eu.transkribus.core.model.beans.mets.MetsType.FileSec) TrpDocMetadata(eu.transkribus.core.model.beans.TrpDocMetadata) StructMapType(eu.transkribus.core.model.beans.mets.StructMapType) FLocat(eu.transkribus.core.model.beans.mets.FileType.FLocat) ITrpFile(eu.transkribus.core.model.beans.ITrpFile) File(java.io.File)

Aggregations

FileGrpType (eu.transkribus.core.model.beans.mets.FileGrpType)5 DivType (eu.transkribus.core.model.beans.mets.DivType)4 FileType (eu.transkribus.core.model.beans.mets.FileType)4 TrpPage (eu.transkribus.core.model.beans.TrpPage)3 IOException (java.io.IOException)3 ArrayList (java.util.ArrayList)3 TrpDocMetadata (eu.transkribus.core.model.beans.TrpDocMetadata)2 Mets (eu.transkribus.core.model.beans.mets.Mets)2 FileGrp (eu.transkribus.core.model.beans.mets.MetsType.FileSec.FileGrp)2 StructMapType (eu.transkribus.core.model.beans.mets.StructMapType)2 File (java.io.File)2 Md5SumComputer (eu.transkribus.core.io.util.Md5SumComputer)1 PageUploadDescriptor (eu.transkribus.core.model.beans.DocumentUploadDescriptor.PageUploadDescriptor)1 ITrpFile (eu.transkribus.core.model.beans.ITrpFile)1 TrpTranscriptMetadata (eu.transkribus.core.model.beans.TrpTranscriptMetadata)1 AmdSecType (eu.transkribus.core.model.beans.mets.AmdSecType)1 Fptr (eu.transkribus.core.model.beans.mets.DivType.Fptr)1 FLocat (eu.transkribus.core.model.beans.mets.FileType.FLocat)1 MdSecType (eu.transkribus.core.model.beans.mets.MdSecType)1 FileSec (eu.transkribus.core.model.beans.mets.MetsType.FileSec)1