Search in sources :

Example 6 with Mets

use of eu.transkribus.core.model.beans.mets.Mets in project TranskribusCore by Transkribus.

the class FEPLocalDocReader method unmarshalMets.

static Mets unmarshalMets(File metsFile, boolean validate, Class<?>... nestedClassed) throws IOException, JAXBException, SAXException {
    Mets mets;
    // try {
    Unmarshaller u = JaxbUtils.createUnmarshaller(Mets.class);
    long t = System.currentTimeMillis();
    if (validate) {
        Schema schema = XmlFormat.METS.getOrCompileSchema();
        u.setSchema(schema);
    }
    Object o = u.unmarshal(metsFile);
    mets = (Mets) o;
    logger.debug("time for unmarshalling: " + (System.currentTimeMillis() - t) + ", validated: " + validate);
    // mets = JaxbUtils.unmarshal(metsFile, Mets.class, nestedClassed);
    // mets = JaxbUtils.unmarshal2(new FileInputStream(metsFile), Mets.class, true, false);
    // } catch (Exception e) {
    // throw new IOException("Could not unmarshal METS file!", e);
    // }
    logger.debug("unmarshalled mets file");
    return mets;
}
Also used : Mets(eu.transkribus.core.model.beans.mets.Mets) Schema(javax.xml.validation.Schema) Unmarshaller(javax.xml.bind.Unmarshaller)

Example 7 with Mets

use of eu.transkribus.core.model.beans.mets.Mets in project TranskribusCore by Transkribus.

the class FEPLocalDocReader method loadFEPDoc.

public static TrpDoc loadFEPDoc(final String path, boolean validateMets, boolean preserveOcrTxtStyles, boolean preserveOcrFontFamily, boolean replaceBadChars, IProgressMonitor monitor) throws Exception {
    final File inputDir = new File(path);
    logger.info("importing FEP document from path: " + path);
    ProgressUtils.beginTask(monitor, "Importing a FEP document", -1);
    ProgressUtils.subTask(monitor, "Parsing mets");
    // find mets file:
    File metsFile = findMetsFile(inputDir);
    // unmarshall mets:
    Mets mets = unmarshalMets(metsFile, validateMets);
    // create trp-document and set metadata:
    TrpDoc trpDoc = new TrpDoc();
    setTitle(trpDoc, mets);
    trpDoc.getMd().setDesc("Imported from FEP export");
    trpDoc.getMd().setLocalFolder(inputDir);
    File pageDir = new File(inputDir.getAbsolutePath() + "/" + LocalDocConst.PAGE_FILE_SUB_FOLDER);
    File thumbDir = new File(inputDir.getAbsolutePath() + "/" + LocalDocConst.THUMBS_FILE_SUB_FOLDER);
    // parse physical structure:
    List<HashMap<String, File>> physStruct = parsePhysicalStructure(inputDir, mets);
    final int nPages = physStruct.size();
    ProgressUtils.beginTask(monitor, "Importing a FEP document", nPages);
    // create PAGEs:
    List<TrpPage> pages = new ArrayList<TrpPage>(nPages);
    int pageNr = 0;
    for (HashMap<String, File> files : physStruct) {
        ProgressUtils.subTask(monitor, "Importing page " + pageNr);
        ++pageNr;
        logger.debug("page: " + pageNr + ", nr of files: " + files.size());
        // first, check if image file is there and set some variables:
        if (!files.containsKey(IMG_GRP))
            throw new IOException("Image file for page " + pageNr + " could not be found!");
        File imgFile = files.get(IMG_GRP);
        ;
        String imgFileBn = FilenameUtils.getBaseName(imgFile.getName());
        File thumbFile = LocalDocReader.getThumbFile(thumbDir, imgFileBn);
        File pageOutFile = new File(pageDir.getAbsolutePath() + "/" + imgFileBn + ".xml");
        FileUtils.forceMkdir(pageOutFile.getParentFile());
        if (files.containsKey(ALTO_GRP)) {
            File altoFile = files.get(ALTO_GRP);
            PcGtsType pc = LocalDocReader.createPageFromAlto2(imgFile.getName(), altoFile, preserveOcrTxtStyles, preserveOcrFontFamily, replaceBadChars);
            pageOutFile = JaxbUtils.marshalToFile(pc, pageOutFile);
        } else {
            throw new IOException("ALTO file for image " + pageNr + " could not be found!");
        // TODO: create empty page file -> NO!
        }
        // TODO is is assumed that the image is not corrupt here! Try to read dimension to be sure
        TrpPage page = LocalDocReader.buildPage(inputDir, pageNr, imgFile, pageOutFile, thumbFile, null, null);
        // exract logical structs for this page from mets and apply them to the page:
        applyLogicalStructFromMetsToPageFile(mets, pageNr, pageOutFile);
        pages.add(page);
        ProgressUtils.worked(monitor, pageNr);
    }
    trpDoc.setPages(pages);
    return trpDoc;
}
Also used : HashMap(java.util.HashMap) TrpPage(eu.transkribus.core.model.beans.TrpPage) ArrayList(java.util.ArrayList) IOException(java.io.IOException) PcGtsType(eu.transkribus.core.model.beans.pagecontent.PcGtsType) Mets(eu.transkribus.core.model.beans.mets.Mets) TrpDoc(eu.transkribus.core.model.beans.TrpDoc) File(java.io.File)

Example 8 with Mets

use of eu.transkribus.core.model.beans.mets.Mets in project TranskribusCore by Transkribus.

the class TrpDocPacker method unpackDoc.

public TrpDoc unpackDoc(File zipFile, String path) throws IOException {
    if (path == null || path.isEmpty()) {
        path = TEMP_DIR + File.separator + FilenameUtils.getBaseName(zipFile.getName());
    }
    File dir = ZipUtils.unzip(zipFile, path);
    // assume mets path and open file
    final String metsPath = path + File.separator + TrpMetsBuilder.METS_FILE_NAME;
    File metsFile = new File(metsPath);
    if (!metsFile.exists()) {
        throw new IOException("No METS file included in zip!");
    }
    // final File parentDir = new File(metsFile.getParent());
    Mets mets;
    try {
        mets = JaxbUtils.unmarshal(metsFile, Mets.class, TrpDocMetadata.class);
    } catch (JAXBException e) {
        throw new IOException("Could not unmarshal METS file!", e);
    }
    TrpDoc doc = LocalDocReader.load(mets, dir);
    return doc;
}
Also used : Mets(eu.transkribus.core.model.beans.mets.Mets) JAXBException(javax.xml.bind.JAXBException) TrpDoc(eu.transkribus.core.model.beans.TrpDoc) TrpDocMetadata(eu.transkribus.core.model.beans.TrpDocMetadata) IOException(java.io.IOException) File(java.io.File)

Example 9 with Mets

use of eu.transkribus.core.model.beans.mets.Mets in project TranskribusCore by Transkribus.

the class TrpMetsBuilder method buildMets.

/**
 * Generate a METS containing
 * <ul>
 * <li>TrpDocMetadata embedded in sourceMd</li>
 * <li>all page images</li>
 * <li>the most recent PAGE XML files from the Doc</li>
 * </ul>
 *
 * If a local document is passed, all hrefs will contain the relative paths to files based on the localFolder!
 *
 * @param doc
 * @param exportImages
 * @param pageIndices
 * @return
 * @throws IOException if image/xml files can't be accessed for reading the mimetype etc.
 */
public static Mets buildMets(TrpDoc doc, boolean exportPage, boolean exportAlto, boolean exportImages, Set<Integer> pageIndices) throws IOException {
    Mets mets = new Mets();
    TrpDocMetadata md = doc.getMd();
    File localFolder = md.getLocalFolder();
    boolean isLocalDoc = localFolder != null;
    mets.setLABEL(md.getTitle());
    mets.setOBJID("" + md.getDocId());
    mets.setPROFILE(TRP_METS_PROFILE);
    // FIXME remove TYPE
    // mets.setTYPE(TRP_METS_PROFILE);
    // metsHdr
    MetsHdr hdr = buildMetsHdr(md);
    mets.setMetsHdr(hdr);
    // TODO dcmd_elec omitted meanwhile
    // md_orig
    AmdSecType amdSec = new AmdSecType();
    amdSec.setID(SOURCE_MD_ID_CONST);
    MdSecType sourceMdSec = buildSourceMdSec(md);
    amdSec.getSourceMD().add(sourceMdSec);
    mets.getAmdSec().add(amdSec);
    // structmap div, linking to the sourceMd section with dmd
    DivType div = new DivType();
    div.getADMID().add(sourceMdSec);
    div.setID(TRP_DOC_DIV_ID);
    FileSec fileSec = new FileSec();
    StructMapType structMap = new StructMapType();
    structMap.setID(TRP_STRUCTMAP_ID);
    structMap.setTYPE("MANUSCRIPT");
    structMap.setDiv(div);
    List<TrpPage> pages = doc.getPages();
    FimgStoreGetClient client = null;
    if (!isLocalDoc) {
        // TODO maybe we need this stuff in the docMetadata?
        URL url = pages.get(0).getUrl();
        client = new FimgStoreGetClient(url);
    }
    FileGrp masterGrp = new FileGrp();
    masterGrp.setID(MASTER_FILE_GRP_ID);
    FileGrpType imgGrp = new FileGrpType();
    imgGrp.setID(IMG_GROUP_ID);
    FileGrpType pageGrp = new FileGrpType();
    pageGrp.setID(PAGE_GROUP_ID);
    FileGrpType altoGrp = new FileGrpType();
    altoGrp.setID(ALTO_GROUP_ID);
    int i = -1;
    for (TrpPage p : pages) {
        i++;
        if (pageIndices != null && !pageIndices.contains(i)) {
            continue;
        }
        // build a page div for the structmap
        DivType pageDiv = new DivType();
        pageDiv.setID("PAGE_" + p.getPageNr());
        pageDiv.setTYPE("SINGLE_PAGE");
        pageDiv.setORDER(BigInteger.valueOf(p.getPageNr()));
        final String imgId = "IMG_" + p.getPageNr();
        final String xmlId = PAGE_GROUP_ID + "_" + p.getPageNr();
        final String altoId = ALTO_GROUP_ID + "_" + p.getPageNr();
        /* only the most recent transcript is added here for now
			 * 
			 * TODO how to deal with imagestore files? use orig image? right now, it's just the view file...
			 * TODO thumbnails not yet included
			*/
        if (exportImages) {
            FileType img = buildFileType(localFolder, imgId, p, p.getPageNr(), client);
            imgGrp.getFile().add(img);
            // linking images
            Fptr imgPtr = buildFptr(img);
            pageDiv.getFptr().add(imgPtr);
        }
        // TODO error handling.. if no transcript??
        if (exportPage) {
            // xmlfiletype: just add the most recent transcript
            TrpTranscriptMetadata tMd;
            // get the transcript chosen for export
            tMd = p.getCurrentTranscript();
            FileType xml = buildFileType(md.getLocalFolder(), xmlId, tMd, p.getPageNr(), client);
            pageGrp.getFile().add(xml);
            Fptr xmlPtr = buildFptr(xml);
            pageDiv.getFptr().add(xmlPtr);
        }
        // creat ALTO fileGrp
        if (exportAlto) {
            FileType altoFt = new FileType();
            altoFt.setCHECKSUMTYPE(ChecksumUtils.ChkSumAlg.MD5.toString());
            // TODO calculate checksum
            altoFt.setCHECKSUM("");
            FLocat fLocat = new FLocat();
            fLocat.setLOCTYPE("OTHER");
            fLocat.setOTHERLOCTYPE("FILE");
            altoFt.setID(altoId);
            altoFt.setSEQ(p.getPageNr());
            // String tmpImgName = img.getFLocat().get(0).getHref();
            String relAltoPath = "alto".concat(File.separator).concat(p.getImgFileName().substring(0, p.getImgFileName().lastIndexOf(".")).concat(".xml"));
            fLocat.setHref(relAltoPath);
            // String absAltoPath = tMd.getUrl().getPath().replace("page", "alto");
            final String path = FileUtils.toFile(p.getUrl()).getAbsolutePath();
            String absAltoPath = path.substring(0, path.lastIndexOf(File.separator));
            absAltoPath = absAltoPath.concat("/alto/").concat(p.getImgFileName().substring(0, p.getImgFileName().lastIndexOf(".")).concat(".xml"));
            // logger.info("alto path starts with: " + absAltoPath);
            if (absAltoPath.startsWith("\\")) /*|| absAltoPath.startsWith("/")*/
            {
                // logger.info("alto path starts with \\ or /");
                absAltoPath = absAltoPath.substring(1);
            }
            String mime = MimeTypes.getMimeType("xml");
            altoFt.setMIMETYPE(mime);
            File altoTmp = new File(absAltoPath);
            if (altoTmp.exists()) {
                // logger.info("alto file exist at " + absAltoPath);
                Date date = new Date(altoTmp.lastModified());
                XMLGregorianCalendar cal = JaxbUtils.getXmlCalendar(date);
                altoFt.setCREATED(cal);
            } else {
                logger.info("alto file does not exist at " + absAltoPath);
            }
            // System.out.println("relAltoPath " + relAltoPath);
            // System.out.println("absAltoPath " + absAltoPath);
            // System.in.read();
            altoFt.getFLocat().add(fLocat);
            altoGrp.getFile().add(altoFt);
            Fptr altoPtr = buildFptr(altoFt);
            pageDiv.getFptr().add(altoPtr);
        }
        div.getDiv().add(pageDiv);
    }
    fileSec.getFileGrp().add(masterGrp);
    mets.setFileSec(fileSec);
    if (exportImages) {
        masterGrp.getFileGrp().add(imgGrp);
    }
    if (exportPage) {
        masterGrp.getFileGrp().add(pageGrp);
    }
    if (exportAlto) {
        masterGrp.getFileGrp().add(altoGrp);
    }
    mets.getStructMap().add(structMap);
    return mets;
}
Also used : TrpPage(eu.transkribus.core.model.beans.TrpPage) FileGrp(eu.transkribus.core.model.beans.mets.MetsType.FileSec.FileGrp) Fptr(eu.transkribus.core.model.beans.mets.DivType.Fptr) TrpTranscriptMetadata(eu.transkribus.core.model.beans.TrpTranscriptMetadata) AmdSecType(eu.transkribus.core.model.beans.mets.AmdSecType) MetsHdr(eu.transkribus.core.model.beans.mets.MetsType.MetsHdr) URL(java.net.URL) Date(java.util.Date) MdSecType(eu.transkribus.core.model.beans.mets.MdSecType) DivType(eu.transkribus.core.model.beans.mets.DivType) FileGrpType(eu.transkribus.core.model.beans.mets.FileGrpType) XMLGregorianCalendar(javax.xml.datatype.XMLGregorianCalendar) Mets(eu.transkribus.core.model.beans.mets.Mets) FimgStoreGetClient(org.dea.fimgstoreclient.FimgStoreGetClient) FileType(eu.transkribus.core.model.beans.mets.FileType) FileSec(eu.transkribus.core.model.beans.mets.MetsType.FileSec) TrpDocMetadata(eu.transkribus.core.model.beans.TrpDocMetadata) StructMapType(eu.transkribus.core.model.beans.mets.StructMapType) FLocat(eu.transkribus.core.model.beans.mets.FileType.FLocat) ITrpFile(eu.transkribus.core.model.beans.ITrpFile) File(java.io.File)

Example 10 with Mets

use of eu.transkribus.core.model.beans.mets.Mets in project TranskribusCore by Transkribus.

the class MetsMessageBodyReader method readFrom.

@Override
public Mets readFrom(Class<Mets> type, Type genericType, Annotation[] annotations, MediaType mediaType, MultivaluedMap<String, String> httpHeaders, InputStream entityStream) throws IOException, WebApplicationException {
    try {
        logger.debug("unmarshalling Mets from input stream, type = " + type + " genericType = " + genericType + " mediaType = " + mediaType);
        sw.start();
        Mets mets = JaxbUtils.unmarshal(entityStream, Mets.class, TrpDocMetadata.class);
        sw.stop(true, "time to unmarshal: ", logger);
        return mets;
    } catch (Exception e) {
        logger.error(e.getMessage(), e);
        throw new WebApplicationException(e);
    }
}
Also used : Mets(eu.transkribus.core.model.beans.mets.Mets) WebApplicationException(javax.ws.rs.WebApplicationException) IOException(java.io.IOException) WebApplicationException(javax.ws.rs.WebApplicationException)

Aggregations

Mets (eu.transkribus.core.model.beans.mets.Mets)10 File (java.io.File)7 IOException (java.io.IOException)6 TrpDoc (eu.transkribus.core.model.beans.TrpDoc)5 TrpDocMetadata (eu.transkribus.core.model.beans.TrpDocMetadata)5 TrpPage (eu.transkribus.core.model.beans.TrpPage)3 JAXBException (javax.xml.bind.JAXBException)3 TrpTranscriptMetadata (eu.transkribus.core.model.beans.TrpTranscriptMetadata)2 FileGrpType (eu.transkribus.core.model.beans.mets.FileGrpType)2 URL (java.net.URL)2 Unmarshaller (javax.xml.bind.Unmarshaller)2 Schema (javax.xml.validation.Schema)2 FimgStoreGetClient (org.dea.fimgstoreclient.FimgStoreGetClient)2 Md5SumComputer (eu.transkribus.core.io.util.Md5SumComputer)1 ITrpFile (eu.transkribus.core.model.beans.ITrpFile)1 JAXBPageTranscript (eu.transkribus.core.model.beans.JAXBPageTranscript)1 AmdSecType (eu.transkribus.core.model.beans.mets.AmdSecType)1 DivType (eu.transkribus.core.model.beans.mets.DivType)1 Fptr (eu.transkribus.core.model.beans.mets.DivType.Fptr)1 FileType (eu.transkribus.core.model.beans.mets.FileType)1