Search in sources :

Example 11 with TrpDocMetadata

use of eu.transkribus.core.model.beans.TrpDocMetadata in project TranskribusCore by Transkribus.

the class LocalDocReader method loadDocMd.

/**
 * searches the inputDir for files ending in XmlFileFilter.mdFileEnding,
 * which is e.g. "metadata.xml". If a file is found, it is parsed into a
 * TrpDocMetadata Object.
 *
 * @param inputDir
 *            where the document is stored
 * @return TrpDocMetadata Object or null if no mdFile is found.
 * @throws IOException
 *             If more than one mdFile is on the path
 */
public static TrpDocMetadata loadDocMd(File inputDir) throws IOException {
    final File[] mdFiles = inputDir.listFiles(new MdFileFilter());
    if (mdFiles == null || mdFiles.length == 0) {
        // no file => no metadata
        throw new FileNotFoundException("No metadata XML was found on path: " + inputDir.getAbsolutePath());
    } else {
        final File mdFile = mdFiles[0];
        logger.info("Found md File " + mdFile.getAbsolutePath());
        try {
            TrpDocMetadata docMd = JaxbUtils.unmarshal(mdFile, TrpDocMetadata.class);
            // set ID to -1 in order to create confusion
            docMd.setDocId(-1);
            return docMd;
        } catch (JAXBException je) {
            // this file will be ignored
            throw new IOException("The md File " + mdFile.getName() + " did not obey the correct format. " + "A doc without metadata will be provided.");
        }
    }
}
Also used : MdFileFilter(eu.transkribus.core.io.util.MdFileFilter) JAXBException(javax.xml.bind.JAXBException) FileNotFoundException(java.io.FileNotFoundException) TrpDocMetadata(eu.transkribus.core.model.beans.TrpDocMetadata) IOException(java.io.IOException) File(java.io.File)

Example 12 with TrpDocMetadata

use of eu.transkribus.core.model.beans.TrpDocMetadata in project TranskribusCore by Transkribus.

the class TrpMetsBuilder method buildMets.

/**
 * Generate a METS containing
 * <ul>
 * <li>TrpDocMetadata embedded in sourceMd</li>
 * <li>all page images</li>
 * <li>the most recent PAGE XML files from the Doc</li>
 * </ul>
 *
 * If a local document is passed, all hrefs will contain the relative paths to files based on the localFolder!
 *
 * @param doc
 * @param exportImages
 * @param pageIndices
 * @return
 * @throws IOException if image/xml files can't be accessed for reading the mimetype etc.
 */
public static Mets buildMets(TrpDoc doc, boolean exportPage, boolean exportAlto, boolean exportImages, Set<Integer> pageIndices) throws IOException {
    Mets mets = new Mets();
    TrpDocMetadata md = doc.getMd();
    File localFolder = md.getLocalFolder();
    boolean isLocalDoc = localFolder != null;
    mets.setLABEL(md.getTitle());
    mets.setOBJID("" + md.getDocId());
    mets.setPROFILE(TRP_METS_PROFILE);
    // FIXME remove TYPE
    // mets.setTYPE(TRP_METS_PROFILE);
    // metsHdr
    MetsHdr hdr = buildMetsHdr(md);
    mets.setMetsHdr(hdr);
    // TODO dcmd_elec omitted meanwhile
    // md_orig
    AmdSecType amdSec = new AmdSecType();
    amdSec.setID(SOURCE_MD_ID_CONST);
    MdSecType sourceMdSec = buildSourceMdSec(md);
    amdSec.getSourceMD().add(sourceMdSec);
    mets.getAmdSec().add(amdSec);
    // structmap div, linking to the sourceMd section with dmd
    DivType div = new DivType();
    div.getADMID().add(sourceMdSec);
    div.setID(TRP_DOC_DIV_ID);
    FileSec fileSec = new FileSec();
    StructMapType structMap = new StructMapType();
    structMap.setID(TRP_STRUCTMAP_ID);
    structMap.setTYPE("MANUSCRIPT");
    structMap.setDiv(div);
    List<TrpPage> pages = doc.getPages();
    FimgStoreGetClient client = null;
    if (!isLocalDoc) {
        // TODO maybe we need this stuff in the docMetadata?
        URL url = pages.get(0).getUrl();
        client = new FimgStoreGetClient(url);
    }
    FileGrp masterGrp = new FileGrp();
    masterGrp.setID(MASTER_FILE_GRP_ID);
    FileGrpType imgGrp = new FileGrpType();
    imgGrp.setID(IMG_GROUP_ID);
    FileGrpType pageGrp = new FileGrpType();
    pageGrp.setID(PAGE_GROUP_ID);
    FileGrpType altoGrp = new FileGrpType();
    altoGrp.setID(ALTO_GROUP_ID);
    int i = -1;
    for (TrpPage p : pages) {
        i++;
        if (pageIndices != null && !pageIndices.contains(i)) {
            continue;
        }
        // build a page div for the structmap
        DivType pageDiv = new DivType();
        pageDiv.setID("PAGE_" + p.getPageNr());
        pageDiv.setTYPE("SINGLE_PAGE");
        pageDiv.setORDER(BigInteger.valueOf(p.getPageNr()));
        final String imgId = "IMG_" + p.getPageNr();
        final String xmlId = PAGE_GROUP_ID + "_" + p.getPageNr();
        final String altoId = ALTO_GROUP_ID + "_" + p.getPageNr();
        /* only the most recent transcript is added here for now
			 * 
			 * TODO how to deal with imagestore files? use orig image? right now, it's just the view file...
			 * TODO thumbnails not yet included
			*/
        if (exportImages) {
            FileType img = buildFileType(localFolder, imgId, p, p.getPageNr(), client);
            imgGrp.getFile().add(img);
            // linking images
            Fptr imgPtr = buildFptr(img);
            pageDiv.getFptr().add(imgPtr);
        }
        // TODO error handling.. if no transcript??
        if (exportPage) {
            // xmlfiletype: just add the most recent transcript
            TrpTranscriptMetadata tMd;
            // get the transcript chosen for export
            tMd = p.getCurrentTranscript();
            FileType xml = buildFileType(md.getLocalFolder(), xmlId, tMd, p.getPageNr(), client);
            pageGrp.getFile().add(xml);
            Fptr xmlPtr = buildFptr(xml);
            pageDiv.getFptr().add(xmlPtr);
        }
        // creat ALTO fileGrp
        if (exportAlto) {
            FileType altoFt = new FileType();
            altoFt.setCHECKSUMTYPE(ChecksumUtils.ChkSumAlg.MD5.toString());
            // TODO calculate checksum
            altoFt.setCHECKSUM("");
            FLocat fLocat = new FLocat();
            fLocat.setLOCTYPE("OTHER");
            fLocat.setOTHERLOCTYPE("FILE");
            altoFt.setID(altoId);
            altoFt.setSEQ(p.getPageNr());
            // String tmpImgName = img.getFLocat().get(0).getHref();
            String relAltoPath = "alto".concat(File.separator).concat(p.getImgFileName().substring(0, p.getImgFileName().lastIndexOf(".")).concat(".xml"));
            fLocat.setHref(relAltoPath);
            // String absAltoPath = tMd.getUrl().getPath().replace("page", "alto");
            final String path = FileUtils.toFile(p.getUrl()).getAbsolutePath();
            String absAltoPath = path.substring(0, path.lastIndexOf(File.separator));
            absAltoPath = absAltoPath.concat("/alto/").concat(p.getImgFileName().substring(0, p.getImgFileName().lastIndexOf(".")).concat(".xml"));
            // logger.info("alto path starts with: " + absAltoPath);
            if (absAltoPath.startsWith("\\")) /*|| absAltoPath.startsWith("/")*/
            {
                // logger.info("alto path starts with \\ or /");
                absAltoPath = absAltoPath.substring(1);
            }
            String mime = MimeTypes.getMimeType("xml");
            altoFt.setMIMETYPE(mime);
            File altoTmp = new File(absAltoPath);
            if (altoTmp.exists()) {
                // logger.info("alto file exist at " + absAltoPath);
                Date date = new Date(altoTmp.lastModified());
                XMLGregorianCalendar cal = JaxbUtils.getXmlCalendar(date);
                altoFt.setCREATED(cal);
            } else {
                logger.info("alto file does not exist at " + absAltoPath);
            }
            // System.out.println("relAltoPath " + relAltoPath);
            // System.out.println("absAltoPath " + absAltoPath);
            // System.in.read();
            altoFt.getFLocat().add(fLocat);
            altoGrp.getFile().add(altoFt);
            Fptr altoPtr = buildFptr(altoFt);
            pageDiv.getFptr().add(altoPtr);
        }
        div.getDiv().add(pageDiv);
    }
    fileSec.getFileGrp().add(masterGrp);
    mets.setFileSec(fileSec);
    if (exportImages) {
        masterGrp.getFileGrp().add(imgGrp);
    }
    if (exportPage) {
        masterGrp.getFileGrp().add(pageGrp);
    }
    if (exportAlto) {
        masterGrp.getFileGrp().add(altoGrp);
    }
    mets.getStructMap().add(structMap);
    return mets;
}
Also used : TrpPage(eu.transkribus.core.model.beans.TrpPage) FileGrp(eu.transkribus.core.model.beans.mets.MetsType.FileSec.FileGrp) Fptr(eu.transkribus.core.model.beans.mets.DivType.Fptr) TrpTranscriptMetadata(eu.transkribus.core.model.beans.TrpTranscriptMetadata) AmdSecType(eu.transkribus.core.model.beans.mets.AmdSecType) MetsHdr(eu.transkribus.core.model.beans.mets.MetsType.MetsHdr) URL(java.net.URL) Date(java.util.Date) MdSecType(eu.transkribus.core.model.beans.mets.MdSecType) DivType(eu.transkribus.core.model.beans.mets.DivType) FileGrpType(eu.transkribus.core.model.beans.mets.FileGrpType) XMLGregorianCalendar(javax.xml.datatype.XMLGregorianCalendar) Mets(eu.transkribus.core.model.beans.mets.Mets) FimgStoreGetClient(org.dea.fimgstoreclient.FimgStoreGetClient) FileType(eu.transkribus.core.model.beans.mets.FileType) FileSec(eu.transkribus.core.model.beans.mets.MetsType.FileSec) TrpDocMetadata(eu.transkribus.core.model.beans.TrpDocMetadata) StructMapType(eu.transkribus.core.model.beans.mets.StructMapType) FLocat(eu.transkribus.core.model.beans.mets.FileType.FLocat) ITrpFile(eu.transkribus.core.model.beans.ITrpFile) File(java.io.File)

Example 13 with TrpDocMetadata

use of eu.transkribus.core.model.beans.TrpDocMetadata in project TranskribusCore by Transkribus.

the class FatBuilder method writeFatXml.

public static File writeFatXml(File outputDir, final String languages, final String typeFace) throws UnsupportedFormatException, IOException {
    if (!new File(outputDir.getAbsolutePath() + File.separator + LocalDocConst.OCR_MASTER_DIR).isDirectory()) {
        throw new IllegalArgumentException("No directory '" + LocalDocConst.OCR_MASTER_DIR + "' in directory: " + outputDir.getAbsolutePath());
    }
    // needs a local doc! Read files separately because we don't want to create Page XMLs
    Map<String, File> imgFiles = LocalDocReader.findImgFiles(outputDir);
    TrpDocMetadata docMd = LocalDocReader.loadDocMd(outputDir);
    // final DocType docType = doc.getMd().getType();
    // if(!DocType.PRINT.equals(docType)){
    // throw new IllegalArgumentException("DocType " + docType + " not allowed for FAT XML production");
    // }
    RootFolder rootFolder = new RootFolder();
    SimpleDateFormat df = new SimpleDateFormat();
    df.applyPattern("yyyy-MM-dd hh:mm");
    final String dateStr = df.format(new Date());
    rootFolder.setDate(dateStr);
    final BigInteger nFiles = getBigIntValue(imgFiles.size());
    rootFolder.setNFiles(nFiles);
    rootFolder.setNDocuments(BigInteger.valueOf(1));
    rootFolder.setNFileWarnings(BigInteger.valueOf(0));
    rootFolder.setNFolders(BigInteger.valueOf(1));
    DocumentFolder docFolder = new DocumentFolder();
    docFolder.setName(outputDir.getName());
    docFolder.setPath(LocalDocConst.OCR_MASTER_DIR);
    docFolder.setNFilesPerFolder(nFiles);
    // TODO throw exception if missingMetadata is true?
    boolean missingMetadata = false;
    Order order = new Order();
    order.setHasViewingFiles("false");
    order.setServices("(OCR)");
    OcrMetadata ocrM = new OcrMetadata();
    if (languages != null && !languages.isEmpty()) {
        ocrM.setLanguages(languages);
    } else if (docMd.getLanguage() != null && !docMd.getLanguage().isEmpty()) {
        ocrM.setLanguages(docMd.getLanguage());
    } else {
        missingMetadata = true;
        ocrM.setLanguages("");
    }
    if (typeFace != null && !typeFace.isEmpty()) {
        ocrM.setTexttype(typeFace);
    } else if (docMd.getScriptType() != null) {
        ocrM.setTexttype(docMd.getScriptType().toString());
    } else {
        ocrM.setTexttype(ScriptType.NORMAL.toString());
        missingMetadata = true;
    }
    ocrM.setOutput("(ABBYY-XML)");
    // check the following!
    int nDocsMissingMetadata = missingMetadata ? 1 : 0;
    rootFolder.setNDocumentsMissingMetadata(BigInteger.valueOf(nDocsMissingMetadata));
    FepMetadata fep = new FepMetadata();
    fep.setWorkflow("None");
    order.setOcrMetadata(ocrM);
    order.setFepMetadata(fep);
    docFolder.setOrder(order);
    FileFolder fileFolder = new FileFolder();
    fileFolder.setType("img");
    // existence of OCR_MASTER_DIR is checked at the beginning
    fileFolder.setName(LocalDocConst.OCR_MASTER_DIR);
    fileFolder.setPath(LocalDocConst.OCR_MASTER_DIR);
    // List<TrpPage> pages = doc.getPages();
    int checkedFiles = 0;
    int uncheckedFiles = 0;
    int nFileErrors = 0;
    for (Entry<String, File> imgE : imgFiles.entrySet()) {
        final File img = imgE.getValue();
        eu.transkribus.core.model.beans.fat.File file = new eu.transkribus.core.model.beans.fat.File();
        file.setName(img.getName());
        String errorType;
        String message;
        try {
            final Map<String, String> exif = ExiftoolUtil.extractImgMd(img.getAbsolutePath());
            final String mimetype = exif.get("MIMEType");
            final String xRes = exif.get("XResolution");
            final String yRes = exif.get("YResolution");
            final String width = exif.get("ImageWidth");
            final String height = exif.get("ImageHeight");
            Metadata md = new Metadata();
            md.setMimetype(mimetype);
            md.setXRes(getBigIntValue(xRes));
            md.setYRes(getBigIntValue(yRes));
            md.setWidth(getBigIntValue(width));
            md.setHeight(getBigIntValue(height));
            final String md5 = formatChecksum(ChecksumUtils.getMd5SumHex(img));
            md.setChecksum(md5);
            file.setMetadata(md);
            checkedFiles++;
            file.setStatus("Checked");
            errorType = "None";
            message = "";
        } catch (TimeoutException | InterruptedException | NumberFormatException e) {
            uncheckedFiles++;
            nFileErrors++;
            errorType = e.getClass().getName();
            message = e.getMessage();
            file.setStatus("Error");
            logger.error("Could not run file checks for file: " + img.getAbsolutePath(), e);
        }
        file.setErrorType(errorType);
        file.setMessage(message);
        fileFolder.getFile().add(file);
    }
    rootFolder.setNCheckedFiles(getBigIntValue(checkedFiles));
    rootFolder.setNUncheckedFiles(getBigIntValue(uncheckedFiles));
    rootFolder.setNFileErrors(getBigIntValue(nFileErrors));
    docFolder.getFileFolder().add(fileFolder);
    rootFolder.getDocumentFolder().add(docFolder);
    File fatFile = new File(outputDir.getAbsolutePath() + File.separator + FatBuilder.FAT_FILE_NAME);
    try {
        fatFile = JaxbUtils.marshalToFile(rootFolder, fatFile);
    } catch (JAXBException e) {
        throw new IOException("Could not marshal FAT XML to file!", e);
    }
    return fatFile;
}
Also used : DocumentFolder(eu.transkribus.core.model.beans.fat.DocumentFolder) Metadata(eu.transkribus.core.model.beans.fat.File.Metadata) FepMetadata(eu.transkribus.core.model.beans.fat.FepMetadata) OcrMetadata(eu.transkribus.core.model.beans.fat.OcrMetadata) TrpDocMetadata(eu.transkribus.core.model.beans.TrpDocMetadata) FileFolder(eu.transkribus.core.model.beans.fat.FileFolder) TimeoutException(java.util.concurrent.TimeoutException) Order(eu.transkribus.core.model.beans.fat.Order) JAXBException(javax.xml.bind.JAXBException) IOException(java.io.IOException) Date(java.util.Date) OcrMetadata(eu.transkribus.core.model.beans.fat.OcrMetadata) TrpDocMetadata(eu.transkribus.core.model.beans.TrpDocMetadata) RootFolder(eu.transkribus.core.model.beans.fat.RootFolder) BigInteger(java.math.BigInteger) FepMetadata(eu.transkribus.core.model.beans.fat.FepMetadata) File(java.io.File) SimpleDateFormat(java.text.SimpleDateFormat)

Example 14 with TrpDocMetadata

use of eu.transkribus.core.model.beans.TrpDocMetadata in project TranskribusCore by Transkribus.

the class DocxBuilder method addTitlePage.

public static void addTitlePage(TrpDoc doc, MainDocumentPart mdp) {
    mdp.getPropertyResolver().activateStyle("Light Shading");
    mdp.getPropertyResolver().activateStyle("Medium List 1");
    addParagraph("", "Title Page", mdp, "Title");
    TrpDocMetadata docMd = doc.getMd();
    addParagraph("Title: ", docMd.getTitle(), mdp, "Subtitle");
    addParagraph("Author: ", docMd.getAuthor(), mdp, "Subtitle");
    addParagraph("Description: ", docMd.getDesc(), mdp, "Subtitle");
    addParagraph("Genre: ", docMd.getGenre(), mdp, "Subtitle");
    addParagraph("Writer: ", docMd.getWriter(), mdp, "Subtitle");
    if (docMd.getScriptType() != null) {
        addParagraph("Sripttype: ", docMd.getScriptType().toString(), mdp, "Subtitle");
    }
    addParagraph("Language: ", docMd.getLanguage(), mdp, "Subtitle");
    addParagraph("Number of Pages in whole Document: ", String.valueOf(docMd.getNrOfPages()), mdp, "Subtitle");
    if (docMd.getCreatedFromDate() != null) {
        addParagraph("Created From: ", docMd.getCreatedFromDate().toString(), mdp, "Subtitle");
    }
    if (docMd.getCreatedToDate() != null) {
        addParagraph("Created To: ", docMd.getCreatedToDate().toString(), mdp, "Subtitle");
    }
    /*
		 * 	static boolean exportTags = true;
			static boolean doBlackening = true;
			static boolean markUnclearWords = false;
			static boolean expandAbbrevs = false;
			static boolean substituteAbbrevs = false;
			static boolean preserveLineBreaks = false;
		 */
    addParagraph("", "Export Settings", mdp, "Title");
    String tagSettings = (exportTags ? "Custom tags are indexed" : "Custom tags are not exported");
    String blackeningSetting = (doBlackening ? "Sensible data is blackened" : "All data is visible");
    String abbrevsSettings = (expandAbbrevs ? "Abbreviations are expanded (abbrev [expansion])" : (substituteAbbrevs ? "Abbreviations are subsituted by there expansion" : "Abbreviations as they are (diplomatic text)"));
    String unclearSettings = (markUnclearWords ? "Unclear words are marked" : "");
    String lineBreakSettings = (preserveLineBreaks ? "Keep the line breaks as in the original document" : "Line breaks does not conform to the original text");
    String suppliedSettings = (showSuppliedWithBrackets ? "Supplied tags are shown in brackets" : (ignoreSupplied ? "Supplied tags get ignored" : "Supplied tags are not marked specifically"));
    addParagraph("", blackeningSetting + " / " + tagSettings + " / " + abbrevsSettings + " / " + unclearSettings + " / " + lineBreakSettings + " / " + suppliedSettings, mdp, "Subtitle");
    addParagraph("", "Editorial Declaration: ", mdp, "Title");
    for (EdFeature edfeat : doc.getEdDeclList()) {
        addParagraph("", edfeat.getTitle() + ": " + edfeat.getDescription() + "\n" + edfeat.getSelectedOption().toString(), mdp, "Subtitle");
    }
}
Also used : EdFeature(eu.transkribus.core.model.beans.EdFeature) TrpDocMetadata(eu.transkribus.core.model.beans.TrpDocMetadata)

Example 15 with TrpDocMetadata

use of eu.transkribus.core.model.beans.TrpDocMetadata in project TranskribusCore by Transkribus.

the class TrpTxtBuilder method addTitlePage.

public static void addTitlePage(TrpDoc doc, File file) {
    List<String> titleContent = new ArrayList<String>();
    titleContent.add("----------------------------");
    titleContent.add("Metadata section of document");
    titleContent.add("----------------------------");
    TrpDocMetadata docMd = doc.getMd();
    titleContent.add("Title: " + docMd.getTitle());
    titleContent.add("Author: " + docMd.getAuthor());
    titleContent.add("Description: " + docMd.getDesc());
    titleContent.add("Genre: " + docMd.getGenre());
    titleContent.add("Writer: " + docMd.getWriter());
    if (docMd.getScriptType() != null) {
        titleContent.add("Sripttype: " + docMd.getScriptType().toString());
    }
    titleContent.add("Language: " + docMd.getLanguage());
    titleContent.add("Number of Pages in whole Document: " + String.valueOf(docMd.getNrOfPages()));
    if (docMd.getCreatedFromDate() != null) {
        titleContent.add("Created From: " + docMd.getCreatedFromDate().toString());
    }
    if (docMd.getCreatedToDate() != null) {
        titleContent.add("Created To: " + docMd.getCreatedToDate().toString());
    }
    titleContent.add("Editorial Declaration: ");
    for (EdFeature edfeat : doc.getEdDeclList()) {
        titleContent.add(edfeat.getTitle() + ": " + edfeat.getDescription() + System.lineSeparator() + edfeat.getSelectedOption().toString());
    }
    titleContent.add("-----------------------");
    titleContent.add("End of metadata section");
    titleContent.add("-----------------------");
    titleContent.add(System.lineSeparator());
    try {
        Files.write(Paths.get(file.getAbsolutePath()), titleContent, utf8, StandardOpenOption.CREATE, StandardOpenOption.APPEND);
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
}
Also used : EdFeature(eu.transkribus.core.model.beans.EdFeature) ArrayList(java.util.ArrayList) TrpDocMetadata(eu.transkribus.core.model.beans.TrpDocMetadata) IOException(java.io.IOException)

Aggregations

TrpDocMetadata (eu.transkribus.core.model.beans.TrpDocMetadata)16 File (java.io.File)7 IOException (java.io.IOException)7 TrpDoc (eu.transkribus.core.model.beans.TrpDoc)6 TrpPage (eu.transkribus.core.model.beans.TrpPage)5 EdFeature (eu.transkribus.core.model.beans.EdFeature)4 FileNotFoundException (java.io.FileNotFoundException)3 Date (java.util.Date)3 JAXBException (javax.xml.bind.JAXBException)3 CorruptImageException (eu.transkribus.core.exceptions.CorruptImageException)2 AmdSecType (eu.transkribus.core.model.beans.mets.AmdSecType)2 MdSecType (eu.transkribus.core.model.beans.mets.MdSecType)2 Mets (eu.transkribus.core.model.beans.mets.Mets)2 Dimension (java.awt.Dimension)2 SimpleDateFormat (java.text.SimpleDateFormat)2 ArrayList (java.util.ArrayList)2 PdfContentByte (com.itextpdf.text.pdf.PdfContentByte)1 XmlFormat (eu.transkribus.core.io.formats.XmlFormat)1 MdFileFilter (eu.transkribus.core.io.util.MdFileFilter)1 PageUploadDescriptor (eu.transkribus.core.model.beans.DocumentUploadDescriptor.PageUploadDescriptor)1