Search in sources :

Example 1 with EdFeature

use of eu.transkribus.core.model.beans.EdFeature in project TranskribusCore by Transkribus.

the class LocalDocReader method load.

/**
 * Loads a document from path.<br>
 *
 * Document metadata has to be in an XML called "metadata.xml".<br>
 *
 * Image files and corresponding XML/txt files have to have the same name. <br>
 * Lexicographic order of image names will imply order of pages.<br>
 * Types of transcript source files are searched in this order:
 * <ol>
 * <li>./page: PAGE XMLs according to schema 2010/2013</li>
 * <li>./ocr: Abbyy Finereader XMLs schema version 10</li>
 * <li>./alto: ALTO v2 XMls
 * <li>./txt: txt files with transcription fulltext only
 * </ol>
 * Testdoc is in $dea_scratch/TRP/TrpTestDoc <br>
 * No versioning of files for local use!<br>
 *
 * @param path the path where the document is stored
 * @param config {@link DocLoadConfig}
 * @return the constructed document
 * @throws IOException if the path can't be read or is malformed
 *
 * @todo implement monitor feedback!
 * @todo Respect Storage.uploadDocument where the monitor will be used by the upload itself later.
 */
public static TrpDoc load(final String path, DocLoadConfig config, IProgressMonitor monitor) throws IOException {
    // create the document
    TrpDoc doc = new TrpDoc();
    // check OS and adjust URL protocol
    final String os = System.getProperty("os.name");
    /*
		 * FIXME use SysUtils.isWin() here?
		 */
    if (os.toLowerCase().contains("win")) {
        LocalDocConst.URL_PROT_CONST = "file:///";
    }
    // else: keep default
    final File inputDir = new File(path);
    final File docXml = new File(inputDir.getAbsolutePath() + File.separator + LocalDocConst.DOC_XML_FILENAME);
    // validate input path ======================================================
    checkInputDir(inputDir);
    // search for IMG files
    TreeMap<String, File> pageMap = findImgFiles(inputDir);
    logger.info("Found " + pageMap.entrySet().size() + " page images.");
    if (pageMap.isEmpty()) {
        throw new FileNotFoundException("The directory does not contain any images: " + inputDir.getAbsolutePath());
    }
    TrpDocMetadata docMd = null;
    boolean doRefresh = true;
    // try to read doc structure from disk
    if (docXml.isFile()) {
        doc = loadDocXml(docXml);
        if (isValid(doc, pageMap.size(), config.isForceCreatePageXml())) {
            logger.info("Loaded document structure from disk.");
            docMd = doc.getMd();
            // no refresh is necessary as doc structure matches the input dir content
            doRefresh = false;
        } else {
            if (doc != null && doc.getMd() != null) {
                // keep any existing metadata if invalid doc structure was found
                docMd = doc.getMd();
            }
            logger.info("Removing faulty doc XML from disk and doing reload.");
            docXml.delete();
            doc = new TrpDoc();
        }
    }
    logger.info("Reading document at " + inputDir.getAbsolutePath());
    // find metadata file if not extracted from doc.xml =============================================
    if (docMd == null) {
        try {
            docMd = loadDocMd(inputDir);
        } catch (IOException ioe) {
            docMd = new TrpDocMetadata();
        }
    }
    initDocMd(docMd, inputDir, config.isStripServerRelatedMetadata());
    // Set the docMd
    doc.setMd(docMd);
    if (!doRefresh) {
        // Stop now and reuse doc structure from file
        return doc;
    }
    // Construct the input dir with pageXml Files.
    File pageInputDir = getPageXmlInputDir(inputDir);
    if (config.isForceCreatePageXml() && !pageInputDir.isDirectory()) {
        pageInputDir.mkdir();
    }
    // abbyy XML search path
    File ocrInputDir = getOcrXmlInputDir(inputDir);
    // alto XML search path
    File altoInputDir = getAltoXmlInputDir(inputDir);
    // alto XML search path
    File txtInputDir = getTxtInputDir(inputDir);
    // backupfolder for outdated page format files, if any
    final String backupFolderName = XmlFormat.PAGE_2010.toString().toLowerCase() + "_backup";
    final String backupPath = pageInputDir.getAbsolutePath() + File.separator + backupFolderName;
    // iterate imgList, search for corresponding XML files and build TrpPages
    int pageNr = 1;
    List<TrpPage> pages = new ArrayList<TrpPage>(pageMap.entrySet().size());
    // need a special variable to test whether we are in sync mode (only then do the following!!!!)
    if (pageMap.entrySet().size() == 0 && config.isEnableSyncWithoutImages()) {
        pageMap = createDummyImgFilesForXmls(inputDir, pageInputDir);
    }
    for (Entry<String, File> e : pageMap.entrySet()) {
        File imgFile = e.getValue();
        // the img file name without extension
        final String imgFileName = e.getKey();
        // check for a page XML of this name
        File pageXml = findXml(imgFileName, pageInputDir);
        // TODO thumbURL dir + imgFile.getName())+".jpg"
        File thumbFile = getThumbFile(inputDir, imgFileName);
        if (pageXml != null) {
            XmlFormat xmlFormat = XmlUtils.getXmlFormat(pageXml);
            switch(xmlFormat) {
                case PAGE_2010:
                    Page2010Converter.updatePageFormatSingleFile(pageXml, backupPath);
                    break;
                case PAGE_2013:
                    break;
                default:
                    throw new IOException("Incompatible XML file in PAGE XML path! " + pageXml.getAbsolutePath());
            }
        }
        // try to read image dimension in any case to detect corrupt files immediately!
        // FIXME this is taking too long and is only necessary on initial loading
        Dimension dim = null;
        String imageRemark = null;
        try {
            dim = ImgUtils.readImageDimensions(imgFile);
        } catch (CorruptImageException cie) {
            logger.error("Image is corrupt: " + imgFile.getAbsolutePath(), cie);
            imageRemark = getCorruptImgMsg(imgFile.getName());
        }
        if (pageXml == null && config.isForceCreatePageXml()) {
            // if no page XML, then create one at this path
            File pageOutFile = new File(pageInputDir.getAbsolutePath() + File.separatorChar + imgFileName + ".xml");
            File abbyyXml = findXml(imgFileName, ocrInputDir);
            File altoXml = findXml(imgFileName, altoInputDir);
            File txtFile = findFile(imgFileName, txtInputDir, "txt");
            pageXml = createPageXml(pageOutFile, false, abbyyXml, altoXml, txtFile, config.isPreserveOcrFontFamily(), config.isPreserveOcrTxtStyles(), config.isReplaceBadChars(), imgFile.getName(), dim);
        }
        TrpPage page = buildPage(inputDir, pageNr++, imgFile, pageXml, thumbFile, dim, imageRemark);
        pages.add(page);
    }
    doc.setPages(pages);
    doc.getMd().setNrOfPages(doc.getPages().size());
    // set editorial declaration:
    List<EdFeature> features = readEditDeclFeatures(doc.getMd().getLocalFolder());
    doc.setEdDeclList(features);
    logger.debug(doc.toString());
    // store doc on disk to save time on next load
    LocalDocWriter.writeDocXml(doc, docXml);
    return doc;
}
Also used : CorruptImageException(eu.transkribus.core.exceptions.CorruptImageException) XmlFormat(eu.transkribus.core.io.formats.XmlFormat) TrpPage(eu.transkribus.core.model.beans.TrpPage) FileNotFoundException(java.io.FileNotFoundException) ArrayList(java.util.ArrayList) IOException(java.io.IOException) Dimension(java.awt.Dimension) EdFeature(eu.transkribus.core.model.beans.EdFeature) TrpDoc(eu.transkribus.core.model.beans.TrpDoc) TrpDocMetadata(eu.transkribus.core.model.beans.TrpDocMetadata) File(java.io.File)

Example 2 with EdFeature

use of eu.transkribus.core.model.beans.EdFeature in project TranskribusCore by Transkribus.

the class TrpPdfDocument method addTitlePage.

public void addTitlePage(TrpDoc doc) {
    document.newPage();
    PdfContentByte cb = writer.getDirectContentUnder();
    float lineHeight = twelfthPoints[1][0] / 3;
    float posY = twelfthPoints[1][1];
    addTitleString("Title Page", posY, 0, (float) (lineHeight * 1.5), cb, bfArialBoldItalic);
    posY += lineHeight * 2;
    TrpDocMetadata docMd = doc.getMd();
    if (writeDocMd("Title: ", docMd.getTitle(), posY, 0, lineHeight, cb, bfArialItalic)) {
        posY += lineHeight * 1.5;
    }
    if (writeDocMd("Author: ", docMd.getAuthor(), posY, 0, lineHeight, cb, bfArialItalic)) {
        posY += lineHeight * 1.5;
    }
    lineHeight = twelfthPoints[1][0] / 6;
    if (writeDocMd("Description: ", docMd.getDesc(), posY, 0, lineHeight, cb, bfArialItalic)) {
        posY += lineHeight * 1.2;
    }
    if (writeDocMd("Genre: ", docMd.getGenre(), posY, 0, lineHeight, cb, bfArialItalic)) {
        posY += lineHeight * 1.2;
    }
    if (writeDocMd("Writer: ", docMd.getWriter(), posY, 0, lineHeight, cb, bfArialItalic)) {
        posY += lineHeight * 1.2;
    }
    if (docMd.getScriptType() != null) {
        if (writeDocMd("Scripttype: ", docMd.getScriptType().toString(), posY, 0, lineHeight, cb, bfArialItalic)) {
            posY += lineHeight * 1.2;
        }
    }
    if (writeDocMd("Language: ", docMd.getLanguage(), posY, 0, lineHeight, cb, bfArialItalic)) {
        posY += lineHeight * 1.2;
    }
    if (writeDocMd("Number of Pages in whole Document: ", String.valueOf(docMd.getNrOfPages()), posY, 0, lineHeight, cb, bfArialItalic)) {
        posY += lineHeight * 1.2;
    }
    if (docMd.getCreatedFromDate() != null) {
        if (writeDocMd("Created From: ", docMd.getCreatedFromDate().toString(), posY, 0, lineHeight, cb, bfArialItalic)) {
            posY += lineHeight * 1.2;
        }
    }
    if (docMd.getCreatedToDate() != null) {
        if (writeDocMd("Created To: ", docMd.getCreatedToDate().toString(), posY, 0, lineHeight, cb, bfArialItalic)) {
            posY += lineHeight * 1.5;
        }
    }
    // --- Export settings section
    lineHeight = twelfthPoints[1][0] / 3;
    addTitleString("Export Settings: ", posY, twelfthPoints[1][0], lineHeight, cb, bfArialBoldItalic);
    String imageSetting = (imgOnly ? "Images without text layer" : "Images with text layer");
    String extraTextSetting = (extraTextPage ? "Extra pages for transcribed text are added" : "");
    String blackeningSetting = (doBlackening ? "Sensible data is invisible" : "Sensible data is shown if existent");
    String tagSetting = (highlightTags ? "Tags are highlighted (colored lines) and added at the end" : "No tags shown in export");
    lineHeight = twelfthPoints[1][0] / 6;
    posY += lineHeight * 1.5;
    addTitleString(imageSetting + " / " + extraTextSetting + " / " + blackeningSetting + " / " + tagSetting, posY, twelfthPoints[1][0], lineHeight, cb, bfArialBoldItalic);
    // --- Export settings section end
    // --- Editorial declaration section
    lineHeight = twelfthPoints[1][0] / 3;
    posY += lineHeight * 1.5;
    List<EdFeature> efl = doc.getEdDeclList();
    if (efl.size() >= 0) {
        addTitleString("Editorial Declaration: ", posY, twelfthPoints[1][0], lineHeight, cb, bfArialBoldItalic);
        posY += lineHeight * 1.5;
        lineHeight = twelfthPoints[1][0] / 6;
    }
    for (EdFeature edfeat : efl) {
        addTitleString(edfeat.getTitle() + ": " + edfeat.getDescription() + "\n" + edfeat.getSelectedOption().toString(), posY, twelfthPoints[1][0], lineHeight, cb, bfArial);
        // posY += lineHeight;
        // addTitleString(edfeat.getSelectedOption().toString(), posY, twelfthPoints[1][0], lineHeight, cb, bfArial);
        posY += lineHeight * 1.5;
    }
// --- Editorial declaration section	end
}
Also used : EdFeature(eu.transkribus.core.model.beans.EdFeature) TrpDocMetadata(eu.transkribus.core.model.beans.TrpDocMetadata) PdfContentByte(com.itextpdf.text.pdf.PdfContentByte)

Example 3 with EdFeature

use of eu.transkribus.core.model.beans.EdFeature in project TranskribusCore by Transkribus.

the class LocalDocReader method readEditDeclFeatures.

public static List<EdFeature> readEditDeclFeatures(File folder) {
    List<EdFeature> features = new ArrayList<>();
    File editDecl = new File(folder + "/" + LocalDocConst.EDITORIAL_DECLARATION_FN);
    if (editDecl.isFile()) {
        try {
            JaxbList<EdFeature> list = JaxbUtils.unmarshal(editDecl, JaxbList.class, EdFeature.class, EdOption.class);
            features = list.getList();
        } catch (Exception e) {
            logger.error(e.getMessage(), e);
        }
    }
    return features;
}
Also used : EdFeature(eu.transkribus.core.model.beans.EdFeature) ArrayList(java.util.ArrayList) File(java.io.File) JAXBException(javax.xml.bind.JAXBException) FileNotFoundException(java.io.FileNotFoundException) SAXException(org.xml.sax.SAXException) TransformerException(javax.xml.transform.TransformerException) MalformedURLException(java.net.MalformedURLException) IOException(java.io.IOException) CorruptImageException(eu.transkribus.core.exceptions.CorruptImageException) DocumentException(com.itextpdf.text.DocumentException) ParserConfigurationException(javax.xml.parsers.ParserConfigurationException)

Example 4 with EdFeature

use of eu.transkribus.core.model.beans.EdFeature in project TranskribusCore by Transkribus.

the class TrpTeiStringBuilder method writeEditorialDeclaration.

void writeEditorialDeclaration(SebisStringBuilder sb) {
    if (trpDoc.getEdDeclList() == null || trpDoc.getEdDeclList().isEmpty())
        return;
    sb.incIndent();
    sb.addLine("<encodingDesc>");
    sb.incIndent();
    sb.addLine("<editorialDecl>");
    sb.incIndent();
    for (EdFeature f : trpDoc.getEdDeclList()) {
        if (f.getSelectedOption() != null) {
            String str = f.getTitle() + " (" + f.getDescription() + "): " + f.getSelectedOption().getText();
            String escapedstr = StringEscapeUtils.escapeXml(str);
            sb.addLine("<p>" + escapedstr + "</p>");
        }
    }
    sb.decIndent();
    sb.addLine("</editorialDecl>");
    sb.decIndent();
    sb.addLine("</encodingDesc>");
    sb.decIndent();
}
Also used : EdFeature(eu.transkribus.core.model.beans.EdFeature)

Example 5 with EdFeature

use of eu.transkribus.core.model.beans.EdFeature in project TranskribusCore by Transkribus.

the class DocxBuilder method addTitlePage.

public static void addTitlePage(TrpDoc doc, MainDocumentPart mdp) {
    mdp.getPropertyResolver().activateStyle("Light Shading");
    mdp.getPropertyResolver().activateStyle("Medium List 1");
    addParagraph("", "Title Page", mdp, "Title");
    TrpDocMetadata docMd = doc.getMd();
    addParagraph("Title: ", docMd.getTitle(), mdp, "Subtitle");
    addParagraph("Author: ", docMd.getAuthor(), mdp, "Subtitle");
    addParagraph("Description: ", docMd.getDesc(), mdp, "Subtitle");
    addParagraph("Genre: ", docMd.getGenre(), mdp, "Subtitle");
    addParagraph("Writer: ", docMd.getWriter(), mdp, "Subtitle");
    if (docMd.getScriptType() != null) {
        addParagraph("Sripttype: ", docMd.getScriptType().toString(), mdp, "Subtitle");
    }
    addParagraph("Language: ", docMd.getLanguage(), mdp, "Subtitle");
    addParagraph("Number of Pages in whole Document: ", String.valueOf(docMd.getNrOfPages()), mdp, "Subtitle");
    if (docMd.getCreatedFromDate() != null) {
        addParagraph("Created From: ", docMd.getCreatedFromDate().toString(), mdp, "Subtitle");
    }
    if (docMd.getCreatedToDate() != null) {
        addParagraph("Created To: ", docMd.getCreatedToDate().toString(), mdp, "Subtitle");
    }
    /*
		 * 	static boolean exportTags = true;
			static boolean doBlackening = true;
			static boolean markUnclearWords = false;
			static boolean expandAbbrevs = false;
			static boolean substituteAbbrevs = false;
			static boolean preserveLineBreaks = false;
		 */
    addParagraph("", "Export Settings", mdp, "Title");
    String tagSettings = (exportTags ? "Custom tags are indexed" : "Custom tags are not exported");
    String blackeningSetting = (doBlackening ? "Sensible data is blackened" : "All data is visible");
    String abbrevsSettings = (expandAbbrevs ? "Abbreviations are expanded (abbrev [expansion])" : (substituteAbbrevs ? "Abbreviations are subsituted by there expansion" : "Abbreviations as they are (diplomatic text)"));
    String unclearSettings = (markUnclearWords ? "Unclear words are marked" : "");
    String lineBreakSettings = (preserveLineBreaks ? "Keep the line breaks as in the original document" : "Line breaks does not conform to the original text");
    String suppliedSettings = (showSuppliedWithBrackets ? "Supplied tags are shown in brackets" : (ignoreSupplied ? "Supplied tags get ignored" : "Supplied tags are not marked specifically"));
    addParagraph("", blackeningSetting + " / " + tagSettings + " / " + abbrevsSettings + " / " + unclearSettings + " / " + lineBreakSettings + " / " + suppliedSettings, mdp, "Subtitle");
    addParagraph("", "Editorial Declaration: ", mdp, "Title");
    for (EdFeature edfeat : doc.getEdDeclList()) {
        addParagraph("", edfeat.getTitle() + ": " + edfeat.getDescription() + "\n" + edfeat.getSelectedOption().toString(), mdp, "Subtitle");
    }
}
Also used : EdFeature(eu.transkribus.core.model.beans.EdFeature) TrpDocMetadata(eu.transkribus.core.model.beans.TrpDocMetadata)

Aggregations

EdFeature (eu.transkribus.core.model.beans.EdFeature)6 TrpDocMetadata (eu.transkribus.core.model.beans.TrpDocMetadata)4 IOException (java.io.IOException)3 ArrayList (java.util.ArrayList)3 CorruptImageException (eu.transkribus.core.exceptions.CorruptImageException)2 File (java.io.File)2 FileNotFoundException (java.io.FileNotFoundException)2 DocumentException (com.itextpdf.text.DocumentException)1 PdfContentByte (com.itextpdf.text.pdf.PdfContentByte)1 XmlFormat (eu.transkribus.core.io.formats.XmlFormat)1 TrpDoc (eu.transkribus.core.model.beans.TrpDoc)1 TrpPage (eu.transkribus.core.model.beans.TrpPage)1 Dimension (java.awt.Dimension)1 MalformedURLException (java.net.MalformedURLException)1 JAXBException (javax.xml.bind.JAXBException)1 ParserConfigurationException (javax.xml.parsers.ParserConfigurationException)1 TransformerException (javax.xml.transform.TransformerException)1 SAXException (org.xml.sax.SAXException)1