use of eu.transkribus.core.model.beans.EdFeature in project TranskribusCore by Transkribus.
the class LocalDocReader method load.
/**
* Loads a document from path.<br>
*
* Document metadata has to be in an XML called "metadata.xml".<br>
*
* Image files and corresponding XML/txt files have to have the same name. <br>
* Lexicographic order of image names will imply order of pages.<br>
* Types of transcript source files are searched in this order:
* <ol>
* <li>./page: PAGE XMLs according to schema 2010/2013</li>
* <li>./ocr: Abbyy Finereader XMLs schema version 10</li>
* <li>./alto: ALTO v2 XMls
* <li>./txt: txt files with transcription fulltext only
* </ol>
* Testdoc is in $dea_scratch/TRP/TrpTestDoc <br>
* No versioning of files for local use!<br>
*
* @param path the path where the document is stored
* @param config {@link DocLoadConfig}
* @return the constructed document
* @throws IOException if the path can't be read or is malformed
*
* @todo implement monitor feedback!
* @todo Respect Storage.uploadDocument where the monitor will be used by the upload itself later.
*/
public static TrpDoc load(final String path, DocLoadConfig config, IProgressMonitor monitor) throws IOException {
// create the document
TrpDoc doc = new TrpDoc();
// check OS and adjust URL protocol
final String os = System.getProperty("os.name");
/*
* FIXME use SysUtils.isWin() here?
*/
if (os.toLowerCase().contains("win")) {
LocalDocConst.URL_PROT_CONST = "file:///";
}
// else: keep default
final File inputDir = new File(path);
final File docXml = new File(inputDir.getAbsolutePath() + File.separator + LocalDocConst.DOC_XML_FILENAME);
// validate input path ======================================================
checkInputDir(inputDir);
// search for IMG files
TreeMap<String, File> pageMap = findImgFiles(inputDir);
logger.info("Found " + pageMap.entrySet().size() + " page images.");
if (pageMap.isEmpty()) {
throw new FileNotFoundException("The directory does not contain any images: " + inputDir.getAbsolutePath());
}
TrpDocMetadata docMd = null;
boolean doRefresh = true;
// try to read doc structure from disk
if (docXml.isFile()) {
doc = loadDocXml(docXml);
if (isValid(doc, pageMap.size(), config.isForceCreatePageXml())) {
logger.info("Loaded document structure from disk.");
docMd = doc.getMd();
// no refresh is necessary as doc structure matches the input dir content
doRefresh = false;
} else {
if (doc != null && doc.getMd() != null) {
// keep any existing metadata if invalid doc structure was found
docMd = doc.getMd();
}
logger.info("Removing faulty doc XML from disk and doing reload.");
docXml.delete();
doc = new TrpDoc();
}
}
logger.info("Reading document at " + inputDir.getAbsolutePath());
// find metadata file if not extracted from doc.xml =============================================
if (docMd == null) {
try {
docMd = loadDocMd(inputDir);
} catch (IOException ioe) {
docMd = new TrpDocMetadata();
}
}
initDocMd(docMd, inputDir, config.isStripServerRelatedMetadata());
// Set the docMd
doc.setMd(docMd);
if (!doRefresh) {
// Stop now and reuse doc structure from file
return doc;
}
// Construct the input dir with pageXml Files.
File pageInputDir = getPageXmlInputDir(inputDir);
if (config.isForceCreatePageXml() && !pageInputDir.isDirectory()) {
pageInputDir.mkdir();
}
// abbyy XML search path
File ocrInputDir = getOcrXmlInputDir(inputDir);
// alto XML search path
File altoInputDir = getAltoXmlInputDir(inputDir);
// alto XML search path
File txtInputDir = getTxtInputDir(inputDir);
// backupfolder for outdated page format files, if any
final String backupFolderName = XmlFormat.PAGE_2010.toString().toLowerCase() + "_backup";
final String backupPath = pageInputDir.getAbsolutePath() + File.separator + backupFolderName;
// iterate imgList, search for corresponding XML files and build TrpPages
int pageNr = 1;
List<TrpPage> pages = new ArrayList<TrpPage>(pageMap.entrySet().size());
// need a special variable to test whether we are in sync mode (only then do the following!!!!)
if (pageMap.entrySet().size() == 0 && config.isEnableSyncWithoutImages()) {
pageMap = createDummyImgFilesForXmls(inputDir, pageInputDir);
}
for (Entry<String, File> e : pageMap.entrySet()) {
File imgFile = e.getValue();
// the img file name without extension
final String imgFileName = e.getKey();
// check for a page XML of this name
File pageXml = findXml(imgFileName, pageInputDir);
// TODO thumbURL dir + imgFile.getName())+".jpg"
File thumbFile = getThumbFile(inputDir, imgFileName);
if (pageXml != null) {
XmlFormat xmlFormat = XmlUtils.getXmlFormat(pageXml);
switch(xmlFormat) {
case PAGE_2010:
Page2010Converter.updatePageFormatSingleFile(pageXml, backupPath);
break;
case PAGE_2013:
break;
default:
throw new IOException("Incompatible XML file in PAGE XML path! " + pageXml.getAbsolutePath());
}
}
// try to read image dimension in any case to detect corrupt files immediately!
// FIXME this is taking too long and is only necessary on initial loading
Dimension dim = null;
String imageRemark = null;
try {
dim = ImgUtils.readImageDimensions(imgFile);
} catch (CorruptImageException cie) {
logger.error("Image is corrupt: " + imgFile.getAbsolutePath(), cie);
imageRemark = getCorruptImgMsg(imgFile.getName());
}
if (pageXml == null && config.isForceCreatePageXml()) {
// if no page XML, then create one at this path
File pageOutFile = new File(pageInputDir.getAbsolutePath() + File.separatorChar + imgFileName + ".xml");
File abbyyXml = findXml(imgFileName, ocrInputDir);
File altoXml = findXml(imgFileName, altoInputDir);
File txtFile = findFile(imgFileName, txtInputDir, "txt");
pageXml = createPageXml(pageOutFile, false, abbyyXml, altoXml, txtFile, config.isPreserveOcrFontFamily(), config.isPreserveOcrTxtStyles(), config.isReplaceBadChars(), imgFile.getName(), dim);
}
TrpPage page = buildPage(inputDir, pageNr++, imgFile, pageXml, thumbFile, dim, imageRemark);
pages.add(page);
}
doc.setPages(pages);
doc.getMd().setNrOfPages(doc.getPages().size());
// set editorial declaration:
List<EdFeature> features = readEditDeclFeatures(doc.getMd().getLocalFolder());
doc.setEdDeclList(features);
logger.debug(doc.toString());
// store doc on disk to save time on next load
LocalDocWriter.writeDocXml(doc, docXml);
return doc;
}
use of eu.transkribus.core.model.beans.EdFeature in project TranskribusCore by Transkribus.
the class TrpPdfDocument method addTitlePage.
public void addTitlePage(TrpDoc doc) {
document.newPage();
PdfContentByte cb = writer.getDirectContentUnder();
float lineHeight = twelfthPoints[1][0] / 3;
float posY = twelfthPoints[1][1];
addTitleString("Title Page", posY, 0, (float) (lineHeight * 1.5), cb, bfArialBoldItalic);
posY += lineHeight * 2;
TrpDocMetadata docMd = doc.getMd();
if (writeDocMd("Title: ", docMd.getTitle(), posY, 0, lineHeight, cb, bfArialItalic)) {
posY += lineHeight * 1.5;
}
if (writeDocMd("Author: ", docMd.getAuthor(), posY, 0, lineHeight, cb, bfArialItalic)) {
posY += lineHeight * 1.5;
}
lineHeight = twelfthPoints[1][0] / 6;
if (writeDocMd("Description: ", docMd.getDesc(), posY, 0, lineHeight, cb, bfArialItalic)) {
posY += lineHeight * 1.2;
}
if (writeDocMd("Genre: ", docMd.getGenre(), posY, 0, lineHeight, cb, bfArialItalic)) {
posY += lineHeight * 1.2;
}
if (writeDocMd("Writer: ", docMd.getWriter(), posY, 0, lineHeight, cb, bfArialItalic)) {
posY += lineHeight * 1.2;
}
if (docMd.getScriptType() != null) {
if (writeDocMd("Scripttype: ", docMd.getScriptType().toString(), posY, 0, lineHeight, cb, bfArialItalic)) {
posY += lineHeight * 1.2;
}
}
if (writeDocMd("Language: ", docMd.getLanguage(), posY, 0, lineHeight, cb, bfArialItalic)) {
posY += lineHeight * 1.2;
}
if (writeDocMd("Number of Pages in whole Document: ", String.valueOf(docMd.getNrOfPages()), posY, 0, lineHeight, cb, bfArialItalic)) {
posY += lineHeight * 1.2;
}
if (docMd.getCreatedFromDate() != null) {
if (writeDocMd("Created From: ", docMd.getCreatedFromDate().toString(), posY, 0, lineHeight, cb, bfArialItalic)) {
posY += lineHeight * 1.2;
}
}
if (docMd.getCreatedToDate() != null) {
if (writeDocMd("Created To: ", docMd.getCreatedToDate().toString(), posY, 0, lineHeight, cb, bfArialItalic)) {
posY += lineHeight * 1.5;
}
}
// --- Export settings section
lineHeight = twelfthPoints[1][0] / 3;
addTitleString("Export Settings: ", posY, twelfthPoints[1][0], lineHeight, cb, bfArialBoldItalic);
String imageSetting = (imgOnly ? "Images without text layer" : "Images with text layer");
String extraTextSetting = (extraTextPage ? "Extra pages for transcribed text are added" : "");
String blackeningSetting = (doBlackening ? "Sensible data is invisible" : "Sensible data is shown if existent");
String tagSetting = (highlightTags ? "Tags are highlighted (colored lines) and added at the end" : "No tags shown in export");
lineHeight = twelfthPoints[1][0] / 6;
posY += lineHeight * 1.5;
addTitleString(imageSetting + " / " + extraTextSetting + " / " + blackeningSetting + " / " + tagSetting, posY, twelfthPoints[1][0], lineHeight, cb, bfArialBoldItalic);
// --- Export settings section end
// --- Editorial declaration section
lineHeight = twelfthPoints[1][0] / 3;
posY += lineHeight * 1.5;
List<EdFeature> efl = doc.getEdDeclList();
if (efl.size() >= 0) {
addTitleString("Editorial Declaration: ", posY, twelfthPoints[1][0], lineHeight, cb, bfArialBoldItalic);
posY += lineHeight * 1.5;
lineHeight = twelfthPoints[1][0] / 6;
}
for (EdFeature edfeat : efl) {
addTitleString(edfeat.getTitle() + ": " + edfeat.getDescription() + "\n" + edfeat.getSelectedOption().toString(), posY, twelfthPoints[1][0], lineHeight, cb, bfArial);
// posY += lineHeight;
// addTitleString(edfeat.getSelectedOption().toString(), posY, twelfthPoints[1][0], lineHeight, cb, bfArial);
posY += lineHeight * 1.5;
}
// --- Editorial declaration section end
}
use of eu.transkribus.core.model.beans.EdFeature in project TranskribusCore by Transkribus.
the class LocalDocReader method readEditDeclFeatures.
public static List<EdFeature> readEditDeclFeatures(File folder) {
List<EdFeature> features = new ArrayList<>();
File editDecl = new File(folder + "/" + LocalDocConst.EDITORIAL_DECLARATION_FN);
if (editDecl.isFile()) {
try {
JaxbList<EdFeature> list = JaxbUtils.unmarshal(editDecl, JaxbList.class, EdFeature.class, EdOption.class);
features = list.getList();
} catch (Exception e) {
logger.error(e.getMessage(), e);
}
}
return features;
}
use of eu.transkribus.core.model.beans.EdFeature in project TranskribusCore by Transkribus.
the class TrpTeiStringBuilder method writeEditorialDeclaration.
void writeEditorialDeclaration(SebisStringBuilder sb) {
if (trpDoc.getEdDeclList() == null || trpDoc.getEdDeclList().isEmpty())
return;
sb.incIndent();
sb.addLine("<encodingDesc>");
sb.incIndent();
sb.addLine("<editorialDecl>");
sb.incIndent();
for (EdFeature f : trpDoc.getEdDeclList()) {
if (f.getSelectedOption() != null) {
String str = f.getTitle() + " (" + f.getDescription() + "): " + f.getSelectedOption().getText();
String escapedstr = StringEscapeUtils.escapeXml(str);
sb.addLine("<p>" + escapedstr + "</p>");
}
}
sb.decIndent();
sb.addLine("</editorialDecl>");
sb.decIndent();
sb.addLine("</encodingDesc>");
sb.decIndent();
}
use of eu.transkribus.core.model.beans.EdFeature in project TranskribusCore by Transkribus.
the class DocxBuilder method addTitlePage.
public static void addTitlePage(TrpDoc doc, MainDocumentPart mdp) {
mdp.getPropertyResolver().activateStyle("Light Shading");
mdp.getPropertyResolver().activateStyle("Medium List 1");
addParagraph("", "Title Page", mdp, "Title");
TrpDocMetadata docMd = doc.getMd();
addParagraph("Title: ", docMd.getTitle(), mdp, "Subtitle");
addParagraph("Author: ", docMd.getAuthor(), mdp, "Subtitle");
addParagraph("Description: ", docMd.getDesc(), mdp, "Subtitle");
addParagraph("Genre: ", docMd.getGenre(), mdp, "Subtitle");
addParagraph("Writer: ", docMd.getWriter(), mdp, "Subtitle");
if (docMd.getScriptType() != null) {
addParagraph("Sripttype: ", docMd.getScriptType().toString(), mdp, "Subtitle");
}
addParagraph("Language: ", docMd.getLanguage(), mdp, "Subtitle");
addParagraph("Number of Pages in whole Document: ", String.valueOf(docMd.getNrOfPages()), mdp, "Subtitle");
if (docMd.getCreatedFromDate() != null) {
addParagraph("Created From: ", docMd.getCreatedFromDate().toString(), mdp, "Subtitle");
}
if (docMd.getCreatedToDate() != null) {
addParagraph("Created To: ", docMd.getCreatedToDate().toString(), mdp, "Subtitle");
}
/*
* static boolean exportTags = true;
static boolean doBlackening = true;
static boolean markUnclearWords = false;
static boolean expandAbbrevs = false;
static boolean substituteAbbrevs = false;
static boolean preserveLineBreaks = false;
*/
addParagraph("", "Export Settings", mdp, "Title");
String tagSettings = (exportTags ? "Custom tags are indexed" : "Custom tags are not exported");
String blackeningSetting = (doBlackening ? "Sensible data is blackened" : "All data is visible");
String abbrevsSettings = (expandAbbrevs ? "Abbreviations are expanded (abbrev [expansion])" : (substituteAbbrevs ? "Abbreviations are subsituted by there expansion" : "Abbreviations as they are (diplomatic text)"));
String unclearSettings = (markUnclearWords ? "Unclear words are marked" : "");
String lineBreakSettings = (preserveLineBreaks ? "Keep the line breaks as in the original document" : "Line breaks does not conform to the original text");
String suppliedSettings = (showSuppliedWithBrackets ? "Supplied tags are shown in brackets" : (ignoreSupplied ? "Supplied tags get ignored" : "Supplied tags are not marked specifically"));
addParagraph("", blackeningSetting + " / " + tagSettings + " / " + abbrevsSettings + " / " + unclearSettings + " / " + lineBreakSettings + " / " + suppliedSettings, mdp, "Subtitle");
addParagraph("", "Editorial Declaration: ", mdp, "Title");
for (EdFeature edfeat : doc.getEdDeclList()) {
addParagraph("", edfeat.getTitle() + ": " + edfeat.getDescription() + "\n" + edfeat.getSelectedOption().toString(), mdp, "Subtitle");
}
}
Aggregations