use of eu.transkribus.core.model.beans.TrpDocMetadata in project TranskribusCore by Transkribus.
the class LocalDocReader method loadDocMd.
/**
* searches the inputDir for files ending in XmlFileFilter.mdFileEnding,
* which is e.g. "metadata.xml". If a file is found, it is parsed into a
* TrpDocMetadata Object.
*
* @param inputDir
* where the document is stored
* @return TrpDocMetadata Object or null if no mdFile is found.
* @throws IOException
* If more than one mdFile is on the path
*/
public static TrpDocMetadata loadDocMd(File inputDir) throws IOException {
final File[] mdFiles = inputDir.listFiles(new MdFileFilter());
if (mdFiles == null || mdFiles.length == 0) {
// no file => no metadata
throw new FileNotFoundException("No metadata XML was found on path: " + inputDir.getAbsolutePath());
} else {
final File mdFile = mdFiles[0];
logger.info("Found md File " + mdFile.getAbsolutePath());
try {
TrpDocMetadata docMd = JaxbUtils.unmarshal(mdFile, TrpDocMetadata.class);
// set ID to -1 in order to create confusion
docMd.setDocId(-1);
return docMd;
} catch (JAXBException je) {
// this file will be ignored
throw new IOException("The md File " + mdFile.getName() + " did not obey the correct format. " + "A doc without metadata will be provided.");
}
}
}
use of eu.transkribus.core.model.beans.TrpDocMetadata in project TranskribusCore by Transkribus.
the class TrpMetsBuilder method buildMets.
/**
* Generate a METS containing
* <ul>
* <li>TrpDocMetadata embedded in sourceMd</li>
* <li>all page images</li>
* <li>the most recent PAGE XML files from the Doc</li>
* </ul>
*
* If a local document is passed, all hrefs will contain the relative paths to files based on the localFolder!
*
* @param doc
* @param exportImages
* @param pageIndices
* @return
* @throws IOException if image/xml files can't be accessed for reading the mimetype etc.
*/
public static Mets buildMets(TrpDoc doc, boolean exportPage, boolean exportAlto, boolean exportImages, Set<Integer> pageIndices) throws IOException {
Mets mets = new Mets();
TrpDocMetadata md = doc.getMd();
File localFolder = md.getLocalFolder();
boolean isLocalDoc = localFolder != null;
mets.setLABEL(md.getTitle());
mets.setOBJID("" + md.getDocId());
mets.setPROFILE(TRP_METS_PROFILE);
// FIXME remove TYPE
// mets.setTYPE(TRP_METS_PROFILE);
// metsHdr
MetsHdr hdr = buildMetsHdr(md);
mets.setMetsHdr(hdr);
// TODO dcmd_elec omitted meanwhile
// md_orig
AmdSecType amdSec = new AmdSecType();
amdSec.setID(SOURCE_MD_ID_CONST);
MdSecType sourceMdSec = buildSourceMdSec(md);
amdSec.getSourceMD().add(sourceMdSec);
mets.getAmdSec().add(amdSec);
// structmap div, linking to the sourceMd section with dmd
DivType div = new DivType();
div.getADMID().add(sourceMdSec);
div.setID(TRP_DOC_DIV_ID);
FileSec fileSec = new FileSec();
StructMapType structMap = new StructMapType();
structMap.setID(TRP_STRUCTMAP_ID);
structMap.setTYPE("MANUSCRIPT");
structMap.setDiv(div);
List<TrpPage> pages = doc.getPages();
FimgStoreGetClient client = null;
if (!isLocalDoc) {
// TODO maybe we need this stuff in the docMetadata?
URL url = pages.get(0).getUrl();
client = new FimgStoreGetClient(url);
}
FileGrp masterGrp = new FileGrp();
masterGrp.setID(MASTER_FILE_GRP_ID);
FileGrpType imgGrp = new FileGrpType();
imgGrp.setID(IMG_GROUP_ID);
FileGrpType pageGrp = new FileGrpType();
pageGrp.setID(PAGE_GROUP_ID);
FileGrpType altoGrp = new FileGrpType();
altoGrp.setID(ALTO_GROUP_ID);
int i = -1;
for (TrpPage p : pages) {
i++;
if (pageIndices != null && !pageIndices.contains(i)) {
continue;
}
// build a page div for the structmap
DivType pageDiv = new DivType();
pageDiv.setID("PAGE_" + p.getPageNr());
pageDiv.setTYPE("SINGLE_PAGE");
pageDiv.setORDER(BigInteger.valueOf(p.getPageNr()));
final String imgId = "IMG_" + p.getPageNr();
final String xmlId = PAGE_GROUP_ID + "_" + p.getPageNr();
final String altoId = ALTO_GROUP_ID + "_" + p.getPageNr();
/* only the most recent transcript is added here for now
*
* TODO how to deal with imagestore files? use orig image? right now, it's just the view file...
* TODO thumbnails not yet included
*/
if (exportImages) {
FileType img = buildFileType(localFolder, imgId, p, p.getPageNr(), client);
imgGrp.getFile().add(img);
// linking images
Fptr imgPtr = buildFptr(img);
pageDiv.getFptr().add(imgPtr);
}
// TODO error handling.. if no transcript??
if (exportPage) {
// xmlfiletype: just add the most recent transcript
TrpTranscriptMetadata tMd;
// get the transcript chosen for export
tMd = p.getCurrentTranscript();
FileType xml = buildFileType(md.getLocalFolder(), xmlId, tMd, p.getPageNr(), client);
pageGrp.getFile().add(xml);
Fptr xmlPtr = buildFptr(xml);
pageDiv.getFptr().add(xmlPtr);
}
// creat ALTO fileGrp
if (exportAlto) {
FileType altoFt = new FileType();
altoFt.setCHECKSUMTYPE(ChecksumUtils.ChkSumAlg.MD5.toString());
// TODO calculate checksum
altoFt.setCHECKSUM("");
FLocat fLocat = new FLocat();
fLocat.setLOCTYPE("OTHER");
fLocat.setOTHERLOCTYPE("FILE");
altoFt.setID(altoId);
altoFt.setSEQ(p.getPageNr());
// String tmpImgName = img.getFLocat().get(0).getHref();
String relAltoPath = "alto".concat(File.separator).concat(p.getImgFileName().substring(0, p.getImgFileName().lastIndexOf(".")).concat(".xml"));
fLocat.setHref(relAltoPath);
// String absAltoPath = tMd.getUrl().getPath().replace("page", "alto");
final String path = FileUtils.toFile(p.getUrl()).getAbsolutePath();
String absAltoPath = path.substring(0, path.lastIndexOf(File.separator));
absAltoPath = absAltoPath.concat("/alto/").concat(p.getImgFileName().substring(0, p.getImgFileName().lastIndexOf(".")).concat(".xml"));
// logger.info("alto path starts with: " + absAltoPath);
if (absAltoPath.startsWith("\\")) /*|| absAltoPath.startsWith("/")*/
{
// logger.info("alto path starts with \\ or /");
absAltoPath = absAltoPath.substring(1);
}
String mime = MimeTypes.getMimeType("xml");
altoFt.setMIMETYPE(mime);
File altoTmp = new File(absAltoPath);
if (altoTmp.exists()) {
// logger.info("alto file exist at " + absAltoPath);
Date date = new Date(altoTmp.lastModified());
XMLGregorianCalendar cal = JaxbUtils.getXmlCalendar(date);
altoFt.setCREATED(cal);
} else {
logger.info("alto file does not exist at " + absAltoPath);
}
// System.out.println("relAltoPath " + relAltoPath);
// System.out.println("absAltoPath " + absAltoPath);
// System.in.read();
altoFt.getFLocat().add(fLocat);
altoGrp.getFile().add(altoFt);
Fptr altoPtr = buildFptr(altoFt);
pageDiv.getFptr().add(altoPtr);
}
div.getDiv().add(pageDiv);
}
fileSec.getFileGrp().add(masterGrp);
mets.setFileSec(fileSec);
if (exportImages) {
masterGrp.getFileGrp().add(imgGrp);
}
if (exportPage) {
masterGrp.getFileGrp().add(pageGrp);
}
if (exportAlto) {
masterGrp.getFileGrp().add(altoGrp);
}
mets.getStructMap().add(structMap);
return mets;
}
use of eu.transkribus.core.model.beans.TrpDocMetadata in project TranskribusCore by Transkribus.
the class FatBuilder method writeFatXml.
public static File writeFatXml(File outputDir, final String languages, final String typeFace) throws UnsupportedFormatException, IOException {
if (!new File(outputDir.getAbsolutePath() + File.separator + LocalDocConst.OCR_MASTER_DIR).isDirectory()) {
throw new IllegalArgumentException("No directory '" + LocalDocConst.OCR_MASTER_DIR + "' in directory: " + outputDir.getAbsolutePath());
}
// needs a local doc! Read files separately because we don't want to create Page XMLs
Map<String, File> imgFiles = LocalDocReader.findImgFiles(outputDir);
TrpDocMetadata docMd = LocalDocReader.loadDocMd(outputDir);
// final DocType docType = doc.getMd().getType();
// if(!DocType.PRINT.equals(docType)){
// throw new IllegalArgumentException("DocType " + docType + " not allowed for FAT XML production");
// }
RootFolder rootFolder = new RootFolder();
SimpleDateFormat df = new SimpleDateFormat();
df.applyPattern("yyyy-MM-dd hh:mm");
final String dateStr = df.format(new Date());
rootFolder.setDate(dateStr);
final BigInteger nFiles = getBigIntValue(imgFiles.size());
rootFolder.setNFiles(nFiles);
rootFolder.setNDocuments(BigInteger.valueOf(1));
rootFolder.setNFileWarnings(BigInteger.valueOf(0));
rootFolder.setNFolders(BigInteger.valueOf(1));
DocumentFolder docFolder = new DocumentFolder();
docFolder.setName(outputDir.getName());
docFolder.setPath(LocalDocConst.OCR_MASTER_DIR);
docFolder.setNFilesPerFolder(nFiles);
// TODO throw exception if missingMetadata is true?
boolean missingMetadata = false;
Order order = new Order();
order.setHasViewingFiles("false");
order.setServices("(OCR)");
OcrMetadata ocrM = new OcrMetadata();
if (languages != null && !languages.isEmpty()) {
ocrM.setLanguages(languages);
} else if (docMd.getLanguage() != null && !docMd.getLanguage().isEmpty()) {
ocrM.setLanguages(docMd.getLanguage());
} else {
missingMetadata = true;
ocrM.setLanguages("");
}
if (typeFace != null && !typeFace.isEmpty()) {
ocrM.setTexttype(typeFace);
} else if (docMd.getScriptType() != null) {
ocrM.setTexttype(docMd.getScriptType().toString());
} else {
ocrM.setTexttype(ScriptType.NORMAL.toString());
missingMetadata = true;
}
ocrM.setOutput("(ABBYY-XML)");
// check the following!
int nDocsMissingMetadata = missingMetadata ? 1 : 0;
rootFolder.setNDocumentsMissingMetadata(BigInteger.valueOf(nDocsMissingMetadata));
FepMetadata fep = new FepMetadata();
fep.setWorkflow("None");
order.setOcrMetadata(ocrM);
order.setFepMetadata(fep);
docFolder.setOrder(order);
FileFolder fileFolder = new FileFolder();
fileFolder.setType("img");
// existence of OCR_MASTER_DIR is checked at the beginning
fileFolder.setName(LocalDocConst.OCR_MASTER_DIR);
fileFolder.setPath(LocalDocConst.OCR_MASTER_DIR);
// List<TrpPage> pages = doc.getPages();
int checkedFiles = 0;
int uncheckedFiles = 0;
int nFileErrors = 0;
for (Entry<String, File> imgE : imgFiles.entrySet()) {
final File img = imgE.getValue();
eu.transkribus.core.model.beans.fat.File file = new eu.transkribus.core.model.beans.fat.File();
file.setName(img.getName());
String errorType;
String message;
try {
final Map<String, String> exif = ExiftoolUtil.extractImgMd(img.getAbsolutePath());
final String mimetype = exif.get("MIMEType");
final String xRes = exif.get("XResolution");
final String yRes = exif.get("YResolution");
final String width = exif.get("ImageWidth");
final String height = exif.get("ImageHeight");
Metadata md = new Metadata();
md.setMimetype(mimetype);
md.setXRes(getBigIntValue(xRes));
md.setYRes(getBigIntValue(yRes));
md.setWidth(getBigIntValue(width));
md.setHeight(getBigIntValue(height));
final String md5 = formatChecksum(ChecksumUtils.getMd5SumHex(img));
md.setChecksum(md5);
file.setMetadata(md);
checkedFiles++;
file.setStatus("Checked");
errorType = "None";
message = "";
} catch (TimeoutException | InterruptedException | NumberFormatException e) {
uncheckedFiles++;
nFileErrors++;
errorType = e.getClass().getName();
message = e.getMessage();
file.setStatus("Error");
logger.error("Could not run file checks for file: " + img.getAbsolutePath(), e);
}
file.setErrorType(errorType);
file.setMessage(message);
fileFolder.getFile().add(file);
}
rootFolder.setNCheckedFiles(getBigIntValue(checkedFiles));
rootFolder.setNUncheckedFiles(getBigIntValue(uncheckedFiles));
rootFolder.setNFileErrors(getBigIntValue(nFileErrors));
docFolder.getFileFolder().add(fileFolder);
rootFolder.getDocumentFolder().add(docFolder);
File fatFile = new File(outputDir.getAbsolutePath() + File.separator + FatBuilder.FAT_FILE_NAME);
try {
fatFile = JaxbUtils.marshalToFile(rootFolder, fatFile);
} catch (JAXBException e) {
throw new IOException("Could not marshal FAT XML to file!", e);
}
return fatFile;
}
use of eu.transkribus.core.model.beans.TrpDocMetadata in project TranskribusCore by Transkribus.
the class DocxBuilder method addTitlePage.
public static void addTitlePage(TrpDoc doc, MainDocumentPart mdp) {
mdp.getPropertyResolver().activateStyle("Light Shading");
mdp.getPropertyResolver().activateStyle("Medium List 1");
addParagraph("", "Title Page", mdp, "Title");
TrpDocMetadata docMd = doc.getMd();
addParagraph("Title: ", docMd.getTitle(), mdp, "Subtitle");
addParagraph("Author: ", docMd.getAuthor(), mdp, "Subtitle");
addParagraph("Description: ", docMd.getDesc(), mdp, "Subtitle");
addParagraph("Genre: ", docMd.getGenre(), mdp, "Subtitle");
addParagraph("Writer: ", docMd.getWriter(), mdp, "Subtitle");
if (docMd.getScriptType() != null) {
addParagraph("Sripttype: ", docMd.getScriptType().toString(), mdp, "Subtitle");
}
addParagraph("Language: ", docMd.getLanguage(), mdp, "Subtitle");
addParagraph("Number of Pages in whole Document: ", String.valueOf(docMd.getNrOfPages()), mdp, "Subtitle");
if (docMd.getCreatedFromDate() != null) {
addParagraph("Created From: ", docMd.getCreatedFromDate().toString(), mdp, "Subtitle");
}
if (docMd.getCreatedToDate() != null) {
addParagraph("Created To: ", docMd.getCreatedToDate().toString(), mdp, "Subtitle");
}
/*
* static boolean exportTags = true;
static boolean doBlackening = true;
static boolean markUnclearWords = false;
static boolean expandAbbrevs = false;
static boolean substituteAbbrevs = false;
static boolean preserveLineBreaks = false;
*/
addParagraph("", "Export Settings", mdp, "Title");
String tagSettings = (exportTags ? "Custom tags are indexed" : "Custom tags are not exported");
String blackeningSetting = (doBlackening ? "Sensible data is blackened" : "All data is visible");
String abbrevsSettings = (expandAbbrevs ? "Abbreviations are expanded (abbrev [expansion])" : (substituteAbbrevs ? "Abbreviations are subsituted by there expansion" : "Abbreviations as they are (diplomatic text)"));
String unclearSettings = (markUnclearWords ? "Unclear words are marked" : "");
String lineBreakSettings = (preserveLineBreaks ? "Keep the line breaks as in the original document" : "Line breaks does not conform to the original text");
String suppliedSettings = (showSuppliedWithBrackets ? "Supplied tags are shown in brackets" : (ignoreSupplied ? "Supplied tags get ignored" : "Supplied tags are not marked specifically"));
addParagraph("", blackeningSetting + " / " + tagSettings + " / " + abbrevsSettings + " / " + unclearSettings + " / " + lineBreakSettings + " / " + suppliedSettings, mdp, "Subtitle");
addParagraph("", "Editorial Declaration: ", mdp, "Title");
for (EdFeature edfeat : doc.getEdDeclList()) {
addParagraph("", edfeat.getTitle() + ": " + edfeat.getDescription() + "\n" + edfeat.getSelectedOption().toString(), mdp, "Subtitle");
}
}
use of eu.transkribus.core.model.beans.TrpDocMetadata in project TranskribusCore by Transkribus.
the class TrpTxtBuilder method addTitlePage.
public static void addTitlePage(TrpDoc doc, File file) {
List<String> titleContent = new ArrayList<String>();
titleContent.add("----------------------------");
titleContent.add("Metadata section of document");
titleContent.add("----------------------------");
TrpDocMetadata docMd = doc.getMd();
titleContent.add("Title: " + docMd.getTitle());
titleContent.add("Author: " + docMd.getAuthor());
titleContent.add("Description: " + docMd.getDesc());
titleContent.add("Genre: " + docMd.getGenre());
titleContent.add("Writer: " + docMd.getWriter());
if (docMd.getScriptType() != null) {
titleContent.add("Sripttype: " + docMd.getScriptType().toString());
}
titleContent.add("Language: " + docMd.getLanguage());
titleContent.add("Number of Pages in whole Document: " + String.valueOf(docMd.getNrOfPages()));
if (docMd.getCreatedFromDate() != null) {
titleContent.add("Created From: " + docMd.getCreatedFromDate().toString());
}
if (docMd.getCreatedToDate() != null) {
titleContent.add("Created To: " + docMd.getCreatedToDate().toString());
}
titleContent.add("Editorial Declaration: ");
for (EdFeature edfeat : doc.getEdDeclList()) {
titleContent.add(edfeat.getTitle() + ": " + edfeat.getDescription() + System.lineSeparator() + edfeat.getSelectedOption().toString());
}
titleContent.add("-----------------------");
titleContent.add("End of metadata section");
titleContent.add("-----------------------");
titleContent.add(System.lineSeparator());
try {
Files.write(Paths.get(file.getAbsolutePath()), titleContent, utf8, StandardOpenOption.CREATE, StandardOpenOption.APPEND);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
Aggregations