use of eu.transkribus.core.model.beans.mets.StructMapType in project TranskribusCore by Transkribus.
the class FEPLocalDocReader method parsePhysicalStructure.
static List<HashMap<String, File>> parsePhysicalStructure(File inputDir, Mets mets) throws IOException {
StructMapType physSm = findStructMap(mets, PHYSICAL_STRUCT_MAP_LABEL);
DivType rootDiv = physSm.getDiv();
// sort divs by order:
Collections.sort(rootDiv.getDiv(), new Comparator<DivType>() {
@Override
public int compare(DivType o1, DivType o2) {
return o1.getORDER().compareTo(o2.getORDER());
}
});
List<HashMap<String, File>> fepFileGrps = new ArrayList<>();
// parse them bloody divs:
for (DivType div : rootDiv.getDiv()) {
if (div.getFptr().size() != 1)
throw new IOException("Error parsing physical structure: nr of fptr elements is not 1 in div: " + div.getFptr().size() + ", id: " + div.getID());
ParType par = div.getFptr().get(0).getPar();
if (par == null)
throw new IOException("Error parsing physical structure: could not parse par element in fptr of div: " + div.getID());
HashMap<String, File> files = new HashMap<>();
for (Serializable o : par.getAreaOrSeq()) {
if (o instanceof AreaType) {
AreaType area = (AreaType) o;
FileType fileType = (FileType) area.getFILEID();
Pair<FileGrp, File> filePair = findFile(inputDir, mets, fileType.getID());
logger.debug("found file with id: " + fileType.getID() + ", path: " + filePair.getRight().getAbsolutePath());
files.put(filePair.getLeft().getID(), filePair.getRight());
}
}
fepFileGrps.add(files);
}
return fepFileGrps;
}
use of eu.transkribus.core.model.beans.mets.StructMapType in project TranskribusCore by Transkribus.
the class FEPLocalDocReader method setTitle.
static void setTitle(TrpDoc doc, Mets mets) throws IOException {
StructMapType physSm = findStructMap(mets, PHYSICAL_STRUCT_MAP_LABEL);
DivType rootDiv = physSm.getDiv();
String title = rootDiv.getLABEL();
doc.getMd().setTitle(title);
}
use of eu.transkribus.core.model.beans.mets.StructMapType in project TranskribusCore by Transkribus.
the class GoobiMetsImporter method fetchFiles.
/**
* @param mets: The unmarshalled Goobi Mets file
* @return
* @throws IOException
*/
public List<TrpPage> fetchFiles(String dir, Mets mets) throws IOException {
List<FileGrp> fileGrps = mets.getFileSec().getFileGrp();
List<FileType> xmlGrp = null;
List<FileType> imgGrp = null;
List<FileType> defaultImgGrp = null;
for (FileGrpType type : fileGrps) {
switch(type.getUSE()) {
case "MAX":
imgGrp = type.getFile();
break;
/*
* could also be that USE='Content' and ID="AltoFiles" or ID="AbbyyXmlFiles" is necessary to get the transcriptions
*/
case "DEFAULT":
defaultImgGrp = type.getFile();
break;
case "XML":
// possibility to load also an existent Alto or Abbyy XML and convert it to Page later on
// TODO: Abklären
xmlGrp = type.getFile();
break;
default:
break;
}
}
// take default images if no MAX images are available
if (imgGrp == null && defaultImgGrp != null) {
imgGrp = defaultImgGrp;
}
if (imgGrp == null)
throw new IOException("METS file has no image file list!");
if (xmlGrp == null) {
logger.debug("no xml file list");
// throw new IOException("METS file has no xml file list!");
}
List<DivType> pageDivs = null;
for (StructMapType sMap : mets.getStructMap()) {
if (sMap.getTYPE().equals("PHYSICAL") && // && sMap.getDiv().getID().equals("PHYS_0000")){
sMap.getDiv().getTYPE().equals("physSequence")) {
pageDivs = sMap.getDiv().getDiv();
break;
}
}
if (pageDivs == null)
throw new IOException("No valid StructMap was found!");
List<TrpPage> pages = new ArrayList<TrpPage>(pageDivs.size());
// Implement a reverse-order Comparator by lambda function
Comparator<DivType> comp = (DivType a, DivType b) -> {
return a.getORDER().compareTo(b.getORDER());
};
pageDivs.sort(comp);
for (DivType div : pageDivs) {
// fetch all files and store them locally
TrpPage p = fetchFilesFromUrl(div, imgGrp, xmlGrp, dir);
pages.add(p);
}
return pages;
}
use of eu.transkribus.core.model.beans.mets.StructMapType in project TranskribusCore by Transkribus.
the class FEPLocalDocReader method getLogicalStructuresForPage.
static HashMap<String, String> getLogicalStructuresForPage(Mets mets, int pageNr) throws IOException {
HashMap<String, String> structs = new HashMap<>();
StructMapType physSm = findStructMap(mets, LOGICAL_STRUCT_MAP_LABEL);
DivType rootDiv = physSm.getDiv();
String regex = "^STRUCTURE_PAGE_(\\d+)_(BLOCK_\\d+)$";
Pattern p = Pattern.compile(regex);
for (DivType d : rootDiv.getDiv()) {
String id = d.getID();
logger.trace("found div with id = " + id);
Matcher m = p.matcher(id);
if (m.matches()) {
logger.trace("id matches regex: " + id);
logger.trace("g1 = " + m.group(1));
logger.trace("g2 = " + m.group(2));
logger.trace("TYPE = " + d.getTYPE());
if (Integer.parseInt(m.group(1)) == pageNr)
structs.put(m.group(2), d.getTYPE());
} else {
logger.warn("div with ID = " + id + " cannot be parsed as a logical structure!");
}
}
return structs;
}
use of eu.transkribus.core.model.beans.mets.StructMapType in project TranskribusCore by Transkribus.
the class TrpMetsBuilder method buildMets.
/**
* Generate a METS containing
* <ul>
* <li>TrpDocMetadata embedded in sourceMd</li>
* <li>all page images</li>
* <li>the most recent PAGE XML files from the Doc</li>
* </ul>
*
* If a local document is passed, all hrefs will contain the relative paths to files based on the localFolder!
*
* @param doc
* @param exportImages
* @param pageIndices
* @return
* @throws IOException if image/xml files can't be accessed for reading the mimetype etc.
*/
public static Mets buildMets(TrpDoc doc, boolean exportPage, boolean exportAlto, boolean exportImages, Set<Integer> pageIndices) throws IOException {
Mets mets = new Mets();
TrpDocMetadata md = doc.getMd();
File localFolder = md.getLocalFolder();
boolean isLocalDoc = localFolder != null;
mets.setLABEL(md.getTitle());
mets.setOBJID("" + md.getDocId());
mets.setPROFILE(TRP_METS_PROFILE);
// FIXME remove TYPE
// mets.setTYPE(TRP_METS_PROFILE);
// metsHdr
MetsHdr hdr = buildMetsHdr(md);
mets.setMetsHdr(hdr);
// TODO dcmd_elec omitted meanwhile
// md_orig
AmdSecType amdSec = new AmdSecType();
amdSec.setID(SOURCE_MD_ID_CONST);
MdSecType sourceMdSec = buildSourceMdSec(md);
amdSec.getSourceMD().add(sourceMdSec);
mets.getAmdSec().add(amdSec);
// structmap div, linking to the sourceMd section with dmd
DivType div = new DivType();
div.getADMID().add(sourceMdSec);
div.setID(TRP_DOC_DIV_ID);
FileSec fileSec = new FileSec();
StructMapType structMap = new StructMapType();
structMap.setID(TRP_STRUCTMAP_ID);
structMap.setTYPE("MANUSCRIPT");
structMap.setDiv(div);
List<TrpPage> pages = doc.getPages();
FimgStoreGetClient client = null;
if (!isLocalDoc) {
// TODO maybe we need this stuff in the docMetadata?
URL url = pages.get(0).getUrl();
client = new FimgStoreGetClient(url);
}
FileGrp masterGrp = new FileGrp();
masterGrp.setID(MASTER_FILE_GRP_ID);
FileGrpType imgGrp = new FileGrpType();
imgGrp.setID(IMG_GROUP_ID);
FileGrpType pageGrp = new FileGrpType();
pageGrp.setID(PAGE_GROUP_ID);
FileGrpType altoGrp = new FileGrpType();
altoGrp.setID(ALTO_GROUP_ID);
int i = -1;
for (TrpPage p : pages) {
i++;
if (pageIndices != null && !pageIndices.contains(i)) {
continue;
}
// build a page div for the structmap
DivType pageDiv = new DivType();
pageDiv.setID("PAGE_" + p.getPageNr());
pageDiv.setTYPE("SINGLE_PAGE");
pageDiv.setORDER(BigInteger.valueOf(p.getPageNr()));
final String imgId = "IMG_" + p.getPageNr();
final String xmlId = PAGE_GROUP_ID + "_" + p.getPageNr();
final String altoId = ALTO_GROUP_ID + "_" + p.getPageNr();
/* only the most recent transcript is added here for now
*
* TODO how to deal with imagestore files? use orig image? right now, it's just the view file...
* TODO thumbnails not yet included
*/
if (exportImages) {
FileType img = buildFileType(localFolder, imgId, p, p.getPageNr(), client);
imgGrp.getFile().add(img);
// linking images
Fptr imgPtr = buildFptr(img);
pageDiv.getFptr().add(imgPtr);
}
// TODO error handling.. if no transcript??
if (exportPage) {
// xmlfiletype: just add the most recent transcript
TrpTranscriptMetadata tMd;
// get the transcript chosen for export
tMd = p.getCurrentTranscript();
FileType xml = buildFileType(md.getLocalFolder(), xmlId, tMd, p.getPageNr(), client);
pageGrp.getFile().add(xml);
Fptr xmlPtr = buildFptr(xml);
pageDiv.getFptr().add(xmlPtr);
}
// creat ALTO fileGrp
if (exportAlto) {
FileType altoFt = new FileType();
altoFt.setCHECKSUMTYPE(ChecksumUtils.ChkSumAlg.MD5.toString());
// TODO calculate checksum
altoFt.setCHECKSUM("");
FLocat fLocat = new FLocat();
fLocat.setLOCTYPE("OTHER");
fLocat.setOTHERLOCTYPE("FILE");
altoFt.setID(altoId);
altoFt.setSEQ(p.getPageNr());
// String tmpImgName = img.getFLocat().get(0).getHref();
String relAltoPath = "alto".concat(File.separator).concat(p.getImgFileName().substring(0, p.getImgFileName().lastIndexOf(".")).concat(".xml"));
fLocat.setHref(relAltoPath);
// String absAltoPath = tMd.getUrl().getPath().replace("page", "alto");
final String path = FileUtils.toFile(p.getUrl()).getAbsolutePath();
String absAltoPath = path.substring(0, path.lastIndexOf(File.separator));
absAltoPath = absAltoPath.concat("/alto/").concat(p.getImgFileName().substring(0, p.getImgFileName().lastIndexOf(".")).concat(".xml"));
// logger.info("alto path starts with: " + absAltoPath);
if (absAltoPath.startsWith("\\")) /*|| absAltoPath.startsWith("/")*/
{
// logger.info("alto path starts with \\ or /");
absAltoPath = absAltoPath.substring(1);
}
String mime = MimeTypes.getMimeType("xml");
altoFt.setMIMETYPE(mime);
File altoTmp = new File(absAltoPath);
if (altoTmp.exists()) {
// logger.info("alto file exist at " + absAltoPath);
Date date = new Date(altoTmp.lastModified());
XMLGregorianCalendar cal = JaxbUtils.getXmlCalendar(date);
altoFt.setCREATED(cal);
} else {
logger.info("alto file does not exist at " + absAltoPath);
}
// System.out.println("relAltoPath " + relAltoPath);
// System.out.println("absAltoPath " + absAltoPath);
// System.in.read();
altoFt.getFLocat().add(fLocat);
altoGrp.getFile().add(altoFt);
Fptr altoPtr = buildFptr(altoFt);
pageDiv.getFptr().add(altoPtr);
}
div.getDiv().add(pageDiv);
}
fileSec.getFileGrp().add(masterGrp);
mets.setFileSec(fileSec);
if (exportImages) {
masterGrp.getFileGrp().add(imgGrp);
}
if (exportPage) {
masterGrp.getFileGrp().add(pageGrp);
}
if (exportAlto) {
masterGrp.getFileGrp().add(altoGrp);
}
mets.getStructMap().add(structMap);
return mets;
}
Aggregations