use of eu.transkribus.core.model.beans.mets.FileType in project TranskribusCore by Transkribus.
the class FEPLocalDocReader method parsePhysicalStructure.
static List<HashMap<String, File>> parsePhysicalStructure(File inputDir, Mets mets) throws IOException {
StructMapType physSm = findStructMap(mets, PHYSICAL_STRUCT_MAP_LABEL);
DivType rootDiv = physSm.getDiv();
// sort divs by order:
Collections.sort(rootDiv.getDiv(), new Comparator<DivType>() {
@Override
public int compare(DivType o1, DivType o2) {
return o1.getORDER().compareTo(o2.getORDER());
}
});
List<HashMap<String, File>> fepFileGrps = new ArrayList<>();
// parse them bloody divs:
for (DivType div : rootDiv.getDiv()) {
if (div.getFptr().size() != 1)
throw new IOException("Error parsing physical structure: nr of fptr elements is not 1 in div: " + div.getFptr().size() + ", id: " + div.getID());
ParType par = div.getFptr().get(0).getPar();
if (par == null)
throw new IOException("Error parsing physical structure: could not parse par element in fptr of div: " + div.getID());
HashMap<String, File> files = new HashMap<>();
for (Serializable o : par.getAreaOrSeq()) {
if (o instanceof AreaType) {
AreaType area = (AreaType) o;
FileType fileType = (FileType) area.getFILEID();
Pair<FileGrp, File> filePair = findFile(inputDir, mets, fileType.getID());
logger.debug("found file with id: " + fileType.getID() + ", path: " + filePair.getRight().getAbsolutePath());
files.put(filePair.getLeft().getID(), filePair.getRight());
}
}
fepFileGrps.add(files);
}
return fepFileGrps;
}
use of eu.transkribus.core.model.beans.mets.FileType in project TranskribusCore by Transkribus.
the class GoobiMetsImporter method fetchFiles.
/**
* @param mets: The unmarshalled Goobi Mets file
* @return
* @throws IOException
*/
public List<TrpPage> fetchFiles(String dir, Mets mets) throws IOException {
List<FileGrp> fileGrps = mets.getFileSec().getFileGrp();
List<FileType> xmlGrp = null;
List<FileType> imgGrp = null;
List<FileType> defaultImgGrp = null;
for (FileGrpType type : fileGrps) {
switch(type.getUSE()) {
case "MAX":
imgGrp = type.getFile();
break;
/*
* could also be that USE='Content' and ID="AltoFiles" or ID="AbbyyXmlFiles" is necessary to get the transcriptions
*/
case "DEFAULT":
defaultImgGrp = type.getFile();
break;
case "XML":
// possibility to load also an existent Alto or Abbyy XML and convert it to Page later on
// TODO: Abklären
xmlGrp = type.getFile();
break;
default:
break;
}
}
// take default images if no MAX images are available
if (imgGrp == null && defaultImgGrp != null) {
imgGrp = defaultImgGrp;
}
if (imgGrp == null)
throw new IOException("METS file has no image file list!");
if (xmlGrp == null) {
logger.debug("no xml file list");
// throw new IOException("METS file has no xml file list!");
}
List<DivType> pageDivs = null;
for (StructMapType sMap : mets.getStructMap()) {
if (sMap.getTYPE().equals("PHYSICAL") && // && sMap.getDiv().getID().equals("PHYS_0000")){
sMap.getDiv().getTYPE().equals("physSequence")) {
pageDivs = sMap.getDiv().getDiv();
break;
}
}
if (pageDivs == null)
throw new IOException("No valid StructMap was found!");
List<TrpPage> pages = new ArrayList<TrpPage>(pageDivs.size());
// Implement a reverse-order Comparator by lambda function
Comparator<DivType> comp = (DivType a, DivType b) -> {
return a.getORDER().compareTo(b.getORDER());
};
pageDivs.sort(comp);
for (DivType div : pageDivs) {
// fetch all files and store them locally
TrpPage p = fetchFilesFromUrl(div, imgGrp, xmlGrp, dir);
pages.add(p);
}
return pages;
}
use of eu.transkribus.core.model.beans.mets.FileType in project TranskribusCore by Transkribus.
the class GoobiMetsImporter method fetchFilesFromUrl.
private TrpPage fetchFilesFromUrl(DivType div, List<FileType> imgGrp, List<FileType> xmlGrp, String dir) throws IOException {
final int pageNr = div.getORDER().intValue();
updateStatus("Downloading file for page nr. " + pageNr);
File imgFile = null;
File abbyyFile = null;
File altoFile = null;
String imgDirPath = dir + File.separator + "img";
String abbyyDirPath = dir + File.separator + LocalDocConst.OCR_FILE_SUB_FOLDER;
String altoDirPath = dir + File.separator + LocalDocConst.ALTO_FILE_SUB_FOLDER;
String pageDirPath = dir + File.separator + LocalDocConst.PAGE_FILE_SUB_FOLDER;
File pageDirFile = new File(pageDirPath);
if (!pageDirFile.isDirectory() && !pageDirFile.mkdir()) {
throw new IOException("Could not create page dir at: " + pageDirPath);
}
/**
* handle cases where no image can be retrieved/stored for this page:
* -image URL is broken
* -the image dimension can not be read from the downloaded file
* -no image file is mapped in the structmap for this page
*
* problemMsg is used to store info on that.
*/
String problemMsg = null;
for (Fptr ptr : div.getFptr()) {
FileType type = (FileType) ptr.getFILEID();
FLocat fLocat = type.getFLocat().get(0);
// FIXME at the moment only remote files are supported here!
final String locType = fLocat.getLOCTYPE();
if (!"URL".equals(locType)) {
throw new IOException("Bad or no LOCTYPE in an FLocat element: " + locType);
}
// MIMETYPE="image/jpeg"
final String mimetype = type.getMIMETYPE();
final URL url = new URL(fLocat.getHref());
String ext = MimeTypes.lookupExtension(mimetype);
/*
* brought problems with file/img links without the filname + ext at the end of the URL
*/
// final String filename = determineFilename(url, type.getID(), mimetype);
/*
* Preferred filename is the name in the getHeaderField("Content-Disposition");
* as fallback we use the fileID and mimetype extension
*
*/
String filename = type.getID() + "." + ext;
logger.debug("url.getProtocol() " + url.getProtocol());
if (url.getProtocol().startsWith("http")) {
String tmpFn = UrlUtils.getFilenameFromHeaderField(url);
// logger.debug("tmpFn " + tmpFn);
if (tmpFn != null) {
filename = tmpFn;
}
}
// logger.debug("mimetype " + mimetype);
logger.debug("imported filename " + filename);
if (imgGrp.contains(type)) {
imgFile = new File(imgDirPath + File.separator + filename);
logger.debug("Downloading: " + url);
// fetch file from this URL and store locally
int imgDownloadStatus = UrlUtils.copyUrlToFile(url, imgFile);
if (imgDownloadStatus >= 400) {
// the image URL connection attempt returns a response with code > 400
problemMsg = getBrokenUrlMsg(url, imgDownloadStatus);
}
}
if (xmlGrp != null && xmlGrp.contains(type)) {
// check for ALTO or Abbyy XML
String xmlId = type.getID();
// FIXME check on ID string might not be reliable
if (xmlId.contains("AbbyyXml")) {
logger.debug("Found potential Abbyy XML: " + type.getID());
// TODO: implement
abbyyFile = new File(abbyyDirPath + File.separator + filename);
if (UrlUtils.copyUrlToFile(url, abbyyFile) >= 400) {
logger.error("Could not download Abbyy XML and it will be ignored!");
// don't fail if abbyy XML could not be retrieved
abbyyFile = null;
}
} else if (xmlId.contains("Alto")) {
logger.debug("Found potential ALTO XML: " + type.getID());
// TODO: implement
altoFile = new File(altoDirPath + File.separator + filename);
if (UrlUtils.copyUrlToFile(url, altoFile) >= 400) {
logger.error("Could not download ALTO XML and it will be ignored!");
// don't fail if ALTO XML could not be retrieved
altoFile = null;
}
}
}
}
File pageXml = null;
File thumb = null;
File imgDir = new File(imgDirPath);
Dimension dim = null;
if (imgFile == null) {
// the divType did not include an image pointer
logger.error("No image mapped for page " + pageNr + " in the structmap!");
problemMsg = getMissingImgMsg(pageNr);
} else {
logger.info("Page " + pageNr + " image: " + imgFile.getAbsolutePath());
if (imgFile.isFile()) {
try {
dim = ImgUtils.readImageDimensions(imgFile);
} catch (CorruptImageException cie) {
logger.error("Image is corrupted!", cie);
// the image dimension can not be read from the downloaded file
problemMsg = LocalDocReader.getCorruptImgMsg(imgFile.getName());
}
}
File pageOutFile = new File(pageDirPath + File.separatorChar + FilenameUtils.getBaseName(imgFile.getName()) + ".xml");
pageXml = LocalDocReader.createPageXml(pageOutFile, true, abbyyFile, altoFile, null, true, true, false, imgFile.getName(), dim);
thumb = LocalDocReader.getThumbFile(imgDir, imgFile.getName());
}
TrpPage page = LocalDocReader.buildPage(new File(dir), pageNr, imgFile, pageXml, thumb, dim, problemMsg);
// page.getTranscripts().add(tmd);
return page;
}
use of eu.transkribus.core.model.beans.mets.FileType in project TranskribusCore by Transkribus.
the class TrpDocPacker method getFiles.
private List<String> getFiles(FileGrpType type) {
List<String> fileList = new LinkedList<>();
for (FileType ft : type.getFile()) {
for (FLocat fl : ft.getFLocat()) {
if (fl.getLOCTYPE().equals("OTHER") && fl.getOTHERLOCTYPE().equals("FILE")) {
logger.debug("Adding File: " + fl.getHref());
fileList.add(fl.getHref());
updateStatus("Adding File: " + fl.getHref());
}
}
}
return fileList;
}
use of eu.transkribus.core.model.beans.mets.FileType in project TranskribusCore by Transkribus.
the class MetsUtil method buildUploadImage.
private static PageUploadDescriptor buildUploadImage(DivType div, List<FileType> imgGrp, List<FileType> xmlGrp) {
PageUploadDescriptor image = new PageUploadDescriptor();
int pageIndex = div.getORDER().intValue() - 1;
image.setPageNr(pageIndex);
String imgFileName = null;
String xmlFileName = null;
String imgChecksum = null;
String xmlChecksum = null;
for (Fptr ptr : div.getFptr()) {
FileType type = (FileType) ptr.getArea().getFILEID();
final Pair<String, String> fileNameAndChecksum = MetsUtil.getFileNameAndChecksum(type);
if (imgGrp.contains(type)) {
imgFileName = fileNameAndChecksum.getLeft();
if (!IMG_NAME_FILTER.accept(null, imgFileName)) {
throw new IllegalArgumentException("Image type is not supported: " + imgFileName);
}
imgChecksum = fileNameAndChecksum.getRight();
} else if (xmlGrp != null && xmlGrp.contains(type)) {
xmlFileName = fileNameAndChecksum.getLeft();
xmlChecksum = fileNameAndChecksum.getRight();
if (!StringUtils.isEmpty(xmlFileName) && xmlFileName.startsWith(LocalDocConst.PAGE_FILE_SUB_FOLDER + "/")) {
// remove the "page/" prefix in XML filename if existent
xmlFileName = xmlFileName.replaceFirst(LocalDocConst.PAGE_FILE_SUB_FOLDER + "/", "");
}
}
}
if (StringUtils.isEmpty(imgFileName)) {
logger.error("No master image mapped for page index = " + pageIndex + " in the structmap!");
} else {
logger.info("Page " + image.getPageNr() + " image: " + imgFileName);
}
image.setFileName(imgFileName);
image.setImgChecksum(imgChecksum);
image.setPageXmlName(xmlFileName);
image.setPageXmlChecksum(xmlChecksum);
return image;
}
Aggregations