use of eu.transkribus.core.model.beans.mets.FileType.FLocat in project TranskribusCore by Transkribus.
the class GoobiMetsImporter method fetchFilesFromUrl.
private TrpPage fetchFilesFromUrl(DivType div, List<FileType> imgGrp, List<FileType> xmlGrp, String dir) throws IOException {
final int pageNr = div.getORDER().intValue();
updateStatus("Downloading file for page nr. " + pageNr);
File imgFile = null;
File abbyyFile = null;
File altoFile = null;
String imgDirPath = dir + File.separator + "img";
String abbyyDirPath = dir + File.separator + LocalDocConst.OCR_FILE_SUB_FOLDER;
String altoDirPath = dir + File.separator + LocalDocConst.ALTO_FILE_SUB_FOLDER;
String pageDirPath = dir + File.separator + LocalDocConst.PAGE_FILE_SUB_FOLDER;
File pageDirFile = new File(pageDirPath);
if (!pageDirFile.isDirectory() && !pageDirFile.mkdir()) {
throw new IOException("Could not create page dir at: " + pageDirPath);
}
/**
* handle cases where no image can be retrieved/stored for this page:
* -image URL is broken
* -the image dimension can not be read from the downloaded file
* -no image file is mapped in the structmap for this page
*
* problemMsg is used to store info on that.
*/
String problemMsg = null;
for (Fptr ptr : div.getFptr()) {
FileType type = (FileType) ptr.getFILEID();
FLocat fLocat = type.getFLocat().get(0);
// FIXME at the moment only remote files are supported here!
final String locType = fLocat.getLOCTYPE();
if (!"URL".equals(locType)) {
throw new IOException("Bad or no LOCTYPE in an FLocat element: " + locType);
}
// MIMETYPE="image/jpeg"
final String mimetype = type.getMIMETYPE();
final URL url = new URL(fLocat.getHref());
String ext = MimeTypes.lookupExtension(mimetype);
/*
* brought problems with file/img links without the filname + ext at the end of the URL
*/
// final String filename = determineFilename(url, type.getID(), mimetype);
/*
* Preferred filename is the name in the getHeaderField("Content-Disposition");
* as fallback we use the fileID and mimetype extension
*
*/
String filename = type.getID() + "." + ext;
logger.debug("url.getProtocol() " + url.getProtocol());
if (url.getProtocol().startsWith("http")) {
String tmpFn = UrlUtils.getFilenameFromHeaderField(url);
// logger.debug("tmpFn " + tmpFn);
if (tmpFn != null) {
filename = tmpFn;
}
}
// logger.debug("mimetype " + mimetype);
logger.debug("imported filename " + filename);
if (imgGrp.contains(type)) {
imgFile = new File(imgDirPath + File.separator + filename);
logger.debug("Downloading: " + url);
// fetch file from this URL and store locally
int imgDownloadStatus = UrlUtils.copyUrlToFile(url, imgFile);
if (imgDownloadStatus >= 400) {
// the image URL connection attempt returns a response with code > 400
problemMsg = getBrokenUrlMsg(url, imgDownloadStatus);
}
}
if (xmlGrp != null && xmlGrp.contains(type)) {
// check for ALTO or Abbyy XML
String xmlId = type.getID();
// FIXME check on ID string might not be reliable
if (xmlId.contains("AbbyyXml")) {
logger.debug("Found potential Abbyy XML: " + type.getID());
// TODO: implement
abbyyFile = new File(abbyyDirPath + File.separator + filename);
if (UrlUtils.copyUrlToFile(url, abbyyFile) >= 400) {
logger.error("Could not download Abbyy XML and it will be ignored!");
// don't fail if abbyy XML could not be retrieved
abbyyFile = null;
}
} else if (xmlId.contains("Alto")) {
logger.debug("Found potential ALTO XML: " + type.getID());
// TODO: implement
altoFile = new File(altoDirPath + File.separator + filename);
if (UrlUtils.copyUrlToFile(url, altoFile) >= 400) {
logger.error("Could not download ALTO XML and it will be ignored!");
// don't fail if ALTO XML could not be retrieved
altoFile = null;
}
}
}
}
File pageXml = null;
File thumb = null;
File imgDir = new File(imgDirPath);
Dimension dim = null;
if (imgFile == null) {
// the divType did not include an image pointer
logger.error("No image mapped for page " + pageNr + " in the structmap!");
problemMsg = getMissingImgMsg(pageNr);
} else {
logger.info("Page " + pageNr + " image: " + imgFile.getAbsolutePath());
if (imgFile.isFile()) {
try {
dim = ImgUtils.readImageDimensions(imgFile);
} catch (CorruptImageException cie) {
logger.error("Image is corrupted!", cie);
// the image dimension can not be read from the downloaded file
problemMsg = LocalDocReader.getCorruptImgMsg(imgFile.getName());
}
}
File pageOutFile = new File(pageDirPath + File.separatorChar + FilenameUtils.getBaseName(imgFile.getName()) + ".xml");
pageXml = LocalDocReader.createPageXml(pageOutFile, true, abbyyFile, altoFile, null, true, true, false, imgFile.getName(), dim);
thumb = LocalDocReader.getThumbFile(imgDir, imgFile.getName());
}
TrpPage page = LocalDocReader.buildPage(new File(dir), pageNr, imgFile, pageXml, thumb, dim, problemMsg);
// page.getTranscripts().add(tmd);
return page;
}
use of eu.transkribus.core.model.beans.mets.FileType.FLocat in project TranskribusCore by Transkribus.
the class TrpDocPacker method getFiles.
private List<String> getFiles(FileGrpType type) {
List<String> fileList = new LinkedList<>();
for (FileType ft : type.getFile()) {
for (FLocat fl : ft.getFLocat()) {
if (fl.getLOCTYPE().equals("OTHER") && fl.getOTHERLOCTYPE().equals("FILE")) {
logger.debug("Adding File: " + fl.getHref());
fileList.add(fl.getHref());
updateStatus("Adding File: " + fl.getHref());
}
}
}
return fileList;
}
use of eu.transkribus.core.model.beans.mets.FileType.FLocat in project TranskribusCore by Transkribus.
the class MetsUtil method getFile.
public static File getFile(FileType type, File parentDir) throws IOException {
File file = null;
FLocat fLocat = type.getFLocat().get(0);
if (fLocat.getOTHERLOCTYPE() != null && fLocat.getOTHERLOCTYPE().equals("FILE")) {
// localdoc
file = new File(parentDir.getAbsolutePath() + File.separator + fLocat.getHref());
if (!file.exists()) {
throw new IOException("File does not exist: " + file.getAbsolutePath());
}
if (!type.isSetCHECKSUMTYPE()) {
logger.error("No checksum set!");
} else if (!type.getCHECKSUMTYPE().equals(ChecksumUtils.ChkSumAlg.MD5.toString())) {
logger.error("Unknown checksum algorithm: " + type.getCHECKSUMTYPE());
} else {
final String metsChkSum = type.getCHECKSUM();
final String chkSum = ChecksumUtils.getMd5SumHex(file);
if (!metsChkSum.equals(chkSum)) {
throw new IOException("Checksum error: METS=" + metsChkSum + " <-> FILE=" + chkSum + " | " + file.getAbsolutePath());
}
logger.debug("Checksum is correct: " + file.getAbsolutePath());
}
} else {
// TODO implement for URL type
throw new IOException("METS file does not belong to a local document!");
}
return file;
}
use of eu.transkribus.core.model.beans.mets.FileType.FLocat in project TranskribusCore by Transkribus.
the class TrpMetsBuilder method buildMets.
/**
* Generate a METS containing
* <ul>
* <li>TrpDocMetadata embedded in sourceMd</li>
* <li>all page images</li>
* <li>the most recent PAGE XML files from the Doc</li>
* </ul>
*
* If a local document is passed, all hrefs will contain the relative paths to files based on the localFolder!
*
* @param doc
* @param exportImages
* @param pageIndices
* @return
* @throws IOException if image/xml files can't be accessed for reading the mimetype etc.
*/
public static Mets buildMets(TrpDoc doc, boolean exportPage, boolean exportAlto, boolean exportImages, Set<Integer> pageIndices) throws IOException {
Mets mets = new Mets();
TrpDocMetadata md = doc.getMd();
File localFolder = md.getLocalFolder();
boolean isLocalDoc = localFolder != null;
mets.setLABEL(md.getTitle());
mets.setOBJID("" + md.getDocId());
mets.setPROFILE(TRP_METS_PROFILE);
// FIXME remove TYPE
// mets.setTYPE(TRP_METS_PROFILE);
// metsHdr
MetsHdr hdr = buildMetsHdr(md);
mets.setMetsHdr(hdr);
// TODO dcmd_elec omitted meanwhile
// md_orig
AmdSecType amdSec = new AmdSecType();
amdSec.setID(SOURCE_MD_ID_CONST);
MdSecType sourceMdSec = buildSourceMdSec(md);
amdSec.getSourceMD().add(sourceMdSec);
mets.getAmdSec().add(amdSec);
// structmap div, linking to the sourceMd section with dmd
DivType div = new DivType();
div.getADMID().add(sourceMdSec);
div.setID(TRP_DOC_DIV_ID);
FileSec fileSec = new FileSec();
StructMapType structMap = new StructMapType();
structMap.setID(TRP_STRUCTMAP_ID);
structMap.setTYPE("MANUSCRIPT");
structMap.setDiv(div);
List<TrpPage> pages = doc.getPages();
FimgStoreGetClient client = null;
if (!isLocalDoc) {
// TODO maybe we need this stuff in the docMetadata?
URL url = pages.get(0).getUrl();
client = new FimgStoreGetClient(url);
}
FileGrp masterGrp = new FileGrp();
masterGrp.setID(MASTER_FILE_GRP_ID);
FileGrpType imgGrp = new FileGrpType();
imgGrp.setID(IMG_GROUP_ID);
FileGrpType pageGrp = new FileGrpType();
pageGrp.setID(PAGE_GROUP_ID);
FileGrpType altoGrp = new FileGrpType();
altoGrp.setID(ALTO_GROUP_ID);
int i = -1;
for (TrpPage p : pages) {
i++;
if (pageIndices != null && !pageIndices.contains(i)) {
continue;
}
// build a page div for the structmap
DivType pageDiv = new DivType();
pageDiv.setID("PAGE_" + p.getPageNr());
pageDiv.setTYPE("SINGLE_PAGE");
pageDiv.setORDER(BigInteger.valueOf(p.getPageNr()));
final String imgId = "IMG_" + p.getPageNr();
final String xmlId = PAGE_GROUP_ID + "_" + p.getPageNr();
final String altoId = ALTO_GROUP_ID + "_" + p.getPageNr();
/* only the most recent transcript is added here for now
*
* TODO how to deal with imagestore files? use orig image? right now, it's just the view file...
* TODO thumbnails not yet included
*/
if (exportImages) {
FileType img = buildFileType(localFolder, imgId, p, p.getPageNr(), client);
imgGrp.getFile().add(img);
// linking images
Fptr imgPtr = buildFptr(img);
pageDiv.getFptr().add(imgPtr);
}
// TODO error handling.. if no transcript??
if (exportPage) {
// xmlfiletype: just add the most recent transcript
TrpTranscriptMetadata tMd;
// get the transcript chosen for export
tMd = p.getCurrentTranscript();
FileType xml = buildFileType(md.getLocalFolder(), xmlId, tMd, p.getPageNr(), client);
pageGrp.getFile().add(xml);
Fptr xmlPtr = buildFptr(xml);
pageDiv.getFptr().add(xmlPtr);
}
// creat ALTO fileGrp
if (exportAlto) {
FileType altoFt = new FileType();
altoFt.setCHECKSUMTYPE(ChecksumUtils.ChkSumAlg.MD5.toString());
// TODO calculate checksum
altoFt.setCHECKSUM("");
FLocat fLocat = new FLocat();
fLocat.setLOCTYPE("OTHER");
fLocat.setOTHERLOCTYPE("FILE");
altoFt.setID(altoId);
altoFt.setSEQ(p.getPageNr());
// String tmpImgName = img.getFLocat().get(0).getHref();
String relAltoPath = "alto".concat(File.separator).concat(p.getImgFileName().substring(0, p.getImgFileName().lastIndexOf(".")).concat(".xml"));
fLocat.setHref(relAltoPath);
// String absAltoPath = tMd.getUrl().getPath().replace("page", "alto");
final String path = FileUtils.toFile(p.getUrl()).getAbsolutePath();
String absAltoPath = path.substring(0, path.lastIndexOf(File.separator));
absAltoPath = absAltoPath.concat("/alto/").concat(p.getImgFileName().substring(0, p.getImgFileName().lastIndexOf(".")).concat(".xml"));
// logger.info("alto path starts with: " + absAltoPath);
if (absAltoPath.startsWith("\\")) /*|| absAltoPath.startsWith("/")*/
{
// logger.info("alto path starts with \\ or /");
absAltoPath = absAltoPath.substring(1);
}
String mime = MimeTypes.getMimeType("xml");
altoFt.setMIMETYPE(mime);
File altoTmp = new File(absAltoPath);
if (altoTmp.exists()) {
// logger.info("alto file exist at " + absAltoPath);
Date date = new Date(altoTmp.lastModified());
XMLGregorianCalendar cal = JaxbUtils.getXmlCalendar(date);
altoFt.setCREATED(cal);
} else {
logger.info("alto file does not exist at " + absAltoPath);
}
// System.out.println("relAltoPath " + relAltoPath);
// System.out.println("absAltoPath " + absAltoPath);
// System.in.read();
altoFt.getFLocat().add(fLocat);
altoGrp.getFile().add(altoFt);
Fptr altoPtr = buildFptr(altoFt);
pageDiv.getFptr().add(altoPtr);
}
div.getDiv().add(pageDiv);
}
fileSec.getFileGrp().add(masterGrp);
mets.setFileSec(fileSec);
if (exportImages) {
masterGrp.getFileGrp().add(imgGrp);
}
if (exportPage) {
masterGrp.getFileGrp().add(pageGrp);
}
if (exportAlto) {
masterGrp.getFileGrp().add(altoGrp);
}
mets.getStructMap().add(structMap);
return mets;
}
use of eu.transkribus.core.model.beans.mets.FileType.FLocat in project TranskribusCore by Transkribus.
the class TrpMetsBuilder method buildFileType.
/**
* @param localFolder null if isLocalDoc
* @param id
* @param o
* @param client
* @return
* @throws IOException
*/
private static FileType buildFileType(File localFolder, String id, ITrpFile o, final int seq, FimgStoreGetClient client) throws IOException {
FileType fType = new FileType();
fType.setID(id);
String mime = null;
Date date = null;
FLocat fLocat = new FLocat();
String loc = null;
if (localFolder != null) {
URL url = o.getUrl();
if (!url.getProtocol().contains("file")) {
throw new IOException("Doc contains local folder reference but an URL refers to a non-local file! " + url.toString());
}
final String path = FileUtils.toFile(url).getAbsolutePath();
File f = new File(path);
mime = MimeTypes.getMimeType(FilenameUtils.getExtension(f.getName()));
date = new Date(f.lastModified());
fLocat.setLOCTYPE("OTHER");
fLocat.setOTHERLOCTYPE("FILE");
// remove protocol and localfolder, i.e. get relative path to this file
// loc = path.substring(localFolder.getAbsolutePath().length() + 1); // BUG: localFolder != path!!
loc = FilenameUtils.getName(path);
if (id.startsWith(PAGE_GROUP_ID)) {
// append relative folder for PAGE XML files
loc = "page/" + loc;
}
logger.debug("loc = " + loc);
if (o.getMd5Sum() != null) {
fType.setCHECKSUMTYPE(ChecksumUtils.ChkSumAlg.MD5.toString());
fType.setCHECKSUM(o.getMd5Sum());
}
} else {
try {
FimgStoreFileMd fMd = client.getFileMd(o.getKey());
date = fMd.getUploadDate();
mime = fMd.getMimetype();
fLocat.setLOCTYPE("URL");
// full URL in case of remote file
loc = o.getUrl().toString();
} catch (IOException e) {
logger.error(e.getMessage(), e);
throw new IOException("FileMetadata could not be retrieved from imagestore for key: " + o.getKey());
}
}
fType.setMIMETYPE(mime);
XMLGregorianCalendar cal = JaxbUtils.getXmlCalendar(date);
fType.setCREATED(cal);
fType.setSEQ(seq);
fLocat.setHref(loc);
fType.getFLocat().add(fLocat);
return fType;
}
Aggregations