use of eu.transkribus.core.exceptions.CorruptImageException in project TranskribusCore by Transkribus.
the class GoobiMetsImporter method fetchFilesFromUrl.
private TrpPage fetchFilesFromUrl(DivType div, List<FileType> imgGrp, List<FileType> xmlGrp, String dir) throws IOException {
final int pageNr = div.getORDER().intValue();
updateStatus("Downloading file for page nr. " + pageNr);
File imgFile = null;
File abbyyFile = null;
File altoFile = null;
String imgDirPath = dir + File.separator + "img";
String abbyyDirPath = dir + File.separator + LocalDocConst.OCR_FILE_SUB_FOLDER;
String altoDirPath = dir + File.separator + LocalDocConst.ALTO_FILE_SUB_FOLDER;
String pageDirPath = dir + File.separator + LocalDocConst.PAGE_FILE_SUB_FOLDER;
File pageDirFile = new File(pageDirPath);
if (!pageDirFile.isDirectory() && !pageDirFile.mkdir()) {
throw new IOException("Could not create page dir at: " + pageDirPath);
}
/**
* handle cases where no image can be retrieved/stored for this page:
* -image URL is broken
* -the image dimension can not be read from the downloaded file
* -no image file is mapped in the structmap for this page
*
* problemMsg is used to store info on that.
*/
String problemMsg = null;
for (Fptr ptr : div.getFptr()) {
FileType type = (FileType) ptr.getFILEID();
FLocat fLocat = type.getFLocat().get(0);
// FIXME at the moment only remote files are supported here!
final String locType = fLocat.getLOCTYPE();
if (!"URL".equals(locType)) {
throw new IOException("Bad or no LOCTYPE in an FLocat element: " + locType);
}
// MIMETYPE="image/jpeg"
final String mimetype = type.getMIMETYPE();
final URL url = new URL(fLocat.getHref());
String ext = MimeTypes.lookupExtension(mimetype);
/*
* brought problems with file/img links without the filname + ext at the end of the URL
*/
// final String filename = determineFilename(url, type.getID(), mimetype);
/*
* Preferred filename is the name in the getHeaderField("Content-Disposition");
* as fallback we use the fileID and mimetype extension
*
*/
String filename = type.getID() + "." + ext;
logger.debug("url.getProtocol() " + url.getProtocol());
if (url.getProtocol().startsWith("http")) {
String tmpFn = UrlUtils.getFilenameFromHeaderField(url);
// logger.debug("tmpFn " + tmpFn);
if (tmpFn != null) {
filename = tmpFn;
}
}
// logger.debug("mimetype " + mimetype);
logger.debug("imported filename " + filename);
if (imgGrp.contains(type)) {
imgFile = new File(imgDirPath + File.separator + filename);
logger.debug("Downloading: " + url);
// fetch file from this URL and store locally
int imgDownloadStatus = UrlUtils.copyUrlToFile(url, imgFile);
if (imgDownloadStatus >= 400) {
// the image URL connection attempt returns a response with code > 400
problemMsg = getBrokenUrlMsg(url, imgDownloadStatus);
}
}
if (xmlGrp != null && xmlGrp.contains(type)) {
// check for ALTO or Abbyy XML
String xmlId = type.getID();
// FIXME check on ID string might not be reliable
if (xmlId.contains("AbbyyXml")) {
logger.debug("Found potential Abbyy XML: " + type.getID());
// TODO: implement
abbyyFile = new File(abbyyDirPath + File.separator + filename);
if (UrlUtils.copyUrlToFile(url, abbyyFile) >= 400) {
logger.error("Could not download Abbyy XML and it will be ignored!");
// don't fail if abbyy XML could not be retrieved
abbyyFile = null;
}
} else if (xmlId.contains("Alto")) {
logger.debug("Found potential ALTO XML: " + type.getID());
// TODO: implement
altoFile = new File(altoDirPath + File.separator + filename);
if (UrlUtils.copyUrlToFile(url, altoFile) >= 400) {
logger.error("Could not download ALTO XML and it will be ignored!");
// don't fail if ALTO XML could not be retrieved
altoFile = null;
}
}
}
}
File pageXml = null;
File thumb = null;
File imgDir = new File(imgDirPath);
Dimension dim = null;
if (imgFile == null) {
// the divType did not include an image pointer
logger.error("No image mapped for page " + pageNr + " in the structmap!");
problemMsg = getMissingImgMsg(pageNr);
} else {
logger.info("Page " + pageNr + " image: " + imgFile.getAbsolutePath());
if (imgFile.isFile()) {
try {
dim = ImgUtils.readImageDimensions(imgFile);
} catch (CorruptImageException cie) {
logger.error("Image is corrupted!", cie);
// the image dimension can not be read from the downloaded file
problemMsg = LocalDocReader.getCorruptImgMsg(imgFile.getName());
}
}
File pageOutFile = new File(pageDirPath + File.separatorChar + FilenameUtils.getBaseName(imgFile.getName()) + ".xml");
pageXml = LocalDocReader.createPageXml(pageOutFile, true, abbyyFile, altoFile, null, true, true, false, imgFile.getName(), dim);
thumb = LocalDocReader.getThumbFile(imgDir, imgFile.getName());
}
TrpPage page = LocalDocReader.buildPage(new File(dir), pageNr, imgFile, pageXml, thumb, dim, problemMsg);
// page.getTranscripts().add(tmd);
return page;
}
use of eu.transkribus.core.exceptions.CorruptImageException in project TranskribusCore by Transkribus.
the class LocalDocReader method load.
public static TrpDoc load(TrpUpload upload) throws IOException {
// validate most necessary things
if (upload == null) {
throw new IllegalArgumentException("Upload is null.");
}
if (upload.getUploadId() < 1) {
throw new IllegalArgumentException("Invalid upload ID: " + upload.getUploadId());
}
if (!upload.canReadDirectories()) {
throw new IllegalArgumentException("Directories are not readable: " + upload.getUploadTmpDir().getAbsolutePath());
}
// transform the upload object into a TRP document
TrpDoc doc = new TrpDoc();
TrpDocMetadata md = upload.getMd();
md.setLocalFolder(upload.getUploadTmpDir());
doc.setMd(md);
File baseDir = upload.getUploadTmpDir();
File xmlDir = upload.getUploadPageTmpDir();
File thumbDir = new File(baseDir.getAbsolutePath() + File.separatorChar + LocalDocConst.THUMBS_FILE_SUB_FOLDER);
for (PageUploadDescriptor p : upload.getPages()) {
final int pageNr = p.getPageNr();
File img = new File(baseDir.getAbsolutePath() + File.separator + p.getFileName());
if (!img.isFile()) {
throw new FileNotFoundException("Image for page " + pageNr + " does not exist: " + img.getAbsolutePath());
}
// try to read image dimension in any case to detect corrupt files immediately!
Dimension dim = null;
String imageRemark = null;
try {
dim = ImgUtils.readImageDimensions(img);
} catch (CorruptImageException cie) {
logger.error("Image is corrupt: " + img.getAbsolutePath(), cie);
imageRemark = getCorruptImgMsg(img.getName());
}
final String imgBaseName = FilenameUtils.getBaseName(img.getName());
File thumb = getThumbFile(thumbDir, imgBaseName);
File pageXml = null;
if (!StringUtils.isEmpty(p.getPageXmlName())) {
pageXml = new File(xmlDir.getAbsolutePath() + File.separator + p.getPageXmlName());
if (!pageXml.isFile()) {
throw new FileNotFoundException("PAGE XML for page " + pageNr + " does not exist: " + img.getAbsolutePath());
}
} else if (StringUtils.isEmpty(imageRemark)) {
// if a problem occured when reading the image
File pageOutFile = new File(xmlDir.getAbsolutePath() + File.separatorChar + imgBaseName + ".xml");
PcGtsType pc = PageXmlUtils.createEmptyPcGtsType(img, dim);
try {
pageXml = JaxbUtils.marshalToFile(pc, pageOutFile);
} catch (JAXBException je) {
logger.error(je.getMessage(), je);
throw new IOException("Could not create empty PageXml on disk!", je);
}
}
TrpPage page = buildPage(baseDir, pageNr, img, pageXml, thumb, dim, imageRemark);
doc.getPages().add(page);
}
return doc;
}
use of eu.transkribus.core.exceptions.CorruptImageException in project TranskribusCore by Transkribus.
the class LocalDocReader method load.
/**
* Loads a document from path.<br>
*
* Document metadata has to be in an XML called "metadata.xml".<br>
*
* Image files and corresponding XML/txt files have to have the same name. <br>
* Lexicographic order of image names will imply order of pages.<br>
* Types of transcript source files are searched in this order:
* <ol>
* <li>./page: PAGE XMLs according to schema 2010/2013</li>
* <li>./ocr: Abbyy Finereader XMLs schema version 10</li>
* <li>./alto: ALTO v2 XMls
* <li>./txt: txt files with transcription fulltext only
* </ol>
* Testdoc is in $dea_scratch/TRP/TrpTestDoc <br>
* No versioning of files for local use!<br>
*
* @param path the path where the document is stored
* @param config {@link DocLoadConfig}
* @return the constructed document
* @throws IOException if the path can't be read or is malformed
*
* @todo implement monitor feedback!
* @todo Respect Storage.uploadDocument where the monitor will be used by the upload itself later.
*/
public static TrpDoc load(final String path, DocLoadConfig config, IProgressMonitor monitor) throws IOException {
// create the document
TrpDoc doc = new TrpDoc();
// check OS and adjust URL protocol
final String os = System.getProperty("os.name");
/*
* FIXME use SysUtils.isWin() here?
*/
if (os.toLowerCase().contains("win")) {
LocalDocConst.URL_PROT_CONST = "file:///";
}
// else: keep default
final File inputDir = new File(path);
final File docXml = new File(inputDir.getAbsolutePath() + File.separator + LocalDocConst.DOC_XML_FILENAME);
// validate input path ======================================================
checkInputDir(inputDir);
// search for IMG files
TreeMap<String, File> pageMap = findImgFiles(inputDir);
logger.info("Found " + pageMap.entrySet().size() + " page images.");
if (pageMap.isEmpty()) {
throw new FileNotFoundException("The directory does not contain any images: " + inputDir.getAbsolutePath());
}
TrpDocMetadata docMd = null;
boolean doRefresh = true;
// try to read doc structure from disk
if (docXml.isFile()) {
doc = loadDocXml(docXml);
if (isValid(doc, pageMap.size(), config.isForceCreatePageXml())) {
logger.info("Loaded document structure from disk.");
docMd = doc.getMd();
// no refresh is necessary as doc structure matches the input dir content
doRefresh = false;
} else {
if (doc != null && doc.getMd() != null) {
// keep any existing metadata if invalid doc structure was found
docMd = doc.getMd();
}
logger.info("Removing faulty doc XML from disk and doing reload.");
docXml.delete();
doc = new TrpDoc();
}
}
logger.info("Reading document at " + inputDir.getAbsolutePath());
// find metadata file if not extracted from doc.xml =============================================
if (docMd == null) {
try {
docMd = loadDocMd(inputDir);
} catch (IOException ioe) {
docMd = new TrpDocMetadata();
}
}
initDocMd(docMd, inputDir, config.isStripServerRelatedMetadata());
// Set the docMd
doc.setMd(docMd);
if (!doRefresh) {
// Stop now and reuse doc structure from file
return doc;
}
// Construct the input dir with pageXml Files.
File pageInputDir = getPageXmlInputDir(inputDir);
if (config.isForceCreatePageXml() && !pageInputDir.isDirectory()) {
pageInputDir.mkdir();
}
// abbyy XML search path
File ocrInputDir = getOcrXmlInputDir(inputDir);
// alto XML search path
File altoInputDir = getAltoXmlInputDir(inputDir);
// alto XML search path
File txtInputDir = getTxtInputDir(inputDir);
// backupfolder for outdated page format files, if any
final String backupFolderName = XmlFormat.PAGE_2010.toString().toLowerCase() + "_backup";
final String backupPath = pageInputDir.getAbsolutePath() + File.separator + backupFolderName;
// iterate imgList, search for corresponding XML files and build TrpPages
int pageNr = 1;
List<TrpPage> pages = new ArrayList<TrpPage>(pageMap.entrySet().size());
// need a special variable to test whether we are in sync mode (only then do the following!!!!)
if (pageMap.entrySet().size() == 0 && config.isEnableSyncWithoutImages()) {
pageMap = createDummyImgFilesForXmls(inputDir, pageInputDir);
}
for (Entry<String, File> e : pageMap.entrySet()) {
File imgFile = e.getValue();
// the img file name without extension
final String imgFileName = e.getKey();
// check for a page XML of this name
File pageXml = findXml(imgFileName, pageInputDir);
// TODO thumbURL dir + imgFile.getName())+".jpg"
File thumbFile = getThumbFile(inputDir, imgFileName);
if (pageXml != null) {
XmlFormat xmlFormat = XmlUtils.getXmlFormat(pageXml);
switch(xmlFormat) {
case PAGE_2010:
Page2010Converter.updatePageFormatSingleFile(pageXml, backupPath);
break;
case PAGE_2013:
break;
default:
throw new IOException("Incompatible XML file in PAGE XML path! " + pageXml.getAbsolutePath());
}
}
// try to read image dimension in any case to detect corrupt files immediately!
// FIXME this is taking too long and is only necessary on initial loading
Dimension dim = null;
String imageRemark = null;
try {
dim = ImgUtils.readImageDimensions(imgFile);
} catch (CorruptImageException cie) {
logger.error("Image is corrupt: " + imgFile.getAbsolutePath(), cie);
imageRemark = getCorruptImgMsg(imgFile.getName());
}
if (pageXml == null && config.isForceCreatePageXml()) {
// if no page XML, then create one at this path
File pageOutFile = new File(pageInputDir.getAbsolutePath() + File.separatorChar + imgFileName + ".xml");
File abbyyXml = findXml(imgFileName, ocrInputDir);
File altoXml = findXml(imgFileName, altoInputDir);
File txtFile = findFile(imgFileName, txtInputDir, "txt");
pageXml = createPageXml(pageOutFile, false, abbyyXml, altoXml, txtFile, config.isPreserveOcrFontFamily(), config.isPreserveOcrTxtStyles(), config.isReplaceBadChars(), imgFile.getName(), dim);
}
TrpPage page = buildPage(inputDir, pageNr++, imgFile, pageXml, thumbFile, dim, imageRemark);
pages.add(page);
}
doc.setPages(pages);
doc.getMd().setNrOfPages(doc.getPages().size());
// set editorial declaration:
List<EdFeature> features = readEditDeclFeatures(doc.getMd().getLocalFolder());
doc.setEdDeclList(features);
logger.debug(doc.toString());
// store doc on disk to save time on next load
LocalDocWriter.writeDocXml(doc, docXml);
return doc;
}
use of eu.transkribus.core.exceptions.CorruptImageException in project TranskribusCore by Transkribus.
the class LocalDocReaderTest method main.
public static void main(String[] args) throws FileNotFoundException, IOException {
String pageDirPath = "C:/Neuer Ordner/Briefe_aus_allen_Jahrhunderten_der_christlichen_Zeitrechnung_1/Briefe_aus_allen_Jahrhunderten_der_christlichen_Zeitrechnung_1/alto";
File altoFile = new File("C:/Neuer Ordner/Briefe_aus_allen_Jahrhunderten_der_christlichen_Zeitrechnung_1/Briefe_aus_allen_Jahrhunderten_der_christlichen_Zeitrechnung_1/alto/0014_ubr16515_0014.xml");
File pageDirFile = new File(pageDirPath);
File pageOutFile = new File(pageDirPath + File.separatorChar + "pageTest.xml");
File imgFile = new File("C:/Neuer Ordner/Briefe_aus_allen_Jahrhunderten_der_christlichen_Zeitrechnung_1/Briefe_aus_allen_Jahrhunderten_der_christlichen_Zeitrechnung_1/0014_ubr16515_0014.jpg");
Dimension dim = null;
if (imgFile.isFile()) {
try {
dim = ImgUtils.readImageDimensions(imgFile);
} catch (CorruptImageException cie) {
logger.error("Image is corrupted!", cie);
// the image dimension can not be read from the downloaded file
}
}
File pageXml = LocalDocReader.createPageXml(pageOutFile, true, null, altoFile, null, true, true, false, imgFile.getName(), dim);
// logger.debug("Setting up doc loading process...");
// try {
//
//
// TrpDoc doc = LocalDocReader.load("C:\\Users\\lange\\Desktop\\testimages");
// System.out.print("Logging messages from this / LocalDocReader: ");
// System.out.println(logger.isDebugEnabled() + " / " +LoggerFactory.getLogger(LocalDocReader.class).isDebugEnabled());
// System.out.println(doc.toString());
// } catch (IOException e) {
// // TODO Auto-generated catch block
// e.printStackTrace();
// logger.error(e.toString());
// }
// String[] docs = {
// BASE + "TrpTestDoc_20131209/"
// TEST_DOC1,
// BASE + "Bentham_box_002/",
// BASE + "test/JustImages",
// BASE + "test/OneImage",
// BASE + "test/ImagesOldPageXml",
// BASE + "test/Schauplatz_test"
// BASE + "test/page_xsl_test2"
// BASE + "test/bsb00089816",
// BASE + "test/II._ZvS_1908_1.Q"};
// //Just 28 images w/o metadata
// final String polenTagebuch = "/mnt/dea_scratch/tmp_philip/04_polen_tagebuecher/tagebuch";
// //a test document with metadata and 3 pages with PAGE XMLs
// final String testDocWithMd = "/mnt/dea_scratch/TRP/TrpTestDoc_20131209_convert/";
// for (String d : docs) {
// try {
// try {
// TrpDoc doc = LocalDocReader.load(d);
// System.out.println(doc.toString());
// // writeMdFile(doc.getMd(), d + "new_metadata.xml");
//
// Mets mets = MetsBuilder.buildMets(doc);
// // JaxbUtils.marshalToFile(mets, new File("/tmp/mets.xml"), TrpDocMetadata.class);
// JaxbUtils.marshalToSysOut(mets, TrpDocMetadata.class);
// } catch (UnsupportedFormatException ufe) {
// logger.error("Caught: " + ufe.getMessage(), ufe);
// // PageXmlUtils.updatePageFormat(d);
// }
// } catch (Exception e) {
// // TODO Auto-generated catch block
// logger.error(e);
// }
// }
// try {
// try {
// TrpDoc doc = LocalDocReader.load(TEST_DOC2);
// System.out.println(doc.toString());
// // writeMdFile(doc.getMd(), d + "new_metadata.xml");
//
// Mets mets = MetsBuilder.buildMets(doc);
// JaxbUtils.marshalToFile(mets, new File(TEST_DOC2 + "/mets.xml"), TrpDocMetadata.class);
// JaxbUtils.marshalToSysOut(mets, TrpDocMetadata.class);
// } catch (UnsupportedFormatException ufe) {
// logger.error("Caught: " + ufe.getMessage(), ufe);
// // PageXmlUtils.updatePageFormat(d);
// }
// } catch (Exception e) {
// // TODO Auto-generated catch block
// logger.error(e);
// }
}
Aggregations