use of eu.transkribus.core.model.beans.pagecontent.PcGtsType in project TranskribusCore by Transkribus.
the class LocalDocReader method load.
public static TrpDoc load(TrpUpload upload) throws IOException {
// validate most necessary things
if (upload == null) {
throw new IllegalArgumentException("Upload is null.");
}
if (upload.getUploadId() < 1) {
throw new IllegalArgumentException("Invalid upload ID: " + upload.getUploadId());
}
if (!upload.canReadDirectories()) {
throw new IllegalArgumentException("Directories are not readable: " + upload.getUploadTmpDir().getAbsolutePath());
}
// transform the upload object into a TRP document
TrpDoc doc = new TrpDoc();
TrpDocMetadata md = upload.getMd();
md.setLocalFolder(upload.getUploadTmpDir());
doc.setMd(md);
File baseDir = upload.getUploadTmpDir();
File xmlDir = upload.getUploadPageTmpDir();
File thumbDir = new File(baseDir.getAbsolutePath() + File.separatorChar + LocalDocConst.THUMBS_FILE_SUB_FOLDER);
for (PageUploadDescriptor p : upload.getPages()) {
final int pageNr = p.getPageNr();
File img = new File(baseDir.getAbsolutePath() + File.separator + p.getFileName());
if (!img.isFile()) {
throw new FileNotFoundException("Image for page " + pageNr + " does not exist: " + img.getAbsolutePath());
}
// try to read image dimension in any case to detect corrupt files immediately!
Dimension dim = null;
String imageRemark = null;
try {
dim = ImgUtils.readImageDimensions(img);
} catch (CorruptImageException cie) {
logger.error("Image is corrupt: " + img.getAbsolutePath(), cie);
imageRemark = getCorruptImgMsg(img.getName());
}
final String imgBaseName = FilenameUtils.getBaseName(img.getName());
File thumb = getThumbFile(thumbDir, imgBaseName);
File pageXml = null;
if (!StringUtils.isEmpty(p.getPageXmlName())) {
pageXml = new File(xmlDir.getAbsolutePath() + File.separator + p.getPageXmlName());
if (!pageXml.isFile()) {
throw new FileNotFoundException("PAGE XML for page " + pageNr + " does not exist: " + img.getAbsolutePath());
}
} else if (StringUtils.isEmpty(imageRemark)) {
// if a problem occured when reading the image
File pageOutFile = new File(xmlDir.getAbsolutePath() + File.separatorChar + imgBaseName + ".xml");
PcGtsType pc = PageXmlUtils.createEmptyPcGtsType(img, dim);
try {
pageXml = JaxbUtils.marshalToFile(pc, pageOutFile);
} catch (JAXBException je) {
logger.error(je.getMessage(), je);
throw new IOException("Could not create empty PageXml on disk!", je);
}
}
TrpPage page = buildPage(baseDir, pageNr, img, pageXml, thumb, dim, imageRemark);
doc.getPages().add(page);
}
return doc;
}
use of eu.transkribus.core.model.beans.pagecontent.PcGtsType in project TranskribusCore by Transkribus.
the class LocalDocReader method createPageXml.
/**
* Method will create a PAGE XML from the given source files at pageOutFile.
* if no supported source file exists (abbyy/alto/txt), then a skeleton will be created if possible.
* <br/><br/>
* This method must NEVER return null. Many mechanisms in Transkribus
* depend on this method reliably creating a file.
*
* @param pageOutFile
* @param doOverwrite
* @param abbyyXml
* @param altoXml
* @param txtFile
* @param preserveOcrFontFamily
* @param preserveOcrTxtStyles
* @param replaceBadChars
* @param imgFile
* @param dim
* @return
* @throws FileNotFoundException
* @throws IOException
*/
protected static File createPageXml(File pageOutFile, boolean doOverwrite, File abbyyXml, File altoXml, File txtFile, boolean preserveOcrFontFamily, boolean preserveOcrTxtStyles, boolean replaceBadChars, final String imgFileName, Dimension dim) throws FileNotFoundException, IOException {
if (pageOutFile == null) {
throw new IllegalArgumentException("PAGE XML output File is null.");
}
if (pageOutFile.exists() && !doOverwrite) {
throw new IOException("PAGE XML already exists at: " + pageOutFile.getAbsolutePath());
}
if (StringUtils.isEmpty(imgFileName)) {
throw new IllegalArgumentException("Image filename must not be empty");
}
PcGtsType pc = null;
if (abbyyXml != null) {
// try find Abbyy XML
pc = createPageFromAbbyy(imgFileName, abbyyXml, preserveOcrTxtStyles, preserveOcrFontFamily, replaceBadChars);
}
if (pc == null && altoXml != null) {
// try find ALTO XML
pc = createPageFromAlto2(imgFileName, altoXml, preserveOcrTxtStyles, preserveOcrFontFamily, replaceBadChars);
}
// from here we need the dimension of the image
if (dim == null) {
// set (0,0) here in order to make the following work
dim = new Dimension();
}
if (pc == null && txtFile != null) {
// try find TXT file
pc = createPageFromTxt(imgFileName, dim, txtFile);
}
// if still null, there is no suitable file for this page yet => create one
if (pc == null) {
logger.warn("No Transcript XML found for img: " + FilenameUtils.getBaseName(imgFileName));
logger.info("Creating empty PageXml.");
pc = PageXmlUtils.createEmptyPcGtsType(imgFileName, dim);
}
// create the file
try {
JaxbUtils.marshalToFile(pc, pageOutFile);
} catch (JAXBException je) {
throw new IOException("Could not create PageXml on disk!", je);
}
return pageOutFile;
}
use of eu.transkribus.core.model.beans.pagecontent.PcGtsType in project TranskribusCore by Transkribus.
the class LocalDocReader method createPageFromAbbyy.
private static PcGtsType createPageFromAbbyy(final String imgFileName, File abbyyXml, boolean preserveOcrTxtStyles, boolean preserveOcrFontFamily, boolean replaceBadChars) throws IOException {
try {
XmlFormat xmlFormat = XmlUtils.getXmlFormat(abbyyXml);
if (xmlFormat.equals(XmlFormat.ABBYY_10)) {
logger.info(abbyyXml.getAbsolutePath() + ": Transforming Finereader10/11 XML to PAGE XML.");
PcGtsType pc = PageXmlUtils.createPcGtsTypeFromAbbyy(abbyyXml, imgFileName, preserveOcrTxtStyles, preserveOcrFontFamily, replaceBadChars);
return pc;
}
throw new IOException("Not a valid Finereader10/11 XML file.");
} catch (IOException | TransformerException ioe) {
logger.error(ioe.getMessage(), ioe);
throw new IOException("Could not migrate file: " + abbyyXml.getAbsolutePath(), ioe);
} catch (ParserConfigurationException | SAXException xmle) {
logger.error(xmle.getMessage(), xmle);
throw new IOException("Could not transform XML file!", xmle);
} catch (JAXBException je) {
/* TODO This exception is only thrown when the pageXML is unmarshalled
* for inserting the image filename which is not included in the abbyy xml! */
logger.error(je.getMessage(), je);
throw new IOException("Transformation output is not a valid page XML!", je);
}
}
use of eu.transkribus.core.model.beans.pagecontent.PcGtsType in project TranskribusCore by Transkribus.
the class PageXmlUtils method findLinesByBaseline.
public static List<TextLineType> findLinesByBaseline(PcGtsType pc, String baseline) {
List<TextRegionType> regions = getTextRegions(pc);
List<TextLineType> matchingLines = new LinkedList<>();
for (TextRegionType r : regions) {
r.getTextLine().stream().filter(// isBaselineInLineBounds(l, baseline, threshold))
l -> doesIntersect(l, baseline)).forEach(l -> matchingLines.add(l));
}
if (matchingLines.size() > 1) {
TrpElementCoordinatesComparator<TextLineType> comp = new TrpElementCoordinatesComparator<>(true);
Collections.sort(matchingLines, comp);
}
return matchingLines;
}
use of eu.transkribus.core.model.beans.pagecontent.PcGtsType in project TranskribusCore by Transkribus.
the class PageXmlUtils method marshalToBytes.
public static byte[] marshalToBytes(PcGtsType page) throws JAXBException {
ValidationEventCollector vec = new ValidationEventCollector();
Marshaller marshaller = createMarshaller(vec);
ObjectFactory objectFactory = new ObjectFactory();
JAXBElement<PcGtsType> je = objectFactory.createPcGts(page);
byte[] data;
ByteArrayOutputStream out = new ByteArrayOutputStream();
try {
try {
marshaller.marshal(je, out);
data = out.toByteArray();
} finally {
out.close();
}
} catch (Exception e) {
throw new MarshalException(e);
}
String msg = buildMsg(vec, page);
if (!msg.startsWith(NO_EVENTS_MSG))
logger.info(msg);
return data;
}
Aggregations