use of eu.transkribus.core.model.beans.TrpPage in project TranskribusCore by Transkribus.
the class TrpMetsBuilder method buildMets.
/**
* Generate a METS containing
* <ul>
* <li>TrpDocMetadata embedded in sourceMd</li>
* <li>all page images</li>
* <li>the most recent PAGE XML files from the Doc</li>
* </ul>
*
* If a local document is passed, all hrefs will contain the relative paths to files based on the localFolder!
*
* @param doc
* @param exportImages
* @param pageIndices
* @return
* @throws IOException if image/xml files can't be accessed for reading the mimetype etc.
*/
public static Mets buildMets(TrpDoc doc, boolean exportPage, boolean exportAlto, boolean exportImages, Set<Integer> pageIndices) throws IOException {
Mets mets = new Mets();
TrpDocMetadata md = doc.getMd();
File localFolder = md.getLocalFolder();
boolean isLocalDoc = localFolder != null;
mets.setLABEL(md.getTitle());
mets.setOBJID("" + md.getDocId());
mets.setPROFILE(TRP_METS_PROFILE);
// FIXME remove TYPE
// mets.setTYPE(TRP_METS_PROFILE);
// metsHdr
MetsHdr hdr = buildMetsHdr(md);
mets.setMetsHdr(hdr);
// TODO dcmd_elec omitted meanwhile
// md_orig
AmdSecType amdSec = new AmdSecType();
amdSec.setID(SOURCE_MD_ID_CONST);
MdSecType sourceMdSec = buildSourceMdSec(md);
amdSec.getSourceMD().add(sourceMdSec);
mets.getAmdSec().add(amdSec);
// structmap div, linking to the sourceMd section with dmd
DivType div = new DivType();
div.getADMID().add(sourceMdSec);
div.setID(TRP_DOC_DIV_ID);
FileSec fileSec = new FileSec();
StructMapType structMap = new StructMapType();
structMap.setID(TRP_STRUCTMAP_ID);
structMap.setTYPE("MANUSCRIPT");
structMap.setDiv(div);
List<TrpPage> pages = doc.getPages();
FimgStoreGetClient client = null;
if (!isLocalDoc) {
// TODO maybe we need this stuff in the docMetadata?
URL url = pages.get(0).getUrl();
client = new FimgStoreGetClient(url);
}
FileGrp masterGrp = new FileGrp();
masterGrp.setID(MASTER_FILE_GRP_ID);
FileGrpType imgGrp = new FileGrpType();
imgGrp.setID(IMG_GROUP_ID);
FileGrpType pageGrp = new FileGrpType();
pageGrp.setID(PAGE_GROUP_ID);
FileGrpType altoGrp = new FileGrpType();
altoGrp.setID(ALTO_GROUP_ID);
int i = -1;
for (TrpPage p : pages) {
i++;
if (pageIndices != null && !pageIndices.contains(i)) {
continue;
}
// build a page div for the structmap
DivType pageDiv = new DivType();
pageDiv.setID("PAGE_" + p.getPageNr());
pageDiv.setTYPE("SINGLE_PAGE");
pageDiv.setORDER(BigInteger.valueOf(p.getPageNr()));
final String imgId = "IMG_" + p.getPageNr();
final String xmlId = PAGE_GROUP_ID + "_" + p.getPageNr();
final String altoId = ALTO_GROUP_ID + "_" + p.getPageNr();
/* only the most recent transcript is added here for now
*
* TODO how to deal with imagestore files? use orig image? right now, it's just the view file...
* TODO thumbnails not yet included
*/
if (exportImages) {
FileType img = buildFileType(localFolder, imgId, p, p.getPageNr(), client);
imgGrp.getFile().add(img);
// linking images
Fptr imgPtr = buildFptr(img);
pageDiv.getFptr().add(imgPtr);
}
// TODO error handling.. if no transcript??
if (exportPage) {
// xmlfiletype: just add the most recent transcript
TrpTranscriptMetadata tMd;
// get the transcript chosen for export
tMd = p.getCurrentTranscript();
FileType xml = buildFileType(md.getLocalFolder(), xmlId, tMd, p.getPageNr(), client);
pageGrp.getFile().add(xml);
Fptr xmlPtr = buildFptr(xml);
pageDiv.getFptr().add(xmlPtr);
}
// creat ALTO fileGrp
if (exportAlto) {
FileType altoFt = new FileType();
altoFt.setCHECKSUMTYPE(ChecksumUtils.ChkSumAlg.MD5.toString());
// TODO calculate checksum
altoFt.setCHECKSUM("");
FLocat fLocat = new FLocat();
fLocat.setLOCTYPE("OTHER");
fLocat.setOTHERLOCTYPE("FILE");
altoFt.setID(altoId);
altoFt.setSEQ(p.getPageNr());
// String tmpImgName = img.getFLocat().get(0).getHref();
String relAltoPath = "alto".concat(File.separator).concat(p.getImgFileName().substring(0, p.getImgFileName().lastIndexOf(".")).concat(".xml"));
fLocat.setHref(relAltoPath);
// String absAltoPath = tMd.getUrl().getPath().replace("page", "alto");
final String path = FileUtils.toFile(p.getUrl()).getAbsolutePath();
String absAltoPath = path.substring(0, path.lastIndexOf(File.separator));
absAltoPath = absAltoPath.concat("/alto/").concat(p.getImgFileName().substring(0, p.getImgFileName().lastIndexOf(".")).concat(".xml"));
// logger.info("alto path starts with: " + absAltoPath);
if (absAltoPath.startsWith("\\")) /*|| absAltoPath.startsWith("/")*/
{
// logger.info("alto path starts with \\ or /");
absAltoPath = absAltoPath.substring(1);
}
String mime = MimeTypes.getMimeType("xml");
altoFt.setMIMETYPE(mime);
File altoTmp = new File(absAltoPath);
if (altoTmp.exists()) {
// logger.info("alto file exist at " + absAltoPath);
Date date = new Date(altoTmp.lastModified());
XMLGregorianCalendar cal = JaxbUtils.getXmlCalendar(date);
altoFt.setCREATED(cal);
} else {
logger.info("alto file does not exist at " + absAltoPath);
}
// System.out.println("relAltoPath " + relAltoPath);
// System.out.println("absAltoPath " + absAltoPath);
// System.in.read();
altoFt.getFLocat().add(fLocat);
altoGrp.getFile().add(altoFt);
Fptr altoPtr = buildFptr(altoFt);
pageDiv.getFptr().add(altoPtr);
}
div.getDiv().add(pageDiv);
}
fileSec.getFileGrp().add(masterGrp);
mets.setFileSec(fileSec);
if (exportImages) {
masterGrp.getFileGrp().add(imgGrp);
}
if (exportPage) {
masterGrp.getFileGrp().add(pageGrp);
}
if (exportAlto) {
masterGrp.getFileGrp().add(altoGrp);
}
mets.getStructMap().add(structMap);
return mets;
}
use of eu.transkribus.core.model.beans.TrpPage in project TranskribusCore by Transkribus.
the class TrpXlsxBuilder method writeXlsxForDoc.
public static void writeXlsxForDoc(TrpDoc doc, boolean wordBased, File exportFile, Set<Integer> pageIndices, IProgressMonitor monitor, ExportCache cache) throws NoTagsException, Exception {
if (cache == null) {
throw new IllegalArgumentException("ExportCache must not be null.");
}
if (cache.getCustomTagMapForDoc().isEmpty()) {
logger.info("No tags to store -> Xlsx export cancelled");
throw new NoTagsException("No tags available to store into Xlsx");
}
List<TrpPage> pages = doc.getPages();
String exportPath = exportFile.getPath();
Set<String> selectedTags = cache.getOnlySelectedTagnames(ExportUtils.getOnlyWantedTagnames(CustomTagFactory.getRegisteredTagNames()));
int totalPages = pageIndices == null ? pages.size() : pageIndices.size();
if (monitor != null) {
monitor.beginTask("Exporting to Excel", totalPages);
}
wb = new XSSFWorkbook();
int c = 0;
for (int i = 0; i < pages.size(); ++i) {
if (pageIndices != null && !pageIndices.contains(i))
continue;
if (monitor != null) {
if (monitor.isCanceled()) {
throw new InterruptedException("Export was canceled by user");
// logger.debug("Xlsx export cancelled!");
// return;
}
monitor.subTask("Processing page " + (c + 1));
}
TrpPage page = pages.get(i);
// try to get previously loaded JAXB transcript
JAXBPageTranscript tr = null;
if (cache != null) {
tr = cache.getPageTranscriptAtIndex(i);
}
if (tr == null) {
TrpTranscriptMetadata md = page.getCurrentTranscript();
tr = new JAXBPageTranscript(md);
tr.build();
}
// old version
// TrpPage page = pages.get(i);
// TrpTranscriptMetadata md = page.getCurrentTranscript();
// JAXBPageTranscript tr = new JAXBPageTranscript(md);
// tr.build();
TrpPageType trpPage = tr.getPage();
logger.debug("writing xlsx for page " + (i + 1) + "/" + doc.getNPages());
List<TrpTextRegionType> textRegions = trpPage.getTextRegions(true);
for (int j = 0; j < textRegions.size(); ++j) {
TrpTextRegionType r = textRegions.get(j);
List<TextLineType> lines = r.getTextLine();
for (int k = 0; k < lines.size(); ++k) {
TrpTextLineType trpL = (TrpTextLineType) lines.get(k);
List<WordType> words = trpL.getWord();
if (wordBased) {
for (int l = 0; l < words.size(); ++l) {
TrpWordType w = (TrpWordType) words.get(l);
writeTagsForShapeElement(w, trpL.getUnicodeText(), String.valueOf(doc.getId()), String.valueOf(page.getPageNr()), r.getId(), trpL.getId(), w.getId(), selectedTags);
}
} else {
writeTagsForShapeElement(trpL, trpL.getUnicodeText(), String.valueOf(doc.getId()), String.valueOf(page.getPageNr()), r.getId(), trpL.getId(), "", selectedTags);
}
}
}
++c;
if (monitor != null) {
monitor.worked(c);
}
}
/*
* auto size the columns
*/
for (int i = 0; i < wb.getNumberOfSheets(); i++) {
int numberOfCells = 0;
Iterator rowIterator = wb.getSheetAt(i).rowIterator();
/**
* Escape the header row *
*/
if (rowIterator.hasNext()) {
Row headerRow = (Row) rowIterator.next();
// get the number of cells in the header row
numberOfCells = headerRow.getPhysicalNumberOfCells();
for (int j = 0; j < numberOfCells; j++) {
wb.getSheetAt(i).autoSizeColumn(j);
}
}
}
FileOutputStream fOut;
try {
// means no tags at all
if (wb.getNumberOfSheets() == 0) {
throw new IOException("Sorry - No tags available for export");
}
fOut = new FileOutputStream(exportPath);
wb.write(fOut);
fOut.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
throw e;
}
logger.info("wrote xlsx to: " + exportPath);
}
use of eu.transkribus.core.model.beans.TrpPage in project TranskribusCore by Transkribus.
the class TrpTeiStringBuilder method setContent.
@Override
protected void setContent(List<TrpPage> pages) throws JAXBException, InterruptedException {
SebisStringBuilder sbFacsimile = new SebisStringBuilder();
SebisStringBuilder sbText = new SebisStringBuilder();
sbText.incIndent();
sbText.addLine("<text>");
sbText.incIndent();
sbText.addLine("<body>");
// sbText.incIndent();
// text = tei.createElementNS(TEI_NS, "text");
// body = tei.createElementNS(TEI_NS, "body");
int totalPages = pageIndices == null ? pages.size() : pageIndices.size();
if (monitor != null) {
monitor.beginTask("Creating TEI", totalPages);
}
int c = 0;
for (int i = 0; i < pages.size(); ++i) {
if (pageIndices != null && !pageIndices.contains(i))
continue;
if (monitor != null) {
if (monitor.isCanceled()) {
throw new InterruptedException("Export was canceled by user");
// break;
}
monitor.subTask("Processing page " + (c + 1));
}
TrpPage p = pages.get(i);
logger.debug("1Processing page " + p.getPageNr() + ": " + p.getUrl() + " - XML=" + p.getCurrentTranscript().getUrl());
// check buffer for transcript or unmarshal the page XML
PcGtsType pc = this.getPcGtsTypeForPage(p);
if (pars.hasZones()) {
// create a facsimile element for each page that are appended to the root element of the TEI after header
openFacsimileElement(sbFacsimile, p, pc);
}
// create page-break element for each page as child of body element:
writePageBreak(sbText, p, pc);
//
// // append all text-regions / lines / words to the xml:
List<TrpRegionType> regions = pc.getPage().getTextRegionOrImageRegionOrLineDrawingRegion();
Collections.sort(regions, new TrpElementReadingOrderComparator<RegionType>(true));
for (TrpRegionType r : regions) {
// System.out.println(r.getClass());
if (r instanceof TextRegionType) {
if (pars.hasZones()) {
writeZonesForTextRegion(sbFacsimile, (TrpTextRegionType) r, p.getPageNr());
}
writeTextForTextRegion(sbText, (TrpTextRegionType) r, p.getPageNr());
} else {
// write other regions
if (pars.hasZones()) {
String facsId = FACS_ID_PREFIX + p.getPageNr();
writeZoneForShape(sbFacsimile, r, facsId, true);
}
}
}
if (pars.hasZones()) {
closeFacsimilieElement(sbFacsimile);
}
++c;
if (monitor != null) {
monitor.worked(c);
}
}
// text.appendChild(body);
// root.appendChild(text);
// sbText.decIndent();
sbText.addLine("</body>");
sbText.decIndent();
sbText.addLine("</text>");
sbText.decIndent();
sbTotal.sb.append(sbFacsimile.toString());
sbTotal.sb.append(sbText.toString());
}
use of eu.transkribus.core.model.beans.TrpPage in project TranskribusCore by Transkribus.
the class DocxBuilder method writeDocxForDoc.
public static void writeDocxForDoc(TrpDoc doc, boolean wordBased, boolean writeTags, boolean doBlackeningSensibleData, File file, Set<Integer> pageIndices, IProgressMonitor monitor, boolean createTitle, boolean markUnclear, boolean expandAbbreviations, boolean replaceAbbrevs, boolean keepLineBreaks, boolean showSuppliedInBrackets, boolean ignoreSuppliedTags, ExportCache cache) throws JAXBException, IOException, Docx4JException, InterruptedException {
// ch.qos.logback.classic.Logger root = logger.getClass().get(ch.qos.logback.classic.Logger) org.slf4j.LoggerFactory.getLogger(ch.qos.logback.classic.Logger.ROOT_LOGGER_NAME);
((ch.qos.logback.classic.Logger) logger).setLevel(ch.qos.logback.classic.Level.DEBUG);
exportTags = writeTags;
doBlackening = doBlackeningSensibleData;
tagnames = cache.getOnlySelectedTagnames(ExportUtils.getOnlyWantedTagnames(CustomTagFactory.getRegisteredTagNames()));
markUnclearWords = markUnclear;
expandAbbrevs = expandAbbreviations;
preserveLineBreaks = keepLineBreaks;
substituteAbbrevs = replaceAbbrevs;
showSuppliedWithBrackets = showSuppliedInBrackets;
ignoreSupplied = ignoreSuppliedTags;
/*
* get all names of tags
*/
// tagnames = CustomTagFactory.getRegisteredTagNames();
// main document part
wordMLPackage = WordprocessingMLPackage.createPackage();
MainDocumentPart mdp = wordMLPackage.getMainDocumentPart();
org.docx4j.wml.ObjectFactory factory = Context.getWmlObjectFactory();
List<TrpPage> pages = doc.getPages();
int totalPages = pageIndices == null ? pages.size() : pageIndices.size();
if (monitor != null) {
monitor.beginTask("Exporting to docx", totalPages);
}
int c = 0;
boolean atLeastOnePageWritten = false;
// can be used as page break every time we need one
Br objBr = new Br();
objBr.setType(STBrType.PAGE);
P pageBreakP = factory.createP();
pageBreakP.getContent().add(objBr);
for (int i = 0; i < pages.size(); ++i) {
if (pageIndices != null && !pageIndices.contains(i))
continue;
if (!atLeastOnePageWritten && createTitle) {
addTitlePage(doc, mdp);
// add page break
mdp.addObject(pageBreakP);
}
if (monitor != null) {
if (monitor.isCanceled()) {
throw new InterruptedException("Export canceled by the user");
// logger.debug("docx export cancelled!");
// return;
}
monitor.subTask("Processing page " + (c + 1));
}
// TrpPage page = pages.get(i);
// TrpTranscriptMetadata md = page.getCurrentTranscript();
// JAXBPageTranscript tr = new JAXBPageTranscript(md);
// tr.build();
JAXBPageTranscript tr = null;
if (cache != null) {
tr = cache.getPageTranscriptAtIndex(i);
}
if (tr == null) {
TrpPage page = pages.get(i);
TrpTranscriptMetadata md = page.getCurrentTranscript();
// md.getStatus().equals("Done");
tr = new JAXBPageTranscript(md);
tr.build();
}
TrpPageType trpPage = tr.getPage();
logger.debug("writing docx for the page " + (i + 1) + "/" + doc.getNPages());
writeDocxForTranscriptWithTables(mdp, trpPage, wordBased, preserveLineBreaks);
atLeastOnePageWritten = true;
++c;
if (monitor != null) {
monitor.worked(c);
}
}
P p = factory.createP();
mdp.getContent().add(p);
addComplexField(p, " INDEX \\e \"", "\" \\c \"1\" \\z \"1031\"");
FieldUpdater updater = new FieldUpdater(wordMLPackage);
updater.update(true);
// write tags at end of last page
if (false) {
// RtfText headline = RtfText.text("Person names in this document (amount of found persons: " + persons.size() + ")", "\n");
logger.debug("export tags ");
boolean firstExport = true;
// tagnames = all user choosen tags via export dialog
for (String currTagname : tagnames) {
// logger.debug("curr tagname " + currTagname);
// get all custom tags with currTagname and text
HashMap<CustomTag, String> allTagsOfThisTagname = cache.getTags(currTagname);
// one paragraph for each tagname
org.docx4j.wml.P p4Tag = factory.createP();
if (allTagsOfThisTagname.size() > 0 && !currTagname.equals("textStyle") && !currTagname.equals("gap") && !currTagname.equals("comment")) {
// new page if tag export starts
if (firstExport) {
// Br objBr = new Br();
// objBr.setType(STBrType.PAGE);
p4Tag.getContent().add(objBr);
firstExport = false;
}
// logger.debug("allTagsOfThisTagname " + allTagsOfThisTagname.size());
// one run for headline and thanfor each entry
org.docx4j.wml.Text t = factory.createText();
t.setValue(currTagname + " tags in this document: " + allTagsOfThisTagname.size());
t.setSpace("preserve");
org.docx4j.wml.R run = factory.createR();
run.getContent().add(t);
org.docx4j.wml.RPr rpr = factory.createRPr();
org.docx4j.wml.BooleanDefaultTrue b = new org.docx4j.wml.BooleanDefaultTrue();
b.setVal(true);
U u = factory.createU();
u.setVal(UnderlineEnumeration.SINGLE);
rpr.setB(b);
rpr.setU(u);
run.setRPr(rpr);
// this Br element is used break the current and go for next line
Br br = factory.createBr();
run.getContent().add(br);
p4Tag.getContent().add(run);
// ArrayList<RtfText> tagTexts = new ArrayList<RtfText>();
Collection<String> valueSet = allTagsOfThisTagname.values();
int l = 0;
for (String currEntry : valueSet) {
org.docx4j.wml.R currRun = factory.createR();
org.docx4j.wml.Text currText = factory.createText();
currText.setValue(currEntry);
currText.setSpace("preserve");
currRun.getContent().add(currText);
// reuse linebreak
currRun.getContent().add(br);
p4Tag.getContent().add(currRun);
}
}
mdp.getContent().add(p4Tag);
}
}
// finally save the file
wordMLPackage.save(file);
logger.info("Saved " + file.getAbsolutePath());
}
use of eu.transkribus.core.model.beans.TrpPage in project TranskribusCore by Transkribus.
the class TrpXlsxTableBuilder method writeXlsxForTables.
public static void writeXlsxForTables(TrpDoc doc, File exportFile, Set<Integer> pageIndices, IProgressMonitor monitor, ExportCache cache) throws NoTablesException, IOException, InterruptedException {
// TrpTableRegionType is contained in the regions too
List<TrpPage> pages = doc.getPages();
String exportPath = exportFile.getPath();
int totalPages = pageIndices == null ? pages.size() : pageIndices.size();
if (monitor != null) {
monitor.beginTask("Exporting tables to Excel", totalPages);
}
wb = new XSSFWorkbook();
int c = 0;
int tableId = 0;
for (int i = 0; i < pages.size(); ++i) {
if (pageIndices != null && !pageIndices.contains(i))
continue;
if (monitor != null) {
if (monitor.isCanceled()) {
throw new InterruptedException("Export was canceled by user");
// logger.debug("Xlsx export cancelled!");
// return;
}
monitor.subTask("Processing page " + (c + 1));
}
TrpPage page = pages.get(i);
// try to get previously loaded JAXB transcript
JAXBPageTranscript tr = null;
if (cache != null) {
tr = cache.getPageTranscriptAtIndex(i);
}
if (tr == null) {
TrpTranscriptMetadata md = page.getCurrentTranscript();
tr = new JAXBPageTranscript(md);
tr.build();
}
TrpPageType trpPage = tr.getPage();
List<TrpRegionType> regions = trpPage.getRegions();
for (int j = 0; j < regions.size(); ++j) {
TrpRegionType r = regions.get(j);
if (r instanceof TrpTableRegionType) {
tableId++;
logger.debug("is table");
TrpTableRegionType table = (TrpTableRegionType) r;
int cols = table.getNCols();
int rows = table.getNRows();
// double maxX = PageXmlUtils.buildPolygon(table.getCoords().getPoints()).getBounds().getMaxX();
// double minX = PageXmlUtils.buildPolygon(table.getCoords().getPoints()).getBounds().getMinX();
// int tablesize = (int) (maxX - minX);
List<List<TrpTableCellType>> allRowCells = new ArrayList<List<TrpTableCellType>>();
for (int k = 0; k < rows; k++) {
allRowCells.add(table.getRowCells(k));
}
List<HashMap<Integer, TrpTableCellType>> allRows = new ArrayList<HashMap<Integer, TrpTableCellType>>();
HashMap<Integer, TrpTableCellType> nextRowMap = new HashMap<Integer, TrpTableCellType>();
for (List<TrpTableCellType> rowCells : allRowCells) {
HashMap<Integer, TrpTableCellType> currRowMap = new HashMap<Integer, TrpTableCellType>();
/*
* fill up all cells which are not set in TRP (needed for vertical cell merge)
* the nextRowMap contains already all cells which span vertically with the cells above - means they got merged
* in the table but have to be considered here
*/
currRowMap.putAll(nextRowMap);
nextRowMap.clear();
for (TrpTableCellType cell : rowCells) {
// logger.debug("table cell text " + cell.getUnicodeTextFromLines());
currRowMap.put(cell.getCol(), cell);
// only one row or col span is considered -> FIXME: do it for all spans, but may happens never?
if (cell.getRowSpan() > 1) {
nextRowMap.put(cell.getCol(), null);
}
if (cell.getColSpan() > 1) {
currRowMap.put(cell.getCol() + 1, null);
}
}
allRows.add(currRowMap);
}
createTable(rows, cols, allRows, tableId);
}
logger.debug("writing xlsx for page " + (i + 1) + "/" + doc.getNPages());
++c;
if (monitor != null) {
monitor.worked(c);
}
}
}
/*
* auto size the columns
*/
for (int i = 0; i < wb.getNumberOfSheets(); i++) {
int numberOfCells = 0;
Iterator rowIterator = wb.getSheetAt(i).rowIterator();
/**
* Escape the header row *
*/
if (rowIterator.hasNext()) {
Row headerRow = (Row) rowIterator.next();
// get the number of cells in the header row
numberOfCells = headerRow.getPhysicalNumberOfCells();
for (int j = 0; j < numberOfCells; j++) {
wb.getSheetAt(i).autoSizeColumn(j, true);
}
}
}
FileOutputStream fOut;
try {
// means no tables at all
if (wb.getNumberOfSheets() == 0) {
throw new NoTablesException("Sorry - No tables available for export");
}
fOut = new FileOutputStream(exportPath);
wb.write(fOut);
fOut.close();
} catch (IOException e) {
if (!(e instanceof NoTablesException)) {
logger.error(e.getMessage(), e);
}
throw e;
}
logger.info("wrote xlsx to: " + exportPath);
}
Aggregations