use of eu.transkribus.core.model.beans.pagecontent_trp.TrpPageType in project TranskribusCore by Transkribus.
the class DocxBuilder method writeDocxForDoc.
public static void writeDocxForDoc(TrpDoc doc, boolean wordBased, boolean writeTags, boolean doBlackeningSensibleData, File file, Set<Integer> pageIndices, IProgressMonitor monitor, boolean createTitle, boolean markUnclear, boolean expandAbbreviations, boolean replaceAbbrevs, boolean keepLineBreaks, boolean showSuppliedInBrackets, boolean ignoreSuppliedTags, ExportCache cache) throws JAXBException, IOException, Docx4JException, InterruptedException {
// ch.qos.logback.classic.Logger root = logger.getClass().get(ch.qos.logback.classic.Logger) org.slf4j.LoggerFactory.getLogger(ch.qos.logback.classic.Logger.ROOT_LOGGER_NAME);
((ch.qos.logback.classic.Logger) logger).setLevel(ch.qos.logback.classic.Level.DEBUG);
exportTags = writeTags;
doBlackening = doBlackeningSensibleData;
tagnames = cache.getOnlySelectedTagnames(ExportUtils.getOnlyWantedTagnames(CustomTagFactory.getRegisteredTagNames()));
markUnclearWords = markUnclear;
expandAbbrevs = expandAbbreviations;
preserveLineBreaks = keepLineBreaks;
substituteAbbrevs = replaceAbbrevs;
showSuppliedWithBrackets = showSuppliedInBrackets;
ignoreSupplied = ignoreSuppliedTags;
/*
* get all names of tags
*/
// tagnames = CustomTagFactory.getRegisteredTagNames();
// main document part
wordMLPackage = WordprocessingMLPackage.createPackage();
MainDocumentPart mdp = wordMLPackage.getMainDocumentPart();
org.docx4j.wml.ObjectFactory factory = Context.getWmlObjectFactory();
List<TrpPage> pages = doc.getPages();
int totalPages = pageIndices == null ? pages.size() : pageIndices.size();
if (monitor != null) {
monitor.beginTask("Exporting to docx", totalPages);
}
int c = 0;
boolean atLeastOnePageWritten = false;
// can be used as page break every time we need one
Br objBr = new Br();
objBr.setType(STBrType.PAGE);
P pageBreakP = factory.createP();
pageBreakP.getContent().add(objBr);
for (int i = 0; i < pages.size(); ++i) {
if (pageIndices != null && !pageIndices.contains(i))
continue;
if (!atLeastOnePageWritten && createTitle) {
addTitlePage(doc, mdp);
// add page break
mdp.addObject(pageBreakP);
}
if (monitor != null) {
if (monitor.isCanceled()) {
throw new InterruptedException("Export canceled by the user");
// logger.debug("docx export cancelled!");
// return;
}
monitor.subTask("Processing page " + (c + 1));
}
// TrpPage page = pages.get(i);
// TrpTranscriptMetadata md = page.getCurrentTranscript();
// JAXBPageTranscript tr = new JAXBPageTranscript(md);
// tr.build();
JAXBPageTranscript tr = null;
if (cache != null) {
tr = cache.getPageTranscriptAtIndex(i);
}
if (tr == null) {
TrpPage page = pages.get(i);
TrpTranscriptMetadata md = page.getCurrentTranscript();
// md.getStatus().equals("Done");
tr = new JAXBPageTranscript(md);
tr.build();
}
TrpPageType trpPage = tr.getPage();
logger.debug("writing docx for the page " + (i + 1) + "/" + doc.getNPages());
writeDocxForTranscriptWithTables(mdp, trpPage, wordBased, preserveLineBreaks);
atLeastOnePageWritten = true;
++c;
if (monitor != null) {
monitor.worked(c);
}
}
P p = factory.createP();
mdp.getContent().add(p);
addComplexField(p, " INDEX \\e \"", "\" \\c \"1\" \\z \"1031\"");
FieldUpdater updater = new FieldUpdater(wordMLPackage);
updater.update(true);
// write tags at end of last page
if (false) {
// RtfText headline = RtfText.text("Person names in this document (amount of found persons: " + persons.size() + ")", "\n");
logger.debug("export tags ");
boolean firstExport = true;
// tagnames = all user choosen tags via export dialog
for (String currTagname : tagnames) {
// logger.debug("curr tagname " + currTagname);
// get all custom tags with currTagname and text
HashMap<CustomTag, String> allTagsOfThisTagname = cache.getTags(currTagname);
// one paragraph for each tagname
org.docx4j.wml.P p4Tag = factory.createP();
if (allTagsOfThisTagname.size() > 0 && !currTagname.equals("textStyle") && !currTagname.equals("gap") && !currTagname.equals("comment")) {
// new page if tag export starts
if (firstExport) {
// Br objBr = new Br();
// objBr.setType(STBrType.PAGE);
p4Tag.getContent().add(objBr);
firstExport = false;
}
// logger.debug("allTagsOfThisTagname " + allTagsOfThisTagname.size());
// one run for headline and thanfor each entry
org.docx4j.wml.Text t = factory.createText();
t.setValue(currTagname + " tags in this document: " + allTagsOfThisTagname.size());
t.setSpace("preserve");
org.docx4j.wml.R run = factory.createR();
run.getContent().add(t);
org.docx4j.wml.RPr rpr = factory.createRPr();
org.docx4j.wml.BooleanDefaultTrue b = new org.docx4j.wml.BooleanDefaultTrue();
b.setVal(true);
U u = factory.createU();
u.setVal(UnderlineEnumeration.SINGLE);
rpr.setB(b);
rpr.setU(u);
run.setRPr(rpr);
// this Br element is used break the current and go for next line
Br br = factory.createBr();
run.getContent().add(br);
p4Tag.getContent().add(run);
// ArrayList<RtfText> tagTexts = new ArrayList<RtfText>();
Collection<String> valueSet = allTagsOfThisTagname.values();
int l = 0;
for (String currEntry : valueSet) {
org.docx4j.wml.R currRun = factory.createR();
org.docx4j.wml.Text currText = factory.createText();
currText.setValue(currEntry);
currText.setSpace("preserve");
currRun.getContent().add(currText);
// reuse linebreak
currRun.getContent().add(br);
p4Tag.getContent().add(currRun);
}
}
mdp.getContent().add(p4Tag);
}
}
// finally save the file
wordMLPackage.save(file);
logger.info("Saved " + file.getAbsolutePath());
}
use of eu.transkribus.core.model.beans.pagecontent_trp.TrpPageType in project TranskribusCore by Transkribus.
the class TrpXlsxTableBuilder method writeXlsxForTables.
public static void writeXlsxForTables(TrpDoc doc, File exportFile, Set<Integer> pageIndices, IProgressMonitor monitor, ExportCache cache) throws NoTablesException, IOException, InterruptedException {
// TrpTableRegionType is contained in the regions too
List<TrpPage> pages = doc.getPages();
String exportPath = exportFile.getPath();
int totalPages = pageIndices == null ? pages.size() : pageIndices.size();
if (monitor != null) {
monitor.beginTask("Exporting tables to Excel", totalPages);
}
wb = new XSSFWorkbook();
int c = 0;
int tableId = 0;
for (int i = 0; i < pages.size(); ++i) {
if (pageIndices != null && !pageIndices.contains(i))
continue;
if (monitor != null) {
if (monitor.isCanceled()) {
throw new InterruptedException("Export was canceled by user");
// logger.debug("Xlsx export cancelled!");
// return;
}
monitor.subTask("Processing page " + (c + 1));
}
TrpPage page = pages.get(i);
// try to get previously loaded JAXB transcript
JAXBPageTranscript tr = null;
if (cache != null) {
tr = cache.getPageTranscriptAtIndex(i);
}
if (tr == null) {
TrpTranscriptMetadata md = page.getCurrentTranscript();
tr = new JAXBPageTranscript(md);
tr.build();
}
TrpPageType trpPage = tr.getPage();
List<TrpRegionType> regions = trpPage.getRegions();
for (int j = 0; j < regions.size(); ++j) {
TrpRegionType r = regions.get(j);
if (r instanceof TrpTableRegionType) {
tableId++;
logger.debug("is table");
TrpTableRegionType table = (TrpTableRegionType) r;
int cols = table.getNCols();
int rows = table.getNRows();
// double maxX = PageXmlUtils.buildPolygon(table.getCoords().getPoints()).getBounds().getMaxX();
// double minX = PageXmlUtils.buildPolygon(table.getCoords().getPoints()).getBounds().getMinX();
// int tablesize = (int) (maxX - minX);
List<List<TrpTableCellType>> allRowCells = new ArrayList<List<TrpTableCellType>>();
for (int k = 0; k < rows; k++) {
allRowCells.add(table.getRowCells(k));
}
List<HashMap<Integer, TrpTableCellType>> allRows = new ArrayList<HashMap<Integer, TrpTableCellType>>();
HashMap<Integer, TrpTableCellType> nextRowMap = new HashMap<Integer, TrpTableCellType>();
for (List<TrpTableCellType> rowCells : allRowCells) {
HashMap<Integer, TrpTableCellType> currRowMap = new HashMap<Integer, TrpTableCellType>();
/*
* fill up all cells which are not set in TRP (needed for vertical cell merge)
* the nextRowMap contains already all cells which span vertically with the cells above - means they got merged
* in the table but have to be considered here
*/
currRowMap.putAll(nextRowMap);
nextRowMap.clear();
for (TrpTableCellType cell : rowCells) {
// logger.debug("table cell text " + cell.getUnicodeTextFromLines());
currRowMap.put(cell.getCol(), cell);
// only one row or col span is considered -> FIXME: do it for all spans, but may happens never?
if (cell.getRowSpan() > 1) {
nextRowMap.put(cell.getCol(), null);
}
if (cell.getColSpan() > 1) {
currRowMap.put(cell.getCol() + 1, null);
}
}
allRows.add(currRowMap);
}
createTable(rows, cols, allRows, tableId);
}
logger.debug("writing xlsx for page " + (i + 1) + "/" + doc.getNPages());
++c;
if (monitor != null) {
monitor.worked(c);
}
}
}
/*
* auto size the columns
*/
for (int i = 0; i < wb.getNumberOfSheets(); i++) {
int numberOfCells = 0;
Iterator rowIterator = wb.getSheetAt(i).rowIterator();
/**
* Escape the header row *
*/
if (rowIterator.hasNext()) {
Row headerRow = (Row) rowIterator.next();
// get the number of cells in the header row
numberOfCells = headerRow.getPhysicalNumberOfCells();
for (int j = 0; j < numberOfCells; j++) {
wb.getSheetAt(i).autoSizeColumn(j, true);
}
}
}
FileOutputStream fOut;
try {
// means no tables at all
if (wb.getNumberOfSheets() == 0) {
throw new NoTablesException("Sorry - No tables available for export");
}
fOut = new FileOutputStream(exportPath);
wb.write(fOut);
fOut.close();
} catch (IOException e) {
if (!(e instanceof NoTablesException)) {
logger.error(e.getMessage(), e);
}
throw e;
}
logger.info("wrote xlsx to: " + exportPath);
}
use of eu.transkribus.core.model.beans.pagecontent_trp.TrpPageType in project TranskribusCore by Transkribus.
the class TrpTxtBuilder method writeTxtForDoc.
public static void writeTxtForDoc(TrpDoc doc, boolean addTitle, boolean wordBased, boolean preserveLineBreaks, File file, Set<Integer> pageIndices, IProgressMonitor monitor, ExportCache cache) throws JAXBException, IOException, Docx4JException, InterruptedException {
// delete file if already exists
Files.deleteIfExists(Paths.get(file.getAbsolutePath()));
if (addTitle) {
addTitlePage(doc, file);
}
List<TrpPage> pages = doc.getPages();
int totalPages = pageIndices == null ? pages.size() : pageIndices.size();
if (monitor != null) {
monitor.beginTask("Exporting to text file", totalPages);
}
int c = 0;
for (int i = 0; i < pages.size(); ++i) {
if (pageIndices != null && !pageIndices.contains(i))
continue;
if (monitor != null) {
if (monitor.isCanceled()) {
throw new InterruptedException("Export canceled by the user");
}
monitor.subTask("Processing page " + (c + 1));
}
JAXBPageTranscript tr = null;
if (cache != null) {
tr = cache.getPageTranscriptAtIndex(i);
}
if (tr == null) {
TrpPage page = pages.get(i);
TrpTranscriptMetadata md = page.getCurrentTranscript();
// md.getStatus().equals("Done");
tr = new JAXBPageTranscript(md);
tr.build();
}
TrpPageType trpPage = tr.getPage();
logger.debug("writing text file for the page " + (i + 1) + "/" + pages.size());
writeTxtForSinglePage(file, trpPage, wordBased, preserveLineBreaks);
++c;
if (monitor != null) {
monitor.worked(c);
}
}
logger.debug("Saved " + file.getAbsolutePath());
}
use of eu.transkribus.core.model.beans.pagecontent_trp.TrpPageType in project TranskribusCore by Transkribus.
the class PageXmlUtils method removeAllIndexedTags.
public static void removeAllIndexedTags(PcGtsType pc) {
TrpPageType p = (TrpPageType) pc.getPage();
List<TrpTextRegionType> trList = p.getTextRegions(true);
for (TrpTextRegionType tr : trList) {
tr.getCustomTagList().removeIndexedTags();
List<TextLineType> lineList = tr.getTextLine();
for (TextLineType l : lineList) {
TrpTextLineType trpL = (TrpTextLineType) l;
trpL.getCustomTagList().removeIndexedTags();
List<WordType> wordList = trpL.getWord();
for (WordType w : wordList) {
TrpWordType trpW = (TrpWordType) w;
trpW.getCustomTagList().removeIndexedTags();
}
}
}
}
use of eu.transkribus.core.model.beans.pagecontent_trp.TrpPageType in project TranskribusCore by Transkribus.
the class ExportCache method storeCustomTagMapForDoc.
/**
* @param doc
* @param wordBased
* @param pageIndices
* @param blackening
* @return all (custom) tags of the given document
* @throws JAXBException
* @throws IOException
* @throws InterruptedException
*/
public void storeCustomTagMapForDoc(TrpDoc doc, boolean wordBased, Set<Integer> pageIndices, IProgressMonitor monitor, boolean blackening) throws JAXBException, IOException, InterruptedException {
doBlackening = blackening;
tags.clear();
List<TrpPage> pages = doc.getPages();
int totalPages = pages.size();
int c = 0;
for (int i = 0; i < totalPages; ++i) {
if (pageIndices != null && !pageIndices.contains(i))
continue;
if (monitor != null && monitor.isCanceled()) {
throw new InterruptedException("User canceled the export");
}
// pageTranscripts get fetched before the custom tag map is stored - so normally pageTranscripts.get(i) != null
JAXBPageTranscript tr;
if (pageTranscripts == null || pageTranscripts.get(i) == null) {
TrpPage page = pages.get(i);
TrpTranscriptMetadata md = page.getCurrentTranscript();
tr = new JAXBPageTranscript(md);
} else {
tr = pageTranscripts.get(i);
tr.getPageData();
}
tr.build();
TrpPageType trpPage = tr.getPage();
logger.debug("get tags for page " + (i + 1) + "/" + doc.getNPages());
List<TrpTextRegionType> textRegions = trpPage.getTextRegions(true);
for (int j = 0; j < textRegions.size(); ++j) {
TrpTextRegionType r = textRegions.get(j);
List<TextLineType> lines = r.getTextLine();
for (int k = 0; k < lines.size(); ++k) {
TrpTextLineType trpL = (TrpTextLineType) lines.get(k);
List<WordType> words = trpL.getWord();
getTagsForShapeElement(trpL);
if (wordBased) {
for (int l = 0; l < words.size(); ++l) {
TrpWordType w = (TrpWordType) words.get(l);
getTagsForShapeElement(w);
}
}
// else{
// getTagsForShapeElement(trpL);
// }
}
}
if (monitor != null) {
monitor.setTaskName("Loaded tags for page " + (i + 1));
monitor.worked(++c);
}
}
}
Aggregations