use of eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType in project TranskribusCore by Transkribus.
the class TrpXlsxBuilder method writeXlsxForDoc.
public static void writeXlsxForDoc(TrpDoc doc, boolean wordBased, File exportFile, Set<Integer> pageIndices, IProgressMonitor monitor, ExportCache cache) throws NoTagsException, Exception {
if (cache == null) {
throw new IllegalArgumentException("ExportCache must not be null.");
}
if (cache.getCustomTagMapForDoc().isEmpty()) {
logger.info("No tags to store -> Xlsx export cancelled");
throw new NoTagsException("No tags available to store into Xlsx");
}
List<TrpPage> pages = doc.getPages();
String exportPath = exportFile.getPath();
Set<String> selectedTags = cache.getOnlySelectedTagnames(ExportUtils.getOnlyWantedTagnames(CustomTagFactory.getRegisteredTagNames()));
int totalPages = pageIndices == null ? pages.size() : pageIndices.size();
if (monitor != null) {
monitor.beginTask("Exporting to Excel", totalPages);
}
wb = new XSSFWorkbook();
int c = 0;
for (int i = 0; i < pages.size(); ++i) {
if (pageIndices != null && !pageIndices.contains(i))
continue;
if (monitor != null) {
if (monitor.isCanceled()) {
throw new InterruptedException("Export was canceled by user");
// logger.debug("Xlsx export cancelled!");
// return;
}
monitor.subTask("Processing page " + (c + 1));
}
TrpPage page = pages.get(i);
// try to get previously loaded JAXB transcript
JAXBPageTranscript tr = null;
if (cache != null) {
tr = cache.getPageTranscriptAtIndex(i);
}
if (tr == null) {
TrpTranscriptMetadata md = page.getCurrentTranscript();
tr = new JAXBPageTranscript(md);
tr.build();
}
// old version
// TrpPage page = pages.get(i);
// TrpTranscriptMetadata md = page.getCurrentTranscript();
// JAXBPageTranscript tr = new JAXBPageTranscript(md);
// tr.build();
TrpPageType trpPage = tr.getPage();
logger.debug("writing xlsx for page " + (i + 1) + "/" + doc.getNPages());
List<TrpTextRegionType> textRegions = trpPage.getTextRegions(true);
for (int j = 0; j < textRegions.size(); ++j) {
TrpTextRegionType r = textRegions.get(j);
List<TextLineType> lines = r.getTextLine();
for (int k = 0; k < lines.size(); ++k) {
TrpTextLineType trpL = (TrpTextLineType) lines.get(k);
List<WordType> words = trpL.getWord();
if (wordBased) {
for (int l = 0; l < words.size(); ++l) {
TrpWordType w = (TrpWordType) words.get(l);
writeTagsForShapeElement(w, trpL.getUnicodeText(), String.valueOf(doc.getId()), String.valueOf(page.getPageNr()), r.getId(), trpL.getId(), w.getId(), selectedTags);
}
} else {
writeTagsForShapeElement(trpL, trpL.getUnicodeText(), String.valueOf(doc.getId()), String.valueOf(page.getPageNr()), r.getId(), trpL.getId(), "", selectedTags);
}
}
}
++c;
if (monitor != null) {
monitor.worked(c);
}
}
/*
* auto size the columns
*/
for (int i = 0; i < wb.getNumberOfSheets(); i++) {
int numberOfCells = 0;
Iterator rowIterator = wb.getSheetAt(i).rowIterator();
/**
* Escape the header row *
*/
if (rowIterator.hasNext()) {
Row headerRow = (Row) rowIterator.next();
// get the number of cells in the header row
numberOfCells = headerRow.getPhysicalNumberOfCells();
for (int j = 0; j < numberOfCells; j++) {
wb.getSheetAt(i).autoSizeColumn(j);
}
}
}
FileOutputStream fOut;
try {
// means no tags at all
if (wb.getNumberOfSheets() == 0) {
throw new IOException("Sorry - No tags available for export");
}
fOut = new FileOutputStream(exportPath);
wb.write(fOut);
fOut.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
throw e;
}
logger.info("wrote xlsx to: " + exportPath);
}
use of eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType in project TranskribusCore by Transkribus.
the class TrpTeiStringBuilder method setContent.
@Override
protected void setContent(List<TrpPage> pages) throws JAXBException, InterruptedException {
SebisStringBuilder sbFacsimile = new SebisStringBuilder();
SebisStringBuilder sbText = new SebisStringBuilder();
sbText.incIndent();
sbText.addLine("<text>");
sbText.incIndent();
sbText.addLine("<body>");
// sbText.incIndent();
// text = tei.createElementNS(TEI_NS, "text");
// body = tei.createElementNS(TEI_NS, "body");
int totalPages = pageIndices == null ? pages.size() : pageIndices.size();
if (monitor != null) {
monitor.beginTask("Creating TEI", totalPages);
}
int c = 0;
for (int i = 0; i < pages.size(); ++i) {
if (pageIndices != null && !pageIndices.contains(i))
continue;
if (monitor != null) {
if (monitor.isCanceled()) {
throw new InterruptedException("Export was canceled by user");
// break;
}
monitor.subTask("Processing page " + (c + 1));
}
TrpPage p = pages.get(i);
logger.debug("1Processing page " + p.getPageNr() + ": " + p.getUrl() + " - XML=" + p.getCurrentTranscript().getUrl());
// check buffer for transcript or unmarshal the page XML
PcGtsType pc = this.getPcGtsTypeForPage(p);
if (pars.hasZones()) {
// create a facsimile element for each page that are appended to the root element of the TEI after header
openFacsimileElement(sbFacsimile, p, pc);
}
// create page-break element for each page as child of body element:
writePageBreak(sbText, p, pc);
//
// // append all text-regions / lines / words to the xml:
List<TrpRegionType> regions = pc.getPage().getTextRegionOrImageRegionOrLineDrawingRegion();
Collections.sort(regions, new TrpElementReadingOrderComparator<RegionType>(true));
for (TrpRegionType r : regions) {
// System.out.println(r.getClass());
if (r instanceof TextRegionType) {
if (pars.hasZones()) {
writeZonesForTextRegion(sbFacsimile, (TrpTextRegionType) r, p.getPageNr());
}
writeTextForTextRegion(sbText, (TrpTextRegionType) r, p.getPageNr());
} else {
// write other regions
if (pars.hasZones()) {
String facsId = FACS_ID_PREFIX + p.getPageNr();
writeZoneForShape(sbFacsimile, r, facsId, true);
}
}
}
if (pars.hasZones()) {
closeFacsimilieElement(sbFacsimile);
}
++c;
if (monitor != null) {
monitor.worked(c);
}
}
// text.appendChild(body);
// root.appendChild(text);
// sbText.decIndent();
sbText.addLine("</body>");
sbText.decIndent();
sbText.addLine("</text>");
sbText.decIndent();
sbTotal.sb.append(sbFacsimile.toString());
sbTotal.sb.append(sbText.toString());
}
use of eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType in project TranskribusCore by Transkribus.
the class DocxBuilder method writeDocxForTranscriptWithTables.
private static void writeDocxForTranscriptWithTables(MainDocumentPart mdp, TrpPageType trpPage, boolean wordBased, boolean preserveLineBreaks) {
boolean rtl = false;
// TrpTableRegionType is contained in the regions too
List<TrpRegionType> regions = trpPage.getRegions();
Collections.sort(regions, new TrpElementReadingOrderComparator<RegionType>(true));
for (int j = 0; j < regions.size(); ++j) {
TrpRegionType r = regions.get(j);
if (r instanceof TrpTableRegionType) {
logger.debug("is table");
TrpTableRegionType table = (TrpTableRegionType) r;
int cols = table.getNCols();
int rows = table.getNRows();
// PageXmlUtils.buildPolygon(table.getCoords().getPoints()).getBounds().getMaxX();
double maxX = table.getBoundingBox().getMaxX();
// PageXmlUtils.buildPolygon(table.getCoords().getPoints()).getBounds().getMinX();
double minX = table.getBoundingBox().getMinX();
int tablesize = (int) (maxX - minX);
List<List<TrpTableCellType>> allRowCells = new ArrayList<List<TrpTableCellType>>();
for (int k = 0; k < rows; k++) {
allRowCells.add(table.getRowCells(k));
}
List<HashMap<Integer, TrpTableCellType>> allRows = new ArrayList<HashMap<Integer, TrpTableCellType>>();
HashMap<Integer, TrpTableCellType> nextRowMap = new HashMap<Integer, TrpTableCellType>();
for (List<TrpTableCellType> rowCells : allRowCells) {
HashMap<Integer, TrpTableCellType> currRowMap = new HashMap<Integer, TrpTableCellType>();
/*
* fill up all cells which are not set in TRP (needed for vertical cell merge)
* the nextRowMap contains already all cells which span vertically with the cells above - means they got merged
* in the table but have to be considered here
*/
currRowMap.putAll(nextRowMap);
nextRowMap.clear();
for (TrpTableCellType cell : rowCells) {
// logger.debug("table cell text " + cell.getUnicodeTextFromLines());
currRowMap.put(cell.getCol(), cell);
if (cell.getRowSpan() > 1) {
nextRowMap.put(cell.getCol(), null);
}
}
allRows.add(currRowMap);
}
Tbl thisTable;
try {
thisTable = getDocxTable(wordMLPackage, wordBased, rows, cols, allRows, tablesize, mdp);
mdp.addObject(thisTable);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
// this Br element is used break the current and go for next line
Br br = factory.createBr();
org.docx4j.wml.P p = factory.createP();
mdp.addObject(p);
p.getContent().add(br);
} else if (r instanceof TrpTextRegionType) {
TrpTextRegionType tr = (TrpTextRegionType) r;
/*
* create one paragraph for each text region
* but only if there is some text in it
*/
String helper = tr.getUnicodeText().replaceAll("\n", "");
if (!helper.equals("")) {
exportTextRegion(tr, wordBased, null, mdp);
}
}
}
}
use of eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType in project TranskribusCore by Transkribus.
the class TrpRtfBuilder method getRtfParagraphsForTranscript.
// public static void writeRtfForElement(Rtf rtf, ITrpShapeType element, boolean wordBased, File file, boolean append) throws IOException, JAXBException {
// element.getUnicodeText();
// CustomTagList cl = element.getCustomTagList();
//
// RtfText text = RtfText.text(element.getUnicodeText());
// text = formatRtfText(text, element.getTextStyle());
//
//
//
// if (element instanceof TextLineType || element instanceof TextRegionType) {// TODO words vs lines and regions
// rtf.p(text);
// } else if (element instanceof TrpWordType) {
// // rtf.p(texts);
// }
//
//
// // cl.getCustomTagAndContinuations(tag)
//
//
//
// }
public static RtfPara[] getRtfParagraphsForTranscript(TrpPageType trpPage, boolean wordBased) throws IOException, JAXBException {
boolean rtl = false;
List<TrpTextRegionType> textRegions = trpPage.getTextRegions(true);
// List<TrpTextRegionType> textRegions = trpPage.getTextRegionsAndTextRegionsFromTableRegions(true);
RtfPara[] paras = new RtfPara[textRegions.size()];
for (int j = 0; j < textRegions.size(); ++j) {
TrpTextRegionType r = textRegions.get(j);
// if (exportTags){
// getTagsForShapeElement(r);
// }
List<TextLineType> lines = r.getTextLine();
RtfText[] linesTexts = new RtfText[lines.size()];
for (int i = 0; i < lines.size(); ++i) {
TrpTextLineType trpL = (TrpTextLineType) lines.get(i);
linesTexts[i] = (wordBased && trpL.getWord().size() > 0) ? getRtfTextForLineFromWords(trpL) : getRtfTextForShapeElement(trpL);
linesTexts[i] = RtfText.text(linesTexts[i], "\n");
}
// read from right to left -> alignment is right
if (rtl) {
// paras[j] = RtfPara.p(linesTexts).footnote("Test").alignRight();
} else {
String test = "test";
paras[j] = RtfPara.p(linesTexts);
// paras[j] = RtfPara.p(linesTexts, RtfText.footnote("Test")).alignLeft();
}
}
return paras;
// Rtf rtf = Rtf.rtf().section(paras);
// return rtf;
// for (RegionType r : trpPage.getTextRegionOrImageRegionOrLineDrawingRegion()) {
// if (r instanceof GraphicRegionType) {
// GraphicRegionType gr = (GraphicRegionType) r;
// // TODO: how to export images in pdf??
// r.getTextRegions(recursive);
// }
// }
// tr.getPage().getTextRegions(recursive);
// Rtf.rtf();
// RtfWriter;
}
use of eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType in project TranskribusCore by Transkribus.
the class TrpTxtBuilder method writeTxtForSinglePage.
private static void writeTxtForSinglePage(File file, TrpPageType trpPage, boolean wordBased, boolean preserveLineBreaks) {
boolean rtl = false;
// TrpTableRegionType is contained in the regions too
List<TrpRegionType> regions = trpPage.getRegions();
Collections.sort(regions, new TrpElementReadingOrderComparator<RegionType>(true));
List<String> content = new ArrayList<String>();
for (int j = 0; j < regions.size(); ++j) {
TrpRegionType r = regions.get(j);
if (r instanceof TrpTableRegionType) {
/*
* TODO: for simple txt export: how to handle tables
*/
continue;
} else if (r instanceof TrpTextRegionType) {
TrpTextRegionType tr = (TrpTextRegionType) r;
List<TextLineType> lines = tr.getTextLine();
for (int i = 0; i < lines.size(); ++i) {
TrpTextLineType trpL = (TrpTextLineType) lines.get(i);
String textOfCurrLine = trpL.getUnicodeText();
if (wordBased && trpL.getWord().size() > 0) {
for (WordType word : trpL.getWord()) {
content.add(((ITrpShapeType) word).getUnicodeText());
}
} else if (textOfCurrLine != "") {
content.add(textOfCurrLine);
}
// if(preserveLineBreaks){
// content.add(System.lineSeparator());
// }
}
if (lines.size() > 0) {
content.add(System.lineSeparator());
// try {
// //Add line separator after each region
// Files.write(Paths.get(file.getAbsolutePath()), new ArrayList<String>() {{ add(System.lineSeparator()); }}, utf8,
// StandardOpenOption.CREATE, StandardOpenOption.APPEND);
// } catch (IOException e) {
// // TODO Auto-generated catch block
// e.printStackTrace();
// }
}
}
}
try {
logger.debug("path " + Paths.get(file.getAbsolutePath()));
Files.write(Paths.get(file.getAbsolutePath()), content, utf8, StandardOpenOption.CREATE, StandardOpenOption.APPEND);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
Aggregations