use of eu.transkribus.core.model.beans.pagecontent.WordType in project TranskribusCore by Transkribus.
the class TrpTextLineType method getTextFromWords.
public String getTextFromWords(boolean fillEmptyWords) {
String text = "";
for (WordType w : getWord()) {
String wt = ((TrpWordType) w).getUnicodeText();
if (fillEmptyWords && wt.isEmpty())
wt = TrpWordType.EMPTY_WORD_FILL;
text += wt + " ";
}
text = StringUtils.removeEnd(text, " ");
return text;
}
use of eu.transkribus.core.model.beans.pagecontent.WordType in project TranskribusCore by Transkribus.
the class TrpXlsxBuilder method writeXlsxForDoc.
public static void writeXlsxForDoc(TrpDoc doc, boolean wordBased, File exportFile, Set<Integer> pageIndices, IProgressMonitor monitor, ExportCache cache) throws NoTagsException, Exception {
if (cache == null) {
throw new IllegalArgumentException("ExportCache must not be null.");
}
if (cache.getCustomTagMapForDoc().isEmpty()) {
logger.info("No tags to store -> Xlsx export cancelled");
throw new NoTagsException("No tags available to store into Xlsx");
}
List<TrpPage> pages = doc.getPages();
String exportPath = exportFile.getPath();
Set<String> selectedTags = cache.getOnlySelectedTagnames(ExportUtils.getOnlyWantedTagnames(CustomTagFactory.getRegisteredTagNames()));
int totalPages = pageIndices == null ? pages.size() : pageIndices.size();
if (monitor != null) {
monitor.beginTask("Exporting to Excel", totalPages);
}
wb = new XSSFWorkbook();
int c = 0;
for (int i = 0; i < pages.size(); ++i) {
if (pageIndices != null && !pageIndices.contains(i))
continue;
if (monitor != null) {
if (monitor.isCanceled()) {
throw new InterruptedException("Export was canceled by user");
// logger.debug("Xlsx export cancelled!");
// return;
}
monitor.subTask("Processing page " + (c + 1));
}
TrpPage page = pages.get(i);
// try to get previously loaded JAXB transcript
JAXBPageTranscript tr = null;
if (cache != null) {
tr = cache.getPageTranscriptAtIndex(i);
}
if (tr == null) {
TrpTranscriptMetadata md = page.getCurrentTranscript();
tr = new JAXBPageTranscript(md);
tr.build();
}
// old version
// TrpPage page = pages.get(i);
// TrpTranscriptMetadata md = page.getCurrentTranscript();
// JAXBPageTranscript tr = new JAXBPageTranscript(md);
// tr.build();
TrpPageType trpPage = tr.getPage();
logger.debug("writing xlsx for page " + (i + 1) + "/" + doc.getNPages());
List<TrpTextRegionType> textRegions = trpPage.getTextRegions(true);
for (int j = 0; j < textRegions.size(); ++j) {
TrpTextRegionType r = textRegions.get(j);
List<TextLineType> lines = r.getTextLine();
for (int k = 0; k < lines.size(); ++k) {
TrpTextLineType trpL = (TrpTextLineType) lines.get(k);
List<WordType> words = trpL.getWord();
if (wordBased) {
for (int l = 0; l < words.size(); ++l) {
TrpWordType w = (TrpWordType) words.get(l);
writeTagsForShapeElement(w, trpL.getUnicodeText(), String.valueOf(doc.getId()), String.valueOf(page.getPageNr()), r.getId(), trpL.getId(), w.getId(), selectedTags);
}
} else {
writeTagsForShapeElement(trpL, trpL.getUnicodeText(), String.valueOf(doc.getId()), String.valueOf(page.getPageNr()), r.getId(), trpL.getId(), "", selectedTags);
}
}
}
++c;
if (monitor != null) {
monitor.worked(c);
}
}
/*
* auto size the columns
*/
for (int i = 0; i < wb.getNumberOfSheets(); i++) {
int numberOfCells = 0;
Iterator rowIterator = wb.getSheetAt(i).rowIterator();
/**
* Escape the header row *
*/
if (rowIterator.hasNext()) {
Row headerRow = (Row) rowIterator.next();
// get the number of cells in the header row
numberOfCells = headerRow.getPhysicalNumberOfCells();
for (int j = 0; j < numberOfCells; j++) {
wb.getSheetAt(i).autoSizeColumn(j);
}
}
}
FileOutputStream fOut;
try {
// means no tags at all
if (wb.getNumberOfSheets() == 0) {
throw new IOException("Sorry - No tags available for export");
}
fOut = new FileOutputStream(exportPath);
wb.write(fOut);
fOut.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
throw e;
}
logger.info("wrote xlsx to: " + exportPath);
}
use of eu.transkribus.core.model.beans.pagecontent.WordType in project TranskribusCore by Transkribus.
the class TrpTeiStringBuilder method writeZonesForTextRegion.
void writeZonesForTextRegion(SebisStringBuilder sb, TextRegionType r, int pageNr) {
String facsId = FACS_ID_PREFIX + pageNr;
if (pars.regionZones) {
writeZoneForShape(sb, (TrpTextRegionType) r, facsId, !pars.lineZones && !pars.wordZones);
}
if (!pars.lineZones && !pars.wordZones)
return;
for (TextLineType tl : r.getTextLine()) {
TrpTextLineType ttl = (TrpTextLineType) tl;
if (pars.lineZones) {
writeZoneForShape(sb, ttl, facsId, !pars.wordZones);
}
if (pars.wordZones) {
for (WordType w : ttl.getWord()) {
TrpWordType tw = (TrpWordType) w;
writeZoneForShape(sb, tw, facsId, true);
}
if (pars.lineZones) {
closeElement(sb, "zone");
}
}
}
if (pars.regionZones) {
closeElement(sb, "zone");
}
}
use of eu.transkribus.core.model.beans.pagecontent.WordType in project TranskribusCore by Transkribus.
the class TrpTeiStringBuilder method writeTextForTextRegion.
void writeTextForTextRegion(SebisStringBuilder sb, TextRegionType r, int pageNr) {
String facsId = FACS_ID_PREFIX + pageNr;
if (r.getTextLine().isEmpty()) {
logger.warn("skipping empty region: " + r.getId());
return;
}
writeTextRegion(sb, r, facsId);
for (TextLineType tl : r.getTextLine()) {
TrpTextLineType ttl = (TrpTextLineType) tl;
if (!commonPars.isWriteTextOnWordLevel()) {
writeLineOrWord(sb, ttl, facsId);
} else {
String lStart = getLineOrWordStart(ttl, facsId);
sb.incIndent();
sb.addLine(lStart);
// TODO: write text for words???
for (WordType w : ttl.getWord()) {
writeLineOrWord(sb, (TrpWordType) w, facsId);
}
String lEnd = getLineOrWordEnd(ttl, facsId);
sb.addLine(lEnd);
// sb.append("\n");
sb.decIndent();
}
}
closeTextRegion(sb);
}
use of eu.transkribus.core.model.beans.pagecontent.WordType in project TranskribusCore by Transkribus.
the class TrpRtfBuilder method getRtfTextForLineFromWords.
// private static void getTagsForShapeElement(ITrpShapeType element) throws IOException{
//
// String textStr = element.getUnicodeText();
// CustomTagList cl = element.getCustomTagList();
// if (textStr == null || cl == null)
// throw new IOException("Element has no text or custom tag list: "+element+", class: "+element.getClass().getName());
//
// for (CustomTag nonIndexedTag : cl.getNonIndexedTags()) {
//
// logger.debug("nonindexed tag found ");
// storeCustomTag(nonIndexedTag, textStr);
//
// }
// for (CustomTag indexedTag : cl.getIndexedTags()) {
//
// logger.debug("indexed tag found ");
// storeCustomTag(indexedTag, textStr);
//
// }
//
// }
//
// private static void storeCustomTag(CustomTag currTag, String textStr) {
// if (!currTag.getTagName().equals("textStyle")){
//
// if (currTag.getOffset() != -1 && currTag.getLength() != -1 && (currTag.getOffset()+currTag.getLength() <= textStr.length())){
// tags.put(currTag, textStr.substring(currTag.getOffset(), currTag.getOffset()+currTag.getLength()));
// }
// else{
// tags.put(currTag, textStr);
// }
// logger.debug("++tag name is " + currTag.getTagName());
// logger.debug("text " + tags.get(currTag));
// }
//
// if (currTag.getTagName().equals("Person")){
// if (currTag.getOffset() != -1 && currTag.getLength() != -1 && (currTag.getOffset()+currTag.getLength() <= textStr.length())){
// persons.add(textStr.substring(currTag.getOffset(), currTag.getOffset()+currTag.getLength()));
// }
// else{
// logger.debug("with index is something wrong: offset " + currTag.getOffset() + " length " + currTag.getLength()) ;
// //throw new Exception("Something wrong with indexed tag for text: " + textStr);
// }
// }
// else if (currTag.getTagName().equals("Place")){
// if (currTag.getOffset() != -1 && currTag.getLength() != -1 && (currTag.getOffset()+currTag.getLength() <= textStr.length())){
// places.add(textStr.substring(currTag.getOffset(), currTag.getOffset()+currTag.getLength()));
// }
// }
//
// }
private static RtfText getRtfTextForLineFromWords(TrpTextLineType line) throws IOException {
List<WordType> words = line.getWord();
RtfText[] wordTexts = new RtfText[words.size()];
for (int i = 0; i < wordTexts.length; ++i) {
TrpWordType w = (TrpWordType) words.get(i);
wordTexts[i] = getRtfTextForShapeElement(w);
}
RtfText totalText = RtfText.text(true, wordTexts);
return totalText;
}
Aggregations