use of eu.transkribus.core.model.beans.pagecontent_trp.TrpPageType in project TranskribusCore by Transkribus.
the class PageXmlUtils method createPcGtsTypeFromText.
public static PcGtsType createPcGtsTypeFromText(final String imgFileName, Dimension dim, String text, TranscriptionLevel level, boolean skipEmptyLines) throws IOException {
// create empty page
PcGtsType pcGtsType = createEmptyPcGtsType(imgFileName, dim);
TrpPageType page = (TrpPageType) pcGtsType.getPage();
// create and add text region with size of image
Rectangle r = new Rectangle(0, 0, page.getImageWidth(), page.getImageHeight());
String defaultCoords = PointStrUtils.pointsToString(r);
TrpTextRegionType region = new TrpTextRegionType((TrpPageType) page);
region.setId("region_1");
region.setCoordinates(defaultCoords, null);
page.getTextRegionOrImageRegionOrLineDrawingRegion().add(region);
if (level == null) {
level = TranscriptionLevel.LINE_BASED;
}
if (level != TranscriptionLevel.REGION_BASED && level != TranscriptionLevel.LINE_BASED && level != TranscriptionLevel.WORD_BASED) {
throw new IOException("Invalide TranscriptionLevel: " + level);
}
if (level == TranscriptionLevel.REGION_BASED) {
region.setUnicodeText(text, null);
} else {
String splitRegex = skipEmptyLines ? "[\\r\\n]+" : "\\r?\\n";
String[] lines = text.split(splitRegex);
logger.debug("nr of lines = " + lines.length);
int lc = 1;
for (String lineText : lines) {
TrpTextLineType line = new TrpTextLineType(region);
line.setId("line_" + (lc++));
line.setCoordinates(defaultCoords, null);
region.getTextLine().add(line);
if (level == TranscriptionLevel.LINE_BASED) {
line.setUnicodeText(lineText, null);
} else if (level == TranscriptionLevel.WORD_BASED) {
int wc = 1;
for (String wordText : lineText.split(" ")) {
// TODO: better word splitting??
TrpWordType word = new TrpWordType(line);
word.setId("word_" + (wc++));
word.setCoordinates(defaultCoords, null);
word.setUnicodeText(wordText, null);
line.getWord().add(word);
}
}
}
}
return pcGtsType;
}
use of eu.transkribus.core.model.beans.pagecontent_trp.TrpPageType in project TranskribusCore by Transkribus.
the class PageXmlUtils method createEmptyPcGtsType.
public static PcGtsType createEmptyPcGtsType(final String imgFileName, final int xDim, final int yDim) {
// create md
MetadataType md = new MetadataType();
md.setCreator("TRP");
XMLGregorianCalendar xmlCal = JaxbUtils.getXmlCalendar(new Date());
md.setCreated(xmlCal);
md.setLastChange(xmlCal);
// create TRP (!) pageType
TrpPageType pt = new TrpPageType();
pt.setImageFilename(imgFileName);
pt.setImageHeight(yDim);
pt.setImageWidth(xDim);
// create root and set stuff
PcGtsType pc = new PcGtsType();
pc.setMetadata(md);
pc.setPage(pt);
return pc;
}
use of eu.transkribus.core.model.beans.pagecontent_trp.TrpPageType in project TranskribusCore by Transkribus.
the class CustomTagListTest method testSimpleAddOrMergeTagWithTextStyles.
// @Ignore
@Test
public void testSimpleAddOrMergeTagWithTextStyles() {
TrpTextLineType line = new TrpTextLineType(new TrpTextRegionType(new TrpPageType()));
line.setUnicodeText("Hello world!", null);
CustomTagList tl = new CustomTagList(line);
TextStyleTag tst = new TextStyleTag(0, 10);
tst.setFontFamily("testFont");
tl.addOrMergeTag(tst, null);
TextStyleTag ts1 = new TextStyleTag(2, 5);
ts1.setBold(true);
tl.addOrMergeTag(ts1, null);
logger.trace(tl.toString());
Assert.assertEquals("Nr of text styles must be 3!", 3, tl.getTags().size());
TextStyleTag ts2 = new TextStyleTag(3, 4);
ts2.setItalic(true);
tl.addOrMergeTag(ts2, null);
Assert.assertEquals("Nr of text styles must be 4!", 4, tl.getTags().size());
logger.trace(tl.toString());
// Assert.assertEquals("Nr of text styles must be 5!", 5, tl.getTags().size());
Assert.assertTrue("offset = 0", tl.getTags().get(0).getOffset() == 0);
CustomTag last = tl.getTags().get(tl.getTags().size() - 1);
Assert.assertTrue("offset+length = 10", (last.getOffset() + last.getLength()) == 10);
}
use of eu.transkribus.core.model.beans.pagecontent_trp.TrpPageType in project TranskribusCore by Transkribus.
the class Pdf2TrpDoc method main.
public static void main(String[] args) {
if (args.length != 1) {
return;
}
File in = new File(args[0]);
final String name = in.getName();
File outDir = new File("/tmp/");
outDir.mkdirs();
try {
// PageImageWriter imgWriter = new PageImageWriter();
// String imgDirPath = imgWriter.extractImages(in.getAbsolutePath(), outDir.getAbsolutePath());
String imgDirPath = "/tmp/KurzgefaĆte_Geschichte_Statistik_und_Topographie_von_Tirol";
File pageDir = new File(imgDirPath + File.separator + "page");
pageDir.mkdirs();
TreeMap<String, File> imgs = LocalDocReader.findImgFiles(new File(imgDirPath));
ArrayList<PDFPage> pages = PDFTextExtractor.processPDF(in.getAbsolutePath());
if (imgs.size() != pages.size()) {
logger.error("Nr. of image files does not match nr. of text pages!");
return;
}
int i = 0;
for (Entry<String, File> img : imgs.entrySet()) {
PDFPage pdfPage = pages.get(i++);
Dimension dim = ImgUtils.readImageDimensions(img.getValue());
PcGtsType pc = PageXmlUtils.createEmptyPcGtsType(img.getValue(), dim);
final File xmlOut = new File(pageDir.getAbsolutePath() + File.separator + img.getKey() + ".xml");
Rectangle printspace = pdfPage.getContentRect();
if (printspace != null) {
TrpPrintSpaceType psType = new TrpPrintSpaceType();
psType.setCoords(rect2Coords(printspace));
TrpPageType pageType = (TrpPageType) pc.getPage();
// ((ITrpShapeType) pageType).getObservable().setActive(false);
pageType.setPrintSpace(psType);
for (PDFRegion r : pdfPage.regions) {
TrpTextRegionType rType = new TrpTextRegionType(pageType);
rType.setCoords(rect2Coords(r.getRect()));
rType.setUnicodeText(r.getText(), null);
for (PDFLine l : r.lines) {
TrpTextLineType lType = new TrpTextLineType(rType);
lType.setCoords(rect2Coords(l.getRect()));
lType.setUnicodeText(l.getText(), null);
for (PDFString s : l.strings) {
TrpWordType wType = new TrpWordType(lType);
wType.setCoords(rect2Coords(s.getRect()));
wType.setUnicodeText(s.value, null);
lType.getWord().add(wType);
}
rType.getTextLine().add(lType);
}
pageType.getRegions().add(rType);
}
}
PageXmlUtils.marshalToFile(pc, xmlOut);
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
use of eu.transkribus.core.model.beans.pagecontent_trp.TrpPageType in project TranskribusCore by Transkribus.
the class TrpRtfBuilder method writeRtfForDoc.
public static void writeRtfForDoc(TrpDoc doc, boolean wordBased, boolean writeTags, boolean doBlackening, File file, Set<Integer> pageIndices, IProgressMonitor monitor, ExportCache cache) throws JAXBException, IOException {
exportTags = writeTags;
tagnames = cache.getSelectedTags();
TrpRtfBuilder.doBlackening = doBlackening;
/*
* get all names of tags
*/
// tagnames = CustomTagFactory.getRegisteredTagNames();
Rtf rtf = Rtf.rtf();
List<TrpPage> pages = doc.getPages();
int totalPages = pageIndices == null ? pages.size() : pageIndices.size();
if (monitor != null) {
monitor.beginTask("Exporting to RTF", totalPages);
}
int c = 0;
for (int i = 0; i < pages.size(); ++i) {
if (pageIndices != null && !pageIndices.contains(i))
continue;
if (monitor != null) {
if (monitor.isCanceled()) {
logger.debug("RTF export cancelled!");
return;
}
monitor.subTask("Processing page " + (c + 1));
}
TrpPage page = pages.get(i);
TrpTranscriptMetadata md = page.getCurrentTranscript();
JAXBPageTranscript tr = new JAXBPageTranscript(md);
tr.build();
TrpPageType trpPage = tr.getPage();
logger.debug("writing rtf for page " + (i + 1) + "/" + doc.getNPages());
// rtf().header(color( 204, 0, 0 ).at( 0 ),
// color( 0, 0xff, 0 ).at( 1 ),
// color( 0, 0, 0xff ).at( 2 ),
// font( "Calibri" ).at( 0 ) );
// RtfHeaderColor color = RtfHeaderColor.color(0xff, 0, 0);
rtf.header(color(204, 0, 0).at(0), color(0, 0xff, 0).at(1)).section(getRtfParagraphsForTranscript(trpPage, wordBased));
++c;
if (monitor != null) {
monitor.worked(c);
}
}
// write tags at end of last page
if (exportTags) {
// RtfText headline = RtfText.text("Person names in this document (amount of found persons: " + persons.size() + ")", "\n");
/*
* for all different tagnames:
* find all custom tags in doc
* create list and
*/
ArrayList<RtfPara> tagParas = new ArrayList<RtfPara>();
// tagnames = all user choosen tags via export dialog
for (String currTagname : tagnames) {
// logger.debug("curr tagname " + currTagname);
// get all custom tags with currTagname and text
HashMap<CustomTag, String> allTagsOfThisTagname = cache.getTags(currTagname);
if (allTagsOfThisTagname.size() > 0) {
tagParas.add(RtfPara.p(RtfText.text(RtfText.underline(currTagname + " tags in this document: " + allTagsOfThisTagname.size()))));
// ArrayList<RtfText> tagTexts = new ArrayList<RtfText>();
Collection<String> valueSet = allTagsOfThisTagname.values();
RtfText[] tagTexts = new RtfText[valueSet.size()];
int l = 0;
for (String currEntry : valueSet) {
tagTexts[l++] = RtfText.text(currEntry.concat("\n"));
// logger.debug("tag value is " + currEntry);
}
tagParas.add(RtfPara.p(tagTexts));
}
}
// int parSize = getParsNumber();
// int k = 0;
//
// if (persons.size() > 0){
// logger.debug("k is " + k);
// List<String> newPersonList = new ArrayList<String>(new HashSet<String>(persons));
// tagParas[k++]=RtfPara.p(RtfText.text("Person names in this document (amount of found persons: " + newPersonList.size() + ")", "\n"));
// logger.debug("k is " + k);
// //rtf.p("Person names in this document (amount of found persons: " + persons.size() + ")", "\n");
// //to make the list contain only unique values
//
// RtfText[] personTexts = new RtfText[newPersonList.size()];
// for (int j=0; j<newPersonList.size(); ++j) {
// personTexts[j] = RtfText.text(newPersonList.get(j), "\n");
// logger.debug("person is " + newPersonList.get(j));
// }
// tagParas[k++] = RtfPara.p(personTexts);
// }
//
// if (places.size() > 0){
// List<String> newPlaceList = new ArrayList<String>(new HashSet<String>(places));
// tagParas[k++]=RtfPara.p(RtfText.text("Places in this document (amount of found places " + newPlaceList.size() + ")", "\n"));
//
// RtfText[] placeTexts = new RtfText[newPlaceList.size()];
// for (int j=0; j<newPlaceList.size(); ++j) {
// //RtfText.color(0, "red");
// placeTexts[j] = RtfText.color(0, newPlaceList.get(j).concat("\n"));
// logger.debug("place is " + newPlaceList.get(j));
// }
// RtfPara par2 = RtfPara.p(placeTexts);
// tagParas[k++] = par2;
// }
//
// if(addresses.size() > 0){
// List<String> newAddressList = new ArrayList<String>(new HashSet<String>(addresses));
// tagParas[k++]=RtfPara.p(RtfText.text("Addresses in this document (amount of found addresses " + newAddressList.size() + ")", "\n"));
//
// RtfText[] addresseTexts = new RtfText[newAddressList.size()];
// for (int j=0; j<newAddressList.size(); ++j) {
// addresseTexts[j] = RtfText.text(newAddressList.get(j), "\n");
// logger.debug("addresse is " + newAddressList.get(j));
// }
// RtfPara par3 = RtfPara.p(addresseTexts);
// tagParas[k++] = par3;
// }
// rtf.section(par3);
rtf.header(color(204, 0, 0).at(0)).section(tagParas);
}
rtf.out(new FileWriter(file));
logger.info("wrote rtf to: " + file.getAbsolutePath());
}
Aggregations