Search in sources :

Example 1 with TrpPageType

use of eu.transkribus.core.model.beans.pagecontent_trp.TrpPageType in project TranskribusCore by Transkribus.

the class PageXmlUtils method createPcGtsTypeFromText.

public static PcGtsType createPcGtsTypeFromText(final String imgFileName, Dimension dim, String text, TranscriptionLevel level, boolean skipEmptyLines) throws IOException {
    // create empty page
    PcGtsType pcGtsType = createEmptyPcGtsType(imgFileName, dim);
    TrpPageType page = (TrpPageType) pcGtsType.getPage();
    // create and add text region with size of image
    Rectangle r = new Rectangle(0, 0, page.getImageWidth(), page.getImageHeight());
    String defaultCoords = PointStrUtils.pointsToString(r);
    TrpTextRegionType region = new TrpTextRegionType((TrpPageType) page);
    region.setId("region_1");
    region.setCoordinates(defaultCoords, null);
    page.getTextRegionOrImageRegionOrLineDrawingRegion().add(region);
    if (level == null) {
        level = TranscriptionLevel.LINE_BASED;
    }
    if (level != TranscriptionLevel.REGION_BASED && level != TranscriptionLevel.LINE_BASED && level != TranscriptionLevel.WORD_BASED) {
        throw new IOException("Invalide TranscriptionLevel: " + level);
    }
    if (level == TranscriptionLevel.REGION_BASED) {
        region.setUnicodeText(text, null);
    } else {
        String splitRegex = skipEmptyLines ? "[\\r\\n]+" : "\\r?\\n";
        String[] lines = text.split(splitRegex);
        logger.debug("nr of lines = " + lines.length);
        int lc = 1;
        for (String lineText : lines) {
            TrpTextLineType line = new TrpTextLineType(region);
            line.setId("line_" + (lc++));
            line.setCoordinates(defaultCoords, null);
            region.getTextLine().add(line);
            if (level == TranscriptionLevel.LINE_BASED) {
                line.setUnicodeText(lineText, null);
            } else if (level == TranscriptionLevel.WORD_BASED) {
                int wc = 1;
                for (String wordText : lineText.split(" ")) {
                    // TODO: better word splitting??
                    TrpWordType word = new TrpWordType(line);
                    word.setId("word_" + (wc++));
                    word.setCoordinates(defaultCoords, null);
                    word.setUnicodeText(wordText, null);
                    line.getWord().add(word);
                }
            }
        }
    }
    return pcGtsType;
}
Also used : TrpTextLineType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextLineType) TrpTextRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType) Rectangle(java.awt.Rectangle) IOException(java.io.IOException) PcGtsType(eu.transkribus.core.model.beans.pagecontent.PcGtsType) TrpWordType(eu.transkribus.core.model.beans.pagecontent_trp.TrpWordType) TrpPageType(eu.transkribus.core.model.beans.pagecontent_trp.TrpPageType)

Example 2 with TrpPageType

use of eu.transkribus.core.model.beans.pagecontent_trp.TrpPageType in project TranskribusCore by Transkribus.

the class PageXmlUtils method createEmptyPcGtsType.

public static PcGtsType createEmptyPcGtsType(final String imgFileName, final int xDim, final int yDim) {
    // create md
    MetadataType md = new MetadataType();
    md.setCreator("TRP");
    XMLGregorianCalendar xmlCal = JaxbUtils.getXmlCalendar(new Date());
    md.setCreated(xmlCal);
    md.setLastChange(xmlCal);
    // create TRP (!) pageType
    TrpPageType pt = new TrpPageType();
    pt.setImageFilename(imgFileName);
    pt.setImageHeight(yDim);
    pt.setImageWidth(xDim);
    // create root and set stuff
    PcGtsType pc = new PcGtsType();
    pc.setMetadata(md);
    pc.setPage(pt);
    return pc;
}
Also used : XMLGregorianCalendar(javax.xml.datatype.XMLGregorianCalendar) MetadataType(eu.transkribus.core.model.beans.pagecontent.MetadataType) PcGtsType(eu.transkribus.core.model.beans.pagecontent.PcGtsType) Date(java.util.Date) TrpPageType(eu.transkribus.core.model.beans.pagecontent_trp.TrpPageType)

Example 3 with TrpPageType

use of eu.transkribus.core.model.beans.pagecontent_trp.TrpPageType in project TranskribusCore by Transkribus.

the class CustomTagListTest method testSimpleAddOrMergeTagWithTextStyles.

// @Ignore
@Test
public void testSimpleAddOrMergeTagWithTextStyles() {
    TrpTextLineType line = new TrpTextLineType(new TrpTextRegionType(new TrpPageType()));
    line.setUnicodeText("Hello world!", null);
    CustomTagList tl = new CustomTagList(line);
    TextStyleTag tst = new TextStyleTag(0, 10);
    tst.setFontFamily("testFont");
    tl.addOrMergeTag(tst, null);
    TextStyleTag ts1 = new TextStyleTag(2, 5);
    ts1.setBold(true);
    tl.addOrMergeTag(ts1, null);
    logger.trace(tl.toString());
    Assert.assertEquals("Nr of text styles must be 3!", 3, tl.getTags().size());
    TextStyleTag ts2 = new TextStyleTag(3, 4);
    ts2.setItalic(true);
    tl.addOrMergeTag(ts2, null);
    Assert.assertEquals("Nr of text styles must be 4!", 4, tl.getTags().size());
    logger.trace(tl.toString());
    // Assert.assertEquals("Nr of text styles must be 5!", 5, tl.getTags().size());
    Assert.assertTrue("offset = 0", tl.getTags().get(0).getOffset() == 0);
    CustomTag last = tl.getTags().get(tl.getTags().size() - 1);
    Assert.assertTrue("offset+length = 10", (last.getOffset() + last.getLength()) == 10);
}
Also used : TrpTextLineType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextLineType) TextStyleTag(eu.transkribus.core.model.beans.customtags.TextStyleTag) TrpTextRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType) CustomTag(eu.transkribus.core.model.beans.customtags.CustomTag) CustomTagList(eu.transkribus.core.model.beans.customtags.CustomTagList) TrpPageType(eu.transkribus.core.model.beans.pagecontent_trp.TrpPageType) Test(org.junit.Test)

Example 4 with TrpPageType

use of eu.transkribus.core.model.beans.pagecontent_trp.TrpPageType in project TranskribusCore by Transkribus.

the class Pdf2TrpDoc method main.

public static void main(String[] args) {
    if (args.length != 1) {
        return;
    }
    File in = new File(args[0]);
    final String name = in.getName();
    File outDir = new File("/tmp/");
    outDir.mkdirs();
    try {
        // PageImageWriter imgWriter = new PageImageWriter();
        // String imgDirPath = imgWriter.extractImages(in.getAbsolutePath(), outDir.getAbsolutePath());
        String imgDirPath = "/tmp/KurzgefaƟte_Geschichte_Statistik_und_Topographie_von_Tirol";
        File pageDir = new File(imgDirPath + File.separator + "page");
        pageDir.mkdirs();
        TreeMap<String, File> imgs = LocalDocReader.findImgFiles(new File(imgDirPath));
        ArrayList<PDFPage> pages = PDFTextExtractor.processPDF(in.getAbsolutePath());
        if (imgs.size() != pages.size()) {
            logger.error("Nr. of image files does not match nr. of text pages!");
            return;
        }
        int i = 0;
        for (Entry<String, File> img : imgs.entrySet()) {
            PDFPage pdfPage = pages.get(i++);
            Dimension dim = ImgUtils.readImageDimensions(img.getValue());
            PcGtsType pc = PageXmlUtils.createEmptyPcGtsType(img.getValue(), dim);
            final File xmlOut = new File(pageDir.getAbsolutePath() + File.separator + img.getKey() + ".xml");
            Rectangle printspace = pdfPage.getContentRect();
            if (printspace != null) {
                TrpPrintSpaceType psType = new TrpPrintSpaceType();
                psType.setCoords(rect2Coords(printspace));
                TrpPageType pageType = (TrpPageType) pc.getPage();
                // ((ITrpShapeType) pageType).getObservable().setActive(false);
                pageType.setPrintSpace(psType);
                for (PDFRegion r : pdfPage.regions) {
                    TrpTextRegionType rType = new TrpTextRegionType(pageType);
                    rType.setCoords(rect2Coords(r.getRect()));
                    rType.setUnicodeText(r.getText(), null);
                    for (PDFLine l : r.lines) {
                        TrpTextLineType lType = new TrpTextLineType(rType);
                        lType.setCoords(rect2Coords(l.getRect()));
                        lType.setUnicodeText(l.getText(), null);
                        for (PDFString s : l.strings) {
                            TrpWordType wType = new TrpWordType(lType);
                            wType.setCoords(rect2Coords(s.getRect()));
                            wType.setUnicodeText(s.value, null);
                            lType.getWord().add(wType);
                        }
                        rType.getTextLine().add(lType);
                    }
                    pageType.getRegions().add(rType);
                }
            }
            PageXmlUtils.marshalToFile(pc, xmlOut);
        }
    } catch (Exception e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
}
Also used : PDFString(org.dea.util.pdf.beans.PDFString) Rectangle(java.awt.Rectangle) PDFString(org.dea.util.pdf.beans.PDFString) Dimension(java.awt.Dimension) PcGtsType(eu.transkribus.core.model.beans.pagecontent.PcGtsType) TrpWordType(eu.transkribus.core.model.beans.pagecontent_trp.TrpWordType) TrpTextLineType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextLineType) PDFRegion(org.dea.util.pdf.beans.PDFRegion) TrpTextRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType) PDFLine(org.dea.util.pdf.beans.PDFLine) File(java.io.File) PDFPage(org.dea.util.pdf.beans.PDFPage) TrpPrintSpaceType(eu.transkribus.core.model.beans.pagecontent_trp.TrpPrintSpaceType) TrpPageType(eu.transkribus.core.model.beans.pagecontent_trp.TrpPageType)

Example 5 with TrpPageType

use of eu.transkribus.core.model.beans.pagecontent_trp.TrpPageType in project TranskribusCore by Transkribus.

the class TrpRtfBuilder method writeRtfForDoc.

public static void writeRtfForDoc(TrpDoc doc, boolean wordBased, boolean writeTags, boolean doBlackening, File file, Set<Integer> pageIndices, IProgressMonitor monitor, ExportCache cache) throws JAXBException, IOException {
    exportTags = writeTags;
    tagnames = cache.getSelectedTags();
    TrpRtfBuilder.doBlackening = doBlackening;
    /*
		 * get all names of tags
		 */
    // tagnames = CustomTagFactory.getRegisteredTagNames();
    Rtf rtf = Rtf.rtf();
    List<TrpPage> pages = doc.getPages();
    int totalPages = pageIndices == null ? pages.size() : pageIndices.size();
    if (monitor != null) {
        monitor.beginTask("Exporting to RTF", totalPages);
    }
    int c = 0;
    for (int i = 0; i < pages.size(); ++i) {
        if (pageIndices != null && !pageIndices.contains(i))
            continue;
        if (monitor != null) {
            if (monitor.isCanceled()) {
                logger.debug("RTF export cancelled!");
                return;
            }
            monitor.subTask("Processing page " + (c + 1));
        }
        TrpPage page = pages.get(i);
        TrpTranscriptMetadata md = page.getCurrentTranscript();
        JAXBPageTranscript tr = new JAXBPageTranscript(md);
        tr.build();
        TrpPageType trpPage = tr.getPage();
        logger.debug("writing rtf for page " + (i + 1) + "/" + doc.getNPages());
        // rtf().header(color( 204, 0, 0 ).at( 0 ),
        // color( 0, 0xff, 0 ).at( 1 ),
        // color( 0, 0, 0xff ).at( 2 ),
        // font( "Calibri" ).at( 0 ) );
        // RtfHeaderColor color = RtfHeaderColor.color(0xff, 0, 0);
        rtf.header(color(204, 0, 0).at(0), color(0, 0xff, 0).at(1)).section(getRtfParagraphsForTranscript(trpPage, wordBased));
        ++c;
        if (monitor != null) {
            monitor.worked(c);
        }
    }
    // write tags at end of last page
    if (exportTags) {
        // RtfText headline = RtfText.text("Person names in this document (amount of found persons: " + persons.size() + ")", "\n");
        /*
			 * for all different tagnames:
			 * find all custom tags in doc
			 * create list and 
			 */
        ArrayList<RtfPara> tagParas = new ArrayList<RtfPara>();
        // tagnames = all user choosen tags via export dialog
        for (String currTagname : tagnames) {
            // logger.debug("curr tagname " + currTagname);
            // get all custom tags with currTagname and text
            HashMap<CustomTag, String> allTagsOfThisTagname = cache.getTags(currTagname);
            if (allTagsOfThisTagname.size() > 0) {
                tagParas.add(RtfPara.p(RtfText.text(RtfText.underline(currTagname + " tags in this document: " + allTagsOfThisTagname.size()))));
                // ArrayList<RtfText> tagTexts = new ArrayList<RtfText>();
                Collection<String> valueSet = allTagsOfThisTagname.values();
                RtfText[] tagTexts = new RtfText[valueSet.size()];
                int l = 0;
                for (String currEntry : valueSet) {
                    tagTexts[l++] = RtfText.text(currEntry.concat("\n"));
                // logger.debug("tag value is " + currEntry);
                }
                tagParas.add(RtfPara.p(tagTexts));
            }
        }
        // int parSize = getParsNumber();
        // int k = 0;
        // 
        // if (persons.size() > 0){
        // logger.debug("k is " + k);
        // List<String> newPersonList = new ArrayList<String>(new HashSet<String>(persons));
        // tagParas[k++]=RtfPara.p(RtfText.text("Person names in this document (amount of found persons: " + newPersonList.size() + ")", "\n"));
        // logger.debug("k is " + k);
        // //rtf.p("Person names in this document (amount of found persons: " + persons.size() + ")", "\n");
        // //to make the list contain only unique values
        // 
        // RtfText[] personTexts = new RtfText[newPersonList.size()];
        // for (int j=0; j<newPersonList.size(); ++j) {
        // personTexts[j] = RtfText.text(newPersonList.get(j), "\n");
        // logger.debug("person is " + newPersonList.get(j));
        // }
        // tagParas[k++] = RtfPara.p(personTexts);
        // }
        // 
        // if (places.size() > 0){
        // List<String> newPlaceList = new ArrayList<String>(new HashSet<String>(places));
        // tagParas[k++]=RtfPara.p(RtfText.text("Places in this document (amount of found places " + newPlaceList.size() + ")", "\n"));
        // 
        // RtfText[] placeTexts = new RtfText[newPlaceList.size()];
        // for (int j=0; j<newPlaceList.size(); ++j) {
        // //RtfText.color(0, "red");
        // placeTexts[j] = RtfText.color(0, newPlaceList.get(j).concat("\n"));
        // logger.debug("place is " + newPlaceList.get(j));
        // }
        // RtfPara par2 = RtfPara.p(placeTexts);
        // tagParas[k++] = par2;
        // }
        // 
        // if(addresses.size() > 0){
        // List<String> newAddressList = new ArrayList<String>(new HashSet<String>(addresses));
        // tagParas[k++]=RtfPara.p(RtfText.text("Addresses in this document (amount of found addresses " + newAddressList.size() + ")", "\n"));
        // 
        // RtfText[] addresseTexts = new RtfText[newAddressList.size()];
        // for (int j=0; j<newAddressList.size(); ++j) {
        // addresseTexts[j] = RtfText.text(newAddressList.get(j), "\n");
        // logger.debug("addresse is " + newAddressList.get(j));
        // }
        // RtfPara par3 = RtfPara.p(addresseTexts);
        // tagParas[k++] = par3;
        // }
        // rtf.section(par3);
        rtf.header(color(204, 0, 0).at(0)).section(tagParas);
    }
    rtf.out(new FileWriter(file));
    logger.info("wrote rtf to: " + file.getAbsolutePath());
}
Also used : JAXBPageTranscript(eu.transkribus.core.model.beans.JAXBPageTranscript) Rtf(com.tutego.jrtf.Rtf) TrpPage(eu.transkribus.core.model.beans.TrpPage) RtfText(com.tutego.jrtf.RtfText) FileWriter(java.io.FileWriter) ArrayList(java.util.ArrayList) TrpTranscriptMetadata(eu.transkribus.core.model.beans.TrpTranscriptMetadata) CustomTag(eu.transkribus.core.model.beans.customtags.CustomTag) RtfPara(com.tutego.jrtf.RtfPara) TrpPageType(eu.transkribus.core.model.beans.pagecontent_trp.TrpPageType)

Aggregations

TrpPageType (eu.transkribus.core.model.beans.pagecontent_trp.TrpPageType)15 TrpTextLineType (eu.transkribus.core.model.beans.pagecontent_trp.TrpTextLineType)9 TrpTextRegionType (eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType)9 JAXBPageTranscript (eu.transkribus.core.model.beans.JAXBPageTranscript)7 TrpPage (eu.transkribus.core.model.beans.TrpPage)7 TrpTranscriptMetadata (eu.transkribus.core.model.beans.TrpTranscriptMetadata)7 TrpWordType (eu.transkribus.core.model.beans.pagecontent_trp.TrpWordType)5 CustomTag (eu.transkribus.core.model.beans.customtags.CustomTag)4 CustomTagList (eu.transkribus.core.model.beans.customtags.CustomTagList)4 TextStyleTag (eu.transkribus.core.model.beans.customtags.TextStyleTag)4 Test (org.junit.Test)4 PcGtsType (eu.transkribus.core.model.beans.pagecontent.PcGtsType)3 TextLineType (eu.transkribus.core.model.beans.pagecontent.TextLineType)3 WordType (eu.transkribus.core.model.beans.pagecontent.WordType)3 IOException (java.io.IOException)3 Rtf (com.tutego.jrtf.Rtf)2 Rectangle (java.awt.Rectangle)2 FileOutputStream (java.io.FileOutputStream)2 FileWriter (java.io.FileWriter)2 ArrayList (java.util.ArrayList)2