Search in sources :

Example 16 with TrpTextLineType

use of eu.transkribus.core.model.beans.pagecontent_trp.TrpTextLineType in project TranskribusCore by Transkribus.

the class DocxBuilder method exportTextRegion.

private static void exportTextRegion(TrpTextRegionType tr, boolean wordBased, P p, MainDocumentPart mdp) {
    if (p == null) {
        p = factory.createP();
        mdp.addObject(p);
    }
    List<TextLineType> lines = tr.getTextLine();
    for (int i = 0; i < lines.size(); ++i) {
        TrpTextLineType trpL = (TrpTextLineType) lines.get(i);
        try {
            if (wordBased && trpL.getWord().size() > 0) {
                getFormattedTextForLineElement(trpL.getWord(), p, mdp);
            } else {
                getFormattedTextForShapeElement(trpL, p, mdp);
            }
        } catch (Exception e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        /* with ΒΆ the user can mark a new paragraph inside a text region
			 * unicode is \u00B6
			 */
        if (trpL.getCustomTagList().containsParagraphTag()) {
            // then new paragraph should be used;
            p = factory.createP();
            mdp.addObject(p);
        } else /*add line break after each text line
			 * or omit this if explicitely wished to have dense lines
			 * No line break at end of paragraph
			 */
        if (preserveLineBreaks && !(i + 1 == lines.size())) {
            // this Br element is used break the current and go for next line
            Br br = factory.createBr();
            p.getContent().add(br);
        }
    }
}
Also used : TrpTextLineType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextLineType) Br(org.docx4j.wml.Br) TextLineType(eu.transkribus.core.model.beans.pagecontent.TextLineType) TrpTextLineType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextLineType) JAXBException(javax.xml.bind.JAXBException) IOException(java.io.IOException) Docx4JException(org.docx4j.openpackaging.exceptions.Docx4JException)

Example 17 with TrpTextLineType

use of eu.transkribus.core.model.beans.pagecontent_trp.TrpTextLineType in project TranskribusCore by Transkribus.

the class TrpRtfBuilder method getRtfParagraphsForTranscript.

// public static void writeRtfForElement(Rtf rtf, ITrpShapeType element, boolean wordBased, File file, boolean append) throws IOException, JAXBException {
// element.getUnicodeText();
// CustomTagList cl = element.getCustomTagList();
// 
// RtfText text = RtfText.text(element.getUnicodeText());
// text = formatRtfText(text, element.getTextStyle());
// 
// 
// 
// if (element instanceof TextLineType || element instanceof TextRegionType) {// TODO words vs lines and regions
// rtf.p(text);
// } else if (element instanceof TrpWordType) {
// //			rtf.p(texts);
// }
// 
// 
// //		cl.getCustomTagAndContinuations(tag)
// 
// 
// 
// }
public static RtfPara[] getRtfParagraphsForTranscript(TrpPageType trpPage, boolean wordBased) throws IOException, JAXBException {
    boolean rtl = false;
    List<TrpTextRegionType> textRegions = trpPage.getTextRegions(true);
    // List<TrpTextRegionType> textRegions = trpPage.getTextRegionsAndTextRegionsFromTableRegions(true);
    RtfPara[] paras = new RtfPara[textRegions.size()];
    for (int j = 0; j < textRegions.size(); ++j) {
        TrpTextRegionType r = textRegions.get(j);
        // if (exportTags){
        // getTagsForShapeElement(r);
        // }
        List<TextLineType> lines = r.getTextLine();
        RtfText[] linesTexts = new RtfText[lines.size()];
        for (int i = 0; i < lines.size(); ++i) {
            TrpTextLineType trpL = (TrpTextLineType) lines.get(i);
            linesTexts[i] = (wordBased && trpL.getWord().size() > 0) ? getRtfTextForLineFromWords(trpL) : getRtfTextForShapeElement(trpL);
            linesTexts[i] = RtfText.text(linesTexts[i], "\n");
        }
        // read from right to left -> alignment is right
        if (rtl) {
        // paras[j] = RtfPara.p(linesTexts).footnote("Test").alignRight();
        } else {
            String test = "test";
            paras[j] = RtfPara.p(linesTexts);
        // paras[j] = RtfPara.p(linesTexts, RtfText.footnote("Test")).alignLeft();
        }
    }
    return paras;
// Rtf rtf = Rtf.rtf().section(paras);
// return rtf;
// for (RegionType r : trpPage.getTextRegionOrImageRegionOrLineDrawingRegion()) {
// if (r instanceof GraphicRegionType) {
// GraphicRegionType gr = (GraphicRegionType) r;
// // TODO: how to export images in pdf??
// r.getTextRegions(recursive);
// }
// }
// tr.getPage().getTextRegions(recursive);
// Rtf.rtf();
// RtfWriter;
}
Also used : RtfPara(com.tutego.jrtf.RtfPara) TrpTextLineType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextLineType) TextLineType(eu.transkribus.core.model.beans.pagecontent.TextLineType) TrpTextLineType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextLineType) TrpTextRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType) RtfText(com.tutego.jrtf.RtfText)

Example 18 with TrpTextLineType

use of eu.transkribus.core.model.beans.pagecontent_trp.TrpTextLineType in project TranskribusCore by Transkribus.

the class TrpTxtBuilder method writeTxtForSinglePage.

private static void writeTxtForSinglePage(File file, TrpPageType trpPage, boolean wordBased, boolean preserveLineBreaks) {
    boolean rtl = false;
    // TrpTableRegionType is contained in the regions too
    List<TrpRegionType> regions = trpPage.getRegions();
    Collections.sort(regions, new TrpElementReadingOrderComparator<RegionType>(true));
    List<String> content = new ArrayList<String>();
    for (int j = 0; j < regions.size(); ++j) {
        TrpRegionType r = regions.get(j);
        if (r instanceof TrpTableRegionType) {
            /*
				 * TODO: for simple txt export: how to handle tables
				 */
            continue;
        } else if (r instanceof TrpTextRegionType) {
            TrpTextRegionType tr = (TrpTextRegionType) r;
            List<TextLineType> lines = tr.getTextLine();
            for (int i = 0; i < lines.size(); ++i) {
                TrpTextLineType trpL = (TrpTextLineType) lines.get(i);
                String textOfCurrLine = trpL.getUnicodeText();
                if (wordBased && trpL.getWord().size() > 0) {
                    for (WordType word : trpL.getWord()) {
                        content.add(((ITrpShapeType) word).getUnicodeText());
                    }
                } else if (textOfCurrLine != "") {
                    content.add(textOfCurrLine);
                }
            // if(preserveLineBreaks){
            // content.add(System.lineSeparator());
            // }
            }
            if (lines.size() > 0) {
                content.add(System.lineSeparator());
            // try {
            // //Add line separator after each region
            // Files.write(Paths.get(file.getAbsolutePath()), new ArrayList<String>() {{ add(System.lineSeparator()); }}, utf8,
            // StandardOpenOption.CREATE, StandardOpenOption.APPEND);
            // } catch (IOException e) {
            // // TODO Auto-generated catch block
            // e.printStackTrace();
            // }
            }
        }
    }
    try {
        logger.debug("path " + Paths.get(file.getAbsolutePath()));
        Files.write(Paths.get(file.getAbsolutePath()), content, utf8, StandardOpenOption.CREATE, StandardOpenOption.APPEND);
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
}
Also used : TrpTextRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType) TrpRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpRegionType) RegionType(eu.transkribus.core.model.beans.pagecontent.RegionType) TrpTableRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTableRegionType) ArrayList(java.util.ArrayList) IOException(java.io.IOException) ITrpShapeType(eu.transkribus.core.model.beans.pagecontent_trp.ITrpShapeType) WordType(eu.transkribus.core.model.beans.pagecontent.WordType) TrpTextLineType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextLineType) TrpTableRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTableRegionType) TrpRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpRegionType) TrpTextRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType) ArrayList(java.util.ArrayList) List(java.util.List)

Example 19 with TrpTextLineType

use of eu.transkribus.core.model.beans.pagecontent_trp.TrpTextLineType in project TranskribusCore by Transkribus.

the class KlosterTeiToPageParser method parsePage.

static void parsePage(Node pbNode, boolean save) throws IOException, JAXBException {
    Element pb = (Element) pbNode;
    String imgFn = pb.getAttribute("facs");
    int pageN = Integer.parseInt(pb.getAttribute("n"));
    int pageHeight = Integer.parseInt(pb.getAttribute("xmlns:h"));
    int pageWidth = Integer.parseInt(pb.getAttribute("xmlns:w"));
    PcGtsType page = PageXmlUtils.createEmptyPcGtsType("imgfn", pageWidth, pageHeight);
    TrpTextRegionType region = new TrpTextRegionType();
    region.setId("region_1");
    System.out.println("page data: imgFn = " + imgFn + " n = " + pageN + " pageWidth = " + pageWidth + " pageHeight = " + pageHeight);
    int minX = 999999, minY = 999999, maxX = -1, maxY = -1;
    Node sibling = pbNode.getNextSibling();
    int lineCount = 0;
    while (sibling != null) {
        if (sibling.getNodeName().equals("pb")) {
            break;
        }
        // System.out.println("sibling type: "+sibling.getTextContent());
        if (sibling.getNodeType() == Node.ELEMENT_NODE && sibling.getNodeName().equals("lb")) {
            Element lb = (Element) sibling;
            int n = Integer.parseInt(lb.getAttribute("n"));
            int x = Integer.parseInt(lb.getAttribute("xmlns:x"));
            int y = Integer.parseInt(lb.getAttribute("xmlns:y"));
            int w = Integer.parseInt(lb.getAttribute("xmlns:w"));
            int h = Integer.parseInt(lb.getAttribute("xmlns:h"));
            if (x < minX)
                minX = x;
            if (y < minY)
                minY = y;
            if (x + w > maxX)
                maxX = x + w;
            if (y + h > maxY)
                maxY = y + h;
            String txt = sibling.getNextSibling().getTextContent();
            txt = StringUtils.stripEnd(txt, " \r\n");
            // System.out.println("line: txt = "+txt+" [x,y,w,h] = ["+x+","+y+","+w+","+h+"]");
            System.out.format("line: n = %d, txt = %s, coords = [%d,%d,%d,%d]\n", n, txt, x, y, w, h);
            TrpTextLineType line = new TrpTextLineType();
            line.setCoords(bbToCoords(x, y, w, h));
            TextEquivType te = new TextEquivType();
            te.setUnicode(txt);
            line.setTextEquiv(te);
            line.setId("line_" + (++lineCount));
            // create baseline:
            TrpBaselineType bl = new TrpBaselineType();
            int yBl = (int) (y + 0.7 * h);
            bl.setPoints(x + "," + yBl + " " + (x + w) + "," + yBl);
            line.setBaseline(bl);
            region.getTextLine().add(line);
        }
        sibling = sibling.getNextSibling();
    // System.out.println("sibling node name: "+sibling.getNodeName());
    // if (!sibling.getNodeName().equals("lb"))
    // break;
    }
    if (!region.getTextLine().isEmpty()) {
        region.setCoords(bbToCoords(minX, minY, maxX - minX, maxY - minY));
    } else {
        region.setCoords(bbToCoords(0, 0, pageWidth, pageHeight));
    }
    page.getPage().getTextRegionOrImageRegionOrLineDrawingRegion().add(region);
    if (save && !region.getTextLine().isEmpty()) {
        File xmlFile = new File(PAGE_DIR + FilenameUtils.getBaseName(imgFn) + ".xml");
        PageXmlUtils.marshalToFile(page, xmlFile);
        FileUtils.copyFile(new File(DIR + imgFn), new File(DST_DIR + imgFn));
        System.out.println("written page to: " + xmlFile.getAbsolutePath());
    }
}
Also used : TrpTextLineType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextLineType) TrpBaselineType(eu.transkribus.core.model.beans.pagecontent_trp.TrpBaselineType) TextEquivType(eu.transkribus.core.model.beans.pagecontent.TextEquivType) TrpTextRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType) Element(org.w3c.dom.Element) Node(org.w3c.dom.Node) PcGtsType(eu.transkribus.core.model.beans.pagecontent.PcGtsType) File(java.io.File)

Example 20 with TrpTextLineType

use of eu.transkribus.core.model.beans.pagecontent_trp.TrpTextLineType in project TranskribusCore by Transkribus.

the class PageXmlUtils method removeAllIndexedTags.

public static void removeAllIndexedTags(PcGtsType pc) {
    TrpPageType p = (TrpPageType) pc.getPage();
    List<TrpTextRegionType> trList = p.getTextRegions(true);
    for (TrpTextRegionType tr : trList) {
        tr.getCustomTagList().removeIndexedTags();
        List<TextLineType> lineList = tr.getTextLine();
        for (TextLineType l : lineList) {
            TrpTextLineType trpL = (TrpTextLineType) l;
            trpL.getCustomTagList().removeIndexedTags();
            List<WordType> wordList = trpL.getWord();
            for (WordType w : wordList) {
                TrpWordType trpW = (TrpWordType) w;
                trpW.getCustomTagList().removeIndexedTags();
            }
        }
    }
}
Also used : TrpTextLineType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextLineType) TextLineType(eu.transkribus.core.model.beans.pagecontent.TextLineType) TrpTextLineType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextLineType) TrpTextRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType) TrpWordType(eu.transkribus.core.model.beans.pagecontent_trp.TrpWordType) TrpPageType(eu.transkribus.core.model.beans.pagecontent_trp.TrpPageType) WordType(eu.transkribus.core.model.beans.pagecontent.WordType) TrpWordType(eu.transkribus.core.model.beans.pagecontent_trp.TrpWordType)

Aggregations

TrpTextLineType (eu.transkribus.core.model.beans.pagecontent_trp.TrpTextLineType)22 TrpTextRegionType (eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType)16 TextLineType (eu.transkribus.core.model.beans.pagecontent.TextLineType)12 TrpWordType (eu.transkribus.core.model.beans.pagecontent_trp.TrpWordType)11 TrpPageType (eu.transkribus.core.model.beans.pagecontent_trp.TrpPageType)9 WordType (eu.transkribus.core.model.beans.pagecontent.WordType)8 Rectangle (java.awt.Rectangle)7 TextStyleTag (eu.transkribus.core.model.beans.customtags.TextStyleTag)5 IOException (java.io.IOException)5 CustomTag (eu.transkribus.core.model.beans.customtags.CustomTag)4 CustomTagList (eu.transkribus.core.model.beans.customtags.CustomTagList)4 TrpBaselineType (eu.transkribus.core.model.beans.pagecontent_trp.TrpBaselineType)4 PcGtsType (eu.transkribus.core.model.beans.pagecontent.PcGtsType)3 RegionType (eu.transkribus.core.model.beans.pagecontent.RegionType)3 TrpRegionType (eu.transkribus.core.model.beans.pagecontent_trp.TrpRegionType)3 TrpTableRegionType (eu.transkribus.core.model.beans.pagecontent_trp.TrpTableRegionType)3 ArrayList (java.util.ArrayList)3 Test (org.junit.Test)3 JAXBPageTranscript (eu.transkribus.core.model.beans.JAXBPageTranscript)2 TrpPage (eu.transkribus.core.model.beans.TrpPage)2