Search in sources :

Example 1 with GapTag

use of eu.transkribus.core.model.beans.customtags.GapTag in project TranskribusCore by Transkribus.

the class TrpPdfDocument method addTags.

public void addTags(TrpDoc doc, Set<Integer> pageIndices, boolean useWordLevel2, ExportCache cache) throws DocumentException, IOException {
    PdfContentByte cb = writer.getDirectContentUnder();
    int l = 0;
    float posY;
    // BaseFont bf = BaseFont.createFont(BaseFont.TIMES_ROMAN, "UTF-8", BaseFont.NOT_EMBEDDED, true, null, null);
    Set<String> wantedTags = cache.getOnlySelectedTagnames(CustomTagFactory.getRegisteredTagNames());
    // logger.debug("selectedTags Size " + selectedTags.size());
    for (String currTagname : wantedTags) {
        double lineHeight = 12 / scaleFactorY;
        double lineGap = 4 / scaleFactorY;
        // logger.debug("currTagname " + currTagname);
        // get all custom tags with currTagname and text
        HashMap<CustomTag, String> allTagsOfThisTagname = cache.getTags(currTagname);
        // logger.debug("all Tags Of This Tagname " + currTagname);
        if (allTagsOfThisTagname.size() > 0) {
            posY = (float) (twelfthPoints[1][1] + (lineHeight + lineGap) * l);
            if (posY > twelfthPoints[10][1]) {
                posY = twelfthPoints[1][1];
                l = 0;
            String color = CustomTagFactory.getTagColor(currTagname);
            addUniformTagList(lineHeight, twelfthPoints[1][0], posY, "", currTagname + " Tags:", "", cb, 0, 0, bfArial, twelfthPoints[1][0], false, color, 0, false);
            // addUniformStringTest(lineMeanHeight, twelfthPoints[1][0], posY, currTagname + " Tags:", cb, 0, 0, bfArial, twelfthPoints[1][0], false, color, 0);
            Collection<String> valueSet = allTagsOfThisTagname.values();
            Collection<CustomTag> keySet = allTagsOfThisTagname.keySet();
            HashSet<String> uniqueValues = new HashSet<String>();
            Iterator<CustomTag> it = keySet.iterator();
            while (it.hasNext()) {
                CustomTag currEntry =;
                String currValue = allTagsOfThisTagname.get(currEntry);
                // case for gap tag
                if (currValue == null) {
                    currValue = "";
                String expansion = "";
                // handles continued tags over several lines
                while (currEntry.isContinued() && it.hasNext()) {
                    currEntry =;
                    if (currEntry.isContinued()) {
                        String continued = allTagsOfThisTagname.get(currEntry);
                        currValue = currValue.concat(continued);
                        // soft hyphen
                        currValue = currValue.replaceAll("\u00AD", "");
                        // minus
                        currValue = currValue.replaceAll("\u002D", "");
                        // not sign
                        currValue = currValue.replaceAll("\u00AC", "");
                    // char c = 0xFFFA; String.valueOf(c).replaceAll("\\p{C}", "?");
                boolean rtl = false;
                if (!currValue.isEmpty() && textIsRTL(currValue)) {
                    rtl = true;
                    // logger.debug("rtl tag found " + currValue);
                    currValue = reverseString(currValue);
                String searchText = currValue;
                if (currTagname.equals(CommentTag.TAG_NAME)) {
                    CommentTag ct = (CommentTag) currEntry;
                    if (ct.getComment() != "") {
                        if (!rtl)
                            expansion = ": " + ct.getComment();
                            expansion = ct.getComment() + " :";
                // currValue = currValue.concat(": " + ct.getComment());
                // logger.debug("comment " + currValue);
                } else if (currTagname.equals(AbbrevTag.TAG_NAME)) {
                    AbbrevTag at = (AbbrevTag) currEntry;
                    if (at.getExpansion() != "")
                        if (!rtl)
                            expansion = ": " + at.getExpansion();
                            expansion = at.getExpansion() + " :";
                } else if (currTagname.equals(GapTag.TAG_NAME)) {
                    GapTag at = (GapTag) currEntry;
                    currValue = currEntry.getTextOfShape();
                    searchText = currValue;
                    int offset = Math.max(at.getOffset(), currValue.length() - 1);
                    String sub1 = currValue.substring(0, offset);
                    String sub2 = currValue.substring(offset);
                    String exp = (String) at.getAttributeValue("supplied");
                    if (exp != null && exp != "") {
                        currValue = sub1.concat("[" + exp + "]").concat(sub2);
                    // expansion = "[" + (String) at.getAttributeValue("supplied") + "]";
                    } else // no supplied attribute - gap must not be in the tag list
                } else if (currTagname.equals(SuppliedTag.TAG_NAME)) {
                // make sure that similar tags are only exported once
                if (!uniqueValues.contains(currValue)) {
                    posY = (float) (twelfthPoints[1][1] + (lineHeight + lineGap) * l);
                    if (posY > twelfthPoints[11][1]) {
                        posY = twelfthPoints[1][1];
                        l = 1;
                    addUniformTagList(lineHeight, twelfthPoints[1][0], posY, searchText, currValue, expansion, cb, 0, 0, bfArial, twelfthPoints[1][0], true, null, 0, rtl);
                    // logger.debug("tag value is " + currValue);
Also used : CustomTag(eu.transkribus.core.model.beans.customtags.CustomTag) Point(java.awt.Point) CommentTag(eu.transkribus.core.model.beans.customtags.CommentTag) GapTag(eu.transkribus.core.model.beans.customtags.GapTag) PdfContentByte(com.itextpdf.text.pdf.PdfContentByte) AbbrevTag(eu.transkribus.core.model.beans.customtags.AbbrevTag) HashSet(java.util.HashSet)

Example 2 with GapTag

use of eu.transkribus.core.model.beans.customtags.GapTag in project TranskribusCore by Transkribus.

the class TrpTeiStringBuilder method createTagStart.

String createTagStart(CustomTag t) {
    String ts = "";
    if (t instanceof TextStyleTag) {
        // TODO!!
        TextStyleTag tst = (TextStyleTag) t;
        ts = "<hi rend='" + tst.getAttributeCssStr() + "'>";
    } else if (t instanceof AbbrevTag) {
        AbbrevTag at = (AbbrevTag) t;
        ts = "<choice><expan>" + StringEscapeUtils.escapeXml(at.getExpansion()) + "</expan><abbr>";
    } else if (t instanceof PersonTag) {
        PersonTag pt = (PersonTag) t;
        ts = "<persName>";
        if (!StringUtils.isEmpty(pt.getFirstname())) {
            ts += "<forename>" + StringEscapeUtils.escapeXml(pt.getFirstname()) + "</forename>";
        if (!StringUtils.isEmpty(pt.getLastname())) {
            ts += "<surname>" + StringEscapeUtils.escapeXml(pt.getLastname()) + "</surname>";
        if (!StringUtils.isEmpty(pt.getDateOfBirth())) {
            ts += "<birth>" + StringEscapeUtils.escapeXml(pt.getDateOfBirth()) + "</birth>";
        if (!StringUtils.isEmpty(pt.getDateOfBirth())) {
            ts += "<death>" + StringEscapeUtils.escapeXml(pt.getDateOfDeath()) + "</death>";
        if (!StringUtils.isEmpty(pt.getNotice())) {
            ts += "<notice>" + StringEscapeUtils.escapeXml(pt.getNotice()) + "</notice>";
    } else if (t instanceof PlaceTag) {
        PlaceTag pt = (PlaceTag) t;
        ts = "<placeName>";
        if (!StringUtils.isEmpty(pt.getCountry())) {
            ts += "<country>" + StringEscapeUtils.escapeXml(pt.getCountry()) + "</country>";
    } else if (t instanceof OrganizationTag) {
        OrganizationTag ot = (OrganizationTag) t;
        ts = "<orgName>";
    } else if (t instanceof SpeechTag) {
        SpeechTag st = (SpeechTag) t;
        ts = "<sp>";
        if (!StringUtils.isEmpty(st.getSpeaker())) {
            ts += "<speaker>" + StringEscapeUtils.escapeXml(st.getSpeaker()) + "</speaker>";
    } else if (t instanceof GapTag) {
        ts = "<gap />";
    } else // do nothing because comment tag is added at the end of the tag entry as note in the createTagEnd method
    if (t instanceof CommentTag) {
        ts = "";
    } else {
        // general tag
        ts = "<" + t.getTagName();
        for (String an : t.getAttributeNames()) {
            if (CustomTag.isOffsetOrLengthOrContinuedProperty(an))
            Object v = t.getAttributeValue(an);
            if (v != null) {
                ts += " " + StringEscapeUtils.escapeXml(an) + "='" + StringEscapeUtils.escapeXml(v.toString()) + "'";
        ts += ">";
    return ts;
Also used : TextStyleTag(eu.transkribus.core.model.beans.customtags.TextStyleTag) CommentTag(eu.transkribus.core.model.beans.customtags.CommentTag) PlaceTag(eu.transkribus.core.model.beans.customtags.PlaceTag) OrganizationTag(eu.transkribus.core.model.beans.customtags.OrganizationTag) GapTag(eu.transkribus.core.model.beans.customtags.GapTag) PersonTag(eu.transkribus.core.model.beans.customtags.PersonTag) AbbrevTag(eu.transkribus.core.model.beans.customtags.AbbrevTag) SpeechTag(eu.transkribus.core.model.beans.customtags.SpeechTag)

Example 3 with GapTag

use of eu.transkribus.core.model.beans.customtags.GapTag in project TranskribusCore by Transkribus.

the class DocxBuilder method getFormattedTextForShapeElement.

private static void getFormattedTextForShapeElement(ITrpShapeType element, P p, MainDocumentPart mdp) throws Exception {
    ArrayList<R> listOfallRuns = new ArrayList<R>();
    String textStr = element.getUnicodeText();
    CustomTagList cl = element.getCustomTagList();
    if (textStr == null || cl == null)
        throw new IOException("Element has no text or custom tag list: " + element + ", class: " + element.getClass().getName());
    if (textStr.isEmpty()) {
    boolean rtl = false;
    // from right to left
    if (Character.getDirectionality(textStr.charAt(0)) == Character.DIRECTIONALITY_RIGHT_TO_LEFT || Character.getDirectionality(textStr.charAt(0)) == Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC || Character.getDirectionality(textStr.charAt(0)) == Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING || Character.getDirectionality(textStr.charAt(0)) == Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE) {
        logger.debug("&&&&&&&& STRING IS RTL : ");
        deleteCharAtIndex(0, textStr);
        rtl = true;
    // format according to tags:CustomTagList
    for (CustomTag nonIndexedTag : cl.getNonIndexedTags()) {
        // exchange chars with * if wished to be blackened
        if (doBlackening && nonIndexedTag.getTagName().equals(RegionTypeUtil.BLACKENING_REGION.toLowerCase())) {
            // logger.debug("nonindexed tag found ");
            textStr = ExportUtils.blackenString(nonIndexedTag, textStr);
			 * for gap and comment: remember their position to find and add them to their corresponding 'run' later on 
        if (nonIndexedTag.getTagName().equals("gap")) {
            GapTag gap = (GapTag) nonIndexedTag;
            gapList.put(nonIndexedTag.getOffset(), gap);
    // unclear and comments can not be non-indexed
    // if (nonIndexedTag.getTagName().equals("comment")){
    // logger.debug("nonindexed comment tag found ");
    // CommentTag ct = (CommentTag) nonIndexedTag;
    // commentList.put(nonIndexedTag.getEnd()-1, ct.getComment());
    // }
    // if(nonIndexedTag.getTagName().equals("unclear")){
    // logger.debug("unclear tag found ");
    // unclearList.put(nonIndexedTag.getOffset(), nonIndexedTag.getOffset()+nonIndexedTag.getLength());
    // }
    for (CustomTag indexedTag : cl.getIndexedTags()) {
        if (doBlackening && indexedTag.getTagName().equals(RegionTypeUtil.BLACKENING_REGION.toLowerCase())) {
            textStr = ExportUtils.blackenString(indexedTag, textStr);
			 * find all gaps and store the offset
        if (indexedTag.getTagName().equals("gap")) {
            GapTag gap = (GapTag) indexedTag;
            gapList.put(indexedTag.getOffset(), gap);
        if (indexedTag.getTagName().equals("comment")) {
            // logger.debug("indexed comment tag found at pos " + (indexedTag.getEnd()-1));
            CommentTag ct = (CommentTag) indexedTag;
            commentList.put(indexedTag.getEnd() - 1, ct.getComment());
        // if(exportTags){
        if (markUnclearWords && indexedTag.getTagName().equals("unclear")) {
            // logger.debug("unclear tag found ");
            // logger.debug("unclear start is: " + indexedTag.getOffset());
            // logger.debug("unclear end is: " + (indexedTag.getEnd()-1));
            unclearList.put(indexedTag.getOffset(), indexedTag.getEnd() - 1);
        if (expandAbbrevs && indexedTag.getTagName().equals("abbrev")) {
            logger.debug("abbrev tag found ");
            AbbrevTag at = (AbbrevTag) indexedTag;
            String expansion = at.getExpansion();
            // only add if an expansion was typed
            if (!expansion.equals("")) {
                expandAbbrevList.put(indexedTag.getEnd(), at.getExpansion());
        if (substituteAbbrevs && indexedTag.getTagName().equals("abbrev")) {
            // logger.debug("abbrev tag found ");
            AbbrevTag at = (AbbrevTag) indexedTag;
            String expansion = at.getExpansion();
            // key is the start of the abbrev
            if (!expansion.equals("")) {
                substituteAbbrevList.put(indexedTag.getOffset(), at);
        if (showSuppliedWithBrackets && indexedTag.getTagName().equals("supplied")) {
            // logger.debug("supplied tag found ");
            SuppliedTag at = (SuppliedTag) indexedTag;
            String text = at.getContainedText();
            // only add if an expansion was typed
            if (!text.equals("")) {
                showSuppliedList.put(indexedTag.getOffset(), text);
        if (ignoreSupplied && indexedTag.getTagName().equals("supplied")) {
            // logger.debug("supplied tag found ");
            SuppliedTag at = (SuppliedTag) indexedTag;
            String text = at.getContainedText();
            // only add if an expansion was typed
            if (!text.equals("")) {
                ignoreSuppliedList.put(indexedTag.getOffset(), text);
        // create index for all choosen tagnames
        if (exportTags && tagnames.contains(indexedTag.getTagName()) && !indexedTag.getTagName().equals("gap")) {
            // logger.debug("export tag as idx entry " + indexedTag.getOffset());
            addValuesToIdxList(idxList, indexedTag.getEnd(), indexedTag);
    // }
    List<TextStyleTag> textStylesTags = element.getTextStyleTags();
    // ArrayList<R> runs = new ArrayList<R>();
    boolean shapeEnded = false;
    for (int i = 0; i <= textStr.length(); ++i) {
        // use of abbrevIdx: this is necessary for the appearance at the end of a textline
        // otherwise the abbrev expansion would not appear at the end of a line because then the index i would be too small
        shapeEnded = (i + 1 >= textStr.length() ? true : false);
			 * is this case the abbrev gets totally replaced by its expansion
			 * so if the start of the abbrev was found the expansion is written and we can break the writing of the abbrev
        if (substituteAbbrevList.containsKey(i)) {
            String exp = substituteAbbrevList.get(i).getExpansion();
            if (rtl) {
                exp = reverseString(exp);
            org.docx4j.wml.Text abbrevText = factory.createText();
            org.docx4j.wml.R abbrevRun = factory.createR();
            // p.getContent().add(abbrevRun);
            // go to end of the abbreviation and proceed with remaining text
            i += substituteAbbrevList.get(i).getLength();
            shapeEnded = (i == textStr.length() ? true : false);
			 * add expansion in brackets behind the abbrev		
			 * the abbrev list contains as key the end index of the abbrev	
        if (expandAbbrevList.containsKey(i)) {
            String exp = expandAbbrevList.get(i);
            if (rtl) {
                exp = reverseString(exp);
            org.docx4j.wml.Text abbrevText = factory.createText();
            abbrevText.setValue("[" + exp + "]");
            org.docx4j.wml.R abbrevRun = factory.createR();
            // p.getContent().add(abbrevRun);
			 * in this case the supplied tag is expanded either with or without brackets
        if (showSuppliedList.containsKey(i)) {
            String exp = showSuppliedList.get(i);
            if (rtl) {
                exp = reverseString(exp);
            org.docx4j.wml.Text suppliedText = factory.createText();
            suppliedText.setValue("[" + exp + "]");
            org.docx4j.wml.R suppliedRun = factory.createR();
            // supplied is handled now - so set i to the end of supplied
            i += showSuppliedList.get(i).length();
            shapeEnded = (i == textStr.length() ? true : false);
			 * in this case the supplied tag gets ignored
			 * this means that index i must be incremented by the length of this supplied tag text
        if (ignoreSuppliedList.containsKey(i)) {
            i += ignoreSuppliedList.get(i).length();
            shapeEnded = (i == textStr.length() ? true : false);
			 * gap is at this position
			 * hence create extra run with [...] as value and then go on
			 * of if suppied attribute is set handle supplied as set in the export settings
        if (gapList.containsKey(i)) {
            org.docx4j.wml.Text t = factory.createText();
            // if (!rtl)
            // t.setValue("[...] ");
            // else
            // t.setValue(" [...]");
            GapTag gt = gapList.get(i);
            String cta = (String) gt.getAttributeValue("supplied");
            // attribute supplied is set in the gap tag -> handle supplied as wanted
            if (cta != null && !cta.equals("")) {
                // may the gap with supplied attribute gets ignored
                if (!ignoreSupplied) {
                    if (showSuppliedWithBrackets) {
                        t.setValue("[" + cta + "]");
                // do not show supplied attribute by default!?
                // else{
                // t.setValue(cta);
                // }
            } else // nothing supplied, so show [...] for the gap tag
            org.docx4j.wml.R run = factory.createR();
            // p.getContent().add(run);
        // begin of unclear word should be marked with [ and end with ]
        if (unclearList.containsKey(i)) {
            org.docx4j.wml.Text t = factory.createText();
            if (!rtl)
            org.docx4j.wml.R run = factory.createR();
            // p.getContent().add(run);
			 * if so we create an index entry for this text string in the docx
        if (idxList.containsKey(i)) {
            addIndexEntry(i, p, textStr, rtl);
        String currText = "";
        if (i + 1 <= textStr.length()) {
            currText = textStr.substring(i, i + 1);
        // logger.debug("&&&&&&&& current single char : " + currText);
			 * 2nd is (should be) soft hyphen with Unicode U+00AD
			 * First arg is not sign and was initially used for soft hyphen by Diggitexx
			 * need to be at the line end - otherwise 
        if ((currText.equals("¬") || currText.equals("­") || currText.equals("-")) && !preserveLineBreaks && shapeEnded) {
        org.docx4j.wml.Text t = factory.createText();
        org.docx4j.wml.R run = factory.createR();
        // p.getContent().add(run);
        // end of unclear tag
        if (unclearList.containsValue(i)) {
            org.docx4j.wml.Text unclearEnd = factory.createText();
            if (!rtl)
            org.docx4j.wml.R unclearRun = factory.createR();
            // p.getContent().add(unclearRun);
        // the properties of this text section
        org.docx4j.wml.RPr rpr = factory.createRPr();
			 * format according to custom style tag - check for each char in the text if a special style should be set
        for (TextStyleTag styleTag : textStylesTags) {
            if (i >= styleTag.getOffset() && i < (styleTag.getOffset() + styleTag.getLength())) {
                org.docx4j.wml.BooleanDefaultTrue b = new org.docx4j.wml.BooleanDefaultTrue();
                TextStyleType ts = styleTag.getTextStyle();
                if (ts == null)
                if (CoreUtils.val(ts.isBold())) {
                if (CoreUtils.val(ts.isItalic())) {
                if (CoreUtils.val(ts.isLetterSpaced())) {
                // ????
                if (CoreUtils.val(ts.isMonospace())) {
                // ????
                if (CoreUtils.val(ts.isReverseVideo())) {
                // ????
                if (CoreUtils.val(ts.isSerif())) {
                // ????
                if (CoreUtils.val(ts.isSmallCaps())) {
                if (CoreUtils.val(ts.isStrikethrough())) {
                if (CoreUtils.val(ts.isSubscript())) {
                    org.docx4j.wml.CTVerticalAlignRun al = factory.createCTVerticalAlignRun();
                if (CoreUtils.val(ts.isSuperscript())) {
                    org.docx4j.wml.CTVerticalAlignRun al = factory.createCTVerticalAlignRun();
                if (CoreUtils.val(ts.isUnderlined())) {
                    U u = factory.createU();
            // BooleanDefaultTrue bdt = Context.getWmlObjectFactory().createBooleanDefaultTrue();
            // bdt.setVal(Boolean.TRUE);
            // rpr.setRtl(bdt);
            // rpr.setHighlight(new Highlight());
        // at the run properties (= text styles) to the run
        // find position of footnote/comment
        if (commentList.containsKey(i)) {
            // logger.debug("position of comment: " + i);
            // logger.debug("value of comment: " + commentList.get(i));
            // creates the footnote at the end of the wished text - this position was found at the beginning of this method
            org.docx4j.wml.R fnRun = factory.createR();
            // p.getContent().add(fnRun);
            createFootnote(commentList.get(i), fnRun, mdp);
			 * add space at end of line if line breaks are not preserved
        if (!preserveLineBreaks && shapeEnded) {
            org.docx4j.wml.Text space = factory.createText();
            space.setValue(" ");
            org.docx4j.wml.R runSpace = factory.createR();
            // p.getContent().add(runSpace);
    // runs.add(run);
    if (rtl) {
        PPr paragraphProperties = factory.createPPr();
        Jc justification = factory.createJc();
    for (int i = listOfallRuns.size() - 1; i >= 0; i--) {
        if (rtl) {
        } else {
Also used : ArrayList(java.util.ArrayList) CustomTag(eu.transkribus.core.model.beans.customtags.CustomTag) RPr(org.docx4j.wml.RPr) R(org.docx4j.wml.R) U(org.docx4j.wml.U) R(org.docx4j.wml.R) Jc(org.docx4j.wml.Jc) Text(org.docx4j.wml.Text) TextStyleType(eu.transkribus.core.model.beans.pagecontent.TextStyleType) SuppliedTag(eu.transkribus.core.model.beans.customtags.SuppliedTag) CustomTagList(eu.transkribus.core.model.beans.customtags.CustomTagList) IOException( CommentTag(eu.transkribus.core.model.beans.customtags.CommentTag) TextStyleTag(eu.transkribus.core.model.beans.customtags.TextStyleTag) PPr(org.docx4j.wml.PPr) GapTag(eu.transkribus.core.model.beans.customtags.GapTag) AbbrevTag(eu.transkribus.core.model.beans.customtags.AbbrevTag)


AbbrevTag (eu.transkribus.core.model.beans.customtags.AbbrevTag)3 CommentTag (eu.transkribus.core.model.beans.customtags.CommentTag)3 GapTag (eu.transkribus.core.model.beans.customtags.GapTag)3 CustomTag (eu.transkribus.core.model.beans.customtags.CustomTag)2 TextStyleTag (eu.transkribus.core.model.beans.customtags.TextStyleTag)2 PdfContentByte (com.itextpdf.text.pdf.PdfContentByte)1 CustomTagList (eu.transkribus.core.model.beans.customtags.CustomTagList)1 OrganizationTag (eu.transkribus.core.model.beans.customtags.OrganizationTag)1 PersonTag (eu.transkribus.core.model.beans.customtags.PersonTag)1 PlaceTag (eu.transkribus.core.model.beans.customtags.PlaceTag)1 SpeechTag (eu.transkribus.core.model.beans.customtags.SpeechTag)1 SuppliedTag (eu.transkribus.core.model.beans.customtags.SuppliedTag)1 TextStyleType (eu.transkribus.core.model.beans.pagecontent.TextStyleType)1 Point (java.awt.Point)1 IOException ( ArrayList (java.util.ArrayList)1 HashSet (java.util.HashSet)1 Jc (org.docx4j.wml.Jc)1 PPr (org.docx4j.wml.PPr)1 R (org.docx4j.wml.R)1