Search in sources :

Example 1 with RegionType

use of eu.transkribus.core.model.beans.pagecontent.RegionType in project TranskribusCore by Transkribus.

the class PageXmlUtils method getTextRegions.

public static List<TextRegionType> getTextRegions(PcGtsType pc) {
    List<TrpRegionType> regions = pc.getPage().getTextRegionOrImageRegionOrLineDrawingRegion();
    List<TextRegionType> tRegions = new ArrayList<>();
    if (regions == null || regions.isEmpty()) {
        return tRegions;
    }
    for (RegionType r : regions) {
        if (r == null)
            continue;
        if (TextRegionType.class.isAssignableFrom(r.getClass())) {
            tRegions.add((TextRegionType) r);
        }
        if (TableRegionType.class.isAssignableFrom(r.getClass())) {
            TableRegionType table = (TableRegionType) r;
            tRegions.addAll(table.getTableCell());
        }
    }
    return tRegions;
}
Also used : TrpTextRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType) TextRegionType(eu.transkribus.core.model.beans.pagecontent.TextRegionType) TrpTextRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType) TrpRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpRegionType) RegionType(eu.transkribus.core.model.beans.pagecontent.RegionType) TextRegionType(eu.transkribus.core.model.beans.pagecontent.TextRegionType) TableRegionType(eu.transkribus.core.model.beans.pagecontent.TableRegionType) TrpRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpRegionType) TableRegionType(eu.transkribus.core.model.beans.pagecontent.TableRegionType) ArrayList(java.util.ArrayList)

Example 2 with RegionType

use of eu.transkribus.core.model.beans.pagecontent.RegionType in project TranskribusCore by Transkribus.

the class PageXmlUtils method removeExcludedRegions.

public static void removeExcludedRegions(PcGtsType pc, List<String> regIds) {
    List<TrpRegionType> regions = pc.getPage().getTextRegionOrImageRegionOrLineDrawingRegion();
    if (regions == null || regions.isEmpty()) {
        return;
    }
    for (int i = 0; i < regions.size(); ) {
        RegionType r = regions.get(i);
        if (!regIds.contains(r.getId())) {
            logger.debug("Removing excluded region: " + r.getId());
            regions.remove(r);
        } else {
            i++;
        }
    }
}
Also used : TrpTextRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType) TrpRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpRegionType) RegionType(eu.transkribus.core.model.beans.pagecontent.RegionType) TextRegionType(eu.transkribus.core.model.beans.pagecontent.TextRegionType) TableRegionType(eu.transkribus.core.model.beans.pagecontent.TableRegionType) TrpRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpRegionType)

Example 3 with RegionType

use of eu.transkribus.core.model.beans.pagecontent.RegionType in project TranskribusCore by Transkribus.

the class FinereaderUtils method replaceBadChars.

/**
 * Method for replacing certain systematic errors in OCR Text.
 * Replacements are done with regexes from private static regexReglMap (see above in this class).
 *
 * TODO add parameters to pass custom maps from a search/replace dialog!?
 * @param pc
 * @return
 */
public static PcGtsType replaceBadChars(PcGtsType pc) {
    List<TrpRegionType> regs = pc.getPage().getTextRegionOrImageRegionOrLineDrawingRegion();
    boolean success = true;
    for (RegionType r : regs) {
        if (!isTextRegion(r)) {
            continue;
        }
        TextRegionType tr = (TextRegionType) r;
        if (tr.getTextEquiv() == null && tr.getTextEquiv().getUnicode() == null) {
            // no text at all
            continue;
        }
        final String textblockBefore = tr.getTextEquiv().getUnicode();
        final String textblockAfter = replaceChars(textblockBefore, regexRepl);
        // iterate lines
        List<TextLineType> lines = tr.getTextLine();
        if (lines == null || lines.isEmpty()) {
            // textblockAfter = replaceChars(textblockBefore, regexRepl);
            continue;
        }
        // setRegionText
        tr.getTextEquiv().setUnicode(textblockAfter);
        StringBuffer linesBefore = new StringBuffer();
        StringBuffer linesAfter = new StringBuffer();
        // DEBUG END
        boolean isFirstLine = true;
        for (TextLineType l : lines) {
            if (l.getTextEquiv() == null && l.getTextEquiv().getUnicode() == null) {
                // empty line
                continue;
            }
            // Build the textRegion for later use
            final String textlineBefore = l.getTextEquiv().getUnicode();
            final String textlineAfter = replaceChars(textlineBefore, regexRepl);
            linesBefore.append(isFirstLine ? textlineBefore : "\n" + textlineBefore);
            linesAfter.append(isFirstLine ? textlineAfter : "\n" + textlineAfter);
            if (isFirstLine)
                isFirstLine = false;
            l.getTextEquiv().setUnicode(textlineAfter);
            // iterate words
            List<WordType> words = l.getWord();
            if (words == null || words.isEmpty()) {
                // with next line
                continue;
            }
            boolean isFirstWord = true;
            StringBuffer wordsBefore = new StringBuffer();
            StringBuffer wordsAfter = new StringBuffer();
            for (int i = 0; i < words.size(); i++) {
                WordType w = words.get(i);
                if (w.getTextEquiv() == null || w.getTextEquiv().getUnicode() == null) {
                    continue;
                }
                final String wordText = w.getTextEquiv().getUnicode();
                final String wordTextAfter;
                if (i < words.size() - 1) {
                    // use general replacement map for all words
                    wordTextAfter = replaceChars(wordText, repl);
                } else {
                    // use regex map for EOL words
                    wordTextAfter = replaceChars(wordText, regexRepl);
                }
                // DEBUG
                wordsBefore.append(isFirstWord ? wordText : " " + wordText);
                wordsAfter.append(isFirstWord ? wordTextAfter : " " + wordTextAfter);
                if (isFirstWord)
                    isFirstWord = false;
                // DEBUG END
                w.getTextEquiv().setUnicode(wordTextAfter);
            }
            boolean lineSuccess = textlineBefore.toString().replace(" ", "").equals(wordsBefore.toString().replace(" ", ""));
            lineSuccess &= textlineAfter.toString().replace(" ", "").equals(wordsAfter.toString().replace(" ", ""));
            if (!lineSuccess) {
                logger.debug("Line before: " + textlineBefore.toString());
                logger.debug("Words before : " + wordsBefore.toString());
                logger.debug("Line after: " + textlineAfter.toString());
                logger.debug("Words after : " + wordsAfter.toString());
            }
            success &= lineSuccess;
        // TODO propagate words -> lines -> regions
        }
        boolean regionSuccess = textblockBefore.replace(" ", "").equals(linesBefore.toString().replace(" ", ""));
        regionSuccess &= textblockAfter.replace(" ", "").equals(linesAfter.toString().replace(" ", ""));
        if (!regionSuccess) {
            logger.debug("\nblock:\n");
            logger.debug(textblockAfter);
            logger.debug("\nblock from lines:\n");
            logger.debug(linesAfter.toString());
        }
        success &= regionSuccess;
    }
    logger.info("Bad character replacement: " + (success ? "SUCCESS" : "FAILURE"));
    // if(!success) throw new IllegalArgumentException();
    return pc;
}
Also used : TrpTextRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType) TextRegionType(eu.transkribus.core.model.beans.pagecontent.TextRegionType) TrpTextRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType) TrpRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpRegionType) RegionType(eu.transkribus.core.model.beans.pagecontent.RegionType) TextRegionType(eu.transkribus.core.model.beans.pagecontent.TextRegionType) TextLineType(eu.transkribus.core.model.beans.pagecontent.TextLineType) TrpRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpRegionType) WordType(eu.transkribus.core.model.beans.pagecontent.WordType)

Example 4 with RegionType

use of eu.transkribus.core.model.beans.pagecontent.RegionType in project TranskribusCore by Transkribus.

the class TrpPdfDocument method addUniformText.

private void addUniformText(PcGtsType pc, int cutoffLeft, int cutoffTop, ExportCache cache) throws DocumentException, IOException {
    PdfContentByte cb = writer.getDirectContentUnder();
    cb.setColorFill(BaseColor.BLACK);
    cb.setColorStroke(BaseColor.BLACK);
    /**
     * The path to the font.
     */
    // FontFactory.register("c:/windows/fonts/arialbd.ttf");
    // BaseFont bf = BaseFont.createFont("/fonts/arialbd.ttf", BaseFont.IDENTITY_H, BaseFont.NOT_EMBEDDED);
    cb.beginLayer(ocrLayer);
    // FontFactory.register("arialbd.ttf", "my_bold_font");
    // Font fontTest = FontFactory.getFont("arialbd.ttf", Font.BOLDITALIC);
    cb.setFontAndSize(bfArial, 10);
    List<TrpRegionType> regions = pc.getPage().getTextRegionOrImageRegionOrLineDrawingRegion();
    /*
		 * use reading order comparator for sorting since at this time reading order is more trustable
		 * other sorting is not transitive and seldomly produces "Comparison violates its general contract" exception
		 */
    Collections.sort(regions, new TrpElementReadingOrderComparator<RegionType>(true));
    // Collections.sort(regions, new TrpElementCoordinatesComparator<RegionType>());
    float textBlockXStart = 0;
    int i = 0;
    for (TrpRegionType r : regions) {
        // TODO add paths for tables etc.
        if (r instanceof TrpTableRegionType) {
            exportTable(r, cb, cutoffLeft, cutoffTop, true, cache);
        } else if (r instanceof TrpTextRegionType) {
            TrpTextRegionType tr = (TrpTextRegionType) r;
            // compute average text region start
            // textBlockXStart = (float) (PageXmlUtils.buildPolygon(tr.getCoords().getPoints()).getBounds().getMinX());
            // double minX = PageXmlUtils.buildPolygon(tr.getCoords().getPoints()).getBounds().getMinX();
            // this should result in the the same value as the method in the line above which is deprecated
            double minX = tr.getBoundingBox().getMinX();
            double maxX = tr.getBoundingBox().getMaxX();
            double trWidth = tr.getBoundingBox().getWidth();
            // if (hasSmallerColumn(regions, tr)){
            if (isOnlyRegionInThisRow(regions, tr)) {
                // if (regions.size() == 1){
                logger.debug("only one region in this row!!");
                // indent start of text block under certain preconditions
                if (minX < twelfthPoints[1][0] && (twelfthPoints[1][0] < maxX && trWidth > twelfthPoints[2][0])) {
                    textBlockXStart = twelfthPoints[1][0];
                } else // if textregion contains only one line this is probably a headline
                if (tr.getTextLine().size() == 1) {
                    // logger.debug("tr.getTextLine().size() == 1 ");
                    textBlockXStart = getPrintregionStartX((float) (minX), tr.getBoundingBox().getMaxX());
                } else if (twelfthPoints[2][0] < maxX && trWidth > twelfthPoints[3][0]) {
                    // logger.debug("twelfthPoints[2][0] < tr.getBoundingBox().getMaxX() ");
                    textBlockXStart = twelfthPoints[2][0];
                } else {
                    textBlockXStart = (float) minX;
                }
            } else {
                logger.debug("several columns found, minX of text region is : " + minX);
                // float startWithThisX = (float) (minX < smallerRegionMaxX ? smallerRegionMaxX : minX);
                // textBlockXStart = getPrintregionStartX((float) (startWithThisX));
                /*
					 * this is then used for all lines of a region as start point
					 */
                textBlockXStart = getAverageBeginningOfBaselines(tr);
                textBlockXStart += 40;
            }
            // logger.debug("textBlockXStart " + textBlockXStart);
            addUniformTextFromTextRegion(tr, cb, cutoffLeft, cutoffTop, bfArial, textBlockXStart, cache);
        }
    }
    cb.endLayer();
// addTocLinks(doc, page,cutoffTop);
}
Also used : TrpTextRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType) UnknownRegionType(eu.transkribus.core.model.beans.pagecontent.UnknownRegionType) TrpRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpRegionType) TrpTableRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTableRegionType) RegionType(eu.transkribus.core.model.beans.pagecontent.RegionType) TextRegionType(eu.transkribus.core.model.beans.pagecontent.TextRegionType) TrpTableRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTableRegionType) TrpRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpRegionType) TrpTextRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType) PdfContentByte(com.itextpdf.text.pdf.PdfContentByte) Point(java.awt.Point)

Example 5 with RegionType

use of eu.transkribus.core.model.beans.pagecontent.RegionType in project TranskribusCore by Transkribus.

the class TrpPdfDocument method isOnlyRegionInThisRow.

private boolean isOnlyRegionInThisRow(List<TrpRegionType> regions, TextRegionType regionToCompare) {
    float minX = 0;
    float minY = 0;
    float maxX = 0;
    float maxY = 0;
    float meanX = 0;
    float meanY = 0;
    java.awt.Rectangle compareBlock = regionToCompare.getBoundingBox();
    float compareMinX = (float) compareBlock.getMinX();
    float compareMinY = (float) compareBlock.getMinY();
    float compareMaxX = (float) compareBlock.getMaxX();
    float compareMaxY = (float) compareBlock.getMaxY();
    float compareMeanX = compareMinX + (compareMaxX - compareMinX) / 2;
    float compareMeanY = compareMinY + (compareMaxY - compareMinY) / 2;
    boolean foundSmallerColumn = false;
    if (regions.size() == 1) {
        return true;
    } else {
        for (RegionType r : regions) {
            // TODO add paths for tables etc.
            if (r instanceof TextRegionType && r.getId() != regionToCompare.getId()) {
                TextRegionType tr = (TextRegionType) r;
                // empty region can be ignored
                if (tr.getTextLine().isEmpty())
                    continue;
                else {
                    // region with empty lines can also be ignored
                    boolean textFound = false;
                    for (TextLineType tlt : tr.getTextLine()) {
                        TrpTextLineType l = (TrpTextLineType) tlt;
                        textFound = !l.getUnicodeText().isEmpty();
                        if (textFound) {
                            break;
                        }
                    }
                    // no text in region -> go to next region
                    if (!textFound) {
                        continue;
                    }
                }
                // logger.debug("tr id " + tr.getId());
                // compute average text region start
                // java.awt.Rectangle block = PageXmlUtils.buildPolygon(tr.getCoords().getPoints()).getBounds();
                java.awt.Rectangle block = tr.getBoundingBox();
                minX = (float) block.getMinX();
                maxX = (float) block.getMaxX();
                minY = (float) block.getMinY();
                maxY = (float) block.getMaxY();
                // meanX = minX+(maxX - minX)/2;
                meanY = minY + (maxY - minY) / 2;
                if (((meanY > compareMinY && meanY < compareMaxY) || (compareMeanY > minY && compareMeanY < maxY))) {
                    return false;
                }
            }
        }
    }
    return true;
}
Also used : TrpTextRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType) TextRegionType(eu.transkribus.core.model.beans.pagecontent.TextRegionType) TrpTextLineType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextLineType) Rectangle(java.awt.Rectangle) TrpTextRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType) UnknownRegionType(eu.transkribus.core.model.beans.pagecontent.UnknownRegionType) TrpRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpRegionType) TrpTableRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTableRegionType) RegionType(eu.transkribus.core.model.beans.pagecontent.RegionType) TextRegionType(eu.transkribus.core.model.beans.pagecontent.TextRegionType) TextLineType(eu.transkribus.core.model.beans.pagecontent.TextLineType) TrpTextLineType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextLineType)

Aggregations

RegionType (eu.transkribus.core.model.beans.pagecontent.RegionType)18 TrpRegionType (eu.transkribus.core.model.beans.pagecontent_trp.TrpRegionType)16 TextRegionType (eu.transkribus.core.model.beans.pagecontent.TextRegionType)15 TrpTextRegionType (eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType)15 TrpTableRegionType (eu.transkribus.core.model.beans.pagecontent_trp.TrpTableRegionType)7 TableRegionType (eu.transkribus.core.model.beans.pagecontent.TableRegionType)6 TextLineType (eu.transkribus.core.model.beans.pagecontent.TextLineType)5 UnknownRegionType (eu.transkribus.core.model.beans.pagecontent.UnknownRegionType)5 WordType (eu.transkribus.core.model.beans.pagecontent.WordType)4 Point (java.awt.Point)4 Rectangle (java.awt.Rectangle)4 TrpTextLineType (eu.transkribus.core.model.beans.pagecontent_trp.TrpTextLineType)3 IOException (java.io.IOException)3 ArrayList (java.util.ArrayList)3 PdfContentByte (com.itextpdf.text.pdf.PdfContentByte)2 PcGtsType (eu.transkribus.core.model.beans.pagecontent.PcGtsType)2 ITrpShapeType (eu.transkribus.core.model.beans.pagecontent_trp.ITrpShapeType)2 HashMap (java.util.HashMap)2 List (java.util.List)2 Image (com.itextpdf.text.Image)1