Search in sources :

Example 1 with TextRegionType

use of eu.transkribus.core.model.beans.pagecontent.TextRegionType in project TranskribusCore by Transkribus.

the class PageXmlUtils method findLinesByBaseline.

public static List<TextLineType> findLinesByBaseline(PcGtsType pc, String baseline) {
    List<TextRegionType> regions = getTextRegions(pc);
    List<TextLineType> matchingLines = new LinkedList<>();
    for (TextRegionType r : regions) {
        r.getTextLine().stream().filter(// isBaselineInLineBounds(l, baseline, threshold))
        l -> doesIntersect(l, baseline)).forEach(l -> matchingLines.add(l));
    }
    if (matchingLines.size() > 1) {
        TrpElementCoordinatesComparator<TextLineType> comp = new TrpElementCoordinatesComparator<>(true);
        Collections.sort(matchingLines, comp);
    }
    return matchingLines;
}
Also used : TrpTextRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType) TextRegionType(eu.transkribus.core.model.beans.pagecontent.TextRegionType) FimgStoreReadConnection(eu.transkribus.core.io.FimgStoreReadConnection) TranscriptionLevel(eu.transkribus.core.model.beans.enums.TranscriptionLevel) URL(java.net.URL) Date(java.util.Date) TextLineType(eu.transkribus.core.model.beans.pagecontent.TextLineType) Rectangle2D(java.awt.geom.Rectangle2D) TrpPageUnmarshalListener(eu.transkribus.core.model.builder.TrpPageUnmarshalListener) LoggerFactory(org.slf4j.LoggerFactory) TrpTextRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType) TrpRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpRegionType) MarshalException(javax.xml.bind.MarshalException) ByteArrayInputStream(java.io.ByteArrayInputStream) Map(java.util.Map) FimgStoreImgMd(org.dea.fimgstoreclient.beans.FimgStoreImgMd) JAXBException(javax.xml.bind.JAXBException) FileNotFoundException(java.io.FileNotFoundException) Dimension(java.awt.Dimension) List(java.util.List) TrpElementCoordinatesComparator(eu.transkribus.core.model.beans.pagecontent_trp.TrpElementCoordinatesComparator) TrpTextLineType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextLineType) SAXException(org.xml.sax.SAXException) TrpObjectFactory(eu.transkribus.core.model.beans.pagecontent_trp.TrpObjectFactory) CustomTagUtil(eu.transkribus.core.model.beans.customtags.CustomTagUtil) PcGtsType(eu.transkribus.core.model.beans.pagecontent.PcGtsType) Polygon(java.awt.Polygon) Rectangle(java.awt.Rectangle) TextEquivType(eu.transkribus.core.model.beans.pagecontent.TextEquivType) ByteArrayOutputStream(java.io.ByteArrayOutputStream) TransformerException(javax.xml.transform.TransformerException) CoordsType(eu.transkribus.core.model.beans.pagecontent.CoordsType) TrpPageMarshalListener(eu.transkribus.core.model.builder.TrpPageMarshalListener) Marshaller(javax.xml.bind.Marshaller) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) ValidationEventCollector(javax.xml.bind.util.ValidationEventCollector) TrpTranscriptStatistics(eu.transkribus.core.model.beans.TrpTranscriptStatistics) LinkedList(java.util.LinkedList) TrpPage(eu.transkribus.core.model.beans.TrpPage) JAXBContext(javax.xml.bind.JAXBContext) Unmarshaller(javax.xml.bind.Unmarshaller) Logger(org.slf4j.Logger) RegionType(eu.transkribus.core.model.beans.pagecontent.RegionType) WordType(eu.transkribus.core.model.beans.pagecontent.WordType) JAXBElement(javax.xml.bind.JAXBElement) IOException(java.io.IOException) FileUtils(org.apache.commons.io.FileUtils) TrpPageType(eu.transkribus.core.model.beans.pagecontent_trp.TrpPageType) FileInputStream(java.io.FileInputStream) XMLGregorianCalendar(javax.xml.datatype.XMLGregorianCalendar) XmlFormat(eu.transkribus.core.io.formats.XmlFormat) File(java.io.File) MetadataType(eu.transkribus.core.model.beans.pagecontent.MetadataType) StringReader(java.io.StringReader) ParserConfigurationException(javax.xml.parsers.ParserConfigurationException) TextRegionType(eu.transkribus.core.model.beans.pagecontent.TextRegionType) TableRegionType(eu.transkribus.core.model.beans.pagecontent.TableRegionType) PrintSpaceType(eu.transkribus.core.model.beans.pagecontent.PrintSpaceType) Collections(java.util.Collections) TrpTranscriptMetadata(eu.transkribus.core.model.beans.TrpTranscriptMetadata) ObjectFactory(eu.transkribus.core.model.beans.pagecontent.ObjectFactory) TrpWordType(eu.transkribus.core.model.beans.pagecontent_trp.TrpWordType) InputStream(java.io.InputStream) TextLineType(eu.transkribus.core.model.beans.pagecontent.TextLineType) TrpTextLineType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextLineType) TrpElementCoordinatesComparator(eu.transkribus.core.model.beans.pagecontent_trp.TrpElementCoordinatesComparator) LinkedList(java.util.LinkedList)

Example 2 with TextRegionType

use of eu.transkribus.core.model.beans.pagecontent.TextRegionType in project TranskribusCore by Transkribus.

the class PageXmlUtils method getTextRegions.

public static List<TextRegionType> getTextRegions(PcGtsType pc) {
    List<TrpRegionType> regions = pc.getPage().getTextRegionOrImageRegionOrLineDrawingRegion();
    List<TextRegionType> tRegions = new ArrayList<>();
    if (regions == null || regions.isEmpty()) {
        return tRegions;
    }
    for (RegionType r : regions) {
        if (r == null)
            continue;
        if (TextRegionType.class.isAssignableFrom(r.getClass())) {
            tRegions.add((TextRegionType) r);
        }
        if (TableRegionType.class.isAssignableFrom(r.getClass())) {
            TableRegionType table = (TableRegionType) r;
            tRegions.addAll(table.getTableCell());
        }
    }
    return tRegions;
}
Also used : TrpTextRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType) TextRegionType(eu.transkribus.core.model.beans.pagecontent.TextRegionType) TrpTextRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType) TrpRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpRegionType) RegionType(eu.transkribus.core.model.beans.pagecontent.RegionType) TextRegionType(eu.transkribus.core.model.beans.pagecontent.TextRegionType) TableRegionType(eu.transkribus.core.model.beans.pagecontent.TableRegionType) TrpRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpRegionType) TableRegionType(eu.transkribus.core.model.beans.pagecontent.TableRegionType) ArrayList(java.util.ArrayList)

Example 3 with TextRegionType

use of eu.transkribus.core.model.beans.pagecontent.TextRegionType in project TranskribusCore by Transkribus.

the class PageXmlUtils method getLines.

public static List<TextLineType> getLines(PcGtsType pc) {
    List<TextLineType> lines = new ArrayList<>();
    List<TextRegionType> regions = PageXmlUtils.getTextRegions(pc);
    for (TextRegionType r : regions) {
        lines.addAll((r.getTextLine()));
    }
    return lines;
}
Also used : TrpTextRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType) TextRegionType(eu.transkribus.core.model.beans.pagecontent.TextRegionType) TextLineType(eu.transkribus.core.model.beans.pagecontent.TextLineType) TrpTextLineType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextLineType) ArrayList(java.util.ArrayList)

Example 4 with TextRegionType

use of eu.transkribus.core.model.beans.pagecontent.TextRegionType in project TranskribusCore by Transkribus.

the class FinereaderUtils method replaceBadChars.

/**
 * Method for replacing certain systematic errors in OCR Text.
 * Replacements are done with regexes from private static regexReglMap (see above in this class).
 *
 * TODO add parameters to pass custom maps from a search/replace dialog!?
 * @param pc
 * @return
 */
public static PcGtsType replaceBadChars(PcGtsType pc) {
    List<TrpRegionType> regs = pc.getPage().getTextRegionOrImageRegionOrLineDrawingRegion();
    boolean success = true;
    for (RegionType r : regs) {
        if (!isTextRegion(r)) {
            continue;
        }
        TextRegionType tr = (TextRegionType) r;
        if (tr.getTextEquiv() == null && tr.getTextEquiv().getUnicode() == null) {
            // no text at all
            continue;
        }
        final String textblockBefore = tr.getTextEquiv().getUnicode();
        final String textblockAfter = replaceChars(textblockBefore, regexRepl);
        // iterate lines
        List<TextLineType> lines = tr.getTextLine();
        if (lines == null || lines.isEmpty()) {
            // textblockAfter = replaceChars(textblockBefore, regexRepl);
            continue;
        }
        // setRegionText
        tr.getTextEquiv().setUnicode(textblockAfter);
        StringBuffer linesBefore = new StringBuffer();
        StringBuffer linesAfter = new StringBuffer();
        // DEBUG END
        boolean isFirstLine = true;
        for (TextLineType l : lines) {
            if (l.getTextEquiv() == null && l.getTextEquiv().getUnicode() == null) {
                // empty line
                continue;
            }
            // Build the textRegion for later use
            final String textlineBefore = l.getTextEquiv().getUnicode();
            final String textlineAfter = replaceChars(textlineBefore, regexRepl);
            linesBefore.append(isFirstLine ? textlineBefore : "\n" + textlineBefore);
            linesAfter.append(isFirstLine ? textlineAfter : "\n" + textlineAfter);
            if (isFirstLine)
                isFirstLine = false;
            l.getTextEquiv().setUnicode(textlineAfter);
            // iterate words
            List<WordType> words = l.getWord();
            if (words == null || words.isEmpty()) {
                // with next line
                continue;
            }
            boolean isFirstWord = true;
            StringBuffer wordsBefore = new StringBuffer();
            StringBuffer wordsAfter = new StringBuffer();
            for (int i = 0; i < words.size(); i++) {
                WordType w = words.get(i);
                if (w.getTextEquiv() == null || w.getTextEquiv().getUnicode() == null) {
                    continue;
                }
                final String wordText = w.getTextEquiv().getUnicode();
                final String wordTextAfter;
                if (i < words.size() - 1) {
                    // use general replacement map for all words
                    wordTextAfter = replaceChars(wordText, repl);
                } else {
                    // use regex map for EOL words
                    wordTextAfter = replaceChars(wordText, regexRepl);
                }
                // DEBUG
                wordsBefore.append(isFirstWord ? wordText : " " + wordText);
                wordsAfter.append(isFirstWord ? wordTextAfter : " " + wordTextAfter);
                if (isFirstWord)
                    isFirstWord = false;
                // DEBUG END
                w.getTextEquiv().setUnicode(wordTextAfter);
            }
            boolean lineSuccess = textlineBefore.toString().replace(" ", "").equals(wordsBefore.toString().replace(" ", ""));
            lineSuccess &= textlineAfter.toString().replace(" ", "").equals(wordsAfter.toString().replace(" ", ""));
            if (!lineSuccess) {
                logger.debug("Line before: " + textlineBefore.toString());
                logger.debug("Words before : " + wordsBefore.toString());
                logger.debug("Line after: " + textlineAfter.toString());
                logger.debug("Words after : " + wordsAfter.toString());
            }
            success &= lineSuccess;
        // TODO propagate words -> lines -> regions
        }
        boolean regionSuccess = textblockBefore.replace(" ", "").equals(linesBefore.toString().replace(" ", ""));
        regionSuccess &= textblockAfter.replace(" ", "").equals(linesAfter.toString().replace(" ", ""));
        if (!regionSuccess) {
            logger.debug("\nblock:\n");
            logger.debug(textblockAfter);
            logger.debug("\nblock from lines:\n");
            logger.debug(linesAfter.toString());
        }
        success &= regionSuccess;
    }
    logger.info("Bad character replacement: " + (success ? "SUCCESS" : "FAILURE"));
    // if(!success) throw new IllegalArgumentException();
    return pc;
}
Also used : TrpTextRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType) TextRegionType(eu.transkribus.core.model.beans.pagecontent.TextRegionType) TrpTextRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType) TrpRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpRegionType) RegionType(eu.transkribus.core.model.beans.pagecontent.RegionType) TextRegionType(eu.transkribus.core.model.beans.pagecontent.TextRegionType) TextLineType(eu.transkribus.core.model.beans.pagecontent.TextLineType) TrpRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpRegionType) WordType(eu.transkribus.core.model.beans.pagecontent.WordType)

Example 5 with TextRegionType

use of eu.transkribus.core.model.beans.pagecontent.TextRegionType in project TranskribusCore by Transkribus.

the class PageXmlUtilsTest method testGetTextRegions.

public static void testGetTextRegions() throws Exception {
    String transcriptWithTables = "https://dbis-thure.uibk.ac.at/f/Get?id=VCLTRLDSWETCXIHQNHKOPRLS";
    PcGtsType t = PageXmlUtils.unmarshal(new URL(transcriptWithTables));
    List<TextRegionType> tr = PageXmlUtils.getTextRegions(t);
    for (TextRegionType r : tr) {
        System.out.println("tr: " + r.getClass().getSimpleName() + " id: " + r.getId() + " n-lines: " + r.getTextLine().size());
    }
}
Also used : TextRegionType(eu.transkribus.core.model.beans.pagecontent.TextRegionType) PcGtsType(eu.transkribus.core.model.beans.pagecontent.PcGtsType) URL(java.net.URL)

Aggregations

TextRegionType (eu.transkribus.core.model.beans.pagecontent.TextRegionType)19 TrpTextRegionType (eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType)16 RegionType (eu.transkribus.core.model.beans.pagecontent.RegionType)13 TrpRegionType (eu.transkribus.core.model.beans.pagecontent_trp.TrpRegionType)12 TextLineType (eu.transkribus.core.model.beans.pagecontent.TextLineType)9 TrpTextLineType (eu.transkribus.core.model.beans.pagecontent_trp.TrpTextLineType)6 PcGtsType (eu.transkribus.core.model.beans.pagecontent.PcGtsType)5 TableRegionType (eu.transkribus.core.model.beans.pagecontent.TableRegionType)5 WordType (eu.transkribus.core.model.beans.pagecontent.WordType)5 UnknownRegionType (eu.transkribus.core.model.beans.pagecontent.UnknownRegionType)4 TrpTableRegionType (eu.transkribus.core.model.beans.pagecontent_trp.TrpTableRegionType)4 Rectangle (java.awt.Rectangle)4 URL (java.net.URL)4 ArrayList (java.util.ArrayList)4 TrpPage (eu.transkribus.core.model.beans.TrpPage)3 TrpElementCoordinatesComparator (eu.transkribus.core.model.beans.pagecontent_trp.TrpElementCoordinatesComparator)3 TrpTranscriptMetadata (eu.transkribus.core.model.beans.TrpTranscriptMetadata)2 TrpTranscriptStatistics (eu.transkribus.core.model.beans.TrpTranscriptStatistics)2 TrpWordType (eu.transkribus.core.model.beans.pagecontent_trp.TrpWordType)2 Point (java.awt.Point)2