Search in sources :

Example 11 with WordType

use of eu.transkribus.core.model.beans.pagecontent.WordType in project TranskribusCore by Transkribus.

the class TrpTxtBuilder method writeTxtForSinglePage.

private static void writeTxtForSinglePage(File file, TrpPageType trpPage, boolean wordBased, boolean preserveLineBreaks) {
    boolean rtl = false;
    // TrpTableRegionType is contained in the regions too
    List<TrpRegionType> regions = trpPage.getRegions();
    Collections.sort(regions, new TrpElementReadingOrderComparator<RegionType>(true));
    List<String> content = new ArrayList<String>();
    for (int j = 0; j < regions.size(); ++j) {
        TrpRegionType r = regions.get(j);
        if (r instanceof TrpTableRegionType) {
            /*
				 * TODO: for simple txt export: how to handle tables
				 */
            continue;
        } else if (r instanceof TrpTextRegionType) {
            TrpTextRegionType tr = (TrpTextRegionType) r;
            List<TextLineType> lines = tr.getTextLine();
            for (int i = 0; i < lines.size(); ++i) {
                TrpTextLineType trpL = (TrpTextLineType) lines.get(i);
                String textOfCurrLine = trpL.getUnicodeText();
                if (wordBased && trpL.getWord().size() > 0) {
                    for (WordType word : trpL.getWord()) {
                        content.add(((ITrpShapeType) word).getUnicodeText());
                    }
                } else if (textOfCurrLine != "") {
                    content.add(textOfCurrLine);
                }
            // if(preserveLineBreaks){
            // content.add(System.lineSeparator());
            // }
            }
            if (lines.size() > 0) {
                content.add(System.lineSeparator());
            // try {
            // //Add line separator after each region
            // Files.write(Paths.get(file.getAbsolutePath()), new ArrayList<String>() {{ add(System.lineSeparator()); }}, utf8,
            // StandardOpenOption.CREATE, StandardOpenOption.APPEND);
            // } catch (IOException e) {
            // // TODO Auto-generated catch block
            // e.printStackTrace();
            // }
            }
        }
    }
    try {
        logger.debug("path " + Paths.get(file.getAbsolutePath()));
        Files.write(Paths.get(file.getAbsolutePath()), content, utf8, StandardOpenOption.CREATE, StandardOpenOption.APPEND);
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
}
Also used : TrpTextRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType) TrpRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpRegionType) RegionType(eu.transkribus.core.model.beans.pagecontent.RegionType) TrpTableRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTableRegionType) ArrayList(java.util.ArrayList) IOException(java.io.IOException) ITrpShapeType(eu.transkribus.core.model.beans.pagecontent_trp.ITrpShapeType) WordType(eu.transkribus.core.model.beans.pagecontent.WordType) TrpTextLineType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextLineType) TrpTableRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTableRegionType) TrpRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpRegionType) TrpTextRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType) ArrayList(java.util.ArrayList) List(java.util.List)

Example 12 with WordType

use of eu.transkribus.core.model.beans.pagecontent.WordType in project TranskribusCore by Transkribus.

the class PageXmlUtils method extractStats.

public static TrpTranscriptStatistics extractStats(PcGtsType page) {
    TrpTranscriptStatistics s = new TrpTranscriptStatistics();
    int nrOfRegions, nrOfTranscribedRegions, nrOfWordsInRegions, nrOfLines, nrOfTranscribedLines, nrOfWordsInLines, nrOfWords, nrOfTranscribedWords;
    nrOfRegions = nrOfTranscribedRegions = nrOfWordsInRegions = nrOfLines = nrOfTranscribedLines = nrOfWordsInLines = nrOfWords = nrOfTranscribedWords = 0;
    List<TextRegionType> regs = PageXmlUtils.getTextRegions(page);
    nrOfRegions = regs.size();
    for (TextRegionType r : regs) {
        if (r.getTextEquiv() != null && r.getTextEquiv().getUnicode() != null && !r.getTextEquiv().getUnicode().trim().isEmpty()) {
            nrOfTranscribedRegions += 1;
            // TODO use tokenizer here
            nrOfWordsInRegions += r.getTextEquiv().getUnicode().split(" ").length;
        }
        List<TextLineType> lines = r.getTextLine();
        nrOfLines += lines.size();
        for (TextLineType l : lines) {
            if (l.getTextEquiv() != null && l.getTextEquiv().getUnicode() != null && !l.getTextEquiv().getUnicode().trim().isEmpty()) {
                nrOfTranscribedLines += 1;
                // TODO use tokenizer here
                nrOfWordsInLines += l.getTextEquiv().getUnicode().split(" ").length;
            }
            List<WordType> words = l.getWord();
            nrOfWords += words.size();
            for (WordType w : words) {
                if (w.getTextEquiv() != null && w.getTextEquiv().getUnicode() != null && !w.getTextEquiv().getUnicode().trim().isEmpty()) {
                    nrOfTranscribedWords += 1;
                }
            }
        }
    }
    s.setNrOfLines(nrOfLines);
    s.setNrOfRegions(nrOfRegions);
    s.setNrOfTranscribedLines(nrOfTranscribedLines);
    s.setNrOfTranscribedWords(nrOfTranscribedWords);
    s.setNrOfTranscribedRegions(nrOfTranscribedRegions);
    s.setNrOfWords(nrOfWords);
    s.setNrOfWordsInLines(nrOfWordsInLines);
    s.setNrOfWordsInRegions(nrOfWordsInRegions);
    return s;
}
Also used : TrpTextRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType) TextRegionType(eu.transkribus.core.model.beans.pagecontent.TextRegionType) TextLineType(eu.transkribus.core.model.beans.pagecontent.TextLineType) TrpTextLineType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextLineType) TrpTranscriptStatistics(eu.transkribus.core.model.beans.TrpTranscriptStatistics) WordType(eu.transkribus.core.model.beans.pagecontent.WordType) TrpWordType(eu.transkribus.core.model.beans.pagecontent_trp.TrpWordType)

Example 13 with WordType

use of eu.transkribus.core.model.beans.pagecontent.WordType in project TranskribusCore by Transkribus.

the class PageXmlUtils method removeAllIndexedTags.

public static void removeAllIndexedTags(PcGtsType pc) {
    TrpPageType p = (TrpPageType) pc.getPage();
    List<TrpTextRegionType> trList = p.getTextRegions(true);
    for (TrpTextRegionType tr : trList) {
        tr.getCustomTagList().removeIndexedTags();
        List<TextLineType> lineList = tr.getTextLine();
        for (TextLineType l : lineList) {
            TrpTextLineType trpL = (TrpTextLineType) l;
            trpL.getCustomTagList().removeIndexedTags();
            List<WordType> wordList = trpL.getWord();
            for (WordType w : wordList) {
                TrpWordType trpW = (TrpWordType) w;
                trpW.getCustomTagList().removeIndexedTags();
            }
        }
    }
}
Also used : TrpTextLineType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextLineType) TextLineType(eu.transkribus.core.model.beans.pagecontent.TextLineType) TrpTextLineType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextLineType) TrpTextRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType) TrpWordType(eu.transkribus.core.model.beans.pagecontent_trp.TrpWordType) TrpPageType(eu.transkribus.core.model.beans.pagecontent_trp.TrpPageType) WordType(eu.transkribus.core.model.beans.pagecontent.WordType) TrpWordType(eu.transkribus.core.model.beans.pagecontent_trp.TrpWordType)

Example 14 with WordType

use of eu.transkribus.core.model.beans.pagecontent.WordType in project TranskribusCore by Transkribus.

the class ExportCache method storeCustomTagMapForDoc.

/**
 * @param doc
 * @param wordBased
 * @param pageIndices
 * @param blackening
 * @return all (custom) tags of the given document
 * @throws JAXBException
 * @throws IOException
 * @throws InterruptedException
 */
public void storeCustomTagMapForDoc(TrpDoc doc, boolean wordBased, Set<Integer> pageIndices, IProgressMonitor monitor, boolean blackening) throws JAXBException, IOException, InterruptedException {
    doBlackening = blackening;
    tags.clear();
    List<TrpPage> pages = doc.getPages();
    int totalPages = pages.size();
    int c = 0;
    for (int i = 0; i < totalPages; ++i) {
        if (pageIndices != null && !pageIndices.contains(i))
            continue;
        if (monitor != null && monitor.isCanceled()) {
            throw new InterruptedException("User canceled the export");
        }
        // pageTranscripts get fetched before the custom tag map is stored - so normally pageTranscripts.get(i) != null
        JAXBPageTranscript tr;
        if (pageTranscripts == null || pageTranscripts.get(i) == null) {
            TrpPage page = pages.get(i);
            TrpTranscriptMetadata md = page.getCurrentTranscript();
            tr = new JAXBPageTranscript(md);
        } else {
            tr = pageTranscripts.get(i);
            tr.getPageData();
        }
        tr.build();
        TrpPageType trpPage = tr.getPage();
        logger.debug("get tags for page " + (i + 1) + "/" + doc.getNPages());
        List<TrpTextRegionType> textRegions = trpPage.getTextRegions(true);
        for (int j = 0; j < textRegions.size(); ++j) {
            TrpTextRegionType r = textRegions.get(j);
            List<TextLineType> lines = r.getTextLine();
            for (int k = 0; k < lines.size(); ++k) {
                TrpTextLineType trpL = (TrpTextLineType) lines.get(k);
                List<WordType> words = trpL.getWord();
                getTagsForShapeElement(trpL);
                if (wordBased) {
                    for (int l = 0; l < words.size(); ++l) {
                        TrpWordType w = (TrpWordType) words.get(l);
                        getTagsForShapeElement(w);
                    }
                }
            // else{
            // getTagsForShapeElement(trpL);
            // }
            }
        }
        if (monitor != null) {
            monitor.setTaskName("Loaded tags for page " + (i + 1));
            monitor.worked(++c);
        }
    }
}
Also used : JAXBPageTranscript(eu.transkribus.core.model.beans.JAXBPageTranscript) TrpPage(eu.transkribus.core.model.beans.TrpPage) TrpTranscriptMetadata(eu.transkribus.core.model.beans.TrpTranscriptMetadata) TrpWordType(eu.transkribus.core.model.beans.pagecontent_trp.TrpWordType) WordType(eu.transkribus.core.model.beans.pagecontent.WordType) TrpWordType(eu.transkribus.core.model.beans.pagecontent_trp.TrpWordType) TrpTextLineType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextLineType) TextLineType(eu.transkribus.core.model.beans.pagecontent.TextLineType) TrpTextLineType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextLineType) TrpTextRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType) TrpPageType(eu.transkribus.core.model.beans.pagecontent_trp.TrpPageType)

Example 15 with WordType

use of eu.transkribus.core.model.beans.pagecontent.WordType in project TranskribusCore by Transkribus.

the class FinereaderUtils method addTextStyleToWords.

public static void addTextStyleToWords(TrpDoc doc) throws JAXBException, FileNotFoundException {
    /*
	 * Ich hab im folgenden Ordner das Buch abgelegt, wo die Sprecherangaben automatisiert als „letter-spaced“ zu markieren wären (sofern sich das mit angemessenem Aufwand bewältigen lässt):
		ftp://ftp.uibk.ac.at/private/x3061015_20140902_78e054475d7532953c204ce6d392d8e9/Andy_Barbara_Bettina/zu_bearbeiten/
		dabei handelt es sich um folgende Namen, sofern sie am Zeilenanfang stehen:

		Ernst
		Albrecht
		Preising
		Marschall
		Pappenheim
		Pienzenau
		Bern
		Törring
		Nothafft von Wernberg
		Frauenhoven
		Hans von Läubelfing
		Caspar Bernauer
		Agnes
		Theobald
		Knippeldollinger
		Bürgermeister
		Barbara
		Martha
		Stachus
		Der Kastellan
		Herold
		Legat
		
		FIXME Der Herold
		FIXME Der Legat

	 */
    String[] names = { "Ernst", "Albrecht", "Preising", "Marschall", "Pappenheim", "Pienzenau", "Bern", "Törring", "Nothafft von Wernberg", "Frauenhoven", "Hans von Läubelfing", "Caspar Bernauer", "Agnes", "Theobald", "Knippeldollinger", "Bürgermeister", "Barbara", "Martha", "Stachus", "Der Kastellan", "Der Herold", "Der Legat" };
    List<String[]> nameList = new ArrayList<>(names.length);
    List<String> nameStartList = new ArrayList<>(names.length);
    // List<String> nameSet = new ArrayLilst<>();
    String[] tmp;
    for (int i = 0; i < names.length; i++) {
        String s = names[i];
        tmp = s.split(" ");
        String tmpStr = "{ ";
        for (String t : tmp) {
            tmpStr += t + "|";
        }
        System.out.println(i + "\t- splitting: " + tmpStr + "}");
        nameList.add(i, tmp);
        nameStartList.add(tmp[0]);
    }
    TrpElementCoordinatesComparator<WordType> wordComp = new TrpElementCoordinatesComparator<WordType>();
    for (TrpPage p : doc.getPages()) {
        System.out.println("Processing page: " + p.getPageNr());
        URL url = p.getCurrentTranscript().getUrl();
        final String xmlPath = FileUtils.toFile(url).getAbsolutePath();
        File xmlFile = new File(xmlPath);
        PcGtsType pc = JaxbUtils.unmarshal(xmlFile, PcGtsType.class);
        List<TextRegionType> regions = PageXmlUtils.getTextRegions(pc);
        for (TextRegionType r : regions) {
            // System.out.println("Processing region: " + r.getId());
            List<Integer> candidatesIndex;
            int i;
            for (TextLineType l : r.getTextLine()) {
                candidatesIndex = new LinkedList<>();
                i = 0;
                // System.out.println("Processing line: " + l.getId());
                List<WordType> words = l.getWord();
                if (words != null && !words.isEmpty()) {
                    Collections.sort(words, wordComp);
                    // read first word and iterate to second
                    WordType w1 = words.get(i);
                    // List<Integer> candidates = new LinkedList<>();
                    for (int j = 0; j < nameStartList.size(); j++) {
                        String e = nameStartList.get(j);
                        if (w1.getTextEquiv() != null && w1.getTextEquiv().getUnicode() != null && isMatch(w1.getTextEquiv().getUnicode(), e)) {
                            candidatesIndex.add(j);
                        // System.out.println("Found candidate word: " + j + " - " + w1.getTextEquiv().getUnicode());
                        }
                    }
                    if (!candidatesIndex.isEmpty()) {
                        for (Integer index : candidatesIndex) {
                            String[] name = nameList.get(index);
                            if (name.length == 1) {
                                // Done.
                                w1.getTextStyle().setLetterSpaced(true);
                                System.out.println("OK: " + name[i]);
                                break;
                            } else {
                                List<WordType> wordList = new ArrayList<>(name.length);
                                boolean isName = true;
                                wordList.add(w1);
                                String nameStr = w1.getTextEquiv().getUnicode() + " ";
                                // check subsequent words
                                for (i = 1; i < name.length; i++) {
                                    WordType wi = words.get(i);
                                    if (isMatch(wi.getTextEquiv().getUnicode(), name[i])) {
                                        nameStr += wi.getTextEquiv().getUnicode() + " ";
                                        wordList.add(wi);
                                    } else {
                                        System.out.println("NEGATIVE: " + nameStr + words.get(i).getTextEquiv().getUnicode() + " != " + name[i]);
                                        isName = false;
                                        break;
                                    }
                                }
                                if (isName) {
                                    System.out.println("OK : " + nameStr);
                                    for (WordType w : wordList) {
                                        // System.out.println(w.getTextEquiv().getUnicode());
                                        w.getTextStyle().setLetterSpaced(true);
                                    }
                                    break;
                                }
                            }
                        }
                    }
                }
            }
        }
        // TODO store pageXML
        JaxbUtils.marshalToFile(pc, xmlFile);
    }
}
Also used : TrpPage(eu.transkribus.core.model.beans.TrpPage) ArrayList(java.util.ArrayList) TrpElementCoordinatesComparator(eu.transkribus.core.model.beans.pagecontent_trp.TrpElementCoordinatesComparator) PcGtsType(eu.transkribus.core.model.beans.pagecontent.PcGtsType) URL(java.net.URL) WordType(eu.transkribus.core.model.beans.pagecontent.WordType) TrpTextRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType) TextRegionType(eu.transkribus.core.model.beans.pagecontent.TextRegionType) TextLineType(eu.transkribus.core.model.beans.pagecontent.TextLineType) File(java.io.File)

Aggregations

WordType (eu.transkribus.core.model.beans.pagecontent.WordType)17 TextLineType (eu.transkribus.core.model.beans.pagecontent.TextLineType)12 TrpTextLineType (eu.transkribus.core.model.beans.pagecontent_trp.TrpTextLineType)9 TrpWordType (eu.transkribus.core.model.beans.pagecontent_trp.TrpWordType)9 TrpTextRegionType (eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType)8 RegionType (eu.transkribus.core.model.beans.pagecontent.RegionType)4 TextRegionType (eu.transkribus.core.model.beans.pagecontent.TextRegionType)4 TrpPage (eu.transkribus.core.model.beans.TrpPage)3 TrpPageType (eu.transkribus.core.model.beans.pagecontent_trp.TrpPageType)3 Rectangle (java.awt.Rectangle)3 ArrayList (java.util.ArrayList)3 JAXBPageTranscript (eu.transkribus.core.model.beans.JAXBPageTranscript)2 TrpTranscriptMetadata (eu.transkribus.core.model.beans.TrpTranscriptMetadata)2 TrpBaselineType (eu.transkribus.core.model.beans.pagecontent_trp.TrpBaselineType)2 TrpRegionType (eu.transkribus.core.model.beans.pagecontent_trp.TrpRegionType)2 Point (java.awt.Point)2 IOException (java.io.IOException)2 Chunk (com.itextpdf.text.Chunk)1 Phrase (com.itextpdf.text.Phrase)1 RtfText (com.tutego.jrtf.RtfText)1