Search in sources :

Example 6 with WordType

use of eu.transkribus.core.model.beans.pagecontent.WordType in project TranskribusCore by Transkribus.

the class TrpTextLineType method getTextFromWords.

public String getTextFromWords(boolean fillEmptyWords) {
    String text = "";
    for (WordType w : getWord()) {
        String wt = ((TrpWordType) w).getUnicodeText();
        if (fillEmptyWords && wt.isEmpty())
            wt = TrpWordType.EMPTY_WORD_FILL;
        text += wt + " ";
    }
    text = StringUtils.removeEnd(text, " ");
    return text;
}
Also used : WordType(eu.transkribus.core.model.beans.pagecontent.WordType)

Example 7 with WordType

use of eu.transkribus.core.model.beans.pagecontent.WordType in project TranskribusCore by Transkribus.

the class TrpXlsxBuilder method writeXlsxForDoc.

public static void writeXlsxForDoc(TrpDoc doc, boolean wordBased, File exportFile, Set<Integer> pageIndices, IProgressMonitor monitor, ExportCache cache) throws NoTagsException, Exception {
    if (cache == null) {
        throw new IllegalArgumentException("ExportCache must not be null.");
    }
    if (cache.getCustomTagMapForDoc().isEmpty()) {
        logger.info("No tags to store -> Xlsx export cancelled");
        throw new NoTagsException("No tags available to store into Xlsx");
    }
    List<TrpPage> pages = doc.getPages();
    String exportPath = exportFile.getPath();
    Set<String> selectedTags = cache.getOnlySelectedTagnames(ExportUtils.getOnlyWantedTagnames(CustomTagFactory.getRegisteredTagNames()));
    int totalPages = pageIndices == null ? pages.size() : pageIndices.size();
    if (monitor != null) {
        monitor.beginTask("Exporting to Excel", totalPages);
    }
    wb = new XSSFWorkbook();
    int c = 0;
    for (int i = 0; i < pages.size(); ++i) {
        if (pageIndices != null && !pageIndices.contains(i))
            continue;
        if (monitor != null) {
            if (monitor.isCanceled()) {
                throw new InterruptedException("Export was canceled by user");
            // logger.debug("Xlsx export cancelled!");
            // return;
            }
            monitor.subTask("Processing page " + (c + 1));
        }
        TrpPage page = pages.get(i);
        // try to get previously loaded JAXB transcript
        JAXBPageTranscript tr = null;
        if (cache != null) {
            tr = cache.getPageTranscriptAtIndex(i);
        }
        if (tr == null) {
            TrpTranscriptMetadata md = page.getCurrentTranscript();
            tr = new JAXBPageTranscript(md);
            tr.build();
        }
        // old version
        // TrpPage page = pages.get(i);
        // TrpTranscriptMetadata md = page.getCurrentTranscript();
        // JAXBPageTranscript tr = new JAXBPageTranscript(md);
        // tr.build();
        TrpPageType trpPage = tr.getPage();
        logger.debug("writing xlsx for page " + (i + 1) + "/" + doc.getNPages());
        List<TrpTextRegionType> textRegions = trpPage.getTextRegions(true);
        for (int j = 0; j < textRegions.size(); ++j) {
            TrpTextRegionType r = textRegions.get(j);
            List<TextLineType> lines = r.getTextLine();
            for (int k = 0; k < lines.size(); ++k) {
                TrpTextLineType trpL = (TrpTextLineType) lines.get(k);
                List<WordType> words = trpL.getWord();
                if (wordBased) {
                    for (int l = 0; l < words.size(); ++l) {
                        TrpWordType w = (TrpWordType) words.get(l);
                        writeTagsForShapeElement(w, trpL.getUnicodeText(), String.valueOf(doc.getId()), String.valueOf(page.getPageNr()), r.getId(), trpL.getId(), w.getId(), selectedTags);
                    }
                } else {
                    writeTagsForShapeElement(trpL, trpL.getUnicodeText(), String.valueOf(doc.getId()), String.valueOf(page.getPageNr()), r.getId(), trpL.getId(), "", selectedTags);
                }
            }
        }
        ++c;
        if (monitor != null) {
            monitor.worked(c);
        }
    }
    /*
		 * auto size the columns
		 */
    for (int i = 0; i < wb.getNumberOfSheets(); i++) {
        int numberOfCells = 0;
        Iterator rowIterator = wb.getSheetAt(i).rowIterator();
        /**
         * Escape the header row *
         */
        if (rowIterator.hasNext()) {
            Row headerRow = (Row) rowIterator.next();
            // get the number of cells in the header row
            numberOfCells = headerRow.getPhysicalNumberOfCells();
            for (int j = 0; j < numberOfCells; j++) {
                wb.getSheetAt(i).autoSizeColumn(j);
            }
        }
    }
    FileOutputStream fOut;
    try {
        // means no tags at all
        if (wb.getNumberOfSheets() == 0) {
            throw new IOException("Sorry - No tags available for export");
        }
        fOut = new FileOutputStream(exportPath);
        wb.write(fOut);
        fOut.close();
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
        throw e;
    }
    logger.info("wrote xlsx to: " + exportPath);
}
Also used : NoTagsException(eu.transkribus.core.model.builder.NoTagsException) JAXBPageTranscript(eu.transkribus.core.model.beans.JAXBPageTranscript) TrpPage(eu.transkribus.core.model.beans.TrpPage) TrpTranscriptMetadata(eu.transkribus.core.model.beans.TrpTranscriptMetadata) IOException(java.io.IOException) TrpWordType(eu.transkribus.core.model.beans.pagecontent_trp.TrpWordType) WordType(eu.transkribus.core.model.beans.pagecontent.WordType) TrpWordType(eu.transkribus.core.model.beans.pagecontent_trp.TrpWordType) TrpTextLineType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextLineType) TextLineType(eu.transkribus.core.model.beans.pagecontent.TextLineType) TrpTextLineType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextLineType) TrpTextRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType) FileOutputStream(java.io.FileOutputStream) Iterator(java.util.Iterator) XSSFWorkbook(org.apache.poi.xssf.usermodel.XSSFWorkbook) Row(org.apache.poi.ss.usermodel.Row) TrpPageType(eu.transkribus.core.model.beans.pagecontent_trp.TrpPageType)

Example 8 with WordType

use of eu.transkribus.core.model.beans.pagecontent.WordType in project TranskribusCore by Transkribus.

the class TrpTeiStringBuilder method writeZonesForTextRegion.

void writeZonesForTextRegion(SebisStringBuilder sb, TextRegionType r, int pageNr) {
    String facsId = FACS_ID_PREFIX + pageNr;
    if (pars.regionZones) {
        writeZoneForShape(sb, (TrpTextRegionType) r, facsId, !pars.lineZones && !pars.wordZones);
    }
    if (!pars.lineZones && !pars.wordZones)
        return;
    for (TextLineType tl : r.getTextLine()) {
        TrpTextLineType ttl = (TrpTextLineType) tl;
        if (pars.lineZones) {
            writeZoneForShape(sb, ttl, facsId, !pars.wordZones);
        }
        if (pars.wordZones) {
            for (WordType w : ttl.getWord()) {
                TrpWordType tw = (TrpWordType) w;
                writeZoneForShape(sb, tw, facsId, true);
            }
            if (pars.lineZones) {
                closeElement(sb, "zone");
            }
        }
    }
    if (pars.regionZones) {
        closeElement(sb, "zone");
    }
}
Also used : TrpTextLineType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextLineType) TextLineType(eu.transkribus.core.model.beans.pagecontent.TextLineType) TrpTextLineType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextLineType) TrpWordType(eu.transkribus.core.model.beans.pagecontent_trp.TrpWordType) WordType(eu.transkribus.core.model.beans.pagecontent.WordType) TrpWordType(eu.transkribus.core.model.beans.pagecontent_trp.TrpWordType)

Example 9 with WordType

use of eu.transkribus.core.model.beans.pagecontent.WordType in project TranskribusCore by Transkribus.

the class TrpTeiStringBuilder method writeTextForTextRegion.

void writeTextForTextRegion(SebisStringBuilder sb, TextRegionType r, int pageNr) {
    String facsId = FACS_ID_PREFIX + pageNr;
    if (r.getTextLine().isEmpty()) {
        logger.warn("skipping empty region: " + r.getId());
        return;
    }
    writeTextRegion(sb, r, facsId);
    for (TextLineType tl : r.getTextLine()) {
        TrpTextLineType ttl = (TrpTextLineType) tl;
        if (!commonPars.isWriteTextOnWordLevel()) {
            writeLineOrWord(sb, ttl, facsId);
        } else {
            String lStart = getLineOrWordStart(ttl, facsId);
            sb.incIndent();
            sb.addLine(lStart);
            // TODO: write text for words???
            for (WordType w : ttl.getWord()) {
                writeLineOrWord(sb, (TrpWordType) w, facsId);
            }
            String lEnd = getLineOrWordEnd(ttl, facsId);
            sb.addLine(lEnd);
            // sb.append("\n");
            sb.decIndent();
        }
    }
    closeTextRegion(sb);
}
Also used : TrpTextLineType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextLineType) TextLineType(eu.transkribus.core.model.beans.pagecontent.TextLineType) TrpTextLineType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextLineType) WordType(eu.transkribus.core.model.beans.pagecontent.WordType) TrpWordType(eu.transkribus.core.model.beans.pagecontent_trp.TrpWordType)

Example 10 with WordType

use of eu.transkribus.core.model.beans.pagecontent.WordType in project TranskribusCore by Transkribus.

the class TrpRtfBuilder method getRtfTextForLineFromWords.

// private static void getTagsForShapeElement(ITrpShapeType element) throws IOException{
// 
// String textStr = element.getUnicodeText();
// CustomTagList cl = element.getCustomTagList();
// if (textStr == null || cl == null)
// throw new IOException("Element has no text or custom tag list: "+element+", class: "+element.getClass().getName());
// 
// for (CustomTag nonIndexedTag : cl.getNonIndexedTags()) {
// 
// logger.debug("nonindexed tag found ");
// storeCustomTag(nonIndexedTag, textStr);
// 
// }
// for (CustomTag indexedTag : cl.getIndexedTags()) {
// 
// logger.debug("indexed tag found ");
// storeCustomTag(indexedTag, textStr);
// 
// }
// 
// }
// 
// private static void storeCustomTag(CustomTag currTag, String textStr) {
// if (!currTag.getTagName().equals("textStyle")){
// 
// if (currTag.getOffset() != -1 && currTag.getLength() != -1 && (currTag.getOffset()+currTag.getLength() <= textStr.length())){
// tags.put(currTag, textStr.substring(currTag.getOffset(), currTag.getOffset()+currTag.getLength()));
// }
// else{
// tags.put(currTag, textStr);
// }
// logger.debug("++tag name is " + currTag.getTagName());
// logger.debug("text " + tags.get(currTag));
// }
// 
// if (currTag.getTagName().equals("Person")){
// if (currTag.getOffset() != -1 && currTag.getLength() != -1 && (currTag.getOffset()+currTag.getLength() <= textStr.length())){
// persons.add(textStr.substring(currTag.getOffset(), currTag.getOffset()+currTag.getLength()));
// }
// else{
// logger.debug("with index is something wrong: offset " + currTag.getOffset() + " length " + currTag.getLength()) ;
// //throw new Exception("Something wrong with indexed tag for text: " + textStr);
// }
// }
// else if (currTag.getTagName().equals("Place")){
// if (currTag.getOffset() != -1 && currTag.getLength() != -1 && (currTag.getOffset()+currTag.getLength() <= textStr.length())){
// places.add(textStr.substring(currTag.getOffset(), currTag.getOffset()+currTag.getLength()));
// }
// }
// 
// }
private static RtfText getRtfTextForLineFromWords(TrpTextLineType line) throws IOException {
    List<WordType> words = line.getWord();
    RtfText[] wordTexts = new RtfText[words.size()];
    for (int i = 0; i < wordTexts.length; ++i) {
        TrpWordType w = (TrpWordType) words.get(i);
        wordTexts[i] = getRtfTextForShapeElement(w);
    }
    RtfText totalText = RtfText.text(true, wordTexts);
    return totalText;
}
Also used : RtfText(com.tutego.jrtf.RtfText) TrpWordType(eu.transkribus.core.model.beans.pagecontent_trp.TrpWordType) WordType(eu.transkribus.core.model.beans.pagecontent.WordType) TrpWordType(eu.transkribus.core.model.beans.pagecontent_trp.TrpWordType)

Aggregations

WordType (eu.transkribus.core.model.beans.pagecontent.WordType)17 TextLineType (eu.transkribus.core.model.beans.pagecontent.TextLineType)12 TrpTextLineType (eu.transkribus.core.model.beans.pagecontent_trp.TrpTextLineType)9 TrpWordType (eu.transkribus.core.model.beans.pagecontent_trp.TrpWordType)9 TrpTextRegionType (eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType)8 RegionType (eu.transkribus.core.model.beans.pagecontent.RegionType)4 TextRegionType (eu.transkribus.core.model.beans.pagecontent.TextRegionType)4 TrpPage (eu.transkribus.core.model.beans.TrpPage)3 TrpPageType (eu.transkribus.core.model.beans.pagecontent_trp.TrpPageType)3 Rectangle (java.awt.Rectangle)3 ArrayList (java.util.ArrayList)3 JAXBPageTranscript (eu.transkribus.core.model.beans.JAXBPageTranscript)2 TrpTranscriptMetadata (eu.transkribus.core.model.beans.TrpTranscriptMetadata)2 TrpBaselineType (eu.transkribus.core.model.beans.pagecontent_trp.TrpBaselineType)2 TrpRegionType (eu.transkribus.core.model.beans.pagecontent_trp.TrpRegionType)2 Point (java.awt.Point)2 IOException (java.io.IOException)2 Chunk (com.itextpdf.text.Chunk)1 Phrase (com.itextpdf.text.Phrase)1 RtfText (com.tutego.jrtf.RtfText)1