Search in sources :

Example 21 with TrpPage

use of eu.transkribus.core.model.beans.TrpPage in project TranskribusCore by Transkribus.

the class TrpMetsBuilder method buildMets.

/**
 * Generate a METS containing
 * <ul>
 * <li>TrpDocMetadata embedded in sourceMd</li>
 * <li>all page images</li>
 * <li>the most recent PAGE XML files from the Doc</li>
 * </ul>
 *
 * If a local document is passed, all hrefs will contain the relative paths to files based on the localFolder!
 *
 * @param doc
 * @param exportImages
 * @param pageIndices
 * @return
 * @throws IOException if image/xml files can't be accessed for reading the mimetype etc.
 */
public static Mets buildMets(TrpDoc doc, boolean exportPage, boolean exportAlto, boolean exportImages, Set<Integer> pageIndices) throws IOException {
    Mets mets = new Mets();
    TrpDocMetadata md = doc.getMd();
    File localFolder = md.getLocalFolder();
    boolean isLocalDoc = localFolder != null;
    mets.setLABEL(md.getTitle());
    mets.setOBJID("" + md.getDocId());
    mets.setPROFILE(TRP_METS_PROFILE);
    // FIXME remove TYPE
    // mets.setTYPE(TRP_METS_PROFILE);
    // metsHdr
    MetsHdr hdr = buildMetsHdr(md);
    mets.setMetsHdr(hdr);
    // TODO dcmd_elec omitted meanwhile
    // md_orig
    AmdSecType amdSec = new AmdSecType();
    amdSec.setID(SOURCE_MD_ID_CONST);
    MdSecType sourceMdSec = buildSourceMdSec(md);
    amdSec.getSourceMD().add(sourceMdSec);
    mets.getAmdSec().add(amdSec);
    // structmap div, linking to the sourceMd section with dmd
    DivType div = new DivType();
    div.getADMID().add(sourceMdSec);
    div.setID(TRP_DOC_DIV_ID);
    FileSec fileSec = new FileSec();
    StructMapType structMap = new StructMapType();
    structMap.setID(TRP_STRUCTMAP_ID);
    structMap.setTYPE("MANUSCRIPT");
    structMap.setDiv(div);
    List<TrpPage> pages = doc.getPages();
    FimgStoreGetClient client = null;
    if (!isLocalDoc) {
        // TODO maybe we need this stuff in the docMetadata?
        URL url = pages.get(0).getUrl();
        client = new FimgStoreGetClient(url);
    }
    FileGrp masterGrp = new FileGrp();
    masterGrp.setID(MASTER_FILE_GRP_ID);
    FileGrpType imgGrp = new FileGrpType();
    imgGrp.setID(IMG_GROUP_ID);
    FileGrpType pageGrp = new FileGrpType();
    pageGrp.setID(PAGE_GROUP_ID);
    FileGrpType altoGrp = new FileGrpType();
    altoGrp.setID(ALTO_GROUP_ID);
    int i = -1;
    for (TrpPage p : pages) {
        i++;
        if (pageIndices != null && !pageIndices.contains(i)) {
            continue;
        }
        // build a page div for the structmap
        DivType pageDiv = new DivType();
        pageDiv.setID("PAGE_" + p.getPageNr());
        pageDiv.setTYPE("SINGLE_PAGE");
        pageDiv.setORDER(BigInteger.valueOf(p.getPageNr()));
        final String imgId = "IMG_" + p.getPageNr();
        final String xmlId = PAGE_GROUP_ID + "_" + p.getPageNr();
        final String altoId = ALTO_GROUP_ID + "_" + p.getPageNr();
        /* only the most recent transcript is added here for now
			 * 
			 * TODO how to deal with imagestore files? use orig image? right now, it's just the view file...
			 * TODO thumbnails not yet included
			*/
        if (exportImages) {
            FileType img = buildFileType(localFolder, imgId, p, p.getPageNr(), client);
            imgGrp.getFile().add(img);
            // linking images
            Fptr imgPtr = buildFptr(img);
            pageDiv.getFptr().add(imgPtr);
        }
        // TODO error handling.. if no transcript??
        if (exportPage) {
            // xmlfiletype: just add the most recent transcript
            TrpTranscriptMetadata tMd;
            // get the transcript chosen for export
            tMd = p.getCurrentTranscript();
            FileType xml = buildFileType(md.getLocalFolder(), xmlId, tMd, p.getPageNr(), client);
            pageGrp.getFile().add(xml);
            Fptr xmlPtr = buildFptr(xml);
            pageDiv.getFptr().add(xmlPtr);
        }
        // creat ALTO fileGrp
        if (exportAlto) {
            FileType altoFt = new FileType();
            altoFt.setCHECKSUMTYPE(ChecksumUtils.ChkSumAlg.MD5.toString());
            // TODO calculate checksum
            altoFt.setCHECKSUM("");
            FLocat fLocat = new FLocat();
            fLocat.setLOCTYPE("OTHER");
            fLocat.setOTHERLOCTYPE("FILE");
            altoFt.setID(altoId);
            altoFt.setSEQ(p.getPageNr());
            // String tmpImgName = img.getFLocat().get(0).getHref();
            String relAltoPath = "alto".concat(File.separator).concat(p.getImgFileName().substring(0, p.getImgFileName().lastIndexOf(".")).concat(".xml"));
            fLocat.setHref(relAltoPath);
            // String absAltoPath = tMd.getUrl().getPath().replace("page", "alto");
            final String path = FileUtils.toFile(p.getUrl()).getAbsolutePath();
            String absAltoPath = path.substring(0, path.lastIndexOf(File.separator));
            absAltoPath = absAltoPath.concat("/alto/").concat(p.getImgFileName().substring(0, p.getImgFileName().lastIndexOf(".")).concat(".xml"));
            // logger.info("alto path starts with: " + absAltoPath);
            if (absAltoPath.startsWith("\\")) /*|| absAltoPath.startsWith("/")*/
            {
                // logger.info("alto path starts with \\ or /");
                absAltoPath = absAltoPath.substring(1);
            }
            String mime = MimeTypes.getMimeType("xml");
            altoFt.setMIMETYPE(mime);
            File altoTmp = new File(absAltoPath);
            if (altoTmp.exists()) {
                // logger.info("alto file exist at " + absAltoPath);
                Date date = new Date(altoTmp.lastModified());
                XMLGregorianCalendar cal = JaxbUtils.getXmlCalendar(date);
                altoFt.setCREATED(cal);
            } else {
                logger.info("alto file does not exist at " + absAltoPath);
            }
            // System.out.println("relAltoPath " + relAltoPath);
            // System.out.println("absAltoPath " + absAltoPath);
            // System.in.read();
            altoFt.getFLocat().add(fLocat);
            altoGrp.getFile().add(altoFt);
            Fptr altoPtr = buildFptr(altoFt);
            pageDiv.getFptr().add(altoPtr);
        }
        div.getDiv().add(pageDiv);
    }
    fileSec.getFileGrp().add(masterGrp);
    mets.setFileSec(fileSec);
    if (exportImages) {
        masterGrp.getFileGrp().add(imgGrp);
    }
    if (exportPage) {
        masterGrp.getFileGrp().add(pageGrp);
    }
    if (exportAlto) {
        masterGrp.getFileGrp().add(altoGrp);
    }
    mets.getStructMap().add(structMap);
    return mets;
}
Also used : TrpPage(eu.transkribus.core.model.beans.TrpPage) FileGrp(eu.transkribus.core.model.beans.mets.MetsType.FileSec.FileGrp) Fptr(eu.transkribus.core.model.beans.mets.DivType.Fptr) TrpTranscriptMetadata(eu.transkribus.core.model.beans.TrpTranscriptMetadata) AmdSecType(eu.transkribus.core.model.beans.mets.AmdSecType) MetsHdr(eu.transkribus.core.model.beans.mets.MetsType.MetsHdr) URL(java.net.URL) Date(java.util.Date) MdSecType(eu.transkribus.core.model.beans.mets.MdSecType) DivType(eu.transkribus.core.model.beans.mets.DivType) FileGrpType(eu.transkribus.core.model.beans.mets.FileGrpType) XMLGregorianCalendar(javax.xml.datatype.XMLGregorianCalendar) Mets(eu.transkribus.core.model.beans.mets.Mets) FimgStoreGetClient(org.dea.fimgstoreclient.FimgStoreGetClient) FileType(eu.transkribus.core.model.beans.mets.FileType) FileSec(eu.transkribus.core.model.beans.mets.MetsType.FileSec) TrpDocMetadata(eu.transkribus.core.model.beans.TrpDocMetadata) StructMapType(eu.transkribus.core.model.beans.mets.StructMapType) FLocat(eu.transkribus.core.model.beans.mets.FileType.FLocat) ITrpFile(eu.transkribus.core.model.beans.ITrpFile) File(java.io.File)

Example 22 with TrpPage

use of eu.transkribus.core.model.beans.TrpPage in project TranskribusCore by Transkribus.

the class TrpXlsxBuilder method writeXlsxForDoc.

public static void writeXlsxForDoc(TrpDoc doc, boolean wordBased, File exportFile, Set<Integer> pageIndices, IProgressMonitor monitor, ExportCache cache) throws NoTagsException, Exception {
    if (cache == null) {
        throw new IllegalArgumentException("ExportCache must not be null.");
    }
    if (cache.getCustomTagMapForDoc().isEmpty()) {
        logger.info("No tags to store -> Xlsx export cancelled");
        throw new NoTagsException("No tags available to store into Xlsx");
    }
    List<TrpPage> pages = doc.getPages();
    String exportPath = exportFile.getPath();
    Set<String> selectedTags = cache.getOnlySelectedTagnames(ExportUtils.getOnlyWantedTagnames(CustomTagFactory.getRegisteredTagNames()));
    int totalPages = pageIndices == null ? pages.size() : pageIndices.size();
    if (monitor != null) {
        monitor.beginTask("Exporting to Excel", totalPages);
    }
    wb = new XSSFWorkbook();
    int c = 0;
    for (int i = 0; i < pages.size(); ++i) {
        if (pageIndices != null && !pageIndices.contains(i))
            continue;
        if (monitor != null) {
            if (monitor.isCanceled()) {
                throw new InterruptedException("Export was canceled by user");
            // logger.debug("Xlsx export cancelled!");
            // return;
            }
            monitor.subTask("Processing page " + (c + 1));
        }
        TrpPage page = pages.get(i);
        // try to get previously loaded JAXB transcript
        JAXBPageTranscript tr = null;
        if (cache != null) {
            tr = cache.getPageTranscriptAtIndex(i);
        }
        if (tr == null) {
            TrpTranscriptMetadata md = page.getCurrentTranscript();
            tr = new JAXBPageTranscript(md);
            tr.build();
        }
        // old version
        // TrpPage page = pages.get(i);
        // TrpTranscriptMetadata md = page.getCurrentTranscript();
        // JAXBPageTranscript tr = new JAXBPageTranscript(md);
        // tr.build();
        TrpPageType trpPage = tr.getPage();
        logger.debug("writing xlsx for page " + (i + 1) + "/" + doc.getNPages());
        List<TrpTextRegionType> textRegions = trpPage.getTextRegions(true);
        for (int j = 0; j < textRegions.size(); ++j) {
            TrpTextRegionType r = textRegions.get(j);
            List<TextLineType> lines = r.getTextLine();
            for (int k = 0; k < lines.size(); ++k) {
                TrpTextLineType trpL = (TrpTextLineType) lines.get(k);
                List<WordType> words = trpL.getWord();
                if (wordBased) {
                    for (int l = 0; l < words.size(); ++l) {
                        TrpWordType w = (TrpWordType) words.get(l);
                        writeTagsForShapeElement(w, trpL.getUnicodeText(), String.valueOf(doc.getId()), String.valueOf(page.getPageNr()), r.getId(), trpL.getId(), w.getId(), selectedTags);
                    }
                } else {
                    writeTagsForShapeElement(trpL, trpL.getUnicodeText(), String.valueOf(doc.getId()), String.valueOf(page.getPageNr()), r.getId(), trpL.getId(), "", selectedTags);
                }
            }
        }
        ++c;
        if (monitor != null) {
            monitor.worked(c);
        }
    }
    /*
		 * auto size the columns
		 */
    for (int i = 0; i < wb.getNumberOfSheets(); i++) {
        int numberOfCells = 0;
        Iterator rowIterator = wb.getSheetAt(i).rowIterator();
        /**
         * Escape the header row *
         */
        if (rowIterator.hasNext()) {
            Row headerRow = (Row) rowIterator.next();
            // get the number of cells in the header row
            numberOfCells = headerRow.getPhysicalNumberOfCells();
            for (int j = 0; j < numberOfCells; j++) {
                wb.getSheetAt(i).autoSizeColumn(j);
            }
        }
    }
    FileOutputStream fOut;
    try {
        // means no tags at all
        if (wb.getNumberOfSheets() == 0) {
            throw new IOException("Sorry - No tags available for export");
        }
        fOut = new FileOutputStream(exportPath);
        wb.write(fOut);
        fOut.close();
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
        throw e;
    }
    logger.info("wrote xlsx to: " + exportPath);
}
Also used : NoTagsException(eu.transkribus.core.model.builder.NoTagsException) JAXBPageTranscript(eu.transkribus.core.model.beans.JAXBPageTranscript) TrpPage(eu.transkribus.core.model.beans.TrpPage) TrpTranscriptMetadata(eu.transkribus.core.model.beans.TrpTranscriptMetadata) IOException(java.io.IOException) TrpWordType(eu.transkribus.core.model.beans.pagecontent_trp.TrpWordType) WordType(eu.transkribus.core.model.beans.pagecontent.WordType) TrpWordType(eu.transkribus.core.model.beans.pagecontent_trp.TrpWordType) TrpTextLineType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextLineType) TextLineType(eu.transkribus.core.model.beans.pagecontent.TextLineType) TrpTextLineType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextLineType) TrpTextRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType) FileOutputStream(java.io.FileOutputStream) Iterator(java.util.Iterator) XSSFWorkbook(org.apache.poi.xssf.usermodel.XSSFWorkbook) Row(org.apache.poi.ss.usermodel.Row) TrpPageType(eu.transkribus.core.model.beans.pagecontent_trp.TrpPageType)

Example 23 with TrpPage

use of eu.transkribus.core.model.beans.TrpPage in project TranskribusCore by Transkribus.

the class TrpTeiStringBuilder method setContent.

@Override
protected void setContent(List<TrpPage> pages) throws JAXBException, InterruptedException {
    SebisStringBuilder sbFacsimile = new SebisStringBuilder();
    SebisStringBuilder sbText = new SebisStringBuilder();
    sbText.incIndent();
    sbText.addLine("<text>");
    sbText.incIndent();
    sbText.addLine("<body>");
    // sbText.incIndent();
    // text = tei.createElementNS(TEI_NS, "text");
    // body = tei.createElementNS(TEI_NS, "body");
    int totalPages = pageIndices == null ? pages.size() : pageIndices.size();
    if (monitor != null) {
        monitor.beginTask("Creating TEI", totalPages);
    }
    int c = 0;
    for (int i = 0; i < pages.size(); ++i) {
        if (pageIndices != null && !pageIndices.contains(i))
            continue;
        if (monitor != null) {
            if (monitor.isCanceled()) {
                throw new InterruptedException("Export was canceled by user");
            // break;
            }
            monitor.subTask("Processing page " + (c + 1));
        }
        TrpPage p = pages.get(i);
        logger.debug("1Processing page " + p.getPageNr() + ": " + p.getUrl() + " - XML=" + p.getCurrentTranscript().getUrl());
        // check buffer for transcript or unmarshal the page XML
        PcGtsType pc = this.getPcGtsTypeForPage(p);
        if (pars.hasZones()) {
            // create a facsimile element for each page that are appended to the root element of the TEI after header
            openFacsimileElement(sbFacsimile, p, pc);
        }
        // create page-break element for each page as child of body element:
        writePageBreak(sbText, p, pc);
        // 
        // // append all text-regions / lines / words to the xml:
        List<TrpRegionType> regions = pc.getPage().getTextRegionOrImageRegionOrLineDrawingRegion();
        Collections.sort(regions, new TrpElementReadingOrderComparator<RegionType>(true));
        for (TrpRegionType r : regions) {
            // System.out.println(r.getClass());
            if (r instanceof TextRegionType) {
                if (pars.hasZones()) {
                    writeZonesForTextRegion(sbFacsimile, (TrpTextRegionType) r, p.getPageNr());
                }
                writeTextForTextRegion(sbText, (TrpTextRegionType) r, p.getPageNr());
            } else {
                // write other regions
                if (pars.hasZones()) {
                    String facsId = FACS_ID_PREFIX + p.getPageNr();
                    writeZoneForShape(sbFacsimile, r, facsId, true);
                }
            }
        }
        if (pars.hasZones()) {
            closeFacsimilieElement(sbFacsimile);
        }
        ++c;
        if (monitor != null) {
            monitor.worked(c);
        }
    }
    // text.appendChild(body);
    // root.appendChild(text);
    // sbText.decIndent();
    sbText.addLine("</body>");
    sbText.decIndent();
    sbText.addLine("</text>");
    sbText.decIndent();
    sbTotal.sb.append(sbFacsimile.toString());
    sbTotal.sb.append(sbText.toString());
}
Also used : TrpTextRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType) TextRegionType(eu.transkribus.core.model.beans.pagecontent.TextRegionType) TrpTextRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType) TrpRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpRegionType) RegionType(eu.transkribus.core.model.beans.pagecontent.RegionType) TextRegionType(eu.transkribus.core.model.beans.pagecontent.TextRegionType) TrpPage(eu.transkribus.core.model.beans.TrpPage) TrpRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpRegionType) SebisStringBuilder(eu.transkribus.core.util.SebisStringBuilder) PcGtsType(eu.transkribus.core.model.beans.pagecontent.PcGtsType) Point(java.awt.Point)

Example 24 with TrpPage

use of eu.transkribus.core.model.beans.TrpPage in project TranskribusCore by Transkribus.

the class DocxBuilder method writeDocxForDoc.

public static void writeDocxForDoc(TrpDoc doc, boolean wordBased, boolean writeTags, boolean doBlackeningSensibleData, File file, Set<Integer> pageIndices, IProgressMonitor monitor, boolean createTitle, boolean markUnclear, boolean expandAbbreviations, boolean replaceAbbrevs, boolean keepLineBreaks, boolean showSuppliedInBrackets, boolean ignoreSuppliedTags, ExportCache cache) throws JAXBException, IOException, Docx4JException, InterruptedException {
    // ch.qos.logback.classic.Logger root = logger.getClass().get(ch.qos.logback.classic.Logger) org.slf4j.LoggerFactory.getLogger(ch.qos.logback.classic.Logger.ROOT_LOGGER_NAME);
    ((ch.qos.logback.classic.Logger) logger).setLevel(ch.qos.logback.classic.Level.DEBUG);
    exportTags = writeTags;
    doBlackening = doBlackeningSensibleData;
    tagnames = cache.getOnlySelectedTagnames(ExportUtils.getOnlyWantedTagnames(CustomTagFactory.getRegisteredTagNames()));
    markUnclearWords = markUnclear;
    expandAbbrevs = expandAbbreviations;
    preserveLineBreaks = keepLineBreaks;
    substituteAbbrevs = replaceAbbrevs;
    showSuppliedWithBrackets = showSuppliedInBrackets;
    ignoreSupplied = ignoreSuppliedTags;
    /*
		 * get all names of tags
		 */
    // tagnames = CustomTagFactory.getRegisteredTagNames();
    // main document part
    wordMLPackage = WordprocessingMLPackage.createPackage();
    MainDocumentPart mdp = wordMLPackage.getMainDocumentPart();
    org.docx4j.wml.ObjectFactory factory = Context.getWmlObjectFactory();
    List<TrpPage> pages = doc.getPages();
    int totalPages = pageIndices == null ? pages.size() : pageIndices.size();
    if (monitor != null) {
        monitor.beginTask("Exporting to docx", totalPages);
    }
    int c = 0;
    boolean atLeastOnePageWritten = false;
    // can be used as page break every time we need one
    Br objBr = new Br();
    objBr.setType(STBrType.PAGE);
    P pageBreakP = factory.createP();
    pageBreakP.getContent().add(objBr);
    for (int i = 0; i < pages.size(); ++i) {
        if (pageIndices != null && !pageIndices.contains(i))
            continue;
        if (!atLeastOnePageWritten && createTitle) {
            addTitlePage(doc, mdp);
            // add page break
            mdp.addObject(pageBreakP);
        }
        if (monitor != null) {
            if (monitor.isCanceled()) {
                throw new InterruptedException("Export canceled by the user");
            // logger.debug("docx export cancelled!");
            // return;
            }
            monitor.subTask("Processing page " + (c + 1));
        }
        // TrpPage page = pages.get(i);
        // TrpTranscriptMetadata md = page.getCurrentTranscript();
        // JAXBPageTranscript tr = new JAXBPageTranscript(md);
        // tr.build();
        JAXBPageTranscript tr = null;
        if (cache != null) {
            tr = cache.getPageTranscriptAtIndex(i);
        }
        if (tr == null) {
            TrpPage page = pages.get(i);
            TrpTranscriptMetadata md = page.getCurrentTranscript();
            // md.getStatus().equals("Done");
            tr = new JAXBPageTranscript(md);
            tr.build();
        }
        TrpPageType trpPage = tr.getPage();
        logger.debug("writing docx for the page " + (i + 1) + "/" + doc.getNPages());
        writeDocxForTranscriptWithTables(mdp, trpPage, wordBased, preserveLineBreaks);
        atLeastOnePageWritten = true;
        ++c;
        if (monitor != null) {
            monitor.worked(c);
        }
    }
    P p = factory.createP();
    mdp.getContent().add(p);
    addComplexField(p, " INDEX \\e \"", "\" \\c \"1\" \\z \"1031\"");
    FieldUpdater updater = new FieldUpdater(wordMLPackage);
    updater.update(true);
    // write tags at end of last page
    if (false) {
        // RtfText headline = RtfText.text("Person names in this document (amount of found persons: " + persons.size() + ")", "\n");
        logger.debug("export tags ");
        boolean firstExport = true;
        // tagnames = all user choosen tags via export dialog
        for (String currTagname : tagnames) {
            // logger.debug("curr tagname " + currTagname);
            // get all custom tags with currTagname and text
            HashMap<CustomTag, String> allTagsOfThisTagname = cache.getTags(currTagname);
            // one paragraph for each tagname
            org.docx4j.wml.P p4Tag = factory.createP();
            if (allTagsOfThisTagname.size() > 0 && !currTagname.equals("textStyle") && !currTagname.equals("gap") && !currTagname.equals("comment")) {
                // new page if tag export starts
                if (firstExport) {
                    // Br objBr = new Br();
                    // objBr.setType(STBrType.PAGE);
                    p4Tag.getContent().add(objBr);
                    firstExport = false;
                }
                // logger.debug("allTagsOfThisTagname " + allTagsOfThisTagname.size());
                // one run for headline and thanfor each entry
                org.docx4j.wml.Text t = factory.createText();
                t.setValue(currTagname + " tags in this document: " + allTagsOfThisTagname.size());
                t.setSpace("preserve");
                org.docx4j.wml.R run = factory.createR();
                run.getContent().add(t);
                org.docx4j.wml.RPr rpr = factory.createRPr();
                org.docx4j.wml.BooleanDefaultTrue b = new org.docx4j.wml.BooleanDefaultTrue();
                b.setVal(true);
                U u = factory.createU();
                u.setVal(UnderlineEnumeration.SINGLE);
                rpr.setB(b);
                rpr.setU(u);
                run.setRPr(rpr);
                // this Br element is used break the current and go for next line
                Br br = factory.createBr();
                run.getContent().add(br);
                p4Tag.getContent().add(run);
                // ArrayList<RtfText> tagTexts = new ArrayList<RtfText>();
                Collection<String> valueSet = allTagsOfThisTagname.values();
                int l = 0;
                for (String currEntry : valueSet) {
                    org.docx4j.wml.R currRun = factory.createR();
                    org.docx4j.wml.Text currText = factory.createText();
                    currText.setValue(currEntry);
                    currText.setSpace("preserve");
                    currRun.getContent().add(currText);
                    // reuse linebreak
                    currRun.getContent().add(br);
                    p4Tag.getContent().add(currRun);
                }
            }
            mdp.getContent().add(p4Tag);
        }
    }
    // finally save the file
    wordMLPackage.save(file);
    logger.info("Saved " + file.getAbsolutePath());
}
Also used : JAXBPageTranscript(eu.transkribus.core.model.beans.JAXBPageTranscript) TrpTranscriptMetadata(eu.transkribus.core.model.beans.TrpTranscriptMetadata) CustomTag(eu.transkribus.core.model.beans.customtags.CustomTag) Logger(org.slf4j.Logger) RPr(org.docx4j.wml.RPr) P(org.docx4j.wml.P) U(org.docx4j.wml.U) R(org.docx4j.wml.R) Text(org.docx4j.wml.Text) TrpPageType(eu.transkribus.core.model.beans.pagecontent_trp.TrpPageType) P(org.docx4j.wml.P) FieldUpdater(org.docx4j.model.fields.FieldUpdater) TrpPage(eu.transkribus.core.model.beans.TrpPage) MainDocumentPart(org.docx4j.openpackaging.parts.WordprocessingML.MainDocumentPart) Br(org.docx4j.wml.Br)

Example 25 with TrpPage

use of eu.transkribus.core.model.beans.TrpPage in project TranskribusCore by Transkribus.

the class TrpXlsxTableBuilder method writeXlsxForTables.

public static void writeXlsxForTables(TrpDoc doc, File exportFile, Set<Integer> pageIndices, IProgressMonitor monitor, ExportCache cache) throws NoTablesException, IOException, InterruptedException {
    // TrpTableRegionType is contained in the regions too
    List<TrpPage> pages = doc.getPages();
    String exportPath = exportFile.getPath();
    int totalPages = pageIndices == null ? pages.size() : pageIndices.size();
    if (monitor != null) {
        monitor.beginTask("Exporting tables to Excel", totalPages);
    }
    wb = new XSSFWorkbook();
    int c = 0;
    int tableId = 0;
    for (int i = 0; i < pages.size(); ++i) {
        if (pageIndices != null && !pageIndices.contains(i))
            continue;
        if (monitor != null) {
            if (monitor.isCanceled()) {
                throw new InterruptedException("Export was canceled by user");
            // logger.debug("Xlsx export cancelled!");
            // return;
            }
            monitor.subTask("Processing page " + (c + 1));
        }
        TrpPage page = pages.get(i);
        // try to get previously loaded JAXB transcript
        JAXBPageTranscript tr = null;
        if (cache != null) {
            tr = cache.getPageTranscriptAtIndex(i);
        }
        if (tr == null) {
            TrpTranscriptMetadata md = page.getCurrentTranscript();
            tr = new JAXBPageTranscript(md);
            tr.build();
        }
        TrpPageType trpPage = tr.getPage();
        List<TrpRegionType> regions = trpPage.getRegions();
        for (int j = 0; j < regions.size(); ++j) {
            TrpRegionType r = regions.get(j);
            if (r instanceof TrpTableRegionType) {
                tableId++;
                logger.debug("is table");
                TrpTableRegionType table = (TrpTableRegionType) r;
                int cols = table.getNCols();
                int rows = table.getNRows();
                // double maxX = PageXmlUtils.buildPolygon(table.getCoords().getPoints()).getBounds().getMaxX();
                // double minX = PageXmlUtils.buildPolygon(table.getCoords().getPoints()).getBounds().getMinX();
                // int tablesize = (int) (maxX - minX);
                List<List<TrpTableCellType>> allRowCells = new ArrayList<List<TrpTableCellType>>();
                for (int k = 0; k < rows; k++) {
                    allRowCells.add(table.getRowCells(k));
                }
                List<HashMap<Integer, TrpTableCellType>> allRows = new ArrayList<HashMap<Integer, TrpTableCellType>>();
                HashMap<Integer, TrpTableCellType> nextRowMap = new HashMap<Integer, TrpTableCellType>();
                for (List<TrpTableCellType> rowCells : allRowCells) {
                    HashMap<Integer, TrpTableCellType> currRowMap = new HashMap<Integer, TrpTableCellType>();
                    /*
		            	 * fill up all cells which are not set in TRP (needed for vertical cell merge)
		            	 * the nextRowMap contains already all cells which span vertically with the cells above - means they got merged 
		            	 * in the table but have to be considered here 
		            	 */
                    currRowMap.putAll(nextRowMap);
                    nextRowMap.clear();
                    for (TrpTableCellType cell : rowCells) {
                        // logger.debug("table cell text " + cell.getUnicodeTextFromLines());
                        currRowMap.put(cell.getCol(), cell);
                        // only one row or col span is considered -> FIXME: do it for all spans, but may happens never?
                        if (cell.getRowSpan() > 1) {
                            nextRowMap.put(cell.getCol(), null);
                        }
                        if (cell.getColSpan() > 1) {
                            currRowMap.put(cell.getCol() + 1, null);
                        }
                    }
                    allRows.add(currRowMap);
                }
                createTable(rows, cols, allRows, tableId);
            }
            logger.debug("writing xlsx for page " + (i + 1) + "/" + doc.getNPages());
            ++c;
            if (monitor != null) {
                monitor.worked(c);
            }
        }
    }
    /*
		 * auto size the columns
		 */
    for (int i = 0; i < wb.getNumberOfSheets(); i++) {
        int numberOfCells = 0;
        Iterator rowIterator = wb.getSheetAt(i).rowIterator();
        /**
         * Escape the header row *
         */
        if (rowIterator.hasNext()) {
            Row headerRow = (Row) rowIterator.next();
            // get the number of cells in the header row
            numberOfCells = headerRow.getPhysicalNumberOfCells();
            for (int j = 0; j < numberOfCells; j++) {
                wb.getSheetAt(i).autoSizeColumn(j, true);
            }
        }
    }
    FileOutputStream fOut;
    try {
        // means no tables at all
        if (wb.getNumberOfSheets() == 0) {
            throw new NoTablesException("Sorry - No tables available for export");
        }
        fOut = new FileOutputStream(exportPath);
        wb.write(fOut);
        fOut.close();
    } catch (IOException e) {
        if (!(e instanceof NoTablesException)) {
            logger.error(e.getMessage(), e);
        }
        throw e;
    }
    logger.info("wrote xlsx to: " + exportPath);
}
Also used : JAXBPageTranscript(eu.transkribus.core.model.beans.JAXBPageTranscript) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) TrpTranscriptMetadata(eu.transkribus.core.model.beans.TrpTranscriptMetadata) TrpTableRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTableRegionType) NoTablesException(eu.transkribus.core.model.builder.NoTablesException) TrpRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpRegionType) Iterator(java.util.Iterator) XSSFWorkbook(org.apache.poi.xssf.usermodel.XSSFWorkbook) ArrayList(java.util.ArrayList) List(java.util.List) TrpPageType(eu.transkribus.core.model.beans.pagecontent_trp.TrpPageType) TrpTableCellType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTableCellType) TrpPage(eu.transkribus.core.model.beans.TrpPage) IOException(java.io.IOException) FileOutputStream(java.io.FileOutputStream) Row(org.apache.poi.ss.usermodel.Row)

Aggregations

TrpPage (eu.transkribus.core.model.beans.TrpPage)32 TrpTranscriptMetadata (eu.transkribus.core.model.beans.TrpTranscriptMetadata)14 File (java.io.File)14 IOException (java.io.IOException)14 JAXBPageTranscript (eu.transkribus.core.model.beans.JAXBPageTranscript)10 PcGtsType (eu.transkribus.core.model.beans.pagecontent.PcGtsType)7 TrpPageType (eu.transkribus.core.model.beans.pagecontent_trp.TrpPageType)7 URL (java.net.URL)7 ArrayList (java.util.ArrayList)7 TrpDoc (eu.transkribus.core.model.beans.TrpDoc)6 TrpDocMetadata (eu.transkribus.core.model.beans.TrpDocMetadata)5 FileType (eu.transkribus.core.model.beans.mets.FileType)5 JAXBException (javax.xml.bind.JAXBException)5 TrpTextRegionType (eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType)4 Dimension (java.awt.Dimension)4 FileNotFoundException (java.io.FileNotFoundException)4 CorruptImageException (eu.transkribus.core.exceptions.CorruptImageException)3 DivType (eu.transkribus.core.model.beans.mets.DivType)3 Fptr (eu.transkribus.core.model.beans.mets.DivType.Fptr)3 FileGrpType (eu.transkribus.core.model.beans.mets.FileGrpType)3