Search in sources :

Example 11 with TrpTranscriptMetadata

use of eu.transkribus.core.model.beans.TrpTranscriptMetadata in project TranskribusCore by Transkribus.

the class Md5SumComputer method computeAndSetMd5Sums.

public TrpDoc computeAndSetMd5Sums(TrpDoc doc) throws IOException {
    if (doc == null) {
        throw new IllegalArgumentException("doc is null.");
    }
    File localFolder = doc.getMd().getLocalFolder();
    if (localFolder == null) {
        throw new IllegalArgumentException("Not a local Document!");
    }
    updateStatus("Computing checksums...");
    for (TrpPage p : doc.getPages()) {
        updateStatus("Computing checksum: " + getFileNameFromUrl(p.getUrl()));
        p.setMd5Sum(ChecksumUtils.getMd5SumHex(p.getUrl()));
        for (TrpTranscriptMetadata t : p.getTranscripts()) {
            updateStatus("Computing checksum: " + getFileNameFromUrl(t.getUrl()));
            t.setMd5Sum(ChecksumUtils.getMd5SumHex(t.getUrl()));
        }
    }
    return doc;
}
Also used : TrpPage(eu.transkribus.core.model.beans.TrpPage) TrpTranscriptMetadata(eu.transkribus.core.model.beans.TrpTranscriptMetadata) File(java.io.File)

Example 12 with TrpTranscriptMetadata

use of eu.transkribus.core.model.beans.TrpTranscriptMetadata in project TranskribusCore by Transkribus.

the class LocalDocWriter method updateTrpPageXml.

public static List<TrpTranscriptMetadata> updateTrpPageXml(JAXBPageTranscript tr) throws Exception {
    File xmlFile = FileUtils.toFile(tr.getMd().getUrl());
    if (xmlFile == null)
        throw new Exception("Cannot retrieve file url from: " + tr.getMd().getUrl());
    updateImageDimension(tr);
    // set last change date
    tr.getPageData().getMetadata().setLastChange(XmlUtils.getXmlGregCal());
    PageXmlUtils.marshalToFile(tr.getPageData(), xmlFile);
    List<TrpTranscriptMetadata> mds = new ArrayList<>();
    mds.add(tr.getMd());
    return mds;
// PageXmlDao.writeJAXBPageTranscript(tr, xmlFile);
}
Also used : ArrayList(java.util.ArrayList) TrpTranscriptMetadata(eu.transkribus.core.model.beans.TrpTranscriptMetadata) File(java.io.File) IOException(java.io.IOException) JAXBException(javax.xml.bind.JAXBException) FileNotFoundException(java.io.FileNotFoundException)

Example 13 with TrpTranscriptMetadata

use of eu.transkribus.core.model.beans.TrpTranscriptMetadata in project TranskribusCore by Transkribus.

the class TrpMetsBuilder method buildMets.

/**
 * Generate a METS containing
 * <ul>
 * <li>TrpDocMetadata embedded in sourceMd</li>
 * <li>all page images</li>
 * <li>the most recent PAGE XML files from the Doc</li>
 * </ul>
 *
 * If a local document is passed, all hrefs will contain the relative paths to files based on the localFolder!
 *
 * @param doc
 * @param exportImages
 * @param pageIndices
 * @return
 * @throws IOException if image/xml files can't be accessed for reading the mimetype etc.
 */
public static Mets buildMets(TrpDoc doc, boolean exportPage, boolean exportAlto, boolean exportImages, Set<Integer> pageIndices) throws IOException {
    Mets mets = new Mets();
    TrpDocMetadata md = doc.getMd();
    File localFolder = md.getLocalFolder();
    boolean isLocalDoc = localFolder != null;
    mets.setLABEL(md.getTitle());
    mets.setOBJID("" + md.getDocId());
    mets.setPROFILE(TRP_METS_PROFILE);
    // FIXME remove TYPE
    // mets.setTYPE(TRP_METS_PROFILE);
    // metsHdr
    MetsHdr hdr = buildMetsHdr(md);
    mets.setMetsHdr(hdr);
    // TODO dcmd_elec omitted meanwhile
    // md_orig
    AmdSecType amdSec = new AmdSecType();
    amdSec.setID(SOURCE_MD_ID_CONST);
    MdSecType sourceMdSec = buildSourceMdSec(md);
    amdSec.getSourceMD().add(sourceMdSec);
    mets.getAmdSec().add(amdSec);
    // structmap div, linking to the sourceMd section with dmd
    DivType div = new DivType();
    div.getADMID().add(sourceMdSec);
    div.setID(TRP_DOC_DIV_ID);
    FileSec fileSec = new FileSec();
    StructMapType structMap = new StructMapType();
    structMap.setID(TRP_STRUCTMAP_ID);
    structMap.setTYPE("MANUSCRIPT");
    structMap.setDiv(div);
    List<TrpPage> pages = doc.getPages();
    FimgStoreGetClient client = null;
    if (!isLocalDoc) {
        // TODO maybe we need this stuff in the docMetadata?
        URL url = pages.get(0).getUrl();
        client = new FimgStoreGetClient(url);
    }
    FileGrp masterGrp = new FileGrp();
    masterGrp.setID(MASTER_FILE_GRP_ID);
    FileGrpType imgGrp = new FileGrpType();
    imgGrp.setID(IMG_GROUP_ID);
    FileGrpType pageGrp = new FileGrpType();
    pageGrp.setID(PAGE_GROUP_ID);
    FileGrpType altoGrp = new FileGrpType();
    altoGrp.setID(ALTO_GROUP_ID);
    int i = -1;
    for (TrpPage p : pages) {
        i++;
        if (pageIndices != null && !pageIndices.contains(i)) {
            continue;
        }
        // build a page div for the structmap
        DivType pageDiv = new DivType();
        pageDiv.setID("PAGE_" + p.getPageNr());
        pageDiv.setTYPE("SINGLE_PAGE");
        pageDiv.setORDER(BigInteger.valueOf(p.getPageNr()));
        final String imgId = "IMG_" + p.getPageNr();
        final String xmlId = PAGE_GROUP_ID + "_" + p.getPageNr();
        final String altoId = ALTO_GROUP_ID + "_" + p.getPageNr();
        /* only the most recent transcript is added here for now
			 * 
			 * TODO how to deal with imagestore files? use orig image? right now, it's just the view file...
			 * TODO thumbnails not yet included
			*/
        if (exportImages) {
            FileType img = buildFileType(localFolder, imgId, p, p.getPageNr(), client);
            imgGrp.getFile().add(img);
            // linking images
            Fptr imgPtr = buildFptr(img);
            pageDiv.getFptr().add(imgPtr);
        }
        // TODO error handling.. if no transcript??
        if (exportPage) {
            // xmlfiletype: just add the most recent transcript
            TrpTranscriptMetadata tMd;
            // get the transcript chosen for export
            tMd = p.getCurrentTranscript();
            FileType xml = buildFileType(md.getLocalFolder(), xmlId, tMd, p.getPageNr(), client);
            pageGrp.getFile().add(xml);
            Fptr xmlPtr = buildFptr(xml);
            pageDiv.getFptr().add(xmlPtr);
        }
        // creat ALTO fileGrp
        if (exportAlto) {
            FileType altoFt = new FileType();
            altoFt.setCHECKSUMTYPE(ChecksumUtils.ChkSumAlg.MD5.toString());
            // TODO calculate checksum
            altoFt.setCHECKSUM("");
            FLocat fLocat = new FLocat();
            fLocat.setLOCTYPE("OTHER");
            fLocat.setOTHERLOCTYPE("FILE");
            altoFt.setID(altoId);
            altoFt.setSEQ(p.getPageNr());
            // String tmpImgName = img.getFLocat().get(0).getHref();
            String relAltoPath = "alto".concat(File.separator).concat(p.getImgFileName().substring(0, p.getImgFileName().lastIndexOf(".")).concat(".xml"));
            fLocat.setHref(relAltoPath);
            // String absAltoPath = tMd.getUrl().getPath().replace("page", "alto");
            final String path = FileUtils.toFile(p.getUrl()).getAbsolutePath();
            String absAltoPath = path.substring(0, path.lastIndexOf(File.separator));
            absAltoPath = absAltoPath.concat("/alto/").concat(p.getImgFileName().substring(0, p.getImgFileName().lastIndexOf(".")).concat(".xml"));
            // logger.info("alto path starts with: " + absAltoPath);
            if (absAltoPath.startsWith("\\")) /*|| absAltoPath.startsWith("/")*/
            {
                // logger.info("alto path starts with \\ or /");
                absAltoPath = absAltoPath.substring(1);
            }
            String mime = MimeTypes.getMimeType("xml");
            altoFt.setMIMETYPE(mime);
            File altoTmp = new File(absAltoPath);
            if (altoTmp.exists()) {
                // logger.info("alto file exist at " + absAltoPath);
                Date date = new Date(altoTmp.lastModified());
                XMLGregorianCalendar cal = JaxbUtils.getXmlCalendar(date);
                altoFt.setCREATED(cal);
            } else {
                logger.info("alto file does not exist at " + absAltoPath);
            }
            // System.out.println("relAltoPath " + relAltoPath);
            // System.out.println("absAltoPath " + absAltoPath);
            // System.in.read();
            altoFt.getFLocat().add(fLocat);
            altoGrp.getFile().add(altoFt);
            Fptr altoPtr = buildFptr(altoFt);
            pageDiv.getFptr().add(altoPtr);
        }
        div.getDiv().add(pageDiv);
    }
    fileSec.getFileGrp().add(masterGrp);
    mets.setFileSec(fileSec);
    if (exportImages) {
        masterGrp.getFileGrp().add(imgGrp);
    }
    if (exportPage) {
        masterGrp.getFileGrp().add(pageGrp);
    }
    if (exportAlto) {
        masterGrp.getFileGrp().add(altoGrp);
    }
    mets.getStructMap().add(structMap);
    return mets;
}
Also used : TrpPage(eu.transkribus.core.model.beans.TrpPage) FileGrp(eu.transkribus.core.model.beans.mets.MetsType.FileSec.FileGrp) Fptr(eu.transkribus.core.model.beans.mets.DivType.Fptr) TrpTranscriptMetadata(eu.transkribus.core.model.beans.TrpTranscriptMetadata) AmdSecType(eu.transkribus.core.model.beans.mets.AmdSecType) MetsHdr(eu.transkribus.core.model.beans.mets.MetsType.MetsHdr) URL(java.net.URL) Date(java.util.Date) MdSecType(eu.transkribus.core.model.beans.mets.MdSecType) DivType(eu.transkribus.core.model.beans.mets.DivType) FileGrpType(eu.transkribus.core.model.beans.mets.FileGrpType) XMLGregorianCalendar(javax.xml.datatype.XMLGregorianCalendar) Mets(eu.transkribus.core.model.beans.mets.Mets) FimgStoreGetClient(org.dea.fimgstoreclient.FimgStoreGetClient) FileType(eu.transkribus.core.model.beans.mets.FileType) FileSec(eu.transkribus.core.model.beans.mets.MetsType.FileSec) TrpDocMetadata(eu.transkribus.core.model.beans.TrpDocMetadata) StructMapType(eu.transkribus.core.model.beans.mets.StructMapType) FLocat(eu.transkribus.core.model.beans.mets.FileType.FLocat) ITrpFile(eu.transkribus.core.model.beans.ITrpFile) File(java.io.File)

Example 14 with TrpTranscriptMetadata

use of eu.transkribus.core.model.beans.TrpTranscriptMetadata in project TranskribusCore by Transkribus.

the class TrpXlsxBuilder method writeXlsxForDoc.

public static void writeXlsxForDoc(TrpDoc doc, boolean wordBased, File exportFile, Set<Integer> pageIndices, IProgressMonitor monitor, ExportCache cache) throws NoTagsException, Exception {
    if (cache == null) {
        throw new IllegalArgumentException("ExportCache must not be null.");
    }
    if (cache.getCustomTagMapForDoc().isEmpty()) {
        logger.info("No tags to store -> Xlsx export cancelled");
        throw new NoTagsException("No tags available to store into Xlsx");
    }
    List<TrpPage> pages = doc.getPages();
    String exportPath = exportFile.getPath();
    Set<String> selectedTags = cache.getOnlySelectedTagnames(ExportUtils.getOnlyWantedTagnames(CustomTagFactory.getRegisteredTagNames()));
    int totalPages = pageIndices == null ? pages.size() : pageIndices.size();
    if (monitor != null) {
        monitor.beginTask("Exporting to Excel", totalPages);
    }
    wb = new XSSFWorkbook();
    int c = 0;
    for (int i = 0; i < pages.size(); ++i) {
        if (pageIndices != null && !pageIndices.contains(i))
            continue;
        if (monitor != null) {
            if (monitor.isCanceled()) {
                throw new InterruptedException("Export was canceled by user");
            // logger.debug("Xlsx export cancelled!");
            // return;
            }
            monitor.subTask("Processing page " + (c + 1));
        }
        TrpPage page = pages.get(i);
        // try to get previously loaded JAXB transcript
        JAXBPageTranscript tr = null;
        if (cache != null) {
            tr = cache.getPageTranscriptAtIndex(i);
        }
        if (tr == null) {
            TrpTranscriptMetadata md = page.getCurrentTranscript();
            tr = new JAXBPageTranscript(md);
            tr.build();
        }
        // old version
        // TrpPage page = pages.get(i);
        // TrpTranscriptMetadata md = page.getCurrentTranscript();
        // JAXBPageTranscript tr = new JAXBPageTranscript(md);
        // tr.build();
        TrpPageType trpPage = tr.getPage();
        logger.debug("writing xlsx for page " + (i + 1) + "/" + doc.getNPages());
        List<TrpTextRegionType> textRegions = trpPage.getTextRegions(true);
        for (int j = 0; j < textRegions.size(); ++j) {
            TrpTextRegionType r = textRegions.get(j);
            List<TextLineType> lines = r.getTextLine();
            for (int k = 0; k < lines.size(); ++k) {
                TrpTextLineType trpL = (TrpTextLineType) lines.get(k);
                List<WordType> words = trpL.getWord();
                if (wordBased) {
                    for (int l = 0; l < words.size(); ++l) {
                        TrpWordType w = (TrpWordType) words.get(l);
                        writeTagsForShapeElement(w, trpL.getUnicodeText(), String.valueOf(doc.getId()), String.valueOf(page.getPageNr()), r.getId(), trpL.getId(), w.getId(), selectedTags);
                    }
                } else {
                    writeTagsForShapeElement(trpL, trpL.getUnicodeText(), String.valueOf(doc.getId()), String.valueOf(page.getPageNr()), r.getId(), trpL.getId(), "", selectedTags);
                }
            }
        }
        ++c;
        if (monitor != null) {
            monitor.worked(c);
        }
    }
    /*
		 * auto size the columns
		 */
    for (int i = 0; i < wb.getNumberOfSheets(); i++) {
        int numberOfCells = 0;
        Iterator rowIterator = wb.getSheetAt(i).rowIterator();
        /**
         * Escape the header row *
         */
        if (rowIterator.hasNext()) {
            Row headerRow = (Row) rowIterator.next();
            // get the number of cells in the header row
            numberOfCells = headerRow.getPhysicalNumberOfCells();
            for (int j = 0; j < numberOfCells; j++) {
                wb.getSheetAt(i).autoSizeColumn(j);
            }
        }
    }
    FileOutputStream fOut;
    try {
        // means no tags at all
        if (wb.getNumberOfSheets() == 0) {
            throw new IOException("Sorry - No tags available for export");
        }
        fOut = new FileOutputStream(exportPath);
        wb.write(fOut);
        fOut.close();
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
        throw e;
    }
    logger.info("wrote xlsx to: " + exportPath);
}
Also used : NoTagsException(eu.transkribus.core.model.builder.NoTagsException) JAXBPageTranscript(eu.transkribus.core.model.beans.JAXBPageTranscript) TrpPage(eu.transkribus.core.model.beans.TrpPage) TrpTranscriptMetadata(eu.transkribus.core.model.beans.TrpTranscriptMetadata) IOException(java.io.IOException) TrpWordType(eu.transkribus.core.model.beans.pagecontent_trp.TrpWordType) WordType(eu.transkribus.core.model.beans.pagecontent.WordType) TrpWordType(eu.transkribus.core.model.beans.pagecontent_trp.TrpWordType) TrpTextLineType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextLineType) TextLineType(eu.transkribus.core.model.beans.pagecontent.TextLineType) TrpTextLineType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextLineType) TrpTextRegionType(eu.transkribus.core.model.beans.pagecontent_trp.TrpTextRegionType) FileOutputStream(java.io.FileOutputStream) Iterator(java.util.Iterator) XSSFWorkbook(org.apache.poi.xssf.usermodel.XSSFWorkbook) Row(org.apache.poi.ss.usermodel.Row) TrpPageType(eu.transkribus.core.model.beans.pagecontent_trp.TrpPageType)

Example 15 with TrpTranscriptMetadata

use of eu.transkribus.core.model.beans.TrpTranscriptMetadata in project TranskribusCore by Transkribus.

the class AltoExporter method exportAltoFile.

public File exportAltoFile(TrpPage p, final String fileName, File altoOutputDir, boolean splitIntoWords) throws JAXBException, FileNotFoundException, TransformerException {
    if (p == null || fileName == null) {
        throw new IllegalArgumentException("An argument is null!");
    }
    TrpTranscriptMetadata t = p.getCurrentTranscript();
    PcGtsType pc = PageXmlUtils.unmarshal(t.getUrl());
    StreamSource mySrc = new StreamSource();
    mySrc.setInputStream(new ByteArrayInputStream(PageXmlUtils.marshalToBytes(pc)));
    InputStream is;
    if (splitIntoWords) {
        is = XslTransformer.class.getClassLoader().getResourceAsStream(PAGE_TO_ALTO_WORD_LEVEL_XSLT);
    } else {
        is = XslTransformer.class.getClassLoader().getResourceAsStream(PAGE_TO_ALTO_XSLT);
    }
    // InputStream xslIS = new BufferedInputStream(new FileInputStream(xslID));
    InputStream xslIS = new BufferedInputStream(is);
    StreamSource xslSource = new StreamSource(xslIS);
    // das Factory-Pattern unterstützt verschiedene XSLT-Prozessoren
    TransformerFactory transFact = TransformerFactory.newInstance();
    Transformer trans;
    // try {
    trans = transFact.newTransformer(xslSource);
    File altoFile = new File(altoOutputDir.getAbsolutePath() + "/" + fileName);
    trans.transform(mySrc, new StreamResult(new FileOutputStream(altoFile)));
    return altoFile;
// } catch (TransformerConfigurationException e) {
// // TODO Auto-generated catch block
// e.printStackTrace();
// } catch (TransformerException e) {
// // TODO Auto-generated catch block
// e.printStackTrace();
// }
}
Also used : TransformerFactory(javax.xml.transform.TransformerFactory) XslTransformer(eu.transkribus.core.util.XslTransformer) Transformer(javax.xml.transform.Transformer) StreamResult(javax.xml.transform.stream.StreamResult) ByteArrayInputStream(java.io.ByteArrayInputStream) BufferedInputStream(java.io.BufferedInputStream) BufferedInputStream(java.io.BufferedInputStream) ByteArrayInputStream(java.io.ByteArrayInputStream) InputStream(java.io.InputStream) StreamSource(javax.xml.transform.stream.StreamSource) FileOutputStream(java.io.FileOutputStream) TrpTranscriptMetadata(eu.transkribus.core.model.beans.TrpTranscriptMetadata) PcGtsType(eu.transkribus.core.model.beans.pagecontent.PcGtsType) File(java.io.File)

Aggregations

TrpTranscriptMetadata (eu.transkribus.core.model.beans.TrpTranscriptMetadata)21 TrpPage (eu.transkribus.core.model.beans.TrpPage)14 JAXBPageTranscript (eu.transkribus.core.model.beans.JAXBPageTranscript)11 TrpPageType (eu.transkribus.core.model.beans.pagecontent_trp.TrpPageType)7 IOException (java.io.IOException)7 File (java.io.File)6 URL (java.net.URL)4 PcGtsType (eu.transkribus.core.model.beans.pagecontent.PcGtsType)3 FileOutputStream (java.io.FileOutputStream)3 ArrayList (java.util.ArrayList)3 Date (java.util.Date)3 JAXBException (javax.xml.bind.JAXBException)3 Rtf (com.tutego.jrtf.Rtf)2 TrpDoc (eu.transkribus.core.model.beans.TrpDoc)2 CustomTag (eu.transkribus.core.model.beans.customtags.CustomTag)2 Fptr (eu.transkribus.core.model.beans.mets.DivType.Fptr)2 FileType (eu.transkribus.core.model.beans.mets.FileType)2 Mets (eu.transkribus.core.model.beans.mets.Mets)2 TextLineType (eu.transkribus.core.model.beans.pagecontent.TextLineType)2 WordType (eu.transkribus.core.model.beans.pagecontent.WordType)2