Search in sources :

Example 1 with FulltextKramerius

use of cz.mzk.recordmanager.server.model.FulltextKramerius in project RecordManager2 by moravianlibrary.

the class ZakonyProLidiFulltextWriter method write.

@Override
public void write(List<? extends HarvestedRecordUniqueId> items) throws Exception {
    for (HarvestedRecordUniqueId uniqueId : items) {
        HarvestedRecord hr = harvestedRecordDao.get(uniqueId);
        if (!hr.getFulltextKramerius().isEmpty())
            continue;
        getNextFulltext(uniqueId.getRecordId());
        FulltextKramerius fk = new FulltextKramerius();
        String fulltext = reader.next();
        if (fulltext.isEmpty()) {
            logger.warn("Fulltext from " + FULLTEXT_URL + uniqueId.getRecordId() + " is empty.");
        } else {
            fk.setFulltext(fulltext.getBytes());
            fk.setUuidPage(uniqueId.getRecordId());
            fk.setPage("1");
            fk.setOrder(1L);
            hr.setFulltextKramerius(Collections.singletonList(fk));
            hr.setUpdated(new Date());
            harvestedRecordDao.persist(hr);
        }
        client.close();
    }
    sessionFactory.getCurrentSession().flush();
    sessionFactory.getCurrentSession().clear();
}
Also used : HarvestedRecordUniqueId(cz.mzk.recordmanager.server.model.HarvestedRecord.HarvestedRecordUniqueId) FulltextKramerius(cz.mzk.recordmanager.server.model.FulltextKramerius) Date(java.util.Date) HarvestedRecord(cz.mzk.recordmanager.server.model.HarvestedRecord)

Example 2 with FulltextKramerius

use of cz.mzk.recordmanager.server.model.FulltextKramerius in project RecordManager2 by moravianlibrary.

the class KrameriusFulltextProcessor method process.

@Override
public HarvestedRecord process(HarvestedRecord item) throws Exception {
    logger.debug("Processing Harvested Record: " + item.toString() + " uniqueId: " + item.getUniqueId());
    String policy;
    String model;
    // read complete HarvestedRecord using DAO
    HarvestedRecord rec = recordDao.findByIdAndHarvestConfiguration(item.getUniqueId().getRecordId(), confId);
    InputStream is = new ByteArrayInputStream(rec.getRawRecord());
    // get Kramerius policy from record
    try {
        DublinCoreRecord dcRecord = parser.parseRecord(is);
        MetadataDublinCoreRecord mdrc = new MetadataDublinCoreRecord(dcRecord);
        policy = mdrc.getPolicyKramerius();
        model = mdrc.getModelKramerius();
    } catch (InvalidDcException e) {
        logger.warn("InvalidDcException for record with id:" + item.getUniqueId());
        logger.warn(e.getMessage());
        // doesn't do anything, just returns rec from DAO and writes a message into log
        return rec;
    }
    // modify read HarvestedRecord only if following condition is fulfilled
    if (policy.equals("public") || downloadPrivateFulltexts) {
        logger.debug("Processor: privacy condition fulfilled, reading pages");
        String rootUuid = rec.getUniqueId().getRecordId();
        List<FulltextKramerius> pages;
        if (model.equals("periodical")) {
            logger.info("Using (periodical) fultexter \"for root\" for uuid " + rootUuid + ".");
            pages = fulltexter.getFulltextForRoot(rootUuid);
        } else {
            logger.info("Using (monograph/default) fultexter \"for parent\" for uuid " + rootUuid + ".");
            pages = fulltexter.getFulltextObjects(rootUuid);
        }
        // if we got empty list in pages => do nothing, return original record
        if (pages.isEmpty()) {
            return rec;
        }
        // delete old FulltextKramerius from database before adding new ones
        fmDao.deleteFulltext(rec.getId());
        rec.setFulltextKramerius(pages);
    } else {
        logger.debug("Processor: privacy condition is NOT fulfilled, skipping record");
    }
    return rec;
}
Also used : DublinCoreRecord(cz.mzk.recordmanager.server.dc.DublinCoreRecord) MetadataDublinCoreRecord(cz.mzk.recordmanager.server.metadata.MetadataDublinCoreRecord) ByteArrayInputStream(java.io.ByteArrayInputStream) ByteArrayInputStream(java.io.ByteArrayInputStream) InputStream(java.io.InputStream) InvalidDcException(cz.mzk.recordmanager.server.dc.InvalidDcException) FulltextKramerius(cz.mzk.recordmanager.server.model.FulltextKramerius) MetadataDublinCoreRecord(cz.mzk.recordmanager.server.metadata.MetadataDublinCoreRecord) HarvestedRecord(cz.mzk.recordmanager.server.model.HarvestedRecord)

Example 3 with FulltextKramerius

use of cz.mzk.recordmanager.server.model.FulltextKramerius in project RecordManager2 by moravianlibrary.

the class KrameriusFulltexterFedora method getFulltextObjects.

/*
	 * gets some page metadata read from JSON for given rootUuid, loads OCR,
	 * modifies FulltextKramerius and returns them in list
	 */
@Override
public List<FulltextKramerius> getFulltextObjects(String rootUuid) throws IOException {
    List<FulltextKramerius> fms = getPagesMetadata(rootUuid);
    Long pageOrder = 0L;
    for (FulltextKramerius fm : fms) {
        pageOrder++;
        String pageUuid = fm.getUuidPage();
        /*
			 * really try to get OCR only if page is not private(=is public), or
			 * download of private fulltext is allowed and authToken is set)
			 */
        if (!fm.isPrivate() || (downloadPrivateFulltexts && authToken != null)) {
            byte[] ocr = getOCRBytes(pageUuid, fm.isPrivate());
            fm.setFulltext(ocr);
        }
        fm.setOrder(pageOrder);
    }
    return fms;
}
Also used : FulltextKramerius(cz.mzk.recordmanager.server.model.FulltextKramerius)

Example 4 with FulltextKramerius

use of cz.mzk.recordmanager.server.model.FulltextKramerius in project RecordManager2 by moravianlibrary.

the class KrameriusFulltexterFedora method getPagesMetadata.

/*
	 * gets basic page metadata from JSON received from Kramerius API (for
	 * specified rootUuid) returns list of FulltextMonographies
	 */
public List<FulltextKramerius> getPagesMetadata(String rootUuid) throws IOException {
    JSONArray pagesJson;
    /* check it */
    List<FulltextKramerius> pagesMetadataList = new ArrayList<FulltextKramerius>();
    String pagesListUrl = kramApiUrl + "/item/" + rootUuid + "/children";
    logger.debug("Going to read pages metadata from: {}", pagesListUrl);
    try {
        pagesJson = readKrameriusJSON(pagesListUrl);
    } catch (JSONException e) {
        logger.warn(e.getMessage());
        pagesJson = new JSONArray();
    }
    /*
		 * for each JSONObject in array - extract page id, page number, policy &
		 * check model
		 */
    for (int i = 0; i < pagesJson.length(); i++) {
        FulltextKramerius ftm = new FulltextKramerius();
        try {
            JSONObject obj = pagesJson.getJSONObject(i);
            // model MUST equal "page" or it will be ignored
            String model = (String) obj.get("model");
            if (!model.equals("page")) {
                logger.debug("Model is not \"page\", Model is \"{}\" for uuid: {}", model, obj.get("pid"));
                continue;
            }
            String policy = (String) obj.get("policy");
            ftm.setPrivate(!policy.equals("public"));
            String pid = (String) obj.get("pid");
            ftm.setUuidPage(pid);
            JSONObject details = (JSONObject) obj.get("details");
            String page = (String) details.get("pagenumber");
            // String page= (String) obj.get("title"); //information in
            // "title" is sometimes malformed in Kramerius' JSON
            ftm.setPage(page.trim());
            pagesMetadataList.add(ftm);
        } catch (JSONException e) {
            logger.error(e.getMessage());
        }
    }
    return pagesMetadataList;
}
Also used : JSONObject(org.json.JSONObject) JSONArray(org.json.JSONArray) FulltextKramerius(cz.mzk.recordmanager.server.model.FulltextKramerius) ArrayList(java.util.ArrayList) JSONException(org.json.JSONException)

Example 5 with FulltextKramerius

use of cz.mzk.recordmanager.server.model.FulltextKramerius in project RecordManager2 by moravianlibrary.

the class KrameriusFulltexterFedora method getFulltextForRoot.

@Override
public List<FulltextKramerius> getFulltextForRoot(String rootUuid) throws IOException {
    List<FulltextKramerius> pagesMetadataList = new ArrayList<FulltextKramerius>();
    // find all non page objects... and add them to uuid list; then get fulltext for all objects
    LinkedList<String> nonPagesUuids = new LinkedList<String>();
    nonPagesUuids.add(rootUuid);
    JSONArray pagesJson;
    while (!nonPagesUuids.isEmpty()) {
        String processedUuid = nonPagesUuids.poll();
        // read json object for processedUuid
        String childrenListUrl = kramApiUrl + "/item/" + processedUuid + "/children";
        try {
            pagesJson = readKrameriusJSON(childrenListUrl);
        } catch (JSONException e) {
            logger.warn(e.getMessage());
            pagesJson = new JSONArray();
        }
        for (int i = 0; i < pagesJson.length(); i++) {
            FulltextKramerius ftm = new FulltextKramerius();
            try {
                JSONObject obj = pagesJson.getJSONObject(i);
                String model = (String) obj.get("model");
                String pid = (String) obj.get("pid");
                // get pages
                if (model.equals("page")) {
                    logger.debug("Got periodical page: {}", pid);
                    String policy = (String) obj.get("policy");
                    ftm.setPrivate(!policy.equals("public"));
                    ftm.setUuidPage(pid);
                    JSONObject details = (JSONObject) obj.get("details");
                    String page = (String) details.get("pagenumber");
                    // String page= (String) obj.get("title"); //information in
                    // "title" is sometimes malformed in Kramerius' JSON
                    // TODO data sometimes contain garbage values - this should be considered fallback solution
                    page = page.length() > 50 ? page.substring(0, 50) : page;
                    ftm.setPage(page.trim());
                    pagesMetadataList.add(ftm);
                // put other models in list, they will be searched by while cycle
                } else {
                    nonPagesUuids.push(pid);
                }
            } catch (JSONException | ClassCastException e) {
                logger.error(e.getMessage());
            }
        }
    }
    Long pageOrder = 0L;
    for (FulltextKramerius fm : pagesMetadataList) {
        pageOrder++;
        String pageUuid = fm.getUuidPage();
        /*
			 * really try to get OCR only if page is not private(=is public), or
			 * download of private fulltext is allowed and authToken is set)
			 */
        if (!fm.isPrivate() || (downloadPrivateFulltexts && authToken != null)) {
            byte[] ocr = getOCRBytes(pageUuid, fm.isPrivate());
            fm.setFulltext(ocr);
        }
        fm.setOrder(pageOrder);
    }
    return pagesMetadataList;
}
Also used : FulltextKramerius(cz.mzk.recordmanager.server.model.FulltextKramerius) ArrayList(java.util.ArrayList) JSONArray(org.json.JSONArray) JSONException(org.json.JSONException) LinkedList(java.util.LinkedList) JSONObject(org.json.JSONObject)

Aggregations

FulltextKramerius (cz.mzk.recordmanager.server.model.FulltextKramerius)8 ArrayList (java.util.ArrayList)4 HarvestedRecord (cz.mzk.recordmanager.server.model.HarvestedRecord)3 HarvestedRecordUniqueId (cz.mzk.recordmanager.server.model.HarvestedRecord.HarvestedRecordUniqueId)2 ByteArrayInputStream (java.io.ByteArrayInputStream)2 InputStream (java.io.InputStream)2 Date (java.util.Date)2 JSONArray (org.json.JSONArray)2 JSONException (org.json.JSONException)2 JSONObject (org.json.JSONObject)2 DublinCoreRecord (cz.mzk.recordmanager.server.dc.DublinCoreRecord)1 InvalidDcException (cz.mzk.recordmanager.server.dc.InvalidDcException)1 MetadataDublinCoreRecord (cz.mzk.recordmanager.server.metadata.MetadataDublinCoreRecord)1 ByteArrayOutputStream (java.io.ByteArrayOutputStream)1 IOException (java.io.IOException)1 LinkedList (java.util.LinkedList)1 DOMSource (javax.xml.transform.dom.DOMSource)1 StreamResult (javax.xml.transform.stream.StreamResult)1 SolrQuery (org.apache.solr.client.solrj.SolrQuery)1 QueryResponse (org.apache.solr.client.solrj.response.QueryResponse)1