use of cz.mzk.recordmanager.server.model.FulltextKramerius in project RecordManager2 by moravianlibrary.
the class ZakonyProLidiFulltextWriter method write.
@Override
public void write(List<? extends HarvestedRecordUniqueId> items) throws Exception {
for (HarvestedRecordUniqueId uniqueId : items) {
HarvestedRecord hr = harvestedRecordDao.get(uniqueId);
if (!hr.getFulltextKramerius().isEmpty())
continue;
getNextFulltext(uniqueId.getRecordId());
FulltextKramerius fk = new FulltextKramerius();
String fulltext = reader.next();
if (fulltext.isEmpty()) {
logger.warn("Fulltext from " + FULLTEXT_URL + uniqueId.getRecordId() + " is empty.");
} else {
fk.setFulltext(fulltext.getBytes());
fk.setUuidPage(uniqueId.getRecordId());
fk.setPage("1");
fk.setOrder(1L);
hr.setFulltextKramerius(Collections.singletonList(fk));
hr.setUpdated(new Date());
harvestedRecordDao.persist(hr);
}
client.close();
}
sessionFactory.getCurrentSession().flush();
sessionFactory.getCurrentSession().clear();
}
use of cz.mzk.recordmanager.server.model.FulltextKramerius in project RecordManager2 by moravianlibrary.
the class KrameriusFulltextProcessor method process.
@Override
public HarvestedRecord process(HarvestedRecord item) throws Exception {
logger.debug("Processing Harvested Record: " + item.toString() + " uniqueId: " + item.getUniqueId());
String policy;
String model;
// read complete HarvestedRecord using DAO
HarvestedRecord rec = recordDao.findByIdAndHarvestConfiguration(item.getUniqueId().getRecordId(), confId);
InputStream is = new ByteArrayInputStream(rec.getRawRecord());
// get Kramerius policy from record
try {
DublinCoreRecord dcRecord = parser.parseRecord(is);
MetadataDublinCoreRecord mdrc = new MetadataDublinCoreRecord(dcRecord);
policy = mdrc.getPolicyKramerius();
model = mdrc.getModelKramerius();
} catch (InvalidDcException e) {
logger.warn("InvalidDcException for record with id:" + item.getUniqueId());
logger.warn(e.getMessage());
// doesn't do anything, just returns rec from DAO and writes a message into log
return rec;
}
// modify read HarvestedRecord only if following condition is fulfilled
if (policy.equals("public") || downloadPrivateFulltexts) {
logger.debug("Processor: privacy condition fulfilled, reading pages");
String rootUuid = rec.getUniqueId().getRecordId();
List<FulltextKramerius> pages;
if (model.equals("periodical")) {
logger.info("Using (periodical) fultexter \"for root\" for uuid " + rootUuid + ".");
pages = fulltexter.getFulltextForRoot(rootUuid);
} else {
logger.info("Using (monograph/default) fultexter \"for parent\" for uuid " + rootUuid + ".");
pages = fulltexter.getFulltextObjects(rootUuid);
}
// if we got empty list in pages => do nothing, return original record
if (pages.isEmpty()) {
return rec;
}
// delete old FulltextKramerius from database before adding new ones
fmDao.deleteFulltext(rec.getId());
rec.setFulltextKramerius(pages);
} else {
logger.debug("Processor: privacy condition is NOT fulfilled, skipping record");
}
return rec;
}
use of cz.mzk.recordmanager.server.model.FulltextKramerius in project RecordManager2 by moravianlibrary.
the class KrameriusFulltexterFedora method getFulltextObjects.
/*
* gets some page metadata read from JSON for given rootUuid, loads OCR,
* modifies FulltextKramerius and returns them in list
*/
@Override
public List<FulltextKramerius> getFulltextObjects(String rootUuid) throws IOException {
List<FulltextKramerius> fms = getPagesMetadata(rootUuid);
Long pageOrder = 0L;
for (FulltextKramerius fm : fms) {
pageOrder++;
String pageUuid = fm.getUuidPage();
/*
* really try to get OCR only if page is not private(=is public), or
* download of private fulltext is allowed and authToken is set)
*/
if (!fm.isPrivate() || (downloadPrivateFulltexts && authToken != null)) {
byte[] ocr = getOCRBytes(pageUuid, fm.isPrivate());
fm.setFulltext(ocr);
}
fm.setOrder(pageOrder);
}
return fms;
}
use of cz.mzk.recordmanager.server.model.FulltextKramerius in project RecordManager2 by moravianlibrary.
the class KrameriusFulltexterFedora method getPagesMetadata.
/*
* gets basic page metadata from JSON received from Kramerius API (for
* specified rootUuid) returns list of FulltextMonographies
*/
public List<FulltextKramerius> getPagesMetadata(String rootUuid) throws IOException {
JSONArray pagesJson;
/* check it */
List<FulltextKramerius> pagesMetadataList = new ArrayList<FulltextKramerius>();
String pagesListUrl = kramApiUrl + "/item/" + rootUuid + "/children";
logger.debug("Going to read pages metadata from: {}", pagesListUrl);
try {
pagesJson = readKrameriusJSON(pagesListUrl);
} catch (JSONException e) {
logger.warn(e.getMessage());
pagesJson = new JSONArray();
}
/*
* for each JSONObject in array - extract page id, page number, policy &
* check model
*/
for (int i = 0; i < pagesJson.length(); i++) {
FulltextKramerius ftm = new FulltextKramerius();
try {
JSONObject obj = pagesJson.getJSONObject(i);
// model MUST equal "page" or it will be ignored
String model = (String) obj.get("model");
if (!model.equals("page")) {
logger.debug("Model is not \"page\", Model is \"{}\" for uuid: {}", model, obj.get("pid"));
continue;
}
String policy = (String) obj.get("policy");
ftm.setPrivate(!policy.equals("public"));
String pid = (String) obj.get("pid");
ftm.setUuidPage(pid);
JSONObject details = (JSONObject) obj.get("details");
String page = (String) details.get("pagenumber");
// String page= (String) obj.get("title"); //information in
// "title" is sometimes malformed in Kramerius' JSON
ftm.setPage(page.trim());
pagesMetadataList.add(ftm);
} catch (JSONException e) {
logger.error(e.getMessage());
}
}
return pagesMetadataList;
}
use of cz.mzk.recordmanager.server.model.FulltextKramerius in project RecordManager2 by moravianlibrary.
the class KrameriusFulltexterFedora method getFulltextForRoot.
@Override
public List<FulltextKramerius> getFulltextForRoot(String rootUuid) throws IOException {
List<FulltextKramerius> pagesMetadataList = new ArrayList<FulltextKramerius>();
// find all non page objects... and add them to uuid list; then get fulltext for all objects
LinkedList<String> nonPagesUuids = new LinkedList<String>();
nonPagesUuids.add(rootUuid);
JSONArray pagesJson;
while (!nonPagesUuids.isEmpty()) {
String processedUuid = nonPagesUuids.poll();
// read json object for processedUuid
String childrenListUrl = kramApiUrl + "/item/" + processedUuid + "/children";
try {
pagesJson = readKrameriusJSON(childrenListUrl);
} catch (JSONException e) {
logger.warn(e.getMessage());
pagesJson = new JSONArray();
}
for (int i = 0; i < pagesJson.length(); i++) {
FulltextKramerius ftm = new FulltextKramerius();
try {
JSONObject obj = pagesJson.getJSONObject(i);
String model = (String) obj.get("model");
String pid = (String) obj.get("pid");
// get pages
if (model.equals("page")) {
logger.debug("Got periodical page: {}", pid);
String policy = (String) obj.get("policy");
ftm.setPrivate(!policy.equals("public"));
ftm.setUuidPage(pid);
JSONObject details = (JSONObject) obj.get("details");
String page = (String) details.get("pagenumber");
// String page= (String) obj.get("title"); //information in
// "title" is sometimes malformed in Kramerius' JSON
// TODO data sometimes contain garbage values - this should be considered fallback solution
page = page.length() > 50 ? page.substring(0, 50) : page;
ftm.setPage(page.trim());
pagesMetadataList.add(ftm);
// put other models in list, they will be searched by while cycle
} else {
nonPagesUuids.push(pid);
}
} catch (JSONException | ClassCastException e) {
logger.error(e.getMessage());
}
}
}
Long pageOrder = 0L;
for (FulltextKramerius fm : pagesMetadataList) {
pageOrder++;
String pageUuid = fm.getUuidPage();
/*
* really try to get OCR only if page is not private(=is public), or
* download of private fulltext is allowed and authToken is set)
*/
if (!fm.isPrivate() || (downloadPrivateFulltexts && authToken != null)) {
byte[] ocr = getOCRBytes(pageUuid, fm.isPrivate());
fm.setFulltext(ocr);
}
fm.setOrder(pageOrder);
}
return pagesMetadataList;
}
Aggregations