use of cz.mzk.recordmanager.server.model.FulltextKramerius in project RecordManager2 by moravianlibrary.
the class ManuscriptoriumFulltextWriter method write.
@Override
public void write(List<? extends HarvestedRecordUniqueId> items) throws Exception {
for (HarvestedRecordUniqueId uniqueId : items) {
HarvestedRecord hr = harvestedRecordDao.get(uniqueId);
if (!hr.getFulltextKramerius().isEmpty())
continue;
getNextFulltext(uniqueId.getRecordId());
FulltextKramerius fk = new FulltextKramerius();
String fulltext = fulltextReader.next();
if (fulltext.isEmpty()) {
logger.warn("Fulltext from " + FULLTEXT_URL + uniqueId.getRecordId() + " is empty.");
} else {
fk.setFulltext(fulltext.getBytes());
fk.setUuidPage(uniqueId.getRecordId());
fk.setPage("1");
fk.setOrder(1L);
hr.setFulltextKramerius(Collections.singletonList(fk));
hr.setUpdated(new Date());
InputStream is = new ByteArrayInputStream(hr.getRawRecord());
Document doc = documentBuilder.parse(removeFormating(is));
// remove old TEI element from DC
NodeList tei = doc.getElementsByTagName(TEI);
if (tei != null && tei.getLength() > 0) {
Node remove = tei.item(0);
remove.getParentNode().removeChild(tei.item(0));
}
// get new TEI element from source document
Document teiDoc = documentBuilder.parse(removeFormating(teiReader));
Node newNode = teiDoc.getElementsByTagName(TEI).item(0).cloneNode(true);
doc.adoptNode(newNode);
// add TEI elemenet to DC
Node root = doc.getFirstChild();
root.appendChild(newNode);
DOMSource source = new DOMSource(doc.getDocumentElement());
ByteArrayOutputStream bos = new ByteArrayOutputStream();
StreamResult result = new StreamResult(bos);
transformer.transform(source, result);
hr.setRawRecord(bos.toByteArray());
harvestedRecordDao.persist(hr);
}
client.close();
}
sessionFactory.getCurrentSession().flush();
sessionFactory.getCurrentSession().clear();
}
use of cz.mzk.recordmanager.server.model.FulltextKramerius in project RecordManager2 by moravianlibrary.
the class KrameriusFulltexterSolr method getFulltextObjects.
protected List<FulltextKramerius> getFulltextObjects(String field, String rootUuid) throws IOException {
int start = 0;
long numFound = 0;
boolean finished = false;
List<FulltextKramerius> result = new ArrayList<FulltextKramerius>();
while (!finished) {
logger.debug("Downloading fulltext for pages {} to {}", start, start + MAX_PAGES);
SolrQuery query = new SolrQuery();
String queryString = SolrUtils.createEscapedFieldQuery(field, rootUuid) + " AND " + SolrUtils.createEscapedFieldQuery(FEDORA_MODEL_FIELD, FEDORA_MODEL_PAGE);
query.setQuery(queryString);
query.set("fl", FL_FIELDS);
query.setRows(MAX_PAGES);
query.setStart(start);
try {
QueryResponse response = solr.query(query);
SolrDocumentList documents = response.getResults();
numFound = documents.getNumFound();
result.addAll(asPages(documents));
} catch (Exception ex) {
logger.error("Harvesting of fulltext for uuid: {} FAILED", rootUuid);
logger.error(ex.getMessage());
return result;
}
start += MAX_PAGES;
if (start >= PAGE_LIMIT) {
logger.error("Harvesting of fulltext for uuid: {} REACHED LIMIT {} for number of pages for one record", rootUuid, PAGE_LIMIT);
finished = true;
}
if (start > numFound) {
finished = true;
}
}
return result;
}
use of cz.mzk.recordmanager.server.model.FulltextKramerius in project RecordManager2 by moravianlibrary.
the class KrameriusFulltexterSolr method asPages.
private List<FulltextKramerius> asPages(SolrDocumentList documents) {
List<FulltextKramerius> pages = new ArrayList<FulltextKramerius>(documents.size());
Collections.sort(documents, KrameriusPageComparator.INSTANCE);
long order = 0L;
for (SolrDocument document : documents) {
order++;
FulltextKramerius page = new FulltextKramerius();
String uuid = (String) document.getFieldValue(PID_FIELD);
logger.debug("Harvesting fulltext from Kramerius for page uuid: {}", uuid);
String fulltext = (String) document.getFieldValue(FULLTEXT_FIELD);
String pageNum = (String) document.getFieldValue(PAGE_NUMBER_FIELD);
pageNum = (pageNum == null) ? String.valueOf(order) : pageNum;
// TODO data sometimes contain garbage values - this should be considered fallback solution
pageNum = pageNum.length() > 50 ? pageNum.substring(0, 50) : pageNum;
page.setUuidPage(uuid);
if (fulltext != null) {
page.setFulltext(fulltext.getBytes(Charsets.UTF_8));
}
page.setOrder(order);
page.setPage(pageNum);
pages.add(page);
}
return pages;
}
Aggregations