Search in sources :

Example 6 with FulltextKramerius

use of cz.mzk.recordmanager.server.model.FulltextKramerius in project RecordManager2 by moravianlibrary.

the class ManuscriptoriumFulltextWriter method write.

@Override
public void write(List<? extends HarvestedRecordUniqueId> items) throws Exception {
    for (HarvestedRecordUniqueId uniqueId : items) {
        HarvestedRecord hr = harvestedRecordDao.get(uniqueId);
        if (!hr.getFulltextKramerius().isEmpty())
            continue;
        getNextFulltext(uniqueId.getRecordId());
        FulltextKramerius fk = new FulltextKramerius();
        String fulltext = fulltextReader.next();
        if (fulltext.isEmpty()) {
            logger.warn("Fulltext from " + FULLTEXT_URL + uniqueId.getRecordId() + " is empty.");
        } else {
            fk.setFulltext(fulltext.getBytes());
            fk.setUuidPage(uniqueId.getRecordId());
            fk.setPage("1");
            fk.setOrder(1L);
            hr.setFulltextKramerius(Collections.singletonList(fk));
            hr.setUpdated(new Date());
            InputStream is = new ByteArrayInputStream(hr.getRawRecord());
            Document doc = documentBuilder.parse(removeFormating(is));
            // remove old TEI element from DC
            NodeList tei = doc.getElementsByTagName(TEI);
            if (tei != null && tei.getLength() > 0) {
                Node remove = tei.item(0);
                remove.getParentNode().removeChild(tei.item(0));
            }
            // get new TEI element from source document
            Document teiDoc = documentBuilder.parse(removeFormating(teiReader));
            Node newNode = teiDoc.getElementsByTagName(TEI).item(0).cloneNode(true);
            doc.adoptNode(newNode);
            // add TEI elemenet to DC
            Node root = doc.getFirstChild();
            root.appendChild(newNode);
            DOMSource source = new DOMSource(doc.getDocumentElement());
            ByteArrayOutputStream bos = new ByteArrayOutputStream();
            StreamResult result = new StreamResult(bos);
            transformer.transform(source, result);
            hr.setRawRecord(bos.toByteArray());
            harvestedRecordDao.persist(hr);
        }
        client.close();
    }
    sessionFactory.getCurrentSession().flush();
    sessionFactory.getCurrentSession().clear();
}
Also used : DOMSource(javax.xml.transform.dom.DOMSource) HarvestedRecordUniqueId(cz.mzk.recordmanager.server.model.HarvestedRecord.HarvestedRecordUniqueId) StreamResult(javax.xml.transform.stream.StreamResult) ByteArrayInputStream(java.io.ByteArrayInputStream) InputStream(java.io.InputStream) NodeList(org.w3c.dom.NodeList) Node(org.w3c.dom.Node) FulltextKramerius(cz.mzk.recordmanager.server.model.FulltextKramerius) ByteArrayOutputStream(java.io.ByteArrayOutputStream) Document(org.w3c.dom.Document) Date(java.util.Date) ByteArrayInputStream(java.io.ByteArrayInputStream) HarvestedRecord(cz.mzk.recordmanager.server.model.HarvestedRecord)

Example 7 with FulltextKramerius

use of cz.mzk.recordmanager.server.model.FulltextKramerius in project RecordManager2 by moravianlibrary.

the class KrameriusFulltexterSolr method getFulltextObjects.

protected List<FulltextKramerius> getFulltextObjects(String field, String rootUuid) throws IOException {
    int start = 0;
    long numFound = 0;
    boolean finished = false;
    List<FulltextKramerius> result = new ArrayList<FulltextKramerius>();
    while (!finished) {
        logger.debug("Downloading fulltext for pages {} to {}", start, start + MAX_PAGES);
        SolrQuery query = new SolrQuery();
        String queryString = SolrUtils.createEscapedFieldQuery(field, rootUuid) + " AND " + SolrUtils.createEscapedFieldQuery(FEDORA_MODEL_FIELD, FEDORA_MODEL_PAGE);
        query.setQuery(queryString);
        query.set("fl", FL_FIELDS);
        query.setRows(MAX_PAGES);
        query.setStart(start);
        try {
            QueryResponse response = solr.query(query);
            SolrDocumentList documents = response.getResults();
            numFound = documents.getNumFound();
            result.addAll(asPages(documents));
        } catch (Exception ex) {
            logger.error("Harvesting of fulltext for uuid: {} FAILED", rootUuid);
            logger.error(ex.getMessage());
            return result;
        }
        start += MAX_PAGES;
        if (start >= PAGE_LIMIT) {
            logger.error("Harvesting of fulltext for uuid: {} REACHED LIMIT {} for number of pages for one record", rootUuid, PAGE_LIMIT);
            finished = true;
        }
        if (start > numFound) {
            finished = true;
        }
    }
    return result;
}
Also used : QueryResponse(org.apache.solr.client.solrj.response.QueryResponse) FulltextKramerius(cz.mzk.recordmanager.server.model.FulltextKramerius) ArrayList(java.util.ArrayList) SolrDocumentList(org.apache.solr.common.SolrDocumentList) SolrQuery(org.apache.solr.client.solrj.SolrQuery) IOException(java.io.IOException)

Example 8 with FulltextKramerius

use of cz.mzk.recordmanager.server.model.FulltextKramerius in project RecordManager2 by moravianlibrary.

the class KrameriusFulltexterSolr method asPages.

private List<FulltextKramerius> asPages(SolrDocumentList documents) {
    List<FulltextKramerius> pages = new ArrayList<FulltextKramerius>(documents.size());
    Collections.sort(documents, KrameriusPageComparator.INSTANCE);
    long order = 0L;
    for (SolrDocument document : documents) {
        order++;
        FulltextKramerius page = new FulltextKramerius();
        String uuid = (String) document.getFieldValue(PID_FIELD);
        logger.debug("Harvesting fulltext from Kramerius for page uuid: {}", uuid);
        String fulltext = (String) document.getFieldValue(FULLTEXT_FIELD);
        String pageNum = (String) document.getFieldValue(PAGE_NUMBER_FIELD);
        pageNum = (pageNum == null) ? String.valueOf(order) : pageNum;
        // TODO data sometimes contain garbage values - this should be considered fallback solution
        pageNum = pageNum.length() > 50 ? pageNum.substring(0, 50) : pageNum;
        page.setUuidPage(uuid);
        if (fulltext != null) {
            page.setFulltext(fulltext.getBytes(Charsets.UTF_8));
        }
        page.setOrder(order);
        page.setPage(pageNum);
        pages.add(page);
    }
    return pages;
}
Also used : SolrDocument(org.apache.solr.common.SolrDocument) FulltextKramerius(cz.mzk.recordmanager.server.model.FulltextKramerius) ArrayList(java.util.ArrayList)

Aggregations

FulltextKramerius (cz.mzk.recordmanager.server.model.FulltextKramerius)8 ArrayList (java.util.ArrayList)4 HarvestedRecord (cz.mzk.recordmanager.server.model.HarvestedRecord)3 HarvestedRecordUniqueId (cz.mzk.recordmanager.server.model.HarvestedRecord.HarvestedRecordUniqueId)2 ByteArrayInputStream (java.io.ByteArrayInputStream)2 InputStream (java.io.InputStream)2 Date (java.util.Date)2 JSONArray (org.json.JSONArray)2 JSONException (org.json.JSONException)2 JSONObject (org.json.JSONObject)2 DublinCoreRecord (cz.mzk.recordmanager.server.dc.DublinCoreRecord)1 InvalidDcException (cz.mzk.recordmanager.server.dc.InvalidDcException)1 MetadataDublinCoreRecord (cz.mzk.recordmanager.server.metadata.MetadataDublinCoreRecord)1 ByteArrayOutputStream (java.io.ByteArrayOutputStream)1 IOException (java.io.IOException)1 LinkedList (java.util.LinkedList)1 DOMSource (javax.xml.transform.dom.DOMSource)1 StreamResult (javax.xml.transform.stream.StreamResult)1 SolrQuery (org.apache.solr.client.solrj.SolrQuery)1 QueryResponse (org.apache.solr.client.solrj.response.QueryResponse)1