use of cz.mzk.recordmanager.server.model.DedupRecord in project RecordManager2 by moravianlibrary.
the class DedupSimpleKeysStepWriter method checkIfUpdateIsNeeded.
/**
* @param hr
* @return False if harvested record was updated before it's dedupRecord updated time
*/
protected boolean checkIfUpdateIsNeeded(HarvestedRecord hr) {
if (hr == null) {
return false;
}
DedupRecord dr = hr.getDedupRecord();
if (dr == null || dr.getUpdated() == null) {
return false;
}
Date drUpdate = dr.getUpdated();
if (hr.getUpdated() != null) {
return drUpdate.compareTo(hr.getUpdated()) > 0;
}
return true;
}
use of cz.mzk.recordmanager.server.model.DedupRecord in project RecordManager2 by moravianlibrary.
the class DedupSimpleKeysStepProcessor method process.
@Override
public List<HarvestedRecord> process(List<Long> idList) throws Exception {
List<HarvestedRecord> hrList = new ArrayList<>();
// count of DedupRecord in current batch
Multiset<DedupRecord> dedupMap = HashMultiset.create();
// Map of records that shoul be updated after processing of batch
// used in merging two different DedupRecords into one
Map<DedupRecord, Set<DedupRecord>> updateDedupRecordsMap = new HashMap<>();
for (Long id : idList) {
HarvestedRecord currentHr = harvestedRecordDao.get(id);
if (currentHr == null) {
logger.warn("Missing record with id: " + id);
continue;
}
DedupRecord currentDr = currentHr.getDedupRecord();
if (currentDr != null) {
dedupMap.add(currentDr);
}
hrList.add(currentHr);
}
for (int i = 0; i < hrList.size(); i++) {
HarvestedRecord outerRec = hrList.get(i);
for (int j = i + 1; j < hrList.size(); j++) {
HarvestedRecord innerRec = hrList.get(j);
if (matchRecords(outerRec, innerRec)) {
// merge records, both already have assigned DedupRecord
if (outerRec.getDedupRecord() != null && innerRec.getDedupRecord() != null) {
if (sameDedupRecords(outerRec.getDedupRecord(), innerRec.getDedupRecord())) {
// equal dedupRecord, nothing to do
} else {
DedupRecord moreFrequented = dedupMap.count(outerRec.getDedupRecord()) >= dedupMap.count(innerRec.getDedupRecord()) ? outerRec.getDedupRecord() : innerRec.getDedupRecord();
DedupRecord lessFrequented = sameDedupRecords(moreFrequented, outerRec.getDedupRecord()) ? innerRec.getDedupRecord() : outerRec.getDedupRecord();
outerRec.setDedupRecord(moreFrequented);
innerRec.setDedupRecord(moreFrequented);
lessFrequented.setUpdated(new Date());
dedupMap.add(moreFrequented);
dedupMap.remove(lessFrequented);
// all occurrences of lessFrequented in database should be updated to moreFrequented later
if (harvestedRecordDao.existsByDedupRecord(lessFrequented)) {
updateDedupRecordsMap.computeIfAbsent(moreFrequented, key -> new HashSet<>()).add(lessFrequented);
}
}
continue;
}
// any of records have assigned DedupRecord
if (outerRec.getDedupRecord() == null && innerRec.getDedupRecord() == null) {
DedupRecord newDr = new DedupRecord();
newDr.setUpdated(new Date());
newDr = dedupRecordDAO.persist(newDr);
outerRec.setDedupRecord(newDr);
innerRec.setDedupRecord(newDr);
dedupMap.setCount(newDr, 2);
continue;
}
// if we got this far, exactly one of records has assigned DedupRecord
DedupRecord dr = outerRec.getDedupRecord() != null ? outerRec.getDedupRecord() : innerRec.getDedupRecord();
dr.setUpdated(new Date());
outerRec.setDedupRecord(dr);
innerRec.setDedupRecord(dr);
dedupMap.add(dr);
}
}
}
// walk through map and update references
for (Map.Entry<DedupRecord, Set<DedupRecord>> entry : updateDedupRecordsMap.entrySet()) {
for (DedupRecord updatedDR : entry.getValue()) {
for (HarvestedRecord toBeUpdated : harvestedRecordDao.getByDedupRecord(updatedDR)) {
toBeUpdated.setDedupRecord(entry.getKey());
}
}
}
return hrList;
}
use of cz.mzk.recordmanager.server.model.DedupRecord in project RecordManager2 by moravianlibrary.
the class UrlEnricherTest method krameriusUrlTest.
@Test
public void krameriusUrlTest() {
DedupRecord dr = new DedupRecord();
SolrInputDocument merged = new SolrInputDocument();
List<SolrInputDocument> local = new ArrayList<SolrInputDocument>();
local.add(newField(MZKKRAM_PROT_URL));
local.add(newField(MZKKRAM_UNKN_URL));
List<String> result = new ArrayList<>();
result.add(MZKKRAM_PROT_URL);
UrlDedupRecordEnricher ue = new UrlDedupRecordEnricher();
ue.enrich(dr, merged, local);
Assert.assertEquals(merged.getFieldValues(SolrFieldConstants.URL).toArray(), result.toArray());
}
use of cz.mzk.recordmanager.server.model.DedupRecord in project RecordManager2 by moravianlibrary.
the class UrlEnricherTest method notDuplicitUrlTest.
@Test
public void notDuplicitUrlTest() {
DedupRecord dr = new DedupRecord();
SolrInputDocument merged = new SolrInputDocument();
List<SolrInputDocument> local = new ArrayList<SolrInputDocument>();
local.add(newField(MZK_ONLINE_MZK_URL));
local.add(newField(MZK_UNKNOWN_TRE_URL));
local.add(newField(MZK_PROTECTED_BRNO_URL));
List<String> result = new ArrayList<>();
result.add(MZK_ONLINE_MZK_URL);
result.add(MZK_UNKNOWN_TRE_URL);
result.add(MZK_PROTECTED_BRNO_URL);
UrlDedupRecordEnricher ue = new UrlDedupRecordEnricher();
ue.enrich(dr, merged, local);
Assert.assertTrue(merged.getFieldValues(SolrFieldConstants.URL).containsAll(result));
}
use of cz.mzk.recordmanager.server.model.DedupRecord in project RecordManager2 by moravianlibrary.
the class FulltextKrameriusDAOTest method getFullText.
@Test
public void getFullText() {
DedupRecord record = dedupRecordDao.get(100L);
List<String> fulltext = fulltextKrameriusDao.getFullText(record);
Assert.assertEquals(fulltext.size(), 1);
Assert.assertEquals(fulltext.get(0), "test indexace fulltextu");
}
Aggregations