Search in sources :

Example 1 with DedupRecord

use of cz.mzk.recordmanager.server.model.DedupRecord in project RecordManager2 by moravianlibrary.

the class DedupSimpleKeysStepWriter method checkIfUpdateIsNeeded.

/**
 * @param hr
 * @return False if harvested record was updated before it's dedupRecord updated time
 */
protected boolean checkIfUpdateIsNeeded(HarvestedRecord hr) {
    if (hr == null) {
        return false;
    }
    DedupRecord dr = hr.getDedupRecord();
    if (dr == null || dr.getUpdated() == null) {
        return false;
    }
    Date drUpdate = dr.getUpdated();
    if (hr.getUpdated() != null) {
        return drUpdate.compareTo(hr.getUpdated()) > 0;
    }
    return true;
}
Also used : DedupRecord(cz.mzk.recordmanager.server.model.DedupRecord) Date(java.util.Date)

Example 2 with DedupRecord

use of cz.mzk.recordmanager.server.model.DedupRecord in project RecordManager2 by moravianlibrary.

the class DedupSimpleKeysStepProcessor method process.

@Override
public List<HarvestedRecord> process(List<Long> idList) throws Exception {
    List<HarvestedRecord> hrList = new ArrayList<>();
    // count of DedupRecord in current batch
    Multiset<DedupRecord> dedupMap = HashMultiset.create();
    // Map of records that shoul be updated after processing of batch
    // used in merging two different DedupRecords into one
    Map<DedupRecord, Set<DedupRecord>> updateDedupRecordsMap = new HashMap<>();
    for (Long id : idList) {
        HarvestedRecord currentHr = harvestedRecordDao.get(id);
        if (currentHr == null) {
            logger.warn("Missing record with id: " + id);
            continue;
        }
        DedupRecord currentDr = currentHr.getDedupRecord();
        if (currentDr != null) {
            dedupMap.add(currentDr);
        }
        hrList.add(currentHr);
    }
    for (int i = 0; i < hrList.size(); i++) {
        HarvestedRecord outerRec = hrList.get(i);
        for (int j = i + 1; j < hrList.size(); j++) {
            HarvestedRecord innerRec = hrList.get(j);
            if (matchRecords(outerRec, innerRec)) {
                // merge records, both already have assigned DedupRecord
                if (outerRec.getDedupRecord() != null && innerRec.getDedupRecord() != null) {
                    if (sameDedupRecords(outerRec.getDedupRecord(), innerRec.getDedupRecord())) {
                    // equal dedupRecord, nothing to do
                    } else {
                        DedupRecord moreFrequented = dedupMap.count(outerRec.getDedupRecord()) >= dedupMap.count(innerRec.getDedupRecord()) ? outerRec.getDedupRecord() : innerRec.getDedupRecord();
                        DedupRecord lessFrequented = sameDedupRecords(moreFrequented, outerRec.getDedupRecord()) ? innerRec.getDedupRecord() : outerRec.getDedupRecord();
                        outerRec.setDedupRecord(moreFrequented);
                        innerRec.setDedupRecord(moreFrequented);
                        lessFrequented.setUpdated(new Date());
                        dedupMap.add(moreFrequented);
                        dedupMap.remove(lessFrequented);
                        // all occurrences of lessFrequented in database should be updated to moreFrequented later
                        if (harvestedRecordDao.existsByDedupRecord(lessFrequented)) {
                            updateDedupRecordsMap.computeIfAbsent(moreFrequented, key -> new HashSet<>()).add(lessFrequented);
                        }
                    }
                    continue;
                }
                // any of records have assigned DedupRecord
                if (outerRec.getDedupRecord() == null && innerRec.getDedupRecord() == null) {
                    DedupRecord newDr = new DedupRecord();
                    newDr.setUpdated(new Date());
                    newDr = dedupRecordDAO.persist(newDr);
                    outerRec.setDedupRecord(newDr);
                    innerRec.setDedupRecord(newDr);
                    dedupMap.setCount(newDr, 2);
                    continue;
                }
                // if we got this far, exactly one of records has assigned DedupRecord
                DedupRecord dr = outerRec.getDedupRecord() != null ? outerRec.getDedupRecord() : innerRec.getDedupRecord();
                dr.setUpdated(new Date());
                outerRec.setDedupRecord(dr);
                innerRec.setDedupRecord(dr);
                dedupMap.add(dr);
            }
        }
    }
    // walk through map and update references
    for (Map.Entry<DedupRecord, Set<DedupRecord>> entry : updateDedupRecordsMap.entrySet()) {
        for (DedupRecord updatedDR : entry.getValue()) {
            for (HarvestedRecord toBeUpdated : harvestedRecordDao.getByDedupRecord(updatedDR)) {
                toBeUpdated.setDedupRecord(entry.getKey());
            }
        }
    }
    return hrList;
}
Also used : Logger(org.slf4j.Logger) Date(java.util.Date) Multiset(com.google.common.collect.Multiset) LoggerFactory(org.slf4j.LoggerFactory) Set(java.util.Set) Autowired(org.springframework.beans.factory.annotation.Autowired) HashMap(java.util.HashMap) ItemProcessor(org.springframework.batch.item.ItemProcessor) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) List(java.util.List) Component(org.springframework.stereotype.Component) HarvestedRecord(cz.mzk.recordmanager.server.model.HarvestedRecord) DedupRecordDAO(cz.mzk.recordmanager.server.oai.dao.DedupRecordDAO) HashMultiset(com.google.common.collect.HashMultiset) DedupRecord(cz.mzk.recordmanager.server.model.DedupRecord) Map(java.util.Map) HarvestedRecordDAO(cz.mzk.recordmanager.server.oai.dao.HarvestedRecordDAO) Set(java.util.Set) HashSet(java.util.HashSet) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) DedupRecord(cz.mzk.recordmanager.server.model.DedupRecord) Date(java.util.Date) HashMap(java.util.HashMap) Map(java.util.Map) HarvestedRecord(cz.mzk.recordmanager.server.model.HarvestedRecord) HashSet(java.util.HashSet)

Example 3 with DedupRecord

use of cz.mzk.recordmanager.server.model.DedupRecord in project RecordManager2 by moravianlibrary.

the class UrlEnricherTest method krameriusUrlTest.

@Test
public void krameriusUrlTest() {
    DedupRecord dr = new DedupRecord();
    SolrInputDocument merged = new SolrInputDocument();
    List<SolrInputDocument> local = new ArrayList<SolrInputDocument>();
    local.add(newField(MZKKRAM_PROT_URL));
    local.add(newField(MZKKRAM_UNKN_URL));
    List<String> result = new ArrayList<>();
    result.add(MZKKRAM_PROT_URL);
    UrlDedupRecordEnricher ue = new UrlDedupRecordEnricher();
    ue.enrich(dr, merged, local);
    Assert.assertEquals(merged.getFieldValues(SolrFieldConstants.URL).toArray(), result.toArray());
}
Also used : SolrInputDocument(org.apache.solr.common.SolrInputDocument) UrlDedupRecordEnricher(cz.mzk.recordmanager.server.index.enrich.UrlDedupRecordEnricher) ArrayList(java.util.ArrayList) DedupRecord(cz.mzk.recordmanager.server.model.DedupRecord) Test(org.testng.annotations.Test) AbstractTest(cz.mzk.recordmanager.server.AbstractTest)

Example 4 with DedupRecord

use of cz.mzk.recordmanager.server.model.DedupRecord in project RecordManager2 by moravianlibrary.

the class UrlEnricherTest method notDuplicitUrlTest.

@Test
public void notDuplicitUrlTest() {
    DedupRecord dr = new DedupRecord();
    SolrInputDocument merged = new SolrInputDocument();
    List<SolrInputDocument> local = new ArrayList<SolrInputDocument>();
    local.add(newField(MZK_ONLINE_MZK_URL));
    local.add(newField(MZK_UNKNOWN_TRE_URL));
    local.add(newField(MZK_PROTECTED_BRNO_URL));
    List<String> result = new ArrayList<>();
    result.add(MZK_ONLINE_MZK_URL);
    result.add(MZK_UNKNOWN_TRE_URL);
    result.add(MZK_PROTECTED_BRNO_URL);
    UrlDedupRecordEnricher ue = new UrlDedupRecordEnricher();
    ue.enrich(dr, merged, local);
    Assert.assertTrue(merged.getFieldValues(SolrFieldConstants.URL).containsAll(result));
}
Also used : SolrInputDocument(org.apache.solr.common.SolrInputDocument) UrlDedupRecordEnricher(cz.mzk.recordmanager.server.index.enrich.UrlDedupRecordEnricher) ArrayList(java.util.ArrayList) DedupRecord(cz.mzk.recordmanager.server.model.DedupRecord) Test(org.testng.annotations.Test) AbstractTest(cz.mzk.recordmanager.server.AbstractTest)

Example 5 with DedupRecord

use of cz.mzk.recordmanager.server.model.DedupRecord in project RecordManager2 by moravianlibrary.

the class FulltextKrameriusDAOTest method getFullText.

@Test
public void getFullText() {
    DedupRecord record = dedupRecordDao.get(100L);
    List<String> fulltext = fulltextKrameriusDao.getFullText(record);
    Assert.assertEquals(fulltext.size(), 1);
    Assert.assertEquals(fulltext.get(0), "test indexace fulltextu");
}
Also used : DedupRecord(cz.mzk.recordmanager.server.model.DedupRecord) Test(org.testng.annotations.Test) AbstractTest(cz.mzk.recordmanager.server.AbstractTest)

Aggregations

DedupRecord (cz.mzk.recordmanager.server.model.DedupRecord)14 AbstractTest (cz.mzk.recordmanager.server.AbstractTest)7 ArrayList (java.util.ArrayList)7 SolrInputDocument (org.apache.solr.common.SolrInputDocument)7 Test (org.testng.annotations.Test)7 UrlDedupRecordEnricher (cz.mzk.recordmanager.server.index.enrich.UrlDedupRecordEnricher)5 HarvestedRecord (cz.mzk.recordmanager.server.model.HarvestedRecord)4 Date (java.util.Date)4 HashMap (java.util.HashMap)3 HashSet (java.util.HashSet)2 List (java.util.List)2 Map (java.util.Map)2 Set (java.util.Set)2 Logger (org.slf4j.Logger)2 LoggerFactory (org.slf4j.LoggerFactory)2 Autowired (org.springframework.beans.factory.annotation.Autowired)2 Component (org.springframework.stereotype.Component)2 HashMultiset (com.google.common.collect.HashMultiset)1 Multiset (com.google.common.collect.Multiset)1 DedupRecordEnricher (cz.mzk.recordmanager.server.index.enrich.DedupRecordEnricher)1