use of cz.mzk.recordmanager.server.model.ShortTitle in project RecordManager2 by moravianlibrary.
the class MetadataMarcRecord method getShortTitles.
/**
* get {@link ShortTitle} of record
*
* @return all 245anp and 240anp, if not contains subfield 'b'
*/
@Override
public List<ShortTitle> getShortTitles() {
List<ShortTitle> results = new ArrayList<>();
Long shortTitleCounter = 0L;
for (String tag : TITLE_TAGS) {
for (DataField df : underlayingMarc.getDataFields(tag)) {
if (df.getSubfield('b') == null)
continue;
String titleText = parseTitleValue(df, SHORT_TITLE_SUBFIELDS);
if (!titleText.isEmpty()) {
results.add(ShortTitle.create(titleText, ++shortTitleCounter, MetadataUtils.similarityEnabled(df, titleText)));
}
}
}
return results;
}
use of cz.mzk.recordmanager.server.model.ShortTitle in project RecordManager2 by moravianlibrary.
the class HashingDedupKeyParser method parse.
@Override
public HarvestedRecord parse(HarvestedRecord record, MetadataRecord metadataRecord) throws DedupKeyParserException {
record.setShouldBeProcessed(metadataRecord.matchFilter());
// not dedup key
record.setUpvApplicationId(metadataRecord.getUpvApplicationId());
if (!record.getHarvestedFrom().isGenerateDedupKeys()) {
return record;
}
boolean dedupKeysChanged = false;
boolean oaiTimestampChanged = false;
DedupKeysencapsulator encapsulator = new DedupKeysencapsulator();
List<Title> titles = new ArrayList<>();
for (Title title : metadataRecord.getTitle()) {
title.setTitleStr(MetadataUtils.normalizeAndShorten(title.getTitleStr(), EFFECTIVE_TITLE_LENGTH));
if (title.getTitleStr().isEmpty())
continue;
if (!titles.contains(title)) {
titles.add(title);
}
}
encapsulator.setTitles(titles);
List<ShortTitle> shortTitles = new ArrayList<>();
for (ShortTitle shortTitle : metadataRecord.getShortTitles()) {
shortTitle.setShortTitleStr(MetadataUtils.normalizeAndShorten(shortTitle.getShortTitleStr(), EFFECTIVE_TITLE_LENGTH));
if (shortTitle.getShortTitleStr().isEmpty())
continue;
if (!shortTitles.contains(shortTitle)) {
shortTitles.add(shortTitle);
}
}
encapsulator.setShortTitles(shortTitles);
encapsulator.setIsbns(metadataRecord.getISBNs());
encapsulator.setIssns(metadataRecord.getISSNs());
encapsulator.setIsmns(metadataRecord.getISMNs());
encapsulator.setCnbs(metadataRecord.getCNBs());
encapsulator.setPublicationYear(metadataRecord.getPublicationYear());
List<HarvestedRecordFormatEnum> formatEnums = metadataRecord.getDetectedFormatList();
encapsulator.setFormats(harvestedRecordFormatDAO.getFormatsFromEnums(formatEnums));
encapsulator.setAuthorAuthKey(MetadataUtils.shorten(metadataRecord.getAuthorAuthKey(), EFFECTIVE_AUTHOR_AUTH_KEY_LENGTH));
encapsulator.setAuthorString(MetadataUtils.normalizeAndShorten(metadataRecord.getAuthorString(), EFFECTIVE_AUTHOR_LENGTH));
encapsulator.setScale(metadataRecord.getScale());
encapsulator.setUuid(metadataRecord.getUUId());
encapsulator.setPages(metadataRecord.getPageCount());
encapsulator.setIssnSeries(MetadataUtils.normalize(metadataRecord.getISSNSeries()));
encapsulator.setIssnSeriesOrder(MetadataUtils.normalize(metadataRecord.getISSNSeriesOrder()));
encapsulator.setOclcs(metadataRecord.getOclcs());
encapsulator.setClusterId(metadataRecord.getClusterId());
encapsulator.setRaw001Id(metadataRecord.getRaw001Id());
encapsulator.setSourceInfoT(MetadataUtils.normalizeAndShorten(metadataRecord.getSourceInfoT(), EFFECTIVE_SOURCE_INFO_LENGTH));
encapsulator.setSourceInfoG(MetadataUtils.normalizeAndShorten(metadataRecord.getSourceInfoG(), EFFECTIVE_SOURCE_INFO_LENGTH));
encapsulator.setSourceInfoX(MetadataUtils.normalizeAndShorten(metadataRecord.getSourceInfoX(), EFFECTIVE_LENGTH_30));
encapsulator.setEans(metadataRecord.getEANs());
encapsulator.setPublisherNumbers(metadataRecord.getPublisherNumber());
encapsulator.setLanguages(new HashSet<>(metadataRecord.getLanguages()));
String computedHash = computeHashValue(encapsulator);
String oldHash = record.getDedupKeysHash();
String temporalHash = record.getTemporalDedupHash() == null ? "0000000000000000000000000000000000000000" : record.getTemporalDedupHash();
// during one batch
if ((!temporalHash.equals(computedHash)) && (oldHash == null || oldHash.isEmpty() || !computedHash.equals(oldHash))) {
// keys changed, updated in database
dedupKeysChanged = true;
// drop old keys
harvestedRecordDao.dropDedupKeys(record);
if (record.getHarvestedFrom() != null)
record.setWeight(metadataRecord.getWeight(record.getHarvestedFrom().getBaseWeight()));
// assign new keys
record.setTitles(encapsulator.getTitles());
record.setIsbns(encapsulator.getIsbns());
record.setIssns(encapsulator.getIssns());
record.setIsmns(encapsulator.getIsmns());
record.setCnb(encapsulator.getCnbs());
record.setPublicationYear(encapsulator.getPublicationYear());
record.setPhysicalFormats(harvestedRecordFormatDAO.getFormatsFromEnums(formatEnums));
record.setAuthorAuthKey(encapsulator.getAuthorAuthKey());
record.setAuthorString(encapsulator.getAuthorString());
record.setScale(encapsulator.getScale());
record.setUuid(encapsulator.getUuid());
record.setPages(encapsulator.getPages());
record.setIssnSeries(encapsulator.getIssnSeries());
record.setIssnSeriesOrder(encapsulator.getIssnSeriesOrder());
record.setOclcs(encapsulator.getOclcs());
record.setLanguages(metadataRecord.getLanguages());
record.setClusterId(encapsulator.getClusterId());
record.setRaw001Id(encapsulator.getRaw001Id());
record.setSourceInfoG(encapsulator.getSourceInfoG());
record.setSourceInfoX(encapsulator.getSourceInfoX());
record.setSourceInfoT(encapsulator.getSourceInfoT());
record.setEans(encapsulator.getEans());
record.setShortTitles(encapsulator.getShortTitles());
record.setPublisherNumbers(metadataRecord.getPublisherNumber());
record.setTemporalDedupHash(computedHash);
}
record.setDedupKeysHash(computedHash);
if (record.getOaiTimestamp() != null && record.getTemporalOldOaiTimestamp() != null && !record.getOaiTimestamp().equals(record.getTemporalOldOaiTimestamp())) {
oaiTimestampChanged = true;
} else {
oaiTimestampChanged = false;
}
// decide whether record should be deduplicated
if (dedupKeysChanged) {
// new record or change in keys
record.setNextDedupFlag(true);
} else {
// key are equal
if (oaiTimestampChanged) {
// neither keys neither oai timestamp changed,
// don't deduplicate
record.setNextDedupFlag(false);
} else {
// keys are same but timestamp changed
// keep previsous dedup flag
// this may happen during repeated harvesting before
}
}
return record;
}
use of cz.mzk.recordmanager.server.model.ShortTitle in project RecordManager2 by moravianlibrary.
the class HashingDedupKeyParser method computeHashValue.
/**
* Compute SHA1 hash of deduplication keys from given {@link DedupKeysencapsulator}
* @param encapsulator
* @return
*/
protected String computeHashValue(final DedupKeysencapsulator encapsulator) {
try {
// change of hash function also requires changes in database row
MessageDigest md = MessageDigest.getInstance("SHA-1");
for (Title t : encapsulator.getTitles()) {
md.update(t.getTitleStr().getBytes("utf-8"));
}
for (Isbn i : encapsulator.getIsbns()) {
md.update(i.getIsbn().byteValue());
}
for (Issn i : encapsulator.getIssns()) {
md.update(i.getIssn().getBytes());
}
for (Ismn i : encapsulator.getIsmns()) {
md.update(i.getIsmn().byteValue());
}
for (Cnb c : encapsulator.getCnbs()) {
md.update(c.getCnb().getBytes());
}
if (encapsulator.getPublicationYear() != null) {
md.update(encapsulator.getPublicationYear().byteValue());
}
for (HarvestedRecordFormat hrfe : encapsulator.getFormats()) {
md.update(hrfe.getName().getBytes());
}
if (encapsulator.getAuthorAuthKey() != null) {
md.update(encapsulator.getAuthorAuthKey().getBytes());
}
if (encapsulator.getAuthorString() != null) {
md.update(encapsulator.getAuthorString().getBytes());
}
if (encapsulator.getScale() != null) {
md.update(encapsulator.getScale().byteValue());
}
if (encapsulator.getUuid() != null) {
md.update(encapsulator.getUuid().getBytes());
}
if (encapsulator.getPages() != null) {
md.update(encapsulator.getPages().byteValue());
}
if (encapsulator.getIssnSeries() != null) {
md.update(encapsulator.getIssnSeries().getBytes());
}
if (encapsulator.getIssnSeriesOrder() != null) {
md.update(encapsulator.getIssnSeriesOrder().getBytes());
}
for (Oclc o : encapsulator.getOclcs()) {
md.update(o.getOclcStr().getBytes());
}
for (String l : encapsulator.getLanguages()) {
md.update(l.getBytes());
}
if (encapsulator.getClusterId() != null) {
md.update(encapsulator.getClusterId().getBytes());
}
if (encapsulator.getRaw001Id() != null) {
md.update(encapsulator.getRaw001Id().getBytes());
}
if (encapsulator.getSourceInfoT() != null) {
md.update(encapsulator.getSourceInfoT().getBytes());
}
if (encapsulator.getSourceInfoX() != null) {
md.update(encapsulator.getSourceInfoX().getBytes());
}
if (encapsulator.getSourceInfoG() != null) {
md.update(encapsulator.getSourceInfoG().getBytes());
}
for (Ean ean : encapsulator.getEans()) {
md.update(ean.getEan().byteValue());
}
for (PublisherNumber publisherNumber : encapsulator.getPublisherNumbers()) {
md.update(publisherNumber.getPublisherNumber().getBytes("utf-8"));
}
for (ShortTitle st : encapsulator.getShortTitles()) {
md.update(st.getShortTitleStr().getBytes("utf-8"));
}
byte[] hash = md.digest();
StringBuilder sb = new StringBuilder();
for (byte b : hash) {
sb.append(String.format("%02x", b));
}
return sb.toString();
} catch (NoSuchAlgorithmException e) {
// should never be thrown, SHA-1 is required by Java specification
} catch (UnsupportedEncodingException uee) {
throw new DedupKeyParserException("Uncoding problems in hash computation", uee);
}
return "";
}
use of cz.mzk.recordmanager.server.model.ShortTitle in project RecordManager2 by moravianlibrary.
the class HarvestedRecordDAOHibernate method dropDedupKeys.
@Override
public void dropDedupKeys(HarvestedRecord hr) {
if (hr == null || hr.getId() == null) {
return;
}
Session session = sessionFactory.getCurrentSession();
// don't delete keys for not managed entities
if (!session.contains(hr)) {
System.out.println("NOT CONT");
return;
}
hr.setAuthorAuthKey(null);
hr.setAuthorString(null);
hr.setClusterId(null);
hr.setPages(null);
hr.setPublicationYear(null);
hr.setRaw001Id(null);
hr.setScale(null);
hr.setUuid(null);
hr.setSourceInfoT(null);
hr.setSourceInfoX(null);
hr.setSourceInfoG(null);
hr.setIssnSeries(null);
hr.setIssnSeriesOrder(null);
hr.setWeight(null);
List<Title> titles = hr.getTitles();
hr.setTitles(new ArrayList<>());
for (Title t : titles) {
session.delete(t);
}
List<ShortTitle> shortTitles = hr.getShortTitles();
hr.setShortTitles(new ArrayList<>());
for (ShortTitle st : shortTitles) {
session.delete(st);
}
List<Isbn> isbns = hr.getIsbns();
hr.setIsbns(new ArrayList<>());
for (Isbn i : isbns) {
session.delete(i);
}
List<Issn> issns = hr.getIssns();
hr.setIssns(new ArrayList<>());
for (Issn i : issns) {
session.delete(i);
}
List<Ismn> ismns = hr.getIsmns();
hr.setIsmns(new ArrayList<>());
for (Ismn i : ismns) {
session.delete(i);
}
List<Oclc> oclcs = hr.getOclcs();
hr.setOclcs(new ArrayList<>());
for (Oclc o : oclcs) {
session.delete(o);
}
List<Cnb> cnbs = hr.getCnb();
hr.setCnb(new ArrayList<>());
for (Cnb c : cnbs) {
session.delete(c);
}
List<Ean> eans = hr.getEans();
hr.setEans(new ArrayList<>());
for (Ean ean : eans) {
session.delete(ean);
}
List<HarvestedRecordFormat> physicalFormats = hr.getPhysicalFormats();
hr.getPhysicalFormats().clear();
for (HarvestedRecordFormat hrf : physicalFormats) {
session.delete(hrf);
}
hr.setLanguages(new ArrayList<>());
session.update(hr);
session.flush();
}
Aggregations