Search in sources :

Example 1 with ShortTitle

use of cz.mzk.recordmanager.server.model.ShortTitle in project RecordManager2 by moravianlibrary.

the class MetadataMarcRecord method getShortTitles.

/**
 * get {@link ShortTitle} of record
 *
 * @return all 245anp and 240anp, if not contains subfield 'b'
 */
@Override
public List<ShortTitle> getShortTitles() {
    List<ShortTitle> results = new ArrayList<>();
    Long shortTitleCounter = 0L;
    for (String tag : TITLE_TAGS) {
        for (DataField df : underlayingMarc.getDataFields(tag)) {
            if (df.getSubfield('b') == null)
                continue;
            String titleText = parseTitleValue(df, SHORT_TITLE_SUBFIELDS);
            if (!titleText.isEmpty()) {
                results.add(ShortTitle.create(titleText, ++shortTitleCounter, MetadataUtils.similarityEnabled(df, titleText)));
            }
        }
    }
    return results;
}
Also used : ShortTitle(cz.mzk.recordmanager.server.model.ShortTitle) DataField(org.marc4j.marc.DataField) ArrayList(java.util.ArrayList)

Example 2 with ShortTitle

use of cz.mzk.recordmanager.server.model.ShortTitle in project RecordManager2 by moravianlibrary.

the class HashingDedupKeyParser method parse.

@Override
public HarvestedRecord parse(HarvestedRecord record, MetadataRecord metadataRecord) throws DedupKeyParserException {
    record.setShouldBeProcessed(metadataRecord.matchFilter());
    // not dedup key
    record.setUpvApplicationId(metadataRecord.getUpvApplicationId());
    if (!record.getHarvestedFrom().isGenerateDedupKeys()) {
        return record;
    }
    boolean dedupKeysChanged = false;
    boolean oaiTimestampChanged = false;
    DedupKeysencapsulator encapsulator = new DedupKeysencapsulator();
    List<Title> titles = new ArrayList<>();
    for (Title title : metadataRecord.getTitle()) {
        title.setTitleStr(MetadataUtils.normalizeAndShorten(title.getTitleStr(), EFFECTIVE_TITLE_LENGTH));
        if (title.getTitleStr().isEmpty())
            continue;
        if (!titles.contains(title)) {
            titles.add(title);
        }
    }
    encapsulator.setTitles(titles);
    List<ShortTitle> shortTitles = new ArrayList<>();
    for (ShortTitle shortTitle : metadataRecord.getShortTitles()) {
        shortTitle.setShortTitleStr(MetadataUtils.normalizeAndShorten(shortTitle.getShortTitleStr(), EFFECTIVE_TITLE_LENGTH));
        if (shortTitle.getShortTitleStr().isEmpty())
            continue;
        if (!shortTitles.contains(shortTitle)) {
            shortTitles.add(shortTitle);
        }
    }
    encapsulator.setShortTitles(shortTitles);
    encapsulator.setIsbns(metadataRecord.getISBNs());
    encapsulator.setIssns(metadataRecord.getISSNs());
    encapsulator.setIsmns(metadataRecord.getISMNs());
    encapsulator.setCnbs(metadataRecord.getCNBs());
    encapsulator.setPublicationYear(metadataRecord.getPublicationYear());
    List<HarvestedRecordFormatEnum> formatEnums = metadataRecord.getDetectedFormatList();
    encapsulator.setFormats(harvestedRecordFormatDAO.getFormatsFromEnums(formatEnums));
    encapsulator.setAuthorAuthKey(MetadataUtils.shorten(metadataRecord.getAuthorAuthKey(), EFFECTIVE_AUTHOR_AUTH_KEY_LENGTH));
    encapsulator.setAuthorString(MetadataUtils.normalizeAndShorten(metadataRecord.getAuthorString(), EFFECTIVE_AUTHOR_LENGTH));
    encapsulator.setScale(metadataRecord.getScale());
    encapsulator.setUuid(metadataRecord.getUUId());
    encapsulator.setPages(metadataRecord.getPageCount());
    encapsulator.setIssnSeries(MetadataUtils.normalize(metadataRecord.getISSNSeries()));
    encapsulator.setIssnSeriesOrder(MetadataUtils.normalize(metadataRecord.getISSNSeriesOrder()));
    encapsulator.setOclcs(metadataRecord.getOclcs());
    encapsulator.setClusterId(metadataRecord.getClusterId());
    encapsulator.setRaw001Id(metadataRecord.getRaw001Id());
    encapsulator.setSourceInfoT(MetadataUtils.normalizeAndShorten(metadataRecord.getSourceInfoT(), EFFECTIVE_SOURCE_INFO_LENGTH));
    encapsulator.setSourceInfoG(MetadataUtils.normalizeAndShorten(metadataRecord.getSourceInfoG(), EFFECTIVE_SOURCE_INFO_LENGTH));
    encapsulator.setSourceInfoX(MetadataUtils.normalizeAndShorten(metadataRecord.getSourceInfoX(), EFFECTIVE_LENGTH_30));
    encapsulator.setEans(metadataRecord.getEANs());
    encapsulator.setPublisherNumbers(metadataRecord.getPublisherNumber());
    encapsulator.setLanguages(new HashSet<>(metadataRecord.getLanguages()));
    String computedHash = computeHashValue(encapsulator);
    String oldHash = record.getDedupKeysHash();
    String temporalHash = record.getTemporalDedupHash() == null ? "0000000000000000000000000000000000000000" : record.getTemporalDedupHash();
    // during one batch
    if ((!temporalHash.equals(computedHash)) && (oldHash == null || oldHash.isEmpty() || !computedHash.equals(oldHash))) {
        // keys changed, updated in database
        dedupKeysChanged = true;
        // drop old keys
        harvestedRecordDao.dropDedupKeys(record);
        if (record.getHarvestedFrom() != null)
            record.setWeight(metadataRecord.getWeight(record.getHarvestedFrom().getBaseWeight()));
        // assign new keys
        record.setTitles(encapsulator.getTitles());
        record.setIsbns(encapsulator.getIsbns());
        record.setIssns(encapsulator.getIssns());
        record.setIsmns(encapsulator.getIsmns());
        record.setCnb(encapsulator.getCnbs());
        record.setPublicationYear(encapsulator.getPublicationYear());
        record.setPhysicalFormats(harvestedRecordFormatDAO.getFormatsFromEnums(formatEnums));
        record.setAuthorAuthKey(encapsulator.getAuthorAuthKey());
        record.setAuthorString(encapsulator.getAuthorString());
        record.setScale(encapsulator.getScale());
        record.setUuid(encapsulator.getUuid());
        record.setPages(encapsulator.getPages());
        record.setIssnSeries(encapsulator.getIssnSeries());
        record.setIssnSeriesOrder(encapsulator.getIssnSeriesOrder());
        record.setOclcs(encapsulator.getOclcs());
        record.setLanguages(metadataRecord.getLanguages());
        record.setClusterId(encapsulator.getClusterId());
        record.setRaw001Id(encapsulator.getRaw001Id());
        record.setSourceInfoG(encapsulator.getSourceInfoG());
        record.setSourceInfoX(encapsulator.getSourceInfoX());
        record.setSourceInfoT(encapsulator.getSourceInfoT());
        record.setEans(encapsulator.getEans());
        record.setShortTitles(encapsulator.getShortTitles());
        record.setPublisherNumbers(metadataRecord.getPublisherNumber());
        record.setTemporalDedupHash(computedHash);
    }
    record.setDedupKeysHash(computedHash);
    if (record.getOaiTimestamp() != null && record.getTemporalOldOaiTimestamp() != null && !record.getOaiTimestamp().equals(record.getTemporalOldOaiTimestamp())) {
        oaiTimestampChanged = true;
    } else {
        oaiTimestampChanged = false;
    }
    // decide whether record should be deduplicated
    if (dedupKeysChanged) {
        // new record or change in keys
        record.setNextDedupFlag(true);
    } else {
        // key are equal
        if (oaiTimestampChanged) {
            // neither keys neither oai timestamp changed,
            // don't deduplicate
            record.setNextDedupFlag(false);
        } else {
        // keys are same but timestamp changed
        // keep previsous dedup flag
        // this may happen during repeated harvesting before
        }
    }
    return record;
}
Also used : ShortTitle(cz.mzk.recordmanager.server.model.ShortTitle) ArrayList(java.util.ArrayList) Title(cz.mzk.recordmanager.server.model.Title) ShortTitle(cz.mzk.recordmanager.server.model.ShortTitle) HarvestedRecordFormatEnum(cz.mzk.recordmanager.server.model.HarvestedRecordFormat.HarvestedRecordFormatEnum)

Example 3 with ShortTitle

use of cz.mzk.recordmanager.server.model.ShortTitle in project RecordManager2 by moravianlibrary.

the class HashingDedupKeyParser method computeHashValue.

/**
 * Compute SHA1 hash of deduplication keys from given {@link DedupKeysencapsulator}
 * @param encapsulator
 * @return
 */
protected String computeHashValue(final DedupKeysencapsulator encapsulator) {
    try {
        // change of hash function also requires changes in database row
        MessageDigest md = MessageDigest.getInstance("SHA-1");
        for (Title t : encapsulator.getTitles()) {
            md.update(t.getTitleStr().getBytes("utf-8"));
        }
        for (Isbn i : encapsulator.getIsbns()) {
            md.update(i.getIsbn().byteValue());
        }
        for (Issn i : encapsulator.getIssns()) {
            md.update(i.getIssn().getBytes());
        }
        for (Ismn i : encapsulator.getIsmns()) {
            md.update(i.getIsmn().byteValue());
        }
        for (Cnb c : encapsulator.getCnbs()) {
            md.update(c.getCnb().getBytes());
        }
        if (encapsulator.getPublicationYear() != null) {
            md.update(encapsulator.getPublicationYear().byteValue());
        }
        for (HarvestedRecordFormat hrfe : encapsulator.getFormats()) {
            md.update(hrfe.getName().getBytes());
        }
        if (encapsulator.getAuthorAuthKey() != null) {
            md.update(encapsulator.getAuthorAuthKey().getBytes());
        }
        if (encapsulator.getAuthorString() != null) {
            md.update(encapsulator.getAuthorString().getBytes());
        }
        if (encapsulator.getScale() != null) {
            md.update(encapsulator.getScale().byteValue());
        }
        if (encapsulator.getUuid() != null) {
            md.update(encapsulator.getUuid().getBytes());
        }
        if (encapsulator.getPages() != null) {
            md.update(encapsulator.getPages().byteValue());
        }
        if (encapsulator.getIssnSeries() != null) {
            md.update(encapsulator.getIssnSeries().getBytes());
        }
        if (encapsulator.getIssnSeriesOrder() != null) {
            md.update(encapsulator.getIssnSeriesOrder().getBytes());
        }
        for (Oclc o : encapsulator.getOclcs()) {
            md.update(o.getOclcStr().getBytes());
        }
        for (String l : encapsulator.getLanguages()) {
            md.update(l.getBytes());
        }
        if (encapsulator.getClusterId() != null) {
            md.update(encapsulator.getClusterId().getBytes());
        }
        if (encapsulator.getRaw001Id() != null) {
            md.update(encapsulator.getRaw001Id().getBytes());
        }
        if (encapsulator.getSourceInfoT() != null) {
            md.update(encapsulator.getSourceInfoT().getBytes());
        }
        if (encapsulator.getSourceInfoX() != null) {
            md.update(encapsulator.getSourceInfoX().getBytes());
        }
        if (encapsulator.getSourceInfoG() != null) {
            md.update(encapsulator.getSourceInfoG().getBytes());
        }
        for (Ean ean : encapsulator.getEans()) {
            md.update(ean.getEan().byteValue());
        }
        for (PublisherNumber publisherNumber : encapsulator.getPublisherNumbers()) {
            md.update(publisherNumber.getPublisherNumber().getBytes("utf-8"));
        }
        for (ShortTitle st : encapsulator.getShortTitles()) {
            md.update(st.getShortTitleStr().getBytes("utf-8"));
        }
        byte[] hash = md.digest();
        StringBuilder sb = new StringBuilder();
        for (byte b : hash) {
            sb.append(String.format("%02x", b));
        }
        return sb.toString();
    } catch (NoSuchAlgorithmException e) {
    // should never be thrown, SHA-1 is required by Java specification
    } catch (UnsupportedEncodingException uee) {
        throw new DedupKeyParserException("Uncoding problems in hash computation", uee);
    }
    return "";
}
Also used : Issn(cz.mzk.recordmanager.server.model.Issn) Title(cz.mzk.recordmanager.server.model.Title) ShortTitle(cz.mzk.recordmanager.server.model.ShortTitle) Ismn(cz.mzk.recordmanager.server.model.Ismn) UnsupportedEncodingException(java.io.UnsupportedEncodingException) NoSuchAlgorithmException(java.security.NoSuchAlgorithmException) Oclc(cz.mzk.recordmanager.server.model.Oclc) Ean(cz.mzk.recordmanager.server.model.Ean) Isbn(cz.mzk.recordmanager.server.model.Isbn) ShortTitle(cz.mzk.recordmanager.server.model.ShortTitle) Cnb(cz.mzk.recordmanager.server.model.Cnb) HarvestedRecordFormat(cz.mzk.recordmanager.server.model.HarvestedRecordFormat) MessageDigest(java.security.MessageDigest) PublisherNumber(cz.mzk.recordmanager.server.model.PublisherNumber)

Example 4 with ShortTitle

use of cz.mzk.recordmanager.server.model.ShortTitle in project RecordManager2 by moravianlibrary.

the class HarvestedRecordDAOHibernate method dropDedupKeys.

@Override
public void dropDedupKeys(HarvestedRecord hr) {
    if (hr == null || hr.getId() == null) {
        return;
    }
    Session session = sessionFactory.getCurrentSession();
    // don't delete keys for not managed entities
    if (!session.contains(hr)) {
        System.out.println("NOT CONT");
        return;
    }
    hr.setAuthorAuthKey(null);
    hr.setAuthorString(null);
    hr.setClusterId(null);
    hr.setPages(null);
    hr.setPublicationYear(null);
    hr.setRaw001Id(null);
    hr.setScale(null);
    hr.setUuid(null);
    hr.setSourceInfoT(null);
    hr.setSourceInfoX(null);
    hr.setSourceInfoG(null);
    hr.setIssnSeries(null);
    hr.setIssnSeriesOrder(null);
    hr.setWeight(null);
    List<Title> titles = hr.getTitles();
    hr.setTitles(new ArrayList<>());
    for (Title t : titles) {
        session.delete(t);
    }
    List<ShortTitle> shortTitles = hr.getShortTitles();
    hr.setShortTitles(new ArrayList<>());
    for (ShortTitle st : shortTitles) {
        session.delete(st);
    }
    List<Isbn> isbns = hr.getIsbns();
    hr.setIsbns(new ArrayList<>());
    for (Isbn i : isbns) {
        session.delete(i);
    }
    List<Issn> issns = hr.getIssns();
    hr.setIssns(new ArrayList<>());
    for (Issn i : issns) {
        session.delete(i);
    }
    List<Ismn> ismns = hr.getIsmns();
    hr.setIsmns(new ArrayList<>());
    for (Ismn i : ismns) {
        session.delete(i);
    }
    List<Oclc> oclcs = hr.getOclcs();
    hr.setOclcs(new ArrayList<>());
    for (Oclc o : oclcs) {
        session.delete(o);
    }
    List<Cnb> cnbs = hr.getCnb();
    hr.setCnb(new ArrayList<>());
    for (Cnb c : cnbs) {
        session.delete(c);
    }
    List<Ean> eans = hr.getEans();
    hr.setEans(new ArrayList<>());
    for (Ean ean : eans) {
        session.delete(ean);
    }
    List<HarvestedRecordFormat> physicalFormats = hr.getPhysicalFormats();
    hr.getPhysicalFormats().clear();
    for (HarvestedRecordFormat hrf : physicalFormats) {
        session.delete(hrf);
    }
    hr.setLanguages(new ArrayList<>());
    session.update(hr);
    session.flush();
}
Also used : Issn(cz.mzk.recordmanager.server.model.Issn) Title(cz.mzk.recordmanager.server.model.Title) ShortTitle(cz.mzk.recordmanager.server.model.ShortTitle) Ismn(cz.mzk.recordmanager.server.model.Ismn) Oclc(cz.mzk.recordmanager.server.model.Oclc) Ean(cz.mzk.recordmanager.server.model.Ean) ShortTitle(cz.mzk.recordmanager.server.model.ShortTitle) Isbn(cz.mzk.recordmanager.server.model.Isbn) Cnb(cz.mzk.recordmanager.server.model.Cnb) HarvestedRecordFormat(cz.mzk.recordmanager.server.model.HarvestedRecordFormat) Session(org.hibernate.Session)

Aggregations

ShortTitle (cz.mzk.recordmanager.server.model.ShortTitle)4 Title (cz.mzk.recordmanager.server.model.Title)3 Cnb (cz.mzk.recordmanager.server.model.Cnb)2 Ean (cz.mzk.recordmanager.server.model.Ean)2 HarvestedRecordFormat (cz.mzk.recordmanager.server.model.HarvestedRecordFormat)2 Isbn (cz.mzk.recordmanager.server.model.Isbn)2 Ismn (cz.mzk.recordmanager.server.model.Ismn)2 Issn (cz.mzk.recordmanager.server.model.Issn)2 Oclc (cz.mzk.recordmanager.server.model.Oclc)2 ArrayList (java.util.ArrayList)2 HarvestedRecordFormatEnum (cz.mzk.recordmanager.server.model.HarvestedRecordFormat.HarvestedRecordFormatEnum)1 PublisherNumber (cz.mzk.recordmanager.server.model.PublisherNumber)1 UnsupportedEncodingException (java.io.UnsupportedEncodingException)1 MessageDigest (java.security.MessageDigest)1 NoSuchAlgorithmException (java.security.NoSuchAlgorithmException)1 Session (org.hibernate.Session)1 DataField (org.marc4j.marc.DataField)1