Search in sources :

Example 1 with Title

use of cz.mzk.recordmanager.server.model.Title in project RecordManager2 by moravianlibrary.

the class MarcDSL method getSortableTitle.

/**
 * Get the title (245ab) from a record, without non-filing chars as
 * specified in 245 2nd indicator, and lowercased.
 * @param context - the marc record object
 * @return 245a and 245b values concatenated, with trailing punct removed,
 *         and with non-filing characters omitted. Null returned if no
 *         title can be found.
 *
 * @see SolrIndexer#getTitle
 */
public String getSortableTitle() {
    List<DataField> titleFields = record.getAllFields().get("245");
    if (titleFields == null || titleFields.isEmpty()) {
        return "";
    }
    DataField titleField = titleFields.get(0);
    if (titleField == null)
        return "";
    int nonFilingInt = getInd2AsInt(titleField);
    List<Title> titles = metadataRecord.getTitle();
    if (titles == null || titles.isEmpty())
        return null;
    String title = metadataRecord.getTitle().get(0).getTitleStr();
    title = title.replaceAll(END_PUNCTUATION, EMPTY_SEPARATOR);
    title = title.replaceAll(NUMBERS, "$1$2");
    title = title.toLowerCase();
    // Skip non-filing chars, if possible.
    if (title.length() > nonFilingInt) {
        title = title.substring(nonFilingInt);
    }
    if (title.length() == 0) {
        return null;
    }
    title = title.replaceAll(SUPPRESS, EMPTY_SEPARATOR);
    title = title.replaceAll(TO_BLANK, SPACE_SEPARATOR);
    title = title.replaceAll(LEAD_SPACE, EMPTY_SEPARATOR);
    title = title.replaceAll(PACK_SPACES, SPACE_SEPARATOR);
    return title.trim();
}
Also used : DataField(org.marc4j.marc.DataField) Title(cz.mzk.recordmanager.server.model.Title)

Example 2 with Title

use of cz.mzk.recordmanager.server.model.Title in project RecordManager2 by moravianlibrary.

the class MetadataMarcRecord method getTitle.

/**
 * get {@link Title} of record
 *
 * @return all 245abnp and 240abnp
 */
@Override
public List<Title> getTitle() {
    List<Title> result = new ArrayList<>();
    Long titleOrder = 0L;
    for (String key : TITLE_TAGS) {
        for (DataField df : underlayingMarc.getDataFields(key)) {
            String titleText = parseTitleValue(df, TITLE_SUBFIELDS);
            if (!titleText.isEmpty()) {
                result.add(Title.create(titleText, ++titleOrder, MetadataUtils.similarityEnabled(df, titleText)));
            }
        }
    }
    return result;
}
Also used : DataField(org.marc4j.marc.DataField) ArrayList(java.util.ArrayList) Title(cz.mzk.recordmanager.server.model.Title) ShortTitle(cz.mzk.recordmanager.server.model.ShortTitle)

Example 3 with Title

use of cz.mzk.recordmanager.server.model.Title in project RecordManager2 by moravianlibrary.

the class DedupIdentifierClustersProcessor method matchRecords.

/**
 * On the input are two {@link HarvestedRecord}s, having same identifier (CNB,ISBN,ISSN,OCLC) and common format.
 *
 * Records should match if their titles are similar enough (TITLE_MATCH_BOUNDARY)
 */
@Override
protected boolean matchRecords(HarvestedRecord hrA, HarvestedRecord hrB) {
    List<Title> aTitles = hrA.getTitles();
    List<Title> bTitles = hrB.getTitles();
    if (aTitles.isEmpty() || bTitles.isEmpty()) {
        return false;
    }
    boolean titlesMatching = false;
    for (Title aTitle : aTitles) {
        for (Title bTitle : bTitles) {
            titlesMatching |= StringUtils.simmilarTitleMatch(aTitle, bTitle, TITLE_MATCH_BOUNDARY, TITLE_PREFIX_BOUNDARY);
        }
    }
    if (!titlesMatching) {
        return false;
    }
    return true;
}
Also used : Title(cz.mzk.recordmanager.server.model.Title)

Example 4 with Title

use of cz.mzk.recordmanager.server.model.Title in project RecordManager2 by moravianlibrary.

the class HashingDedupKeyParser method parse.

@Override
public HarvestedRecord parse(HarvestedRecord record, MetadataRecord metadataRecord) throws DedupKeyParserException {
    record.setShouldBeProcessed(metadataRecord.matchFilter());
    // not dedup key
    record.setUpvApplicationId(metadataRecord.getUpvApplicationId());
    if (!record.getHarvestedFrom().isGenerateDedupKeys()) {
        return record;
    }
    boolean dedupKeysChanged = false;
    boolean oaiTimestampChanged = false;
    DedupKeysencapsulator encapsulator = new DedupKeysencapsulator();
    List<Title> titles = new ArrayList<>();
    for (Title title : metadataRecord.getTitle()) {
        title.setTitleStr(MetadataUtils.normalizeAndShorten(title.getTitleStr(), EFFECTIVE_TITLE_LENGTH));
        if (title.getTitleStr().isEmpty())
            continue;
        if (!titles.contains(title)) {
            titles.add(title);
        }
    }
    encapsulator.setTitles(titles);
    List<ShortTitle> shortTitles = new ArrayList<>();
    for (ShortTitle shortTitle : metadataRecord.getShortTitles()) {
        shortTitle.setShortTitleStr(MetadataUtils.normalizeAndShorten(shortTitle.getShortTitleStr(), EFFECTIVE_TITLE_LENGTH));
        if (shortTitle.getShortTitleStr().isEmpty())
            continue;
        if (!shortTitles.contains(shortTitle)) {
            shortTitles.add(shortTitle);
        }
    }
    encapsulator.setShortTitles(shortTitles);
    encapsulator.setIsbns(metadataRecord.getISBNs());
    encapsulator.setIssns(metadataRecord.getISSNs());
    encapsulator.setIsmns(metadataRecord.getISMNs());
    encapsulator.setCnbs(metadataRecord.getCNBs());
    encapsulator.setPublicationYear(metadataRecord.getPublicationYear());
    List<HarvestedRecordFormatEnum> formatEnums = metadataRecord.getDetectedFormatList();
    encapsulator.setFormats(harvestedRecordFormatDAO.getFormatsFromEnums(formatEnums));
    encapsulator.setAuthorAuthKey(MetadataUtils.shorten(metadataRecord.getAuthorAuthKey(), EFFECTIVE_AUTHOR_AUTH_KEY_LENGTH));
    encapsulator.setAuthorString(MetadataUtils.normalizeAndShorten(metadataRecord.getAuthorString(), EFFECTIVE_AUTHOR_LENGTH));
    encapsulator.setScale(metadataRecord.getScale());
    encapsulator.setUuid(metadataRecord.getUUId());
    encapsulator.setPages(metadataRecord.getPageCount());
    encapsulator.setIssnSeries(MetadataUtils.normalize(metadataRecord.getISSNSeries()));
    encapsulator.setIssnSeriesOrder(MetadataUtils.normalize(metadataRecord.getISSNSeriesOrder()));
    encapsulator.setOclcs(metadataRecord.getOclcs());
    encapsulator.setClusterId(metadataRecord.getClusterId());
    encapsulator.setRaw001Id(metadataRecord.getRaw001Id());
    encapsulator.setSourceInfoT(MetadataUtils.normalizeAndShorten(metadataRecord.getSourceInfoT(), EFFECTIVE_SOURCE_INFO_LENGTH));
    encapsulator.setSourceInfoG(MetadataUtils.normalizeAndShorten(metadataRecord.getSourceInfoG(), EFFECTIVE_SOURCE_INFO_LENGTH));
    encapsulator.setSourceInfoX(MetadataUtils.normalizeAndShorten(metadataRecord.getSourceInfoX(), EFFECTIVE_LENGTH_30));
    encapsulator.setEans(metadataRecord.getEANs());
    encapsulator.setPublisherNumbers(metadataRecord.getPublisherNumber());
    encapsulator.setLanguages(new HashSet<>(metadataRecord.getLanguages()));
    String computedHash = computeHashValue(encapsulator);
    String oldHash = record.getDedupKeysHash();
    String temporalHash = record.getTemporalDedupHash() == null ? "0000000000000000000000000000000000000000" : record.getTemporalDedupHash();
    // during one batch
    if ((!temporalHash.equals(computedHash)) && (oldHash == null || oldHash.isEmpty() || !computedHash.equals(oldHash))) {
        // keys changed, updated in database
        dedupKeysChanged = true;
        // drop old keys
        harvestedRecordDao.dropDedupKeys(record);
        if (record.getHarvestedFrom() != null)
            record.setWeight(metadataRecord.getWeight(record.getHarvestedFrom().getBaseWeight()));
        // assign new keys
        record.setTitles(encapsulator.getTitles());
        record.setIsbns(encapsulator.getIsbns());
        record.setIssns(encapsulator.getIssns());
        record.setIsmns(encapsulator.getIsmns());
        record.setCnb(encapsulator.getCnbs());
        record.setPublicationYear(encapsulator.getPublicationYear());
        record.setPhysicalFormats(harvestedRecordFormatDAO.getFormatsFromEnums(formatEnums));
        record.setAuthorAuthKey(encapsulator.getAuthorAuthKey());
        record.setAuthorString(encapsulator.getAuthorString());
        record.setScale(encapsulator.getScale());
        record.setUuid(encapsulator.getUuid());
        record.setPages(encapsulator.getPages());
        record.setIssnSeries(encapsulator.getIssnSeries());
        record.setIssnSeriesOrder(encapsulator.getIssnSeriesOrder());
        record.setOclcs(encapsulator.getOclcs());
        record.setLanguages(metadataRecord.getLanguages());
        record.setClusterId(encapsulator.getClusterId());
        record.setRaw001Id(encapsulator.getRaw001Id());
        record.setSourceInfoG(encapsulator.getSourceInfoG());
        record.setSourceInfoX(encapsulator.getSourceInfoX());
        record.setSourceInfoT(encapsulator.getSourceInfoT());
        record.setEans(encapsulator.getEans());
        record.setShortTitles(encapsulator.getShortTitles());
        record.setPublisherNumbers(metadataRecord.getPublisherNumber());
        record.setTemporalDedupHash(computedHash);
    }
    record.setDedupKeysHash(computedHash);
    if (record.getOaiTimestamp() != null && record.getTemporalOldOaiTimestamp() != null && !record.getOaiTimestamp().equals(record.getTemporalOldOaiTimestamp())) {
        oaiTimestampChanged = true;
    } else {
        oaiTimestampChanged = false;
    }
    // decide whether record should be deduplicated
    if (dedupKeysChanged) {
        // new record or change in keys
        record.setNextDedupFlag(true);
    } else {
        // key are equal
        if (oaiTimestampChanged) {
            // neither keys neither oai timestamp changed,
            // don't deduplicate
            record.setNextDedupFlag(false);
        } else {
        // keys are same but timestamp changed
        // keep previsous dedup flag
        // this may happen during repeated harvesting before
        }
    }
    return record;
}
Also used : ShortTitle(cz.mzk.recordmanager.server.model.ShortTitle) ArrayList(java.util.ArrayList) Title(cz.mzk.recordmanager.server.model.Title) ShortTitle(cz.mzk.recordmanager.server.model.ShortTitle) HarvestedRecordFormatEnum(cz.mzk.recordmanager.server.model.HarvestedRecordFormat.HarvestedRecordFormatEnum)

Example 5 with Title

use of cz.mzk.recordmanager.server.model.Title in project RecordManager2 by moravianlibrary.

the class MetadataUtilsTest method testSimilarityEnabled.

@Test
public void testSimilarityEnabled() {
    Title title = new Title();
    // book is forbidden word
    title.setTitleStr("book drg asd");
    Assert.assertFalse(MetadataUtils.similarityEnabled(title));
    title.setTitleStr("asd bOoK drg asd");
    Assert.assertFalse(MetadataUtils.similarityEnabled(title));
    title.setTitleStr("asD aSd BooK");
    Assert.assertFalse(MetadataUtils.similarityEnabled(title));
    title.setTitleStr("bookrg asd");
    Assert.assertTrue(MetadataUtils.similarityEnabled(title));
    // number
    title.setTitleStr("asdrg8 asd");
    Assert.assertFalse(MetadataUtils.similarityEnabled(title));
    title.setTitleStr("asdrg asd asd");
    Assert.assertTrue(MetadataUtils.similarityEnabled(title));
}
Also used : Title(cz.mzk.recordmanager.server.model.Title) Test(org.testng.annotations.Test)

Aggregations

Title (cz.mzk.recordmanager.server.model.Title)11 ShortTitle (cz.mzk.recordmanager.server.model.ShortTitle)4 Test (org.testng.annotations.Test)4 AbstractTest (cz.mzk.recordmanager.server.AbstractTest)3 MetadataRecord (cz.mzk.recordmanager.server.metadata.MetadataRecord)3 InputStream (java.io.InputStream)3 ArrayList (java.util.ArrayList)3 Cnb (cz.mzk.recordmanager.server.model.Cnb)2 Ean (cz.mzk.recordmanager.server.model.Ean)2 HarvestedRecordFormat (cz.mzk.recordmanager.server.model.HarvestedRecordFormat)2 HarvestedRecordFormatEnum (cz.mzk.recordmanager.server.model.HarvestedRecordFormat.HarvestedRecordFormatEnum)2 Isbn (cz.mzk.recordmanager.server.model.Isbn)2 Ismn (cz.mzk.recordmanager.server.model.Ismn)2 Issn (cz.mzk.recordmanager.server.model.Issn)2 Oclc (cz.mzk.recordmanager.server.model.Oclc)2 DataField (org.marc4j.marc.DataField)2 MetadataRecordFactory (cz.mzk.recordmanager.server.metadata.MetadataRecordFactory)1 HarvestedRecord (cz.mzk.recordmanager.server.model.HarvestedRecord)1 PublisherNumber (cz.mzk.recordmanager.server.model.PublisherNumber)1 UnsupportedEncodingException (java.io.UnsupportedEncodingException)1