use of cz.mzk.recordmanager.server.model.Title in project RecordManager2 by moravianlibrary.
the class MarcDSL method getSortableTitle.
/**
* Get the title (245ab) from a record, without non-filing chars as
* specified in 245 2nd indicator, and lowercased.
* @param context - the marc record object
* @return 245a and 245b values concatenated, with trailing punct removed,
* and with non-filing characters omitted. Null returned if no
* title can be found.
*
* @see SolrIndexer#getTitle
*/
public String getSortableTitle() {
List<DataField> titleFields = record.getAllFields().get("245");
if (titleFields == null || titleFields.isEmpty()) {
return "";
}
DataField titleField = titleFields.get(0);
if (titleField == null)
return "";
int nonFilingInt = getInd2AsInt(titleField);
List<Title> titles = metadataRecord.getTitle();
if (titles == null || titles.isEmpty())
return null;
String title = metadataRecord.getTitle().get(0).getTitleStr();
title = title.replaceAll(END_PUNCTUATION, EMPTY_SEPARATOR);
title = title.replaceAll(NUMBERS, "$1$2");
title = title.toLowerCase();
// Skip non-filing chars, if possible.
if (title.length() > nonFilingInt) {
title = title.substring(nonFilingInt);
}
if (title.length() == 0) {
return null;
}
title = title.replaceAll(SUPPRESS, EMPTY_SEPARATOR);
title = title.replaceAll(TO_BLANK, SPACE_SEPARATOR);
title = title.replaceAll(LEAD_SPACE, EMPTY_SEPARATOR);
title = title.replaceAll(PACK_SPACES, SPACE_SEPARATOR);
return title.trim();
}
use of cz.mzk.recordmanager.server.model.Title in project RecordManager2 by moravianlibrary.
the class MetadataMarcRecord method getTitle.
/**
* get {@link Title} of record
*
* @return all 245abnp and 240abnp
*/
@Override
public List<Title> getTitle() {
List<Title> result = new ArrayList<>();
Long titleOrder = 0L;
for (String key : TITLE_TAGS) {
for (DataField df : underlayingMarc.getDataFields(key)) {
String titleText = parseTitleValue(df, TITLE_SUBFIELDS);
if (!titleText.isEmpty()) {
result.add(Title.create(titleText, ++titleOrder, MetadataUtils.similarityEnabled(df, titleText)));
}
}
}
return result;
}
use of cz.mzk.recordmanager.server.model.Title in project RecordManager2 by moravianlibrary.
the class DedupIdentifierClustersProcessor method matchRecords.
/**
* On the input are two {@link HarvestedRecord}s, having same identifier (CNB,ISBN,ISSN,OCLC) and common format.
*
* Records should match if their titles are similar enough (TITLE_MATCH_BOUNDARY)
*/
@Override
protected boolean matchRecords(HarvestedRecord hrA, HarvestedRecord hrB) {
List<Title> aTitles = hrA.getTitles();
List<Title> bTitles = hrB.getTitles();
if (aTitles.isEmpty() || bTitles.isEmpty()) {
return false;
}
boolean titlesMatching = false;
for (Title aTitle : aTitles) {
for (Title bTitle : bTitles) {
titlesMatching |= StringUtils.simmilarTitleMatch(aTitle, bTitle, TITLE_MATCH_BOUNDARY, TITLE_PREFIX_BOUNDARY);
}
}
if (!titlesMatching) {
return false;
}
return true;
}
use of cz.mzk.recordmanager.server.model.Title in project RecordManager2 by moravianlibrary.
the class HashingDedupKeyParser method parse.
@Override
public HarvestedRecord parse(HarvestedRecord record, MetadataRecord metadataRecord) throws DedupKeyParserException {
record.setShouldBeProcessed(metadataRecord.matchFilter());
// not dedup key
record.setUpvApplicationId(metadataRecord.getUpvApplicationId());
if (!record.getHarvestedFrom().isGenerateDedupKeys()) {
return record;
}
boolean dedupKeysChanged = false;
boolean oaiTimestampChanged = false;
DedupKeysencapsulator encapsulator = new DedupKeysencapsulator();
List<Title> titles = new ArrayList<>();
for (Title title : metadataRecord.getTitle()) {
title.setTitleStr(MetadataUtils.normalizeAndShorten(title.getTitleStr(), EFFECTIVE_TITLE_LENGTH));
if (title.getTitleStr().isEmpty())
continue;
if (!titles.contains(title)) {
titles.add(title);
}
}
encapsulator.setTitles(titles);
List<ShortTitle> shortTitles = new ArrayList<>();
for (ShortTitle shortTitle : metadataRecord.getShortTitles()) {
shortTitle.setShortTitleStr(MetadataUtils.normalizeAndShorten(shortTitle.getShortTitleStr(), EFFECTIVE_TITLE_LENGTH));
if (shortTitle.getShortTitleStr().isEmpty())
continue;
if (!shortTitles.contains(shortTitle)) {
shortTitles.add(shortTitle);
}
}
encapsulator.setShortTitles(shortTitles);
encapsulator.setIsbns(metadataRecord.getISBNs());
encapsulator.setIssns(metadataRecord.getISSNs());
encapsulator.setIsmns(metadataRecord.getISMNs());
encapsulator.setCnbs(metadataRecord.getCNBs());
encapsulator.setPublicationYear(metadataRecord.getPublicationYear());
List<HarvestedRecordFormatEnum> formatEnums = metadataRecord.getDetectedFormatList();
encapsulator.setFormats(harvestedRecordFormatDAO.getFormatsFromEnums(formatEnums));
encapsulator.setAuthorAuthKey(MetadataUtils.shorten(metadataRecord.getAuthorAuthKey(), EFFECTIVE_AUTHOR_AUTH_KEY_LENGTH));
encapsulator.setAuthorString(MetadataUtils.normalizeAndShorten(metadataRecord.getAuthorString(), EFFECTIVE_AUTHOR_LENGTH));
encapsulator.setScale(metadataRecord.getScale());
encapsulator.setUuid(metadataRecord.getUUId());
encapsulator.setPages(metadataRecord.getPageCount());
encapsulator.setIssnSeries(MetadataUtils.normalize(metadataRecord.getISSNSeries()));
encapsulator.setIssnSeriesOrder(MetadataUtils.normalize(metadataRecord.getISSNSeriesOrder()));
encapsulator.setOclcs(metadataRecord.getOclcs());
encapsulator.setClusterId(metadataRecord.getClusterId());
encapsulator.setRaw001Id(metadataRecord.getRaw001Id());
encapsulator.setSourceInfoT(MetadataUtils.normalizeAndShorten(metadataRecord.getSourceInfoT(), EFFECTIVE_SOURCE_INFO_LENGTH));
encapsulator.setSourceInfoG(MetadataUtils.normalizeAndShorten(metadataRecord.getSourceInfoG(), EFFECTIVE_SOURCE_INFO_LENGTH));
encapsulator.setSourceInfoX(MetadataUtils.normalizeAndShorten(metadataRecord.getSourceInfoX(), EFFECTIVE_LENGTH_30));
encapsulator.setEans(metadataRecord.getEANs());
encapsulator.setPublisherNumbers(metadataRecord.getPublisherNumber());
encapsulator.setLanguages(new HashSet<>(metadataRecord.getLanguages()));
String computedHash = computeHashValue(encapsulator);
String oldHash = record.getDedupKeysHash();
String temporalHash = record.getTemporalDedupHash() == null ? "0000000000000000000000000000000000000000" : record.getTemporalDedupHash();
// during one batch
if ((!temporalHash.equals(computedHash)) && (oldHash == null || oldHash.isEmpty() || !computedHash.equals(oldHash))) {
// keys changed, updated in database
dedupKeysChanged = true;
// drop old keys
harvestedRecordDao.dropDedupKeys(record);
if (record.getHarvestedFrom() != null)
record.setWeight(metadataRecord.getWeight(record.getHarvestedFrom().getBaseWeight()));
// assign new keys
record.setTitles(encapsulator.getTitles());
record.setIsbns(encapsulator.getIsbns());
record.setIssns(encapsulator.getIssns());
record.setIsmns(encapsulator.getIsmns());
record.setCnb(encapsulator.getCnbs());
record.setPublicationYear(encapsulator.getPublicationYear());
record.setPhysicalFormats(harvestedRecordFormatDAO.getFormatsFromEnums(formatEnums));
record.setAuthorAuthKey(encapsulator.getAuthorAuthKey());
record.setAuthorString(encapsulator.getAuthorString());
record.setScale(encapsulator.getScale());
record.setUuid(encapsulator.getUuid());
record.setPages(encapsulator.getPages());
record.setIssnSeries(encapsulator.getIssnSeries());
record.setIssnSeriesOrder(encapsulator.getIssnSeriesOrder());
record.setOclcs(encapsulator.getOclcs());
record.setLanguages(metadataRecord.getLanguages());
record.setClusterId(encapsulator.getClusterId());
record.setRaw001Id(encapsulator.getRaw001Id());
record.setSourceInfoG(encapsulator.getSourceInfoG());
record.setSourceInfoX(encapsulator.getSourceInfoX());
record.setSourceInfoT(encapsulator.getSourceInfoT());
record.setEans(encapsulator.getEans());
record.setShortTitles(encapsulator.getShortTitles());
record.setPublisherNumbers(metadataRecord.getPublisherNumber());
record.setTemporalDedupHash(computedHash);
}
record.setDedupKeysHash(computedHash);
if (record.getOaiTimestamp() != null && record.getTemporalOldOaiTimestamp() != null && !record.getOaiTimestamp().equals(record.getTemporalOldOaiTimestamp())) {
oaiTimestampChanged = true;
} else {
oaiTimestampChanged = false;
}
// decide whether record should be deduplicated
if (dedupKeysChanged) {
// new record or change in keys
record.setNextDedupFlag(true);
} else {
// key are equal
if (oaiTimestampChanged) {
// neither keys neither oai timestamp changed,
// don't deduplicate
record.setNextDedupFlag(false);
} else {
// keys are same but timestamp changed
// keep previsous dedup flag
// this may happen during repeated harvesting before
}
}
return record;
}
use of cz.mzk.recordmanager.server.model.Title in project RecordManager2 by moravianlibrary.
the class MetadataUtilsTest method testSimilarityEnabled.
@Test
public void testSimilarityEnabled() {
Title title = new Title();
// book is forbidden word
title.setTitleStr("book drg asd");
Assert.assertFalse(MetadataUtils.similarityEnabled(title));
title.setTitleStr("asd bOoK drg asd");
Assert.assertFalse(MetadataUtils.similarityEnabled(title));
title.setTitleStr("asD aSd BooK");
Assert.assertFalse(MetadataUtils.similarityEnabled(title));
title.setTitleStr("bookrg asd");
Assert.assertTrue(MetadataUtils.similarityEnabled(title));
// number
title.setTitleStr("asdrg8 asd");
Assert.assertFalse(MetadataUtils.similarityEnabled(title));
title.setTitleStr("asdrg asd asd");
Assert.assertTrue(MetadataUtils.similarityEnabled(title));
}
Aggregations