use of org.ambraproject.rhino.service.taxonomy.WeightedTerm in project rhino by PLOS.
the class TaxonomyClassificationServiceImplTest method testParseVectorElement.
@Test
public void testParseVectorElement() throws Exception {
assertEquals(TaxonomyClassificationServiceImpl.parseVectorElement("<TERM>/Biology and life sciences/Computational biology/Computational neuroscience/Single neuron function|(5) neuron*(5)</TERM>"), new WeightedTerm("/Biology and life sciences/Computational biology/Computational neuroscience/Single neuron function", 5));
assertEquals(TaxonomyClassificationServiceImpl.parseVectorElement("<TERM>/Medicine and health sciences/Anesthesiology/Anesthesia|(5) anesthesia(5)</TERM>"), new WeightedTerm("/Medicine and health sciences/Anesthesiology/Anesthesia", 5));
assertEquals(TaxonomyClassificationServiceImpl.parseVectorElement("<TERM>/Medicine and health sciences/Geriatrics/Frailty|(19) frailty(18) frail*(1)</TERM>"), new WeightedTerm("/Medicine and health sciences/Geriatrics/Frailty", 19));
assertEquals(TaxonomyClassificationServiceImpl.parseVectorElement("<TERM>/Biology and life sciences/Anatomy/Head/Face/Nose|(311) nose(311)</TERM>"), new WeightedTerm("/Biology and life sciences/Anatomy/Head/Face/Nose", 311));
assertEquals(TaxonomyClassificationServiceImpl.parseVectorElement("<TERM>/People and places/Demography|(7) demographics(7)</TERM>"), new WeightedTerm("/People and places/Demography", 7));
assertEquals(TaxonomyClassificationServiceImpl.parseVectorElement("<TERM>/Medicine and health sciences/Neurology/Cognitive neurology|(2) cognit*(2)</TERM>"), new WeightedTerm("/Medicine and health sciences/Neurology/Cognitive neurology", 2));
assertEquals(TaxonomyClassificationServiceImpl.parseVectorElement("<TERM> /Medicine and health sciences/Neurology/Cognitive neurology| (67) cognit*(2)</TERM>"), new WeightedTerm("/Medicine and health sciences/Neurology/Cognitive neurology", 67));
}
use of org.ambraproject.rhino.service.taxonomy.WeightedTerm in project rhino by PLOS.
the class TaxonomyClassificationServiceImpl method persistCategories.
private void persistCategories(List<WeightedTerm> terms, Article article) {
Set<String> termStrings = terms.stream().map(WeightedTerm::getPath).collect(Collectors.toSet());
Collection<Category> existingCategories = hibernateTemplate.execute(session -> {
Query query = session.createQuery("FROM Category WHERE path IN (:terms)");
query.setParameterList("terms", termStrings);
return (Collection<Category>) query.list();
});
Map<String, Category> existingCategoryMap = Maps.uniqueIndex(existingCategories, Category::getPath);
Collection<ArticleCategoryAssignment> existingAssignments = getAssignmentsForArticle(article);
Map<Category, ArticleCategoryAssignment> assignmentMap = Maps.uniqueIndex(existingAssignments, ArticleCategoryAssignment::getCategory);
// Make it mutable. We will remove assignments as they are updated.
assignmentMap = new HashMap<>(assignmentMap);
for (WeightedTerm term : terms) {
Category category = existingCategoryMap.get(term.getPath());
if (category == null) {
/*
* A new category from the taxonomy server, which is not yet persisted in our system. Create it now.
*
* This risks a race condition if two articles are being populated concurrently and both have the same new
* category, which can cause a "MySQLIntegrityConstraintViolationException: Duplicate entry" error.
*/
category = new Category();
category.setPath(term.getPath());
hibernateTemplate.save(category);
}
ArticleCategoryAssignment assignment = assignmentMap.remove(category);
if (assignment == null) {
hibernateTemplate.save(new ArticleCategoryAssignment(category, article, term.getWeight()));
} else {
assignment.setWeight(term.getWeight());
hibernateTemplate.update(assignment);
}
}
// Each assignment that was not removed from assignmentMap is not among the new terms, so it should be deleted.
assignmentMap.values().forEach(hibernateTemplate::delete);
}
use of org.ambraproject.rhino.service.taxonomy.WeightedTerm in project rhino by PLOS.
the class TaxonomyClassificationServiceImpl method classifyArticle.
/**
* @inheritDoc
*/
@Override
public List<WeightedTerm> classifyArticle(Article article, Document articleXml) {
RuntimeConfiguration.TaxonomyConfiguration configuration = getTaxonomyConfiguration();
List<String> rawTerms = getRawTerms(articleXml, article, false);
List<WeightedTerm> results = new ArrayList<>(rawTerms.size());
for (String rawTerm : rawTerms) {
WeightedTerm entry = parseVectorElement(rawTerm);
String term = entry.getPath();
if (term != null) {
boolean isBlacklisted = false;
for (String blacklistedCategory : configuration.getCategoryBlacklist()) {
if (term.startsWith(blacklistedCategory)) {
isBlacklisted = true;
break;
}
}
if (!isBlacklisted) {
results.add(entry);
}
}
}
return results;
}
use of org.ambraproject.rhino.service.taxonomy.WeightedTerm in project rhino by PLOS.
the class TaxonomyClassificationServiceImpl method parseVectorElement.
/**
* Parses a single line of the XML response from the taxonomy server.
*
* @param vectorElement The text body of a line of the response
* @return the term and weight of the term
*/
@VisibleForTesting
static WeightedTerm parseVectorElement(String vectorElement) {
Matcher match = TERM_PATTERN.matcher(vectorElement);
if (match.find()) {
String text = match.group(1);
int value = Integer.parseInt(match.group(2));
return new WeightedTerm(text, value);
} else {
//Bad term
throw new TaxonomyRemoteServiceInvalidBehaviorException("Invalid syntax: " + vectorElement);
}
}
use of org.ambraproject.rhino.service.taxonomy.WeightedTerm in project rhino by PLOS.
the class TaxonomyClassificationServiceImpl method populateCategories.
/**
* {@inheritDoc}
*/
@Override
public void populateCategories(ArticleRevision revision) {
ArticleIngestion ingestion = revision.getIngestion();
Article article = ingestion.getArticle();
Document xml = articleCrudService.getManuscriptXml(ingestion);
List<WeightedTerm> terms;
String doi = article.getDoi();
//todo: fix or remove this when we find a home for article types
boolean isAmendment = false;
if (!isAmendment) {
terms = classifyArticle(article, xml);
if (terms != null && terms.size() > 0) {
List<WeightedTerm> leafNodes = getDistinctLeafNodes(CATEGORY_COUNT, terms);
persistCategories(leafNodes, article);
} else {
log.error("Taxonomy server returned 0 terms. Cannot populate Categories. " + doi);
}
}
}
Aggregations