use of org.molgenis.ontology.core.meta.OntologyMetaData.ONTOLOGY in project molgenis by molgenis.
the class SortaServiceImpl method findSynonymWithHighestNgramScore.
/**
* A helper function to calculate the best NGram score from a list ontologyTerm synonyms
*/
private Entity findSynonymWithHighestNgramScore(String ontologyIri, String queryString, Entity ontologyTermEntity) {
Iterable<Entity> entities = ontologyTermEntity.getEntities(OntologyTermMetaData.ONTOLOGY_TERM_SYNONYM);
if (Iterables.size(entities) > 0) {
String cleanedQueryString = removeIllegalCharWithSingleWhiteSpace(queryString);
// Calculate the Ngram silmiarity score for all the synonyms and sort them in descending order
List<Entity> synonymEntities = FluentIterable.from(entities).transform(ontologyTermSynonymEntity -> {
Entity mapEntity = ontologyTermSynonymFactory.create();
mapEntity.set(ontologyTermSynonymEntity);
String ontologyTermSynonym = removeIllegalCharWithSingleWhiteSpace(ontologyTermSynonymEntity.getString(OntologyTermSynonymMetaData.ONTOLOGY_TERM_SYNONYM_ATTR));
mapEntity.set(SCORE, NGramDistanceAlgorithm.stringMatching(cleanedQueryString, ontologyTermSynonym));
return mapEntity;
}).toSortedList((entity_1, entity_2) -> entity_2.getDouble(SCORE).compareTo(entity_1.getDouble(SCORE)));
Entity firstMatchedSynonymEntity = Iterables.getFirst(synonymEntities, ontologyTermSynonymFactory.create());
double topNgramScore = firstMatchedSynonymEntity.getDouble(SCORE);
String topMatchedSynonym = firstMatchedSynonymEntity.getString(OntologyTermSynonymMetaData.ONTOLOGY_TERM_SYNONYM_ATTR);
// else move to next synonym
for (Entity nextMatchedSynonymEntity : Iterables.skip(synonymEntities, 1)) {
String nextMatchedSynonym = nextMatchedSynonymEntity.getString(OntologyTermSynonymMetaData.ONTOLOGY_TERM_SYNONYM_ATTR);
StringBuilder tempCombinedSynonym = new StringBuilder();
tempCombinedSynonym.append(topMatchedSynonym).append(SINGLE_WHITESPACE).append(nextMatchedSynonym);
double newScore = NGramDistanceAlgorithm.stringMatching(cleanedQueryString, removeIllegalCharWithSingleWhiteSpace(tempCombinedSynonym.toString()));
if (newScore > topNgramScore) {
topNgramScore = newScore;
topMatchedSynonym = tempCombinedSynonym.toString();
}
}
firstMatchedSynonymEntity.set(OntologyTermSynonymMetaData.ONTOLOGY_TERM_SYNONYM_ATTR, topMatchedSynonym);
firstMatchedSynonymEntity.set(SCORE, topNgramScore);
firstMatchedSynonymEntity.set(COMBINED_SCORE, topNgramScore);
// The similarity scores are adjusted based on the inverse document frequency of the words.
// The idea is that all the words from query string are weighted (important words occur fewer times across
// all ontology terms than common words), the final score should be compensated for according to the word
// // weight.
Map<String, Double> weightedWordSimilarity = informationContentService.redistributedNGramScore(cleanedQueryString, ontologyIri);
Set<String> synonymStemmedWords = informationContentService.createStemmedWordSet(topMatchedSynonym);
Set<String> createStemmedWordSet = informationContentService.createStemmedWordSet(cleanedQueryString);
createStemmedWordSet.stream().filter(originalWord -> Iterables.contains(synonymStemmedWords, originalWord) && weightedWordSimilarity.containsKey(originalWord)).forEach(word -> firstMatchedSynonymEntity.set(COMBINED_SCORE, (firstMatchedSynonymEntity.getDouble(COMBINED_SCORE) + weightedWordSimilarity.get(word))));
return firstMatchedSynonymEntity;
}
return null;
}
Aggregations