Search in sources :

Example 1 with COMBINED_SCORE

use of org.molgenis.ontology.sorta.meta.OntologyTermHitMetaData.COMBINED_SCORE in project molgenis by molgenis.

the class SortaServiceImpl method findSynonymWithHighestNgramScore.

/**
 * A helper function to calculate the best NGram score from a list ontologyTerm synonyms
 */
private Entity findSynonymWithHighestNgramScore(String ontologyIri, String queryString, Entity ontologyTermEntity) {
    Iterable<Entity> entities = ontologyTermEntity.getEntities(OntologyTermMetaData.ONTOLOGY_TERM_SYNONYM);
    if (Iterables.size(entities) > 0) {
        String cleanedQueryString = removeIllegalCharWithSingleWhiteSpace(queryString);
        // Calculate the Ngram silmiarity score for all the synonyms and sort them in descending order
        List<Entity> synonymEntities = FluentIterable.from(entities).transform(ontologyTermSynonymEntity -> {
            Entity mapEntity = ontologyTermSynonymFactory.create();
            mapEntity.set(ontologyTermSynonymEntity);
            String ontologyTermSynonym = removeIllegalCharWithSingleWhiteSpace(ontologyTermSynonymEntity.getString(OntologyTermSynonymMetaData.ONTOLOGY_TERM_SYNONYM_ATTR));
            mapEntity.set(SCORE, NGramDistanceAlgorithm.stringMatching(cleanedQueryString, ontologyTermSynonym));
            return mapEntity;
        }).toSortedList((entity_1, entity_2) -> entity_2.getDouble(SCORE).compareTo(entity_1.getDouble(SCORE)));
        Entity firstMatchedSynonymEntity = Iterables.getFirst(synonymEntities, ontologyTermSynonymFactory.create());
        double topNgramScore = firstMatchedSynonymEntity.getDouble(SCORE);
        String topMatchedSynonym = firstMatchedSynonymEntity.getString(OntologyTermSynonymMetaData.ONTOLOGY_TERM_SYNONYM_ATTR);
        // else move to next synonym
        for (Entity nextMatchedSynonymEntity : Iterables.skip(synonymEntities, 1)) {
            String nextMatchedSynonym = nextMatchedSynonymEntity.getString(OntologyTermSynonymMetaData.ONTOLOGY_TERM_SYNONYM_ATTR);
            StringBuilder tempCombinedSynonym = new StringBuilder();
            tempCombinedSynonym.append(topMatchedSynonym).append(SINGLE_WHITESPACE).append(nextMatchedSynonym);
            double newScore = NGramDistanceAlgorithm.stringMatching(cleanedQueryString, removeIllegalCharWithSingleWhiteSpace(tempCombinedSynonym.toString()));
            if (newScore > topNgramScore) {
                topNgramScore = newScore;
                topMatchedSynonym = tempCombinedSynonym.toString();
            }
        }
        firstMatchedSynonymEntity.set(OntologyTermSynonymMetaData.ONTOLOGY_TERM_SYNONYM_ATTR, topMatchedSynonym);
        firstMatchedSynonymEntity.set(SCORE, topNgramScore);
        firstMatchedSynonymEntity.set(COMBINED_SCORE, topNgramScore);
        // The similarity scores are adjusted based on the inverse document frequency of the words.
        // The idea is that all the words from query string are weighted (important words occur fewer times across
        // all ontology terms than common words), the final score should be compensated for according to the word
        // // weight.
        Map<String, Double> weightedWordSimilarity = informationContentService.redistributedNGramScore(cleanedQueryString, ontologyIri);
        Set<String> synonymStemmedWords = informationContentService.createStemmedWordSet(topMatchedSynonym);
        Set<String> createStemmedWordSet = informationContentService.createStemmedWordSet(cleanedQueryString);
        createStemmedWordSet.stream().filter(originalWord -> Iterables.contains(synonymStemmedWords, originalWord) && weightedWordSimilarity.containsKey(originalWord)).forEach(word -> firstMatchedSynonymEntity.set(COMBINED_SCORE, (firstMatchedSynonymEntity.getDouble(COMBINED_SCORE) + weightedWordSimilarity.get(word))));
        return firstMatchedSynonymEntity;
    }
    return null;
}
Also used : NGramDistanceAlgorithm(org.molgenis.semanticsearch.string.NGramDistanceAlgorithm) Iterables(com.google.common.collect.Iterables) java.util(java.util) COMBINED_SCORE(org.molgenis.ontology.sorta.meta.OntologyTermHitMetaData.COMBINED_SCORE) Operator(org.molgenis.data.QueryRule.Operator) SCORE(org.molgenis.ontology.sorta.meta.OntologyTermHitMetaData.SCORE) QueryImpl(org.molgenis.data.support.QueryImpl) StringUtils(org.apache.commons.lang3.StringUtils) FluentIterable(com.google.common.collect.FluentIterable) Objects.requireNonNull(java.util.Objects.requireNonNull) ONTOLOGY_TERM_DYNAMIC_ANNOTATION(org.molgenis.ontology.core.meta.OntologyTermDynamicAnnotationMetaData.ONTOLOGY_TERM_DYNAMIC_ANNOTATION) Stemmer(org.molgenis.semanticsearch.string.Stemmer) ONTOLOGY_TERM(org.molgenis.ontology.core.meta.OntologyTermMetaData.ONTOLOGY_TERM) Collectors(java.util.stream.Collectors) Sets(com.google.common.collect.Sets) Stream(java.util.stream.Stream) OntologyTermHitEntity(org.molgenis.ontology.sorta.bean.OntologyTermHitEntity) DataService(org.molgenis.data.DataService) org.molgenis.ontology.core.meta(org.molgenis.ontology.core.meta) OntologyTermHitMetaData(org.molgenis.ontology.sorta.meta.OntologyTermHitMetaData) ONTOLOGY(org.molgenis.ontology.core.meta.OntologyMetaData.ONTOLOGY) QueryRule(org.molgenis.data.QueryRule) Entity(org.molgenis.data.Entity) InformationContentService(org.molgenis.ontology.roc.InformationContentService) SortaService(org.molgenis.ontology.sorta.service.SortaService) OntologyTermHitEntity(org.molgenis.ontology.sorta.bean.OntologyTermHitEntity) Entity(org.molgenis.data.Entity)

Aggregations

FluentIterable (com.google.common.collect.FluentIterable)1 Iterables (com.google.common.collect.Iterables)1 Sets (com.google.common.collect.Sets)1 java.util (java.util)1 Objects.requireNonNull (java.util.Objects.requireNonNull)1 Collectors (java.util.stream.Collectors)1 Stream (java.util.stream.Stream)1 StringUtils (org.apache.commons.lang3.StringUtils)1 DataService (org.molgenis.data.DataService)1 Entity (org.molgenis.data.Entity)1 QueryRule (org.molgenis.data.QueryRule)1 Operator (org.molgenis.data.QueryRule.Operator)1 QueryImpl (org.molgenis.data.support.QueryImpl)1 org.molgenis.ontology.core.meta (org.molgenis.ontology.core.meta)1 ONTOLOGY (org.molgenis.ontology.core.meta.OntologyMetaData.ONTOLOGY)1 ONTOLOGY_TERM_DYNAMIC_ANNOTATION (org.molgenis.ontology.core.meta.OntologyTermDynamicAnnotationMetaData.ONTOLOGY_TERM_DYNAMIC_ANNOTATION)1 ONTOLOGY_TERM (org.molgenis.ontology.core.meta.OntologyTermMetaData.ONTOLOGY_TERM)1 InformationContentService (org.molgenis.ontology.roc.InformationContentService)1 OntologyTermHitEntity (org.molgenis.ontology.sorta.bean.OntologyTermHitEntity)1 OntologyTermHitMetaData (org.molgenis.ontology.sorta.meta.OntologyTermHitMetaData)1