use of org.apache.commons.text.similarity.LevenshteinDistance in project goci by EBISPOT.
the class DiseaseTraitService method similaritySearch.
@Cacheable(value = "diseaseTraitAnalysis", key = "#analysisId")
public AnalysisCacheDto similaritySearch(List<AnalysisDTO> diseaseTraitAnalysisDTOS, String analysisId, double threshold) {
LevenshteinDistance lv = new LevenshteinDistance();
CosineDistance cd = new CosineDistance();
List<DiseaseTrait> diseaseTraits = diseaseTraitRepository.findAll();
List<AnalysisDTO> analysisReport = new ArrayList<>();
diseaseTraitAnalysisDTOS.forEach(diseaseTraitAnalysisDTO -> diseaseTraits.forEach(diseaseTrait -> {
String trait = diseaseTrait.getTrait();
String userTerm = diseaseTraitAnalysisDTO.getUserTerm();
double cosineDistance = cd.apply(userTerm, trait);
double levenshteinDistance = ((double) lv.apply(userTerm, trait)) / Math.max(userTerm.length(), trait.length());
double cosineSimilarityPercent = Math.round((1 - cosineDistance) * 100);
double levenshteinSimilarityPercent = Math.round((1 - levenshteinDistance) * 100);
double chosen = Math.max(cosineSimilarityPercent, levenshteinSimilarityPercent);
if (chosen >= threshold) {
AnalysisDTO report = AnalysisDTO.builder().userTerm(userTerm).similarTerm(trait).degree(chosen).build();
analysisReport.add(report);
}
}));
return AnalysisCacheDto.builder().uniqueId(analysisId).analysisResult(analysisReport).build();
}
use of org.apache.commons.text.similarity.LevenshteinDistance in project drools by kiegroup.
the class KiePMMLTextIndexTest method evaluateRawNoTokenize.
@Test
public void evaluateRawNoTokenize() {
LevenshteinDistance levenshteinDistance = new LevenshteinDistance(2);
Map<LOCAL_TERM_WEIGHTS, Double> expectedResults = new HashMap<>();
double frequency = 3.0;
double logarithmic = Math.log10(1 + frequency);
int maxFrequency = 2;
// cast
double augmentedNormalizedTermFrequency = 0.5 * (1 + (frequency / (double) maxFrequency));
// for java:S2184
expectedResults.put(TERM_FREQUENCY, frequency);
expectedResults.put(BINARY, 1.0);
expectedResults.put(LOGARITHMIC, logarithmic);
expectedResults.put(AUGMENTED_NORMALIZED_TERM_FREQUENCY, augmentedNormalizedTermFrequency);
expectedResults.forEach((localTermWeights, expected) -> assertEquals(expected, KiePMMLTextIndex.evaluateRaw(true, false, TERM_0, TEXT_0, "\\s+", localTermWeights, COUNT_HITS.ALL_HITS, levenshteinDistance), 0.0000001));
// ---
maxFrequency = 3;
// cast
augmentedNormalizedTermFrequency = 0.5 * (1 + (frequency / (double) maxFrequency));
// for java:S2184
expectedResults = new HashMap<>();
expectedResults.put(TERM_FREQUENCY, frequency);
expectedResults.put(BINARY, 1.0);
expectedResults.put(LOGARITHMIC, logarithmic);
expectedResults.put(AUGMENTED_NORMALIZED_TERM_FREQUENCY, augmentedNormalizedTermFrequency);
expectedResults.forEach((localTermWeights, expected) -> assertEquals(expected, KiePMMLTextIndex.evaluateRaw(false, false, TERM_0, TEXT_0, "\\s+", localTermWeights, COUNT_HITS.ALL_HITS, levenshteinDistance), 0.0000001));
// ---
frequency = 3.0;
logarithmic = Math.log10(1 + frequency);
// cast
augmentedNormalizedTermFrequency = 0.5 * (1 + (frequency / (double) maxFrequency));
// for java:S2184
expectedResults = new HashMap<>();
expectedResults.put(TERM_FREQUENCY, frequency);
expectedResults.put(BINARY, 1.0);
expectedResults.put(LOGARITHMIC, logarithmic);
expectedResults.put(AUGMENTED_NORMALIZED_TERM_FREQUENCY, augmentedNormalizedTermFrequency);
expectedResults.forEach((localTermWeights, expected) -> assertEquals(expected, KiePMMLTextIndex.evaluateRaw(false, false, TERM_0, TEXT_0, "[\\s\\-]", localTermWeights, COUNT_HITS.ALL_HITS, levenshteinDistance), 0.0000001));
}
use of org.apache.commons.text.similarity.LevenshteinDistance in project drools by kiegroup.
the class KiePMMLTextIndexTest method evaluateLevenshteinDistanceSplitText.
@Test
public void evaluateLevenshteinDistanceSplitText() {
String toSearch = "brown fox";
String toScan = "brown fox";
LevenshteinDistance levenshteinDistance = new LevenshteinDistance(0);
assertEquals(0, KiePMMLTextIndex.evaluateLevenshteinDistance(levenshteinDistance, toSearch, toScan));
levenshteinDistance = new LevenshteinDistance(1);
assertEquals(0, KiePMMLTextIndex.evaluateLevenshteinDistance(levenshteinDistance, toSearch, toScan));
levenshteinDistance = new LevenshteinDistance(2);
assertEquals(0, KiePMMLTextIndex.evaluateLevenshteinDistance(levenshteinDistance, toSearch, toScan));
toScan = "brown foxy";
levenshteinDistance = new LevenshteinDistance(0);
assertEquals(-1, KiePMMLTextIndex.evaluateLevenshteinDistance(levenshteinDistance, toSearch, toScan));
levenshteinDistance = new LevenshteinDistance(1);
assertEquals(1, KiePMMLTextIndex.evaluateLevenshteinDistance(levenshteinDistance, toSearch, toScan));
levenshteinDistance = new LevenshteinDistance(2);
assertEquals(1, KiePMMLTextIndex.evaluateLevenshteinDistance(levenshteinDistance, toSearch, toScan));
toScan = "browny foxy";
levenshteinDistance = new LevenshteinDistance(0);
assertEquals(-1, KiePMMLTextIndex.evaluateLevenshteinDistance(levenshteinDistance, toSearch, toScan));
levenshteinDistance = new LevenshteinDistance(1);
assertEquals(-1, KiePMMLTextIndex.evaluateLevenshteinDistance(levenshteinDistance, toSearch, toScan));
levenshteinDistance = new LevenshteinDistance(2);
assertEquals(2, KiePMMLTextIndex.evaluateLevenshteinDistance(levenshteinDistance, toSearch, toScan));
}
use of org.apache.commons.text.similarity.LevenshteinDistance in project drools by kiegroup.
the class KiePMMLRow method replace.
String replace(String original, String replacement, String term, boolean isCaseSensitive, int maxLevenshteinDistance, boolean tokenize, String wordSeparatorCharacterRE) {
logger.debug("replace {} {} {} {} {}", original, replacement, term, isCaseSensitive, maxLevenshteinDistance);
int caseSensitiveFlag = isCaseSensitive ? 0 : CASE_INSENSITIVE;
Pattern pattern = tokenize ? Pattern.compile(wordSeparatorCharacterRE, caseSensitiveFlag) : Pattern.compile(DEFAULT_TOKENIZER);
List<String> terms = splitText(replacement, pattern);
String replacementToUse = String.join(" ", terms);
List<String> texts = splitText(original, pattern);
int batchSize = terms.size();
int limit = texts.size() - batchSize + 1;
LevenshteinDistance levenshteinDistance = new LevenshteinDistance(maxLevenshteinDistance);
String toReturn = original;
for (int i = 0; i < limit; i++) {
String text = String.join(" ", texts.subList(i, i + batchSize));
int distance = evaluateLevenshteinDistance(levenshteinDistance, term, text);
if (distance > -1) {
toReturn = toReturn.replace(text, replacementToUse);
}
}
return toReturn;
}
use of org.apache.commons.text.similarity.LevenshteinDistance in project goci by EBISPOT.
the class PublicationController method matchPublication.
@RequestMapping(value = "/match", produces = MediaType.APPLICATION_JSON_VALUE, method = RequestMethod.POST)
@ResponseBody
public ResponseEntity<Map<String, Object>> matchPublication(Model model, @RequestBody String pubmedId) {
Map<String, Object> results = new HashMap<>();
CosineDistance cosScore = new CosineDistance();
LevenshteinDistance levenshteinDistance = new LevenshteinDistance();
JaroWinklerSimilarity jwDistance = new JaroWinklerSimilarity();
EuropePMCData europePMCResult = europepmcPubMedSearchService.createStudyByPubmed(pubmedId);
Map<String, String> searchProps = new HashMap<>();
List<Map<String, String>> data = new ArrayList<>();
if (!europePMCResult.getError()) {
try {
searchProps.put("pubMedID", europePMCResult.getPublication().getPubmedId());
searchProps.put("author", europePMCResult.getFirstAuthor().getFullname());
searchProps.put("title", europePMCResult.getPublication().getTitle());
searchProps.put("doi", europePMCResult.getDoi());
results.put("search", searchProps);
String searchTitle = europePMCResult.getPublication().getTitle();
String searchAuthor = europePMCResult.getFirstAuthor().getFullname();
CharSequence searchString = buildSearch(searchAuthor, searchTitle);
Map<String, Submission> submissionMap = submissionService.getSubmissionsBasic();
for (Map.Entry<String, Submission> e : submissionMap.entrySet()) {
Map<String, String> props = new HashMap<>();
Submission submission = e.getValue();
String matchTitle = submission.getTitle();
String matchAuthor = submission.getAuthor();
CharSequence matchString = buildSearch(matchAuthor, matchTitle);
props.put("submissionID", submission.getId());
props.put("pubMedID", submission.getPubMedID());
props.put("author", submission.getAuthor());
props.put("title", submission.getTitle());
props.put("doi", submission.getDoi());
if (matchString.equals("")) {
props.put("cosScore", new Integer(0).toString());
props.put("levDistance", new Integer(0).toString());
props.put("jwScore", new Integer(0).toString());
} else {
Double score = cosScore.apply(searchString, matchString) * 100;
Integer ldScore = levenshteinDistance.apply(searchString, matchString);
Double jwScore = jwDistance.apply(searchString, matchString) * 100;
props.put("cosScore", normalizeScore(score.intValue()).toString());
props.put("levDistance", normalizeScore(ldScore).toString());
props.put("jwScore", new Integer(jwScore.intValue()).toString());
}
data.add(props);
}
data.sort((o1, o2) -> Integer.decode(o2.get("cosScore")).compareTo(Integer.decode(o1.get("cosScore"))));
} catch (IOException e) {
e.printStackTrace();
}
} else {
results.put("error", "ID " + pubmedId + " not found");
}
results.put("data", data);
model.addAttribute("baseUrl", depositionUiURL);
HttpHeaders responseHeaders = new HttpHeaders();
responseHeaders.add("Content-Type", "application/json; charset=utf-8");
return new ResponseEntity<>(results, responseHeaders, HttpStatus.OK);
}
Aggregations