use of org.edamontology.edammap.core.preprocessing.PreProcessor in project edammap by edamontology.
the class Cli method run.
@Override
public void run() {
synchronized (lock) {
++numThreads;
lockDone = true;
}
try {
PreProcessor pp = new PreProcessor(args.getPreProcessorArgs(), stopwords);
Mapper mapper = new Mapper(processedConcepts);
while (true) {
Query query;
int localIndex;
synchronized (queries) {
if (index >= queries.size()) {
break;
}
query = queries.get(index);
localIndex = index;
++index;
}
logger.info("{}/{} @ {}s", localIndex + 1, queries.size(), (System.currentTimeMillis() - start) / 1000.0);
QueryProcessed processedQuery = processor.getProcessedQuery(query, args.getType(), pp, idf, args.getFetcherArgs());
Mapping mapping = mapper.map(query, processedQuery, args.getMapperArgs());
synchronized (mappings) {
webpages.set(localIndex, processedQuery.getWebpages());
docs.set(localIndex, processedQuery.getDocs());
publications.set(localIndex, processedQuery.getPublications());
mappings.set(localIndex, mapping);
}
}
} finally {
synchronized (lock) {
--numThreads;
lock.notifyAll();
}
}
}
use of org.edamontology.edammap.core.preprocessing.PreProcessor in project edammap by edamontology.
the class Cli method run.
private static void run(Version version) throws IOException, ParseException {
List<Param> paramsMain = new ArrayList<>();
paramsMain.add(new Param("Ontology file", CliArgs.EDAM, new File(args.getEdam()).getName(), "https://github.com/edamontology/edamontology/tree/master/releases"));
if (Input.isProtocol(args.getQuery())) {
paramsMain.add(new Param("Query file", CliArgs.QUERY, args.getQuery(), args.getQuery()));
} else {
paramsMain.add(new Param("Query file", CliArgs.QUERY, new File(args.getQuery()).getName()));
}
paramsMain.add(new Param("Type", CliArgs.TYPE, args.getType().toString()));
paramsMain.add(new Param("Output file", CliArgs.OUTPUT, new File(args.getOutput()).getName()));
paramsMain.add(new Param("Report file", CliArgs.REPORT, new File(args.getReport()).getName()));
paramsMain.add(new Param("Report page size", CliArgs.REPORT_PAGE_SIZE, args.getReportPageSize(), 0.0, null));
paramsMain.add(new Param("Report pagination size", CliArgs.REPORT_PAGINATION_SIZE, args.getReportPaginationSize(), 0.0, null));
paramsMain.add(new Param("Number of threads", CliArgs.THREADS, args.getThreads(), 0.0, null));
Output output = new Output(args.getOutput(), args.getReport(), false);
stopwords = PreProcessor.getStopwords(args.getPreProcessorArgs().getStopwords());
processor = new Processor(args.getProcessorArgs());
idf = null;
if (args.getPreProcessorArgs().isStemming()) {
if (args.getProcessorArgs().getIdfStemmed() != null && !args.getProcessorArgs().getIdfStemmed().isEmpty()) {
idf = new Idf(args.getProcessorArgs().getIdfStemmed());
}
} else {
if (args.getProcessorArgs().getIdf() != null && !args.getProcessorArgs().getIdf().isEmpty()) {
idf = new Idf(args.getProcessorArgs().getIdf());
}
}
logger.info("Loading concepts");
Map<EdamUri, Concept> concepts = Edam.load(args.getEdam());
logger.info("Processing {} concepts", concepts.size());
processedConcepts = processor.getProcessedConcepts(concepts, args.getMapperArgs().getIdfArgs(), args.getMapperArgs().getMultiplierArgs(), new PreProcessor(args.getPreProcessorArgs(), stopwords));
logger.info("Loading queries");
queries = QueryLoader.get(args.getQuery(), args.getType(), concepts, args.getFetcherArgs().getTimeout(), args.getFetcherArgs().getPrivateArgs().getUserAgent());
publications = new ArrayList<>(queries.size());
webpages = new ArrayList<>(queries.size());
docs = new ArrayList<>(queries.size());
mappings = new ArrayList<>(queries.size());
for (int i = 0; i < queries.size(); ++i) {
publications.add(null);
webpages.add(null);
docs.add(null);
mappings.add(null);
}
start = System.currentTimeMillis();
logger.info("Start: {}", Instant.ofEpochMilli(start));
logger.info("Starting mapper threads");
for (int i = 0; i < args.getThreads(); ++i) {
Thread t = new Thread(new Cli());
t.setDaemon(true);
t.start();
}
synchronized (lock) {
while (!lockDone || numThreads > 0) {
try {
lock.wait();
} catch (InterruptedException e) {
// TODO exit threads cleanly? give timeout for threads to exit? close db? print that exiting and waiting for threads to terminate?
logger.error("Exception!", e);
System.exit(1);
}
}
}
logger.info("All mapper threads stopped");
long stop = System.currentTimeMillis();
logger.info("Stop: {}", Instant.ofEpochMilli(stop));
logger.info("Mapping took {}s", (stop - start) / 1000.0);
Results results = Benchmark.calculate(queries, mappings);
logger.info("Outputting results");
output.output(args, paramsMain, args.getType(), args.getReportPageSize(), args.getReportPaginationSize(), concepts, queries, webpages, docs, publications, results, start, stop, version);
logger.info("{} : {}", results.toStringMeasure(Measure.recall), Measure.recall);
logger.info("{} : {}", results.toStringMeasure(Measure.AveP), Measure.AveP);
}
use of org.edamontology.edammap.core.preprocessing.PreProcessor in project edammap by edamontology.
the class PubMedApps method getResults.
private static List<Result> getResults(PreProcessorArgs preProcessorArgs, String queryIdf, String queryPath, QueryType queryType, FetcherArgs fetcherArgs, List<Publication> publications) throws IOException, ParseException {
List<Result> results = new ArrayList<>();
List<String> hostIgnore = getResource("host_ignore.txt");
List<String> beforeTier1 = getResource("before_tier1.txt");
List<String> beforeTier2 = getResource("before_tier2.txt");
List<String> beforeTier3 = getResource("before_tier3.txt");
List<String> afterTier1 = getResource("after_tier1.txt");
List<String> afterTier2 = getResource("after_tier2.txt");
List<String> afterTier3 = getResource("after_tier3.txt");
PreProcessor preProcessor = new PreProcessor(preProcessorArgs);
Idf idf = new Idf(queryIdf);
List<Query> queries = QueryLoader.get(queryPath, queryType, fetcherArgs.getTimeout(), fetcherArgs.getPrivateArgs().getUserAgent());
List<List<String>> queryNamesExtracted = new ArrayList<>();
List<String> queryNamesProcessed = new ArrayList<>();
List<List<String>> queryLinks = new ArrayList<>();
for (Query query : queries) {
List<String> queryNameExtracted = preProcessor.extract(query.getName());
List<String> queryNameProcessed = preProcessor.process(query.getName(), queryNameExtracted);
queryNamesExtracted.add(Arrays.asList(BIOTOOLS_EXTRACTED_VERSION_TRIM.matcher(String.join(" ", queryNameExtracted)).replaceFirst("").split(" ")));
queryNamesProcessed.add(BIOTOOLS_PROCESSED_VERSION_TRIM.matcher(String.join(" ", queryNameProcessed)).replaceFirst(""));
List<Link> links = new ArrayList<>();
links.addAll(query.getWebpageUrls());
links.addAll(query.getDocUrls());
queryLinks.add(links.stream().map(l -> BIOTOOLS_LINK_TRIM_START.matcher(l.getUrl()).replaceFirst("")).map(l -> BIOTOOLS_LINK_TRIM_END.matcher(l).replaceFirst("")).filter(l -> !l.isEmpty()).collect(Collectors.toList()));
}
for (int publicationIndex = 0; publicationIndex < publications.size(); ++publicationIndex) {
double percentage = (publicationIndex + 1) / (double) publications.size() * 100;
percentage = Math.round(percentage * 10) / 10.0;
// TODO
System.err.print("\rMaking results: " + percentage + "%");
Publication publication = publications.get(publicationIndex);
String toolTitle = null;
String toolTitleTwo = null;
String toolTitleAcronym = null;
String toolTitleTwoAcronym = null;
String toolTitlePruned = null;
String toolTitleTwoPruned = null;
long toolTitleWordsTotal = 0;
String title = publication.getTitle().getContent();
int from = 0;
Matcher matcher = TITLE_SEPARATOR.matcher(title);
while (from < title.length() && matcher.find(from)) {
String currentToolTitle = title.substring(from, matcher.start()).trim();
String currentToolTitleTwo = null;
String currentToolTitleAcronym = null;
String currentToolTitleTwoAcronym = null;
String currentToolTitlePruned = null;
String currentToolTitleTwoPruned = null;
String separatorString = " and ";
int separator = currentToolTitle.indexOf(separatorString);
if (separator < 0) {
separatorString = " & ";
separator = currentToolTitle.indexOf(separatorString);
}
if (separator > -1) {
currentToolTitleTwo = currentToolTitle.substring(separator + separatorString.length(), currentToolTitle.length());
currentToolTitle = currentToolTitle.substring(0, separator);
}
List<String> currentToolTitleExtracted = preProcessor.extract(currentToolTitle);
// align indexes
preProcessor.process(currentToolTitle, currentToolTitleExtracted);
List<String> currentToolTitleTwoExtracted = null;
if (currentToolTitleTwo != null) {
currentToolTitleTwoExtracted = preProcessor.extract(currentToolTitleTwo);
// align indexes
preProcessor.process(currentToolTitleTwo, currentToolTitleTwoExtracted);
}
Integer firstAcronymIndex = firstAcronymIndex(currentToolTitle, preProcessor);
if (firstAcronymIndex != null) {
currentToolTitleAcronym = currentToolTitleExtracted.remove(firstAcronymIndex.intValue());
}
if (currentToolTitleTwo != null) {
Integer firstAcronymIndexTwo = firstAcronymIndex(currentToolTitleTwo, preProcessor);
if (firstAcronymIndexTwo != null) {
currentToolTitleTwoAcronym = currentToolTitleTwoExtracted.remove(firstAcronymIndexTwo.intValue());
}
}
currentToolTitle = String.join(" ", currentToolTitleExtracted);
if (currentToolTitleTwo != null) {
currentToolTitleTwo = String.join(" ", currentToolTitleTwoExtracted);
}
currentToolTitlePruned = toolTitlePrune(currentToolTitleExtracted);
if (currentToolTitleTwo != null) {
currentToolTitleTwoPruned = toolTitlePrune(currentToolTitleTwoExtracted);
}
if (currentToolTitleTwo != null && (currentToolTitleExtracted.size() > 1 || currentToolTitleTwoExtracted.size() > 1) && (!currentToolTitle.isEmpty() && !currentToolTitleTwo.isEmpty())) {
currentToolTitle += " " + currentToolTitleTwo;
currentToolTitleTwo = null;
if (!currentToolTitlePruned.isEmpty() && !currentToolTitleTwoPruned.isEmpty()) {
currentToolTitlePruned += " " + currentToolTitleTwoPruned;
currentToolTitleTwoPruned = null;
} else if (!currentToolTitleTwoPruned.isEmpty()) {
currentToolTitlePruned = currentToolTitleTwoPruned;
currentToolTitleTwoPruned = null;
}
currentToolTitleExtracted.addAll(currentToolTitleTwoExtracted);
currentToolTitleTwoExtracted = null;
}
long currentToolTitleWordsTotal = currentToolTitleExtracted.size();
if (currentToolTitleTwoExtracted != null) {
currentToolTitleWordsTotal += currentToolTitleTwoExtracted.size();
}
if (currentToolTitleWordsTotal < toolTitleWordsTotal || toolTitle == null) {
toolTitle = currentToolTitle == null || currentToolTitle.isEmpty() ? null : currentToolTitle;
toolTitleTwo = currentToolTitleTwo == null || currentToolTitleTwo.isEmpty() ? null : currentToolTitleTwo;
toolTitleAcronym = currentToolTitleAcronym == null || currentToolTitleAcronym.isEmpty() ? null : currentToolTitleAcronym;
toolTitleTwoAcronym = currentToolTitleTwoAcronym == null || currentToolTitleTwoAcronym.isEmpty() ? null : currentToolTitleTwoAcronym;
toolTitlePruned = currentToolTitlePruned == null || currentToolTitlePruned.isEmpty() ? null : currentToolTitlePruned;
toolTitleTwoPruned = currentToolTitleTwoPruned == null || currentToolTitleTwoPruned.isEmpty() ? null : currentToolTitleTwoPruned;
toolTitleWordsTotal = currentToolTitleWordsTotal;
}
from = matcher.end();
}
String theAbstract = publication.getAbstract().getContent();
String titleWithoutLinks = preProcessor.removeLinks(title);
String abstractWithoutLinks = preProcessor.removeLinks(theAbstract);
if (from > 0) {
title = title.substring(from).trim();
}
List<String> titleAbstractSentences = preProcessor.sentences(preProcessor.removeLinks(title) + ". " + abstractWithoutLinks);
List<List<String>> extracted = new ArrayList<>();
List<List<String>> processed = new ArrayList<>();
for (String sentence : titleAbstractSentences) {
List<String> sentenceExtracted = preProcessor.extract(sentence);
List<String> sentenceProcessed = preProcessor.process(sentence, sentenceExtracted);
extracted.add(sentenceExtracted);
processed.add(sentenceProcessed);
}
Map<String, Double> scores = new HashMap<>();
Map<String, String> processedToExtracted = new HashMap<>();
Map<String, Set<String>> processedToExtractedBegin = new HashMap<>();
Map<String, List<String>> processedToExtractedWithin = new HashMap<>();
for (int i = 0; i < processed.size(); ++i) {
List<String> sentenceExtracted = extracted.get(i);
List<String> sentenceProcessed = processed.get(i);
for (int j = 0; j < COMPOUND_WORDS; ++j) {
for (int k = 0; k < sentenceProcessed.size() - j; ++k) {
String wordExtracted = sentenceExtracted.get(k);
String wordProcessed = sentenceProcessed.get(k);
for (int l = k + 1; l <= k + j; ++l) wordExtracted += " " + sentenceExtracted.get(l);
for (int l = k + 1; l <= k + j; ++l) wordProcessed += " " + sentenceProcessed.get(l);
Double value;
if (j == 0) {
value = Math.pow(idf.getIdf(sentenceProcessed.get(k)), QUERY_IDF_SCALING);
} else {
value = scores.get(sentenceProcessed.get(k));
for (int l = k + 1; l <= k + j; ++l) value *= scores.get(sentenceProcessed.get(l));
value /= COMPOUND_DIVIDER;
}
scores.merge(wordProcessed, value, Double::sum);
if (i == 0 || k == 0) {
Set<String> wordsExtracted = processedToExtractedBegin.get(wordProcessed);
if (wordsExtracted == null) {
wordsExtracted = new LinkedHashSet<>();
processedToExtractedBegin.put(wordProcessed, wordsExtracted);
}
wordsExtracted.add(wordExtracted);
} else {
List<String> wordsExtracted = processedToExtractedWithin.get(wordProcessed);
if (wordsExtracted == null) {
wordsExtracted = new ArrayList<>();
processedToExtractedWithin.put(wordProcessed, wordsExtracted);
}
wordsExtracted.add(wordExtracted);
}
}
}
}
// put within before begin so that in case of equality option from within wins (because order-preserving sets)
Set<String> processedToExtractedKeys = new LinkedHashSet<>();
processedToExtractedKeys.addAll(processedToExtractedWithin.keySet());
processedToExtractedKeys.addAll(processedToExtractedBegin.keySet());
for (String key : processedToExtractedKeys) {
Map<String, Integer> extractedCount = new LinkedHashMap<>();
List<String> extractedWithins = processedToExtractedWithin.get(key);
if (extractedWithins != null) {
for (String extractedWithin : extractedWithins) {
extractedCount.merge(extractedWithin, 1, Integer::sum);
}
}
Set<String> extractedBegins = processedToExtractedBegin.get(key);
if (extractedBegins != null) {
for (String extractedBegin : extractedBegins) {
extractedCount.merge(extractedBegin, 1, Integer::sum);
}
}
extractedCount = extractedCount.entrySet().stream().sorted(Map.Entry.comparingByValue(Comparator.reverseOrder())).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue, (k, v) -> {
throw new AssertionError();
}, LinkedHashMap::new));
processedToExtracted.put(key, extractedCount.keySet().iterator().next());
}
if (toolTitle != null && toolTitlePruned != null) {
boolean existing = toolTitleScore(toolTitle, preProcessor, scores, processedToExtracted, false);
if (!existing && !toolTitlePruned.equals(toolTitle)) {
toolTitleScore(toolTitlePruned, preProcessor, scores, processedToExtracted, true);
}
}
if (toolTitleTwo != null && toolTitleTwoPruned != null) {
boolean existing = toolTitleScore(toolTitleTwo, preProcessor, scores, processedToExtracted, false);
if (!existing && !toolTitleTwoPruned.equals(toolTitleTwo)) {
toolTitleScore(toolTitleTwoPruned, preProcessor, scores, processedToExtracted, true);
}
}
if (toolTitleAcronym != null) {
toolTitleScore(toolTitleAcronym, preProcessor, scores, processedToExtracted, false);
}
if (toolTitleTwoAcronym != null) {
toolTitleScore(toolTitleTwoAcronym, preProcessor, scores, processedToExtracted, false);
}
Map<String, Double> beforeAfterAdded = new HashMap<>();
for (int i = 0; i < processed.size(); ++i) {
List<String> sentenceProcessed = processed.get(i);
boolean acronymsDone = false;
List<Integer> acronyms = null;
for (int j = 0; j < sentenceProcessed.size(); ++j) {
String wordProcessed = sentenceProcessed.get(j);
boolean inBeforeTier1 = beforeTier1.contains(wordProcessed);
boolean inBeforeTier2 = beforeTier2.contains(wordProcessed);
boolean inBeforeTier3 = beforeTier3.contains(wordProcessed);
if (j + 1 < sentenceProcessed.size() && (inBeforeTier1 || inBeforeTier2 || inBeforeTier3)) {
if (!acronymsDone) {
acronyms = acronyms(titleAbstractSentences.get(i), preProcessor);
acronymsDone = true;
}
boolean acronymFound = false;
String acronym = null;
if (acronyms.contains(j + 1)) {
acronym = sentenceProcessed.get(j + 1);
acronymFound = true;
} else if (acronyms.contains(-(j + 1))) {
acronym = sentenceProcessed.get(j + 1);
acronymFound = true;
} else if (j + 2 < sentenceProcessed.size()) {
if (acronyms.contains(j + 2)) {
acronym = sentenceProcessed.get(j + 2);
acronymFound = true;
} else if (acronyms.contains(-(j + 2))) {
acronym = sentenceProcessed.get(j + 2);
acronymFound = true;
}
}
if (acronymFound) {
beforeAfterScore(acronym, scores, beforeAfterAdded, inBeforeTier1, inBeforeTier2, inBeforeTier3, true);
} else {
String nextWord = sentenceProcessed.get(j + 1);
beforeAfterScore(nextWord, scores, beforeAfterAdded, inBeforeTier1, inBeforeTier2, inBeforeTier3, false);
if (j + 2 < sentenceProcessed.size()) {
acronymFound = false;
for (int k = 1; k <= COMPOUND_WORDS && j + 2 + k < sentenceProcessed.size(); ++k) {
if (acronyms.contains(-(j + 2 + k))) {
String nextNextWord = sentenceProcessed.get(j + 2 + k);
beforeAfterScore(nextNextWord, scores, beforeAfterAdded, inBeforeTier1, inBeforeTier2, inBeforeTier3, false);
acronymFound = true;
break;
}
}
if (!acronymFound) {
String nextNextWord = sentenceProcessed.get(j + 2);
beforeAfterScore(nextNextWord, scores, beforeAfterAdded, inBeforeTier1, inBeforeTier2, inBeforeTier3, false);
String nextCompoundWord = nextWord + " " + nextNextWord;
beforeAfterScore(nextCompoundWord, scores, beforeAfterAdded, inBeforeTier1, inBeforeTier2, inBeforeTier3, false);
}
}
}
}
boolean inAfterTier1 = afterTier1.contains(wordProcessed);
boolean inAfterTier2 = afterTier2.contains(wordProcessed);
boolean inAfterTier3 = afterTier3.contains(wordProcessed);
if (j - 1 >= 0 && (inAfterTier1 || inAfterTier2 || inAfterTier3)) {
if (!acronymsDone) {
acronyms = acronyms(titleAbstractSentences.get(i), preProcessor);
acronymsDone = true;
}
boolean acronymFound = false;
String acronym = null;
if (acronyms.contains(j - 1)) {
acronym = sentenceProcessed.get(j - 1);
acronymFound = true;
} else if (acronyms.contains(-(j - 1))) {
acronym = sentenceProcessed.get(j - 1);
acronymFound = true;
} else if (j - 2 >= 0) {
if (acronyms.contains(j - 2)) {
acronym = sentenceProcessed.get(j - 2);
acronymFound = true;
} else if (acronyms.contains(-(j - 2))) {
acronym = sentenceProcessed.get(j - 2);
acronymFound = true;
}
}
if (acronymFound) {
beforeAfterScore(acronym, scores, beforeAfterAdded, inAfterTier1, inAfterTier2, inAfterTier3, true);
} else {
String nextWord = sentenceProcessed.get(j - 1);
beforeAfterScore(nextWord, scores, beforeAfterAdded, inAfterTier1, inAfterTier2, inAfterTier3, false);
if (j - 2 >= 0) {
acronymFound = false;
for (int k = 1; k <= COMPOUND_WORDS && j - 2 - k >= 0; ++k) {
if (acronyms.contains(-(j - 2 - k))) {
String nextNextWord = sentenceProcessed.get(j - 2 - k);
beforeAfterScore(nextNextWord, scores, beforeAfterAdded, inAfterTier1, inAfterTier2, inAfterTier3, false);
acronymFound = true;
break;
}
}
if (!acronymFound) {
String nextNextWord = sentenceProcessed.get(j - 2);
beforeAfterScore(nextNextWord, scores, beforeAfterAdded, inAfterTier1, inAfterTier2, inAfterTier3, false);
String nextCompoundWord = nextNextWord + " " + nextWord;
beforeAfterScore(nextCompoundWord, scores, beforeAfterAdded, inAfterTier1, inAfterTier2, inAfterTier3, false);
}
}
}
}
}
}
List<String> titleAbstractLinks = preProcessor.links(title);
titleAbstractLinks.addAll(preProcessor.links(theAbstract));
List<String> fulltextLinks = preProcessor.links(publication.getFulltext().getContent());
for (int i = 0; i < titleAbstractLinks.size(); ++i) {
String titleAbstractLink = titleAbstractLinks.get(i);
Iterator<String> it = fulltextLinks.iterator();
while (it.hasNext()) {
String fulltextLink = it.next();
if (fulltextLink.equals(titleAbstractLink)) {
it.remove();
break;
}
String start = "";
Matcher startTitleAbstractLink = LINK_COMPARE_START.matcher(titleAbstractLink);
if (startTitleAbstractLink.find()) {
start = titleAbstractLink.substring(0, startTitleAbstractLink.end());
titleAbstractLink = titleAbstractLink.substring(startTitleAbstractLink.end());
}
Matcher startFulltextLink = LINK_COMPARE_START.matcher(fulltextLink);
if (startFulltextLink.find()) {
String startFulltext = fulltextLink.substring(0, startFulltextLink.end());
if (startFulltext.length() > start.length()) {
start = startFulltext;
}
fulltextLink = fulltextLink.substring(startFulltextLink.end());
}
if (fulltextLink.equals(titleAbstractLink)) {
titleAbstractLinks.set(i, start + titleAbstractLink);
it.remove();
break;
}
if (fulltextLink.startsWith(titleAbstractLink)) {
String rest = fulltextLink.substring(titleAbstractLink.length());
if (rest.startsWith("/")) {
titleAbstractLinks.set(i, start + fulltextLink);
}
it.remove();
break;
}
if (fulltextLink.contains(titleAbstractLink)) {
titleAbstractLinks.set(i, start + fulltextLink);
it.remove();
break;
}
if (titleAbstractLink.startsWith(fulltextLink)) {
String rest = titleAbstractLink.substring(fulltextLink.length() - 1);
if (LINK_COMPARE_REST.matcher(rest).matches()) {
titleAbstractLinks.set(i, start + fulltextLink);
}
Matcher schemaMatcher = LINK_COMPARE_SCHEMA.matcher(rest);
if (schemaMatcher.find()) {
titleAbstractLinks.set(i, start + fulltextLink);
titleAbstractLinks.add(i + 1, rest.substring(schemaMatcher.start()));
}
it.remove();
break;
}
if (titleAbstractLink.contains(fulltextLink)) {
it.remove();
break;
}
}
}
Map<String, List<String>> linksAbstract = links(titleAbstractLinks, preProcessor, idf, hostIgnore, scores.keySet(), processedToExtracted, processed, titleWithoutLinks, abstractWithoutLinks, toolTitle, toolTitleTwo, toolTitleAcronym, toolTitleTwoAcronym, toolTitlePruned, toolTitleTwoPruned);
Map<String, List<String>> linksFulltext = links(fulltextLinks, preProcessor, idf, hostIgnore, scores.keySet(), processedToExtracted, processed, titleWithoutLinks, abstractWithoutLinks, toolTitle, toolTitleTwo, toolTitleAcronym, toolTitleTwoAcronym, toolTitlePruned, toolTitleTwoPruned);
for (Map.Entry<String, List<String>> linkEntry : linksAbstract.entrySet()) {
double score = scores.get(linkEntry.getKey()) * LINK_MULTIPLIER_ABSTRACT * linkEntry.getValue().size();
if (score > LINK_MULTIPLIER_ABSTRACT_MINIMUM) {
scores.put(linkEntry.getKey(), score);
} else {
scores.put(linkEntry.getKey(), LINK_MULTIPLIER_ABSTRACT_MINIMUM);
}
}
boolean genericLinkAugmentation = linksAbstract.isEmpty();
for (String link : titleAbstractLinks) {
boolean present = false;
for (Map.Entry<String, List<String>> linkEntry : linksAbstract.entrySet()) {
if (linkEntry.getValue().contains(link)) {
present = true;
break;
}
}
if (!present) {
if (genericLinkAugmentation) {
for (Map.Entry<String, Double> scoreEntry : scores.entrySet()) {
scores.put(scoreEntry.getKey(), scoreEntry.getValue() * LINK_MULTIPLIER_ABSTRACT_AUGMENTATION);
}
genericLinkAugmentation = false;
}
String fromLink = fromLink(link, preProcessor, idf, hostIgnore);
if (!fromLink.isEmpty()) {
List<String> fromLinkExtracted = preProcessor.extract(fromLink);
List<String> fromLinkProcessed = preProcessor.process(fromLink, fromLinkExtracted);
String fromLinkExtractedString = String.join(" ", fromLinkExtracted);
String fromLinkProcessedString = String.join(" ", fromLinkProcessed);
if (!fromLinkProcessedString.isEmpty()) {
scores.merge(fromLinkProcessedString, LINK_MULTIPLIER_ABSTRACT_NEW / fromLinkProcessed.size(), (d1, d2) -> d1 * d2);
String wordExtracted = processedToExtracted.get(fromLinkProcessedString);
if (wordExtracted == null) {
processedToExtracted.put(fromLinkProcessedString, fromLinkExtractedString);
}
}
}
}
}
for (Map.Entry<String, List<String>> linkEntry : linksFulltext.entrySet()) {
long multiplier = linkEntry.getValue().stream().filter(link -> !LINK_TWO_PART.matcher(link).matches()).count();
if (multiplier > 0) {
if (multiplier > 2) {
multiplier = 2;
}
scores.put(linkEntry.getKey(), scores.get(linkEntry.getKey()) * LINK_MULTIPLIER_FULLTEXT * multiplier);
}
}
Map<String, Double> sortedScores = scores.entrySet().stream().sorted(Map.Entry.comparingByValue(Comparator.reverseOrder())).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue, (k, v) -> {
throw new AssertionError();
}, LinkedHashMap::new));
Result result = new Result();
result.setPmid(publication.getPmid().getContent());
result.setPmcid(publication.getPmcid().getContent());
result.setDoi(publication.getDoi().getContent());
List<String> resultLinks = new ArrayList<>();
List<String> suggestionsProcessed = new ArrayList<>();
Iterator<Map.Entry<String, Double>> sortedScoresIterator = sortedScores.entrySet().iterator();
if (sortedScoresIterator.hasNext()) {
Map.Entry<String, Double> topEntry = sortedScoresIterator.next();
double topScore = topEntry.getValue();
result.setScore(topScore);
result.setSuggestion(processedToExtracted.get(topEntry.getKey()));
List<String> linksFromAbstract = linksAbstract.get(topEntry.getKey());
if (linksFromAbstract != null) {
for (String link : linksFromAbstract) {
resultLinks.add(link);
}
}
List<String> linksFromFulltext = linksFulltext.get(topEntry.getKey());
if (linksFromFulltext != null) {
for (String link : linksFromFulltext) {
resultLinks.add(link);
}
}
suggestionsProcessed.add(topEntry.getKey());
for (int i = 1; i < SUGGESTION_LIMIT && sortedScoresIterator.hasNext(); ++i) {
topEntry = sortedScoresIterator.next();
if (topEntry.getValue() * TOP_SCORE_LIMIT < topScore) {
break;
}
result.addOtherSuggestion(processedToExtracted.get(topEntry.getKey()));
suggestionsProcessed.add(topEntry.getKey());
}
}
List<String> resultOtherLinks = new ArrayList<>();
for (List<String> linksFromAbstract : linksAbstract.values()) {
for (String link : linksFromAbstract) {
if (!resultLinks.contains(link)) {
resultOtherLinks.add(link);
}
}
}
for (List<String> linksFromFulltext : linksFulltext.values()) {
for (String link : linksFromFulltext) {
if (!resultLinks.contains(link)) {
resultOtherLinks.add(link);
}
}
}
List<String> resultLeftoverLinks = new ArrayList<>();
for (String link : titleAbstractLinks) {
if (!resultLinks.contains(link) && !resultOtherLinks.contains(link)) {
resultLeftoverLinks.add(link);
}
}
for (String link : fulltextLinks) {
if (!resultLinks.contains(link) && !resultOtherLinks.contains(link)) {
resultLeftoverLinks.add(link);
}
}
List<String> resultAllLinks = new ArrayList<>();
resultAllLinks.addAll(resultLinks.stream().map(resultLink -> BIOTOOLS_LINK_TRIM_START.matcher(resultLink).replaceFirst("")).collect(Collectors.toList()));
resultAllLinks.addAll(resultOtherLinks.stream().map(resultLink -> BIOTOOLS_LINK_TRIM_START.matcher(resultLink).replaceFirst("")).collect(Collectors.toList()));
resultAllLinks.addAll(resultLeftoverLinks.stream().map(resultLink -> BIOTOOLS_LINK_TRIM_START.matcher(resultLink).replaceFirst("")).collect(Collectors.toList()));
result.addLinks(breakLinks(resultLinks, resultAllLinks));
result.addOtherLinks(breakLinks(resultOtherLinks, resultAllLinks));
result.addLeftoverLinks(breakLinks(resultLeftoverLinks, resultAllLinks));
for (int i = 0; i < queries.size(); ++i) {
Query query = queries.get(i);
for (PublicationIds pubIds : query.getPublicationIds()) {
if (!pubIds.getPmid().isEmpty() && !result.getPmid().isEmpty() && pubIds.getPmid().equals(result.getPmid()) || !pubIds.getPmcid().isEmpty() && !result.getPmcid().isEmpty() && pubIds.getPmcid().equals(result.getPmcid()) || !pubIds.getDoi().isEmpty() && !result.getDoi().isEmpty() && pubIds.getDoi().equals(result.getDoi())) {
result.addExistingName(query.getId(), query.getName());
for (String link : result.getLinks()) {
String linkTrimmed = BIOTOOLS_LINK_TRIM_START.matcher(link).replaceFirst("");
linkTrimmed = BIOTOOLS_LINK_TRIM_END.matcher(linkTrimmed).replaceFirst("");
boolean found = false;
for (String queryLink : queryLinks.get(i)) {
if (linkTrimmed.equalsIgnoreCase(queryLink)) {
found = true;
break;
} else if (linkTrimmed.startsWith(queryLink)) {
String rest = linkTrimmed.substring(queryLink.length() - 1);
if (LINK_COMPARE_REST.matcher(rest).matches()) {
found = true;
break;
}
}
}
if (!found) {
// TODO queryLinks is not complete
// result.addNewLink(link);
}
}
break;
}
}
}
for (String suggestionProcessed : suggestionsProcessed) {
suggestionProcessed = BIOTOOLS_PROCESSED_VERSION_TRIM.matcher(suggestionProcessed).replaceFirst("");
if (suggestionProcessed.isEmpty())
continue;
for (int i = 0; i < queryNamesProcessed.size(); ++i) {
if (suggestionProcessed.equals(queryNamesProcessed.get(i))) {
String possiblyExistingId = queries.get(i).getId();
if (!result.getExistingNames().keySet().contains(possiblyExistingId)) {
result.addPossiblyExisting(possiblyExistingId, queries.get(i).getName());
}
}
}
}
List<String> suggestionsExtracted = new ArrayList<>();
suggestionsExtracted.add(result.getSuggestion());
suggestionsExtracted.addAll(result.getOtherSuggestions());
for (String suggestionExtracted : suggestionsExtracted) {
suggestionExtracted = BIOTOOLS_EXTRACTED_VERSION_TRIM.matcher(suggestionExtracted).replaceFirst("");
if (suggestionExtracted.isEmpty())
continue;
for (String suggestionExtractedWord : suggestionExtracted.split(" ")) {
Map<String, String> possiblyExisting = new LinkedHashMap<>();
for (int i = 0; i < queryNamesExtracted.size(); ++i) {
List<String> queryNameExtracted = queryNamesExtracted.get(i);
if (queryNameExtracted.contains(suggestionExtractedWord)) {
String possiblyExistingId = queries.get(i).getId();
if (!result.getExistingNames().keySet().contains(possiblyExistingId)) {
possiblyExisting.put(possiblyExistingId, queries.get(i).getName());
}
}
}
if (possiblyExisting.size() >= 1 && possiblyExisting.size() <= POSSIBLY_EXISTING_VALID_LIMIT) {
for (Map.Entry<String, String> possiblyExistingEntry : possiblyExisting.entrySet()) {
result.addPossiblyExisting(possiblyExistingEntry.getKey(), possiblyExistingEntry.getValue());
}
}
}
}
List<String> resultLinksOtherLinks = new ArrayList<>();
resultLinksOtherLinks.addAll(result.getLinks());
resultLinksOtherLinks.addAll(result.getOtherLinks());
for (int i = 0; i < resultLinksOtherLinks.size(); ++i) {
String resultLink = resultLinksOtherLinks.get(i);
resultLink = BIOTOOLS_LINK_TRIM_START.matcher(resultLink).replaceFirst("");
resultLink = BIOTOOLS_LINK_TRIM_END.matcher(resultLink).replaceFirst("");
for (int j = 0; j < queryLinks.size(); ++j) {
String possiblyExistingId = queries.get(j).getId();
if (!result.getExistingNames().keySet().contains(possiblyExistingId)) {
List<String> queryLink = queryLinks.get(j);
for (String link : queryLink) {
if (resultLink.equalsIgnoreCase(link)) {
result.addPossiblyExisting(possiblyExistingId, queries.get(j).getName());
} else if (resultLink.startsWith(link)) {
String rest = resultLink.substring(link.length() - 1);
if (LINK_COMPARE_REST.matcher(rest).matches()) {
result.addPossiblyExisting(possiblyExistingId, queries.get(j).getName());
}
}
}
}
}
}
if (!(result.getExistingNames().size() == 1 && !result.getSuggestion().isEmpty() && result.getExistingNames().values().iterator().next().equals(result.getSuggestion()) && result.getNewLinks().isEmpty())) {
results.add(result);
}
}
// TODO
System.err.println();
results = results.stream().sorted(Comparator.comparing(Result::getScore).reversed()).collect(Collectors.toList());
for (int i = 0; i < results.size() - 1; ++i) {
Result resultI = results.get(i);
for (int j = i + 1; j < results.size(); ++j) {
Result resultJ = results.get(j);
if (resultI.getSuggestion().equals(resultJ.getSuggestion())) {
resultI.addSameSuggestion(resultJ.getPmid());
resultJ.addSameSuggestion(resultI.getPmid());
}
}
}
return results;
}
use of org.edamontology.edammap.core.preprocessing.PreProcessor in project edammap by edamontology.
the class Resource method runPost.
private Response runPost(MultivaluedMap<String, String> params, Request request) throws IOException, ParseException, URISyntaxException {
logger.info("POST {} from {}", params, request.getRemoteAddr());
long start = System.currentTimeMillis();
logger.info("Start: {}", Instant.ofEpochMilli(start));
CoreArgs coreArgs = new CoreArgs();
ParamParse.parseParams(params, coreArgs);
coreArgs.setProcessorArgs(Server.args.getProcessorArgs());
coreArgs.getFetcherArgs().setPrivateArgs(Server.args.getFetcherPrivateArgs());
ServerInput serverInput = new ServerInput(ParamParse.getParamString(params, "name"), ParamParse.getParamString(params, "keywords"), ParamParse.getParamString(params, "description"), ParamParse.getParamString(params, "webpage-urls"), ParamParse.getParamString(params, "doc-urls"), ParamParse.getParamString(params, "publication-ids"), ParamParse.getParamString(params, "annotations"));
if (serverInput.getName() != null && serverInput.getName().length() > MAX_NAME_LENGTH) {
throw new IllegalArgumentException("Name length (" + serverInput.getName().length() + ") is greater than maximum allowed (" + MAX_NAME_LENGTH + ")");
}
if (serverInput.getKeywords() != null && serverInput.getKeywords().length() > MAX_KEYWORDS_LENGTH) {
throw new IllegalArgumentException("Keywords length (" + serverInput.getKeywords().length() + ") is greater than maximum allowed (" + MAX_KEYWORDS_LENGTH + ")");
}
if (serverInput.getDescription() != null && serverInput.getDescription().length() > MAX_DESCRIPTION_LENGTH) {
throw new IllegalArgumentException("Description length (" + serverInput.getDescription().length() + ") is greater than maximum allowed (" + MAX_DESCRIPTION_LENGTH + ")");
}
if (serverInput.getWebpageUrls() != null && serverInput.getWebpageUrls().length() > MAX_LINKS_LENGTH) {
throw new IllegalArgumentException("Webpage URLs length (" + serverInput.getWebpageUrls().length() + ") is greater than maximum allowed (" + MAX_LINKS_LENGTH + ")");
}
if (serverInput.getDocUrls() != null && serverInput.getDocUrls().length() > MAX_LINKS_LENGTH) {
throw new IllegalArgumentException("Doc URLs length (" + serverInput.getDocUrls().length() + ") is greater than maximum allowed (" + MAX_LINKS_LENGTH + ")");
}
if (serverInput.getPublicationIds() != null && serverInput.getPublicationIds().length() > MAX_PUBLICATION_IDS_LENGTH) {
throw new IllegalArgumentException("Publication IDs length (" + serverInput.getPublicationIds().length() + ") is greater than maximum allowed (" + MAX_PUBLICATION_IDS_LENGTH + ")");
}
if (serverInput.getAnnotations() != null && serverInput.getAnnotations().length() > MAX_ANNOTATIONS_LENGTH) {
throw new IllegalArgumentException("Annotations length (" + serverInput.getAnnotations().length() + ") is greater than maximum allowed (" + MAX_ANNOTATIONS_LENGTH + ")");
}
String uuid;
String uuidDir;
do {
uuid = Server.version.getVersion() + "/" + UUID.randomUUID().toString();
uuidDir = Server.args.getFiles() + "/" + uuid;
} while (Files.exists(Paths.get(uuidDir)));
Files.createDirectory(Paths.get(uuidDir));
serverInput.setId(uuid);
logger.info("UUID: {}", uuid);
Output output = new Output(uuidDir + "/results.txt", uuidDir, true);
// TODO params to choose if HTML or TXT output desired
PreProcessor preProcessor = new PreProcessor(coreArgs.getPreProcessorArgs(), Server.stopwordsAll.get(coreArgs.getPreProcessorArgs().getStopwords()));
logger.info("Processing {} concepts", Server.concepts.size());
Map<EdamUri, ConceptProcessed> processedConcepts = Server.processor.getProcessedConcepts(Server.concepts, coreArgs.getMapperArgs().getIdfArgs(), coreArgs.getMapperArgs().getMultiplierArgs(), preProcessor);
logger.info("Loading query");
Query query = QueryLoader.fromServer(serverInput, Server.concepts, MAX_KEYWORDS_SIZE, MAX_LINKS_SIZE, MAX_PUBLICATION_IDS_SIZE);
Idf idf;
if (coreArgs.getPreProcessorArgs().isStemming()) {
idf = Server.idfStemmed;
} else {
idf = Server.idf;
}
QueryProcessed processedQuery = Server.processor.getProcessedQuery(query, QueryType.server, preProcessor, idf, coreArgs.getFetcherArgs());
logger.info("Mapping query");
Mapping mapping = new Mapper(processedConcepts).map(query, processedQuery, coreArgs.getMapperArgs());
List<Query> queries = Collections.singletonList(query);
List<List<Webpage>> webpages = Collections.singletonList(processedQuery.getWebpages());
List<List<Webpage>> docs = Collections.singletonList(processedQuery.getDocs());
List<List<Publication>> publications = Collections.singletonList(processedQuery.getPublications());
List<Mapping> mappings = Collections.singletonList(mapping);
Results results = Benchmark.calculate(queries, mappings);
long stop = System.currentTimeMillis();
logger.info("Stop: {}", Instant.ofEpochMilli(stop));
logger.info("Mapping took {}s", (stop - start) / 1000.0);
logger.info("Outputting results");
output.output(coreArgs, Server.paramsMain, QueryType.server, 1, 1, Server.concepts, queries, webpages, docs, publications, results, start, stop, Server.version);
URI location = new URI("/" + Server.args.getPath() + "/" + uuid + "/");
logger.info("POSTED {}", location);
return Response.seeOther(location).build();
}
use of org.edamontology.edammap.core.preprocessing.PreProcessor in project edammap by edamontology.
the class PubMedApps method beforeAfter.
private static void beforeAfter(PreProcessorArgs preProcessorArgs, String queryIdf, String database, List<String> pubFile) throws IOException {
PreProcessor preProcessor = new PreProcessor(preProcessorArgs);
Idf idf = new Idf(queryIdf);
List<Publication> publications = getPublications(database, pubFile);
Map<String, Integer> before = new HashMap<>();
Map<String, Integer> after = new HashMap<>();
Map<String, Integer> all = new HashMap<>();
Map<String, Double> allBeforeScores = new HashMap<>();
int allBeforeScoresSum = 0;
Map<String, Double> allAfterScores = new HashMap<>();
int allAfterScoresSum = 0;
for (Publication publication : publications) {
String toolTitle = publication.getTitle().getContent();
Matcher titleSeparator = TITLE_SEPARATOR.matcher(toolTitle);
if (titleSeparator.find()) {
toolTitle = toolTitle.substring(0, titleSeparator.start()).trim();
} else {
continue;
}
List<String> toolTitleProcessedWords = preProcessor.process(toolTitle);
if (toolTitleProcessedWords.size() != 1)
continue;
String toolTitleProcessed = toolTitleProcessedWords.get(0);
List<String> abstractSentences = preProcessor.sentences(preProcessor.removeLinks(publication.getAbstract().getContent()));
List<List<String>> processed = new ArrayList<>();
for (String sentence : abstractSentences) {
processed.add(preProcessor.process(sentence));
}
Map<String, Double> scores = new HashMap<>();
for (List<String> sentence : processed) {
for (String word : sentence) {
scores.merge(word, Math.pow(idf.getIdf(word), QUERY_IDF_SCALING), Double::sum);
}
}
for (List<String> sentenceProcessed : processed) {
for (int i = 0; i < sentenceProcessed.size(); ++i) {
if (sentenceProcessed.get(i).equals(toolTitleProcessed)) {
if (i - 1 >= 0)
before.merge(sentenceProcessed.get(i - 1), 1, Integer::sum);
if (i - 2 >= 0)
before.merge(sentenceProcessed.get(i - 2), 1, Integer::sum);
if (i + 1 < sentenceProcessed.size())
after.merge(sentenceProcessed.get(i + 1), 1, Integer::sum);
if (i + 2 < sentenceProcessed.size())
after.merge(sentenceProcessed.get(i + 2), 1, Integer::sum);
}
}
}
for (List<String> sentenceProcessed : processed) {
for (int i = 0; i < sentenceProcessed.size(); ++i) {
String wordProcessed = sentenceProcessed.get(i);
all.merge(wordProcessed, 1, Integer::sum);
if (i - 1 >= 0) {
allBeforeScores.merge(wordProcessed, scores.get(sentenceProcessed.get(i - 1)), Double::sum);
++allBeforeScoresSum;
}
if (i - 2 >= 0) {
allBeforeScores.merge(wordProcessed, scores.get(sentenceProcessed.get(i - 2)), Double::sum);
++allBeforeScoresSum;
}
if (i + 1 < sentenceProcessed.size()) {
allAfterScores.merge(wordProcessed, scores.get(sentenceProcessed.get(i + 1)), Double::sum);
++allAfterScoresSum;
}
if (i + 2 < sentenceProcessed.size()) {
allAfterScores.merge(wordProcessed, scores.get(sentenceProcessed.get(i + 2)), Double::sum);
++allAfterScoresSum;
}
}
}
}
Map<String, Integer> beforeSorted = before.entrySet().stream().sorted(Map.Entry.comparingByValue(Comparator.reverseOrder())).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue, (k, v) -> {
throw new AssertionError();
}, LinkedHashMap::new));
System.out.println("BEFORE_TOOL_TITLE\tCOUNT\tTOTAL\tPRECISION\tAVERAGE_SCORE\tPRECISION/AVERAGE_SCORE");
for (Map.Entry<String, Integer> bs : beforeSorted.entrySet()) {
String word = bs.getKey();
int count = bs.getValue();
int total = all.get(word);
double precision = count / (double) total;
Double totalScore = allAfterScores.get(word);
double averageScore = (totalScore != null ? totalScore / allAfterScoresSum : 0);
System.out.printf(Locale.ROOT, "%16s\t%d\t%d\t%.6f\t%.6f\t%8.1f\n", word, count, total, precision, averageScore, precision / averageScore);
}
System.out.println();
Map<String, Integer> afterSorted = after.entrySet().stream().sorted(Map.Entry.comparingByValue(Comparator.reverseOrder())).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue, (k, v) -> {
throw new AssertionError();
}, LinkedHashMap::new));
System.out.println("AFTER_TOOL_TITLE\tCOUNT\tTOTAL\tPRECISION\tAVERAGE_SCORE\tPRECISION/AVERAGE_SCORE");
for (Map.Entry<String, Integer> as : afterSorted.entrySet()) {
String word = as.getKey();
int count = as.getValue();
int total = all.get(word);
double precision = count / (double) total;
Double totalScore = allBeforeScores.get(word);
double averageScore = (totalScore != null ? totalScore / allBeforeScoresSum : 0);
System.out.printf(Locale.ROOT, "%16s\t%d\t%d\t%.6f\t%.6f\t%8.1f\n", word, count, total, precision, averageScore, precision / averageScore);
}
}
Aggregations