Search in sources :

Example 1 with FetcherArgs

use of org.edamontology.pubfetcher.FetcherArgs in project edammap by edamontology.

the class Report method writeArticle.

private static void writeArticle(CoreArgs args, QueryType type, Writer writer, Map<EdamUri, Concept> concepts, Query query, int queriesSize, List<Publication> publications, List<Webpage> webpages, List<Webpage> docs, MappingTest mapping, int page, int nr, int nrMin, int nrMax) throws IOException {
    FetcherArgs fetcherArgs = args.getFetcherArgs();
    writer.write("<article>\n");
    writer.write("\t<h2 id=\"" + nr + "\"><span>");
    if (queriesSize > 1) {
        writer.write("<span class=\"rank\">" + nr + ". </span>");
    }
    writer.write("<span>" + (query.getName() != null ? FetcherCommon.escapeHtml(query.getName()) : "") + "</span>");
    if (query.getId() != null) {
        if (type == QueryType.biotools) {
            writer.write("<a href=\"" + FetcherCommon.escapeHtmlAttribute(QueryLoader.BIOTOOLS + query.getId()) + "\" class=\"biotools-link\"></a>");
        } else {
            writer.write("<span> (" + FetcherCommon.escapeHtml(query.getId()) + ")</span>");
        }
    }
    writer.write("</span><span>");
    String next = "";
    if (nr == nrMax && nr < queriesSize) {
        next = "index" + (page + 1) + ".html#" + (nr + 1);
    } else if (nr < nrMax) {
        next = "#" + (nr + 1);
    }
    if (next.isEmpty()) {
        writer.write("<span class=\"next\"></span>");
    } else {
        writer.write("<a href=\"" + next + "\" class=\"next\"></a>");
    }
    String previous = "";
    if (nr == nrMin && nr > 1) {
        previous = "index" + (page - 1 == 1 ? "" : page - 1) + ".html#" + (nr - 1);
    } else if (nr > nrMin) {
        previous = "#" + (nr - 1);
    }
    if (previous.isEmpty()) {
        writer.write("<span class=\"previous\"></span>");
    } else {
        writer.write("<a href=\"" + previous + "\" class=\"previous\"></a>");
    }
    writer.write("<a href=\"#" + nr + "\" class=\"current\"></a>");
    writer.write("</span></h2>\n");
    boolean webpagesPresent = false;
    if (query.getWebpageUrls() != null) {
        for (Link link : query.getWebpageUrls()) {
            if (link != null && link.getUrl() != null && !link.getUrl().isEmpty()) {
                webpagesPresent = true;
                break;
            }
        }
    }
    boolean docsPresent = false;
    if (query.getDocUrls() != null) {
        for (Link link : query.getDocUrls()) {
            if (link != null && link.getUrl() != null && !link.getUrl().isEmpty()) {
                docsPresent = true;
                break;
            }
        }
    }
    boolean miscPresent = (query.getKeywords() != null && !query.getKeywords().isEmpty()) || (query.getDescription() != null && !query.getDescription().isEmpty()) || webpagesPresent || docsPresent;
    boolean publicationsPresent = false;
    if (query.getPublicationIds() != null) {
        for (PublicationIdsQuery publicationIds : query.getPublicationIds()) {
            if (publicationIds != null && !publicationIds.isEmpty()) {
                publicationsPresent = true;
                break;
            }
        }
    }
    if (miscPresent || publicationsPresent) {
        if (publicationsPresent) {
            writer.write("\t<section class=\"query\">\n");
        } else {
            writer.write("\t<section class=\"query query-no-publications\">\n");
        }
        if (miscPresent) {
            writer.write("\t\t<section class=\"misc\">\n");
            if (query.getKeywords() != null && !query.getKeywords().isEmpty()) {
                Map<String, List<Keyword>> keywords = new LinkedHashMap<>();
                for (Keyword keyword : query.getKeywords()) {
                    if (keywords.get(keyword.getType()) == null) {
                        keywords.put(keyword.getType(), new ArrayList<>());
                    }
                    keywords.get(keyword.getType()).add(keyword);
                }
                for (Map.Entry<String, List<Keyword>> entry : keywords.entrySet()) {
                    writer.write("\t\t\t<div class=\"generic\">\n");
                    writer.write("\t\t\t\t<h3>" + FetcherCommon.escapeHtml(entry.getKey()) + "</h3><br>\n");
                    writer.write("\t\t\t\t<div>");
                    writer.write(entry.getValue().stream().map(k -> FetcherCommon.getLinkHtml(k.getUrl(), k.getValue())).collect(Collectors.joining("; ")));
                    writer.write("</div>\n");
                    writer.write("\t\t\t</div>\n");
                }
            }
            if (query.getDescription() != null && !query.getDescription().isEmpty()) {
                writer.write("\t\t\t<div class=\"generic\">\n");
                writer.write("\t\t\t\t<h3>Description</h3><br>\n");
                writer.write("\t\t\t\t<div>" + FetcherCommon.getParagraphsHtml(query.getDescription()) + "</div>\n");
                writer.write("\t\t\t</div>\n");
            }
            if (webpagesPresent) {
                writer.write("\t\t\t<div class=\"links\">\n");
                writer.write("\t\t\t\t<h3>Links</h3><br>\n");
                writer.write("\t\t\t\t<div>\n");
                writeLinks(fetcherArgs, writer, query.getWebpageUrls(), webpages);
                writer.write("\t\t\t\t</div>\n");
                writer.write("\t\t\t</div>\n");
            }
            if (docsPresent) {
                writer.write("\t\t\t<div class=\"links\">\n");
                writer.write("\t\t\t\t<h3>Documentation</h3><br>\n");
                writer.write("\t\t\t\t<div>\n");
                writeLinks(fetcherArgs, writer, query.getDocUrls(), docs);
                writer.write("\t\t\t\t</div>\n");
                writer.write("\t\t\t</div>\n");
            }
            writer.write("\t\t</section>\n");
        }
        if (publicationsPresent) {
            writer.write("\t\t<section class=\"publications\">\n");
            writePublications(fetcherArgs, writer, query.getPublicationIds(), publications);
            writer.write("\t\t</section>\n");
        }
        writer.write("\t</section>\n");
    }
    writer.write("\t<section class=\"mapping\">\n");
    writeMatches(args.getMapperArgs().getScoreArgs(), writer, concepts, query, publications, mapping);
    writer.write("\t</section>\n");
    writer.write("</article>\n\n");
}
Also used : PublicationIdsQuery(org.edamontology.edammap.core.query.PublicationIdsQuery) FetcherArgs(org.edamontology.pubfetcher.FetcherArgs) Keyword(org.edamontology.edammap.core.query.Keyword) ArrayList(java.util.ArrayList) List(java.util.List) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map) Link(org.edamontology.edammap.core.query.Link) LinkedHashMap(java.util.LinkedHashMap)

Example 2 with FetcherArgs

use of org.edamontology.pubfetcher.FetcherArgs in project edammap by edamontology.

the class PubMedApps method getResults.

private static List<Result> getResults(PreProcessorArgs preProcessorArgs, String queryIdf, String queryPath, QueryType queryType, FetcherArgs fetcherArgs, List<Publication> publications) throws IOException, ParseException {
    List<Result> results = new ArrayList<>();
    List<String> hostIgnore = getResource("host_ignore.txt");
    List<String> beforeTier1 = getResource("before_tier1.txt");
    List<String> beforeTier2 = getResource("before_tier2.txt");
    List<String> beforeTier3 = getResource("before_tier3.txt");
    List<String> afterTier1 = getResource("after_tier1.txt");
    List<String> afterTier2 = getResource("after_tier2.txt");
    List<String> afterTier3 = getResource("after_tier3.txt");
    PreProcessor preProcessor = new PreProcessor(preProcessorArgs);
    Idf idf = new Idf(queryIdf);
    List<Query> queries = QueryLoader.get(queryPath, queryType, fetcherArgs.getTimeout(), fetcherArgs.getPrivateArgs().getUserAgent());
    List<List<String>> queryNamesExtracted = new ArrayList<>();
    List<String> queryNamesProcessed = new ArrayList<>();
    List<List<String>> queryLinks = new ArrayList<>();
    for (Query query : queries) {
        List<String> queryNameExtracted = preProcessor.extract(query.getName());
        List<String> queryNameProcessed = preProcessor.process(query.getName(), queryNameExtracted);
        queryNamesExtracted.add(Arrays.asList(BIOTOOLS_EXTRACTED_VERSION_TRIM.matcher(String.join(" ", queryNameExtracted)).replaceFirst("").split(" ")));
        queryNamesProcessed.add(BIOTOOLS_PROCESSED_VERSION_TRIM.matcher(String.join(" ", queryNameProcessed)).replaceFirst(""));
        List<Link> links = new ArrayList<>();
        links.addAll(query.getWebpageUrls());
        links.addAll(query.getDocUrls());
        queryLinks.add(links.stream().map(l -> BIOTOOLS_LINK_TRIM_START.matcher(l.getUrl()).replaceFirst("")).map(l -> BIOTOOLS_LINK_TRIM_END.matcher(l).replaceFirst("")).filter(l -> !l.isEmpty()).collect(Collectors.toList()));
    }
    for (int publicationIndex = 0; publicationIndex < publications.size(); ++publicationIndex) {
        double percentage = (publicationIndex + 1) / (double) publications.size() * 100;
        percentage = Math.round(percentage * 10) / 10.0;
        // TODO
        System.err.print("\rMaking results: " + percentage + "%");
        Publication publication = publications.get(publicationIndex);
        String toolTitle = null;
        String toolTitleTwo = null;
        String toolTitleAcronym = null;
        String toolTitleTwoAcronym = null;
        String toolTitlePruned = null;
        String toolTitleTwoPruned = null;
        long toolTitleWordsTotal = 0;
        String title = publication.getTitle().getContent();
        int from = 0;
        Matcher matcher = TITLE_SEPARATOR.matcher(title);
        while (from < title.length() && matcher.find(from)) {
            String currentToolTitle = title.substring(from, matcher.start()).trim();
            String currentToolTitleTwo = null;
            String currentToolTitleAcronym = null;
            String currentToolTitleTwoAcronym = null;
            String currentToolTitlePruned = null;
            String currentToolTitleTwoPruned = null;
            String separatorString = " and ";
            int separator = currentToolTitle.indexOf(separatorString);
            if (separator < 0) {
                separatorString = " & ";
                separator = currentToolTitle.indexOf(separatorString);
            }
            if (separator > -1) {
                currentToolTitleTwo = currentToolTitle.substring(separator + separatorString.length(), currentToolTitle.length());
                currentToolTitle = currentToolTitle.substring(0, separator);
            }
            List<String> currentToolTitleExtracted = preProcessor.extract(currentToolTitle);
            // align indexes
            preProcessor.process(currentToolTitle, currentToolTitleExtracted);
            List<String> currentToolTitleTwoExtracted = null;
            if (currentToolTitleTwo != null) {
                currentToolTitleTwoExtracted = preProcessor.extract(currentToolTitleTwo);
                // align indexes
                preProcessor.process(currentToolTitleTwo, currentToolTitleTwoExtracted);
            }
            Integer firstAcronymIndex = firstAcronymIndex(currentToolTitle, preProcessor);
            if (firstAcronymIndex != null) {
                currentToolTitleAcronym = currentToolTitleExtracted.remove(firstAcronymIndex.intValue());
            }
            if (currentToolTitleTwo != null) {
                Integer firstAcronymIndexTwo = firstAcronymIndex(currentToolTitleTwo, preProcessor);
                if (firstAcronymIndexTwo != null) {
                    currentToolTitleTwoAcronym = currentToolTitleTwoExtracted.remove(firstAcronymIndexTwo.intValue());
                }
            }
            currentToolTitle = String.join(" ", currentToolTitleExtracted);
            if (currentToolTitleTwo != null) {
                currentToolTitleTwo = String.join(" ", currentToolTitleTwoExtracted);
            }
            currentToolTitlePruned = toolTitlePrune(currentToolTitleExtracted);
            if (currentToolTitleTwo != null) {
                currentToolTitleTwoPruned = toolTitlePrune(currentToolTitleTwoExtracted);
            }
            if (currentToolTitleTwo != null && (currentToolTitleExtracted.size() > 1 || currentToolTitleTwoExtracted.size() > 1) && (!currentToolTitle.isEmpty() && !currentToolTitleTwo.isEmpty())) {
                currentToolTitle += " " + currentToolTitleTwo;
                currentToolTitleTwo = null;
                if (!currentToolTitlePruned.isEmpty() && !currentToolTitleTwoPruned.isEmpty()) {
                    currentToolTitlePruned += " " + currentToolTitleTwoPruned;
                    currentToolTitleTwoPruned = null;
                } else if (!currentToolTitleTwoPruned.isEmpty()) {
                    currentToolTitlePruned = currentToolTitleTwoPruned;
                    currentToolTitleTwoPruned = null;
                }
                currentToolTitleExtracted.addAll(currentToolTitleTwoExtracted);
                currentToolTitleTwoExtracted = null;
            }
            long currentToolTitleWordsTotal = currentToolTitleExtracted.size();
            if (currentToolTitleTwoExtracted != null) {
                currentToolTitleWordsTotal += currentToolTitleTwoExtracted.size();
            }
            if (currentToolTitleWordsTotal < toolTitleWordsTotal || toolTitle == null) {
                toolTitle = currentToolTitle == null || currentToolTitle.isEmpty() ? null : currentToolTitle;
                toolTitleTwo = currentToolTitleTwo == null || currentToolTitleTwo.isEmpty() ? null : currentToolTitleTwo;
                toolTitleAcronym = currentToolTitleAcronym == null || currentToolTitleAcronym.isEmpty() ? null : currentToolTitleAcronym;
                toolTitleTwoAcronym = currentToolTitleTwoAcronym == null || currentToolTitleTwoAcronym.isEmpty() ? null : currentToolTitleTwoAcronym;
                toolTitlePruned = currentToolTitlePruned == null || currentToolTitlePruned.isEmpty() ? null : currentToolTitlePruned;
                toolTitleTwoPruned = currentToolTitleTwoPruned == null || currentToolTitleTwoPruned.isEmpty() ? null : currentToolTitleTwoPruned;
                toolTitleWordsTotal = currentToolTitleWordsTotal;
            }
            from = matcher.end();
        }
        String theAbstract = publication.getAbstract().getContent();
        String titleWithoutLinks = preProcessor.removeLinks(title);
        String abstractWithoutLinks = preProcessor.removeLinks(theAbstract);
        if (from > 0) {
            title = title.substring(from).trim();
        }
        List<String> titleAbstractSentences = preProcessor.sentences(preProcessor.removeLinks(title) + ". " + abstractWithoutLinks);
        List<List<String>> extracted = new ArrayList<>();
        List<List<String>> processed = new ArrayList<>();
        for (String sentence : titleAbstractSentences) {
            List<String> sentenceExtracted = preProcessor.extract(sentence);
            List<String> sentenceProcessed = preProcessor.process(sentence, sentenceExtracted);
            extracted.add(sentenceExtracted);
            processed.add(sentenceProcessed);
        }
        Map<String, Double> scores = new HashMap<>();
        Map<String, String> processedToExtracted = new HashMap<>();
        Map<String, Set<String>> processedToExtractedBegin = new HashMap<>();
        Map<String, List<String>> processedToExtractedWithin = new HashMap<>();
        for (int i = 0; i < processed.size(); ++i) {
            List<String> sentenceExtracted = extracted.get(i);
            List<String> sentenceProcessed = processed.get(i);
            for (int j = 0; j < COMPOUND_WORDS; ++j) {
                for (int k = 0; k < sentenceProcessed.size() - j; ++k) {
                    String wordExtracted = sentenceExtracted.get(k);
                    String wordProcessed = sentenceProcessed.get(k);
                    for (int l = k + 1; l <= k + j; ++l) wordExtracted += " " + sentenceExtracted.get(l);
                    for (int l = k + 1; l <= k + j; ++l) wordProcessed += " " + sentenceProcessed.get(l);
                    Double value;
                    if (j == 0) {
                        value = Math.pow(idf.getIdf(sentenceProcessed.get(k)), QUERY_IDF_SCALING);
                    } else {
                        value = scores.get(sentenceProcessed.get(k));
                        for (int l = k + 1; l <= k + j; ++l) value *= scores.get(sentenceProcessed.get(l));
                        value /= COMPOUND_DIVIDER;
                    }
                    scores.merge(wordProcessed, value, Double::sum);
                    if (i == 0 || k == 0) {
                        Set<String> wordsExtracted = processedToExtractedBegin.get(wordProcessed);
                        if (wordsExtracted == null) {
                            wordsExtracted = new LinkedHashSet<>();
                            processedToExtractedBegin.put(wordProcessed, wordsExtracted);
                        }
                        wordsExtracted.add(wordExtracted);
                    } else {
                        List<String> wordsExtracted = processedToExtractedWithin.get(wordProcessed);
                        if (wordsExtracted == null) {
                            wordsExtracted = new ArrayList<>();
                            processedToExtractedWithin.put(wordProcessed, wordsExtracted);
                        }
                        wordsExtracted.add(wordExtracted);
                    }
                }
            }
        }
        // put within before begin so that in case of equality option from within wins (because order-preserving sets)
        Set<String> processedToExtractedKeys = new LinkedHashSet<>();
        processedToExtractedKeys.addAll(processedToExtractedWithin.keySet());
        processedToExtractedKeys.addAll(processedToExtractedBegin.keySet());
        for (String key : processedToExtractedKeys) {
            Map<String, Integer> extractedCount = new LinkedHashMap<>();
            List<String> extractedWithins = processedToExtractedWithin.get(key);
            if (extractedWithins != null) {
                for (String extractedWithin : extractedWithins) {
                    extractedCount.merge(extractedWithin, 1, Integer::sum);
                }
            }
            Set<String> extractedBegins = processedToExtractedBegin.get(key);
            if (extractedBegins != null) {
                for (String extractedBegin : extractedBegins) {
                    extractedCount.merge(extractedBegin, 1, Integer::sum);
                }
            }
            extractedCount = extractedCount.entrySet().stream().sorted(Map.Entry.comparingByValue(Comparator.reverseOrder())).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue, (k, v) -> {
                throw new AssertionError();
            }, LinkedHashMap::new));
            processedToExtracted.put(key, extractedCount.keySet().iterator().next());
        }
        if (toolTitle != null && toolTitlePruned != null) {
            boolean existing = toolTitleScore(toolTitle, preProcessor, scores, processedToExtracted, false);
            if (!existing && !toolTitlePruned.equals(toolTitle)) {
                toolTitleScore(toolTitlePruned, preProcessor, scores, processedToExtracted, true);
            }
        }
        if (toolTitleTwo != null && toolTitleTwoPruned != null) {
            boolean existing = toolTitleScore(toolTitleTwo, preProcessor, scores, processedToExtracted, false);
            if (!existing && !toolTitleTwoPruned.equals(toolTitleTwo)) {
                toolTitleScore(toolTitleTwoPruned, preProcessor, scores, processedToExtracted, true);
            }
        }
        if (toolTitleAcronym != null) {
            toolTitleScore(toolTitleAcronym, preProcessor, scores, processedToExtracted, false);
        }
        if (toolTitleTwoAcronym != null) {
            toolTitleScore(toolTitleTwoAcronym, preProcessor, scores, processedToExtracted, false);
        }
        Map<String, Double> beforeAfterAdded = new HashMap<>();
        for (int i = 0; i < processed.size(); ++i) {
            List<String> sentenceProcessed = processed.get(i);
            boolean acronymsDone = false;
            List<Integer> acronyms = null;
            for (int j = 0; j < sentenceProcessed.size(); ++j) {
                String wordProcessed = sentenceProcessed.get(j);
                boolean inBeforeTier1 = beforeTier1.contains(wordProcessed);
                boolean inBeforeTier2 = beforeTier2.contains(wordProcessed);
                boolean inBeforeTier3 = beforeTier3.contains(wordProcessed);
                if (j + 1 < sentenceProcessed.size() && (inBeforeTier1 || inBeforeTier2 || inBeforeTier3)) {
                    if (!acronymsDone) {
                        acronyms = acronyms(titleAbstractSentences.get(i), preProcessor);
                        acronymsDone = true;
                    }
                    boolean acronymFound = false;
                    String acronym = null;
                    if (acronyms.contains(j + 1)) {
                        acronym = sentenceProcessed.get(j + 1);
                        acronymFound = true;
                    } else if (acronyms.contains(-(j + 1))) {
                        acronym = sentenceProcessed.get(j + 1);
                        acronymFound = true;
                    } else if (j + 2 < sentenceProcessed.size()) {
                        if (acronyms.contains(j + 2)) {
                            acronym = sentenceProcessed.get(j + 2);
                            acronymFound = true;
                        } else if (acronyms.contains(-(j + 2))) {
                            acronym = sentenceProcessed.get(j + 2);
                            acronymFound = true;
                        }
                    }
                    if (acronymFound) {
                        beforeAfterScore(acronym, scores, beforeAfterAdded, inBeforeTier1, inBeforeTier2, inBeforeTier3, true);
                    } else {
                        String nextWord = sentenceProcessed.get(j + 1);
                        beforeAfterScore(nextWord, scores, beforeAfterAdded, inBeforeTier1, inBeforeTier2, inBeforeTier3, false);
                        if (j + 2 < sentenceProcessed.size()) {
                            acronymFound = false;
                            for (int k = 1; k <= COMPOUND_WORDS && j + 2 + k < sentenceProcessed.size(); ++k) {
                                if (acronyms.contains(-(j + 2 + k))) {
                                    String nextNextWord = sentenceProcessed.get(j + 2 + k);
                                    beforeAfterScore(nextNextWord, scores, beforeAfterAdded, inBeforeTier1, inBeforeTier2, inBeforeTier3, false);
                                    acronymFound = true;
                                    break;
                                }
                            }
                            if (!acronymFound) {
                                String nextNextWord = sentenceProcessed.get(j + 2);
                                beforeAfterScore(nextNextWord, scores, beforeAfterAdded, inBeforeTier1, inBeforeTier2, inBeforeTier3, false);
                                String nextCompoundWord = nextWord + " " + nextNextWord;
                                beforeAfterScore(nextCompoundWord, scores, beforeAfterAdded, inBeforeTier1, inBeforeTier2, inBeforeTier3, false);
                            }
                        }
                    }
                }
                boolean inAfterTier1 = afterTier1.contains(wordProcessed);
                boolean inAfterTier2 = afterTier2.contains(wordProcessed);
                boolean inAfterTier3 = afterTier3.contains(wordProcessed);
                if (j - 1 >= 0 && (inAfterTier1 || inAfterTier2 || inAfterTier3)) {
                    if (!acronymsDone) {
                        acronyms = acronyms(titleAbstractSentences.get(i), preProcessor);
                        acronymsDone = true;
                    }
                    boolean acronymFound = false;
                    String acronym = null;
                    if (acronyms.contains(j - 1)) {
                        acronym = sentenceProcessed.get(j - 1);
                        acronymFound = true;
                    } else if (acronyms.contains(-(j - 1))) {
                        acronym = sentenceProcessed.get(j - 1);
                        acronymFound = true;
                    } else if (j - 2 >= 0) {
                        if (acronyms.contains(j - 2)) {
                            acronym = sentenceProcessed.get(j - 2);
                            acronymFound = true;
                        } else if (acronyms.contains(-(j - 2))) {
                            acronym = sentenceProcessed.get(j - 2);
                            acronymFound = true;
                        }
                    }
                    if (acronymFound) {
                        beforeAfterScore(acronym, scores, beforeAfterAdded, inAfterTier1, inAfterTier2, inAfterTier3, true);
                    } else {
                        String nextWord = sentenceProcessed.get(j - 1);
                        beforeAfterScore(nextWord, scores, beforeAfterAdded, inAfterTier1, inAfterTier2, inAfterTier3, false);
                        if (j - 2 >= 0) {
                            acronymFound = false;
                            for (int k = 1; k <= COMPOUND_WORDS && j - 2 - k >= 0; ++k) {
                                if (acronyms.contains(-(j - 2 - k))) {
                                    String nextNextWord = sentenceProcessed.get(j - 2 - k);
                                    beforeAfterScore(nextNextWord, scores, beforeAfterAdded, inAfterTier1, inAfterTier2, inAfterTier3, false);
                                    acronymFound = true;
                                    break;
                                }
                            }
                            if (!acronymFound) {
                                String nextNextWord = sentenceProcessed.get(j - 2);
                                beforeAfterScore(nextNextWord, scores, beforeAfterAdded, inAfterTier1, inAfterTier2, inAfterTier3, false);
                                String nextCompoundWord = nextNextWord + " " + nextWord;
                                beforeAfterScore(nextCompoundWord, scores, beforeAfterAdded, inAfterTier1, inAfterTier2, inAfterTier3, false);
                            }
                        }
                    }
                }
            }
        }
        List<String> titleAbstractLinks = preProcessor.links(title);
        titleAbstractLinks.addAll(preProcessor.links(theAbstract));
        List<String> fulltextLinks = preProcessor.links(publication.getFulltext().getContent());
        for (int i = 0; i < titleAbstractLinks.size(); ++i) {
            String titleAbstractLink = titleAbstractLinks.get(i);
            Iterator<String> it = fulltextLinks.iterator();
            while (it.hasNext()) {
                String fulltextLink = it.next();
                if (fulltextLink.equals(titleAbstractLink)) {
                    it.remove();
                    break;
                }
                String start = "";
                Matcher startTitleAbstractLink = LINK_COMPARE_START.matcher(titleAbstractLink);
                if (startTitleAbstractLink.find()) {
                    start = titleAbstractLink.substring(0, startTitleAbstractLink.end());
                    titleAbstractLink = titleAbstractLink.substring(startTitleAbstractLink.end());
                }
                Matcher startFulltextLink = LINK_COMPARE_START.matcher(fulltextLink);
                if (startFulltextLink.find()) {
                    String startFulltext = fulltextLink.substring(0, startFulltextLink.end());
                    if (startFulltext.length() > start.length()) {
                        start = startFulltext;
                    }
                    fulltextLink = fulltextLink.substring(startFulltextLink.end());
                }
                if (fulltextLink.equals(titleAbstractLink)) {
                    titleAbstractLinks.set(i, start + titleAbstractLink);
                    it.remove();
                    break;
                }
                if (fulltextLink.startsWith(titleAbstractLink)) {
                    String rest = fulltextLink.substring(titleAbstractLink.length());
                    if (rest.startsWith("/")) {
                        titleAbstractLinks.set(i, start + fulltextLink);
                    }
                    it.remove();
                    break;
                }
                if (fulltextLink.contains(titleAbstractLink)) {
                    titleAbstractLinks.set(i, start + fulltextLink);
                    it.remove();
                    break;
                }
                if (titleAbstractLink.startsWith(fulltextLink)) {
                    String rest = titleAbstractLink.substring(fulltextLink.length() - 1);
                    if (LINK_COMPARE_REST.matcher(rest).matches()) {
                        titleAbstractLinks.set(i, start + fulltextLink);
                    }
                    Matcher schemaMatcher = LINK_COMPARE_SCHEMA.matcher(rest);
                    if (schemaMatcher.find()) {
                        titleAbstractLinks.set(i, start + fulltextLink);
                        titleAbstractLinks.add(i + 1, rest.substring(schemaMatcher.start()));
                    }
                    it.remove();
                    break;
                }
                if (titleAbstractLink.contains(fulltextLink)) {
                    it.remove();
                    break;
                }
            }
        }
        Map<String, List<String>> linksAbstract = links(titleAbstractLinks, preProcessor, idf, hostIgnore, scores.keySet(), processedToExtracted, processed, titleWithoutLinks, abstractWithoutLinks, toolTitle, toolTitleTwo, toolTitleAcronym, toolTitleTwoAcronym, toolTitlePruned, toolTitleTwoPruned);
        Map<String, List<String>> linksFulltext = links(fulltextLinks, preProcessor, idf, hostIgnore, scores.keySet(), processedToExtracted, processed, titleWithoutLinks, abstractWithoutLinks, toolTitle, toolTitleTwo, toolTitleAcronym, toolTitleTwoAcronym, toolTitlePruned, toolTitleTwoPruned);
        for (Map.Entry<String, List<String>> linkEntry : linksAbstract.entrySet()) {
            double score = scores.get(linkEntry.getKey()) * LINK_MULTIPLIER_ABSTRACT * linkEntry.getValue().size();
            if (score > LINK_MULTIPLIER_ABSTRACT_MINIMUM) {
                scores.put(linkEntry.getKey(), score);
            } else {
                scores.put(linkEntry.getKey(), LINK_MULTIPLIER_ABSTRACT_MINIMUM);
            }
        }
        boolean genericLinkAugmentation = linksAbstract.isEmpty();
        for (String link : titleAbstractLinks) {
            boolean present = false;
            for (Map.Entry<String, List<String>> linkEntry : linksAbstract.entrySet()) {
                if (linkEntry.getValue().contains(link)) {
                    present = true;
                    break;
                }
            }
            if (!present) {
                if (genericLinkAugmentation) {
                    for (Map.Entry<String, Double> scoreEntry : scores.entrySet()) {
                        scores.put(scoreEntry.getKey(), scoreEntry.getValue() * LINK_MULTIPLIER_ABSTRACT_AUGMENTATION);
                    }
                    genericLinkAugmentation = false;
                }
                String fromLink = fromLink(link, preProcessor, idf, hostIgnore);
                if (!fromLink.isEmpty()) {
                    List<String> fromLinkExtracted = preProcessor.extract(fromLink);
                    List<String> fromLinkProcessed = preProcessor.process(fromLink, fromLinkExtracted);
                    String fromLinkExtractedString = String.join(" ", fromLinkExtracted);
                    String fromLinkProcessedString = String.join(" ", fromLinkProcessed);
                    if (!fromLinkProcessedString.isEmpty()) {
                        scores.merge(fromLinkProcessedString, LINK_MULTIPLIER_ABSTRACT_NEW / fromLinkProcessed.size(), (d1, d2) -> d1 * d2);
                        String wordExtracted = processedToExtracted.get(fromLinkProcessedString);
                        if (wordExtracted == null) {
                            processedToExtracted.put(fromLinkProcessedString, fromLinkExtractedString);
                        }
                    }
                }
            }
        }
        for (Map.Entry<String, List<String>> linkEntry : linksFulltext.entrySet()) {
            long multiplier = linkEntry.getValue().stream().filter(link -> !LINK_TWO_PART.matcher(link).matches()).count();
            if (multiplier > 0) {
                if (multiplier > 2) {
                    multiplier = 2;
                }
                scores.put(linkEntry.getKey(), scores.get(linkEntry.getKey()) * LINK_MULTIPLIER_FULLTEXT * multiplier);
            }
        }
        Map<String, Double> sortedScores = scores.entrySet().stream().sorted(Map.Entry.comparingByValue(Comparator.reverseOrder())).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue, (k, v) -> {
            throw new AssertionError();
        }, LinkedHashMap::new));
        Result result = new Result();
        result.setPmid(publication.getPmid().getContent());
        result.setPmcid(publication.getPmcid().getContent());
        result.setDoi(publication.getDoi().getContent());
        List<String> resultLinks = new ArrayList<>();
        List<String> suggestionsProcessed = new ArrayList<>();
        Iterator<Map.Entry<String, Double>> sortedScoresIterator = sortedScores.entrySet().iterator();
        if (sortedScoresIterator.hasNext()) {
            Map.Entry<String, Double> topEntry = sortedScoresIterator.next();
            double topScore = topEntry.getValue();
            result.setScore(topScore);
            result.setSuggestion(processedToExtracted.get(topEntry.getKey()));
            List<String> linksFromAbstract = linksAbstract.get(topEntry.getKey());
            if (linksFromAbstract != null) {
                for (String link : linksFromAbstract) {
                    resultLinks.add(link);
                }
            }
            List<String> linksFromFulltext = linksFulltext.get(topEntry.getKey());
            if (linksFromFulltext != null) {
                for (String link : linksFromFulltext) {
                    resultLinks.add(link);
                }
            }
            suggestionsProcessed.add(topEntry.getKey());
            for (int i = 1; i < SUGGESTION_LIMIT && sortedScoresIterator.hasNext(); ++i) {
                topEntry = sortedScoresIterator.next();
                if (topEntry.getValue() * TOP_SCORE_LIMIT < topScore) {
                    break;
                }
                result.addOtherSuggestion(processedToExtracted.get(topEntry.getKey()));
                suggestionsProcessed.add(topEntry.getKey());
            }
        }
        List<String> resultOtherLinks = new ArrayList<>();
        for (List<String> linksFromAbstract : linksAbstract.values()) {
            for (String link : linksFromAbstract) {
                if (!resultLinks.contains(link)) {
                    resultOtherLinks.add(link);
                }
            }
        }
        for (List<String> linksFromFulltext : linksFulltext.values()) {
            for (String link : linksFromFulltext) {
                if (!resultLinks.contains(link)) {
                    resultOtherLinks.add(link);
                }
            }
        }
        List<String> resultLeftoverLinks = new ArrayList<>();
        for (String link : titleAbstractLinks) {
            if (!resultLinks.contains(link) && !resultOtherLinks.contains(link)) {
                resultLeftoverLinks.add(link);
            }
        }
        for (String link : fulltextLinks) {
            if (!resultLinks.contains(link) && !resultOtherLinks.contains(link)) {
                resultLeftoverLinks.add(link);
            }
        }
        List<String> resultAllLinks = new ArrayList<>();
        resultAllLinks.addAll(resultLinks.stream().map(resultLink -> BIOTOOLS_LINK_TRIM_START.matcher(resultLink).replaceFirst("")).collect(Collectors.toList()));
        resultAllLinks.addAll(resultOtherLinks.stream().map(resultLink -> BIOTOOLS_LINK_TRIM_START.matcher(resultLink).replaceFirst("")).collect(Collectors.toList()));
        resultAllLinks.addAll(resultLeftoverLinks.stream().map(resultLink -> BIOTOOLS_LINK_TRIM_START.matcher(resultLink).replaceFirst("")).collect(Collectors.toList()));
        result.addLinks(breakLinks(resultLinks, resultAllLinks));
        result.addOtherLinks(breakLinks(resultOtherLinks, resultAllLinks));
        result.addLeftoverLinks(breakLinks(resultLeftoverLinks, resultAllLinks));
        for (int i = 0; i < queries.size(); ++i) {
            Query query = queries.get(i);
            for (PublicationIds pubIds : query.getPublicationIds()) {
                if (!pubIds.getPmid().isEmpty() && !result.getPmid().isEmpty() && pubIds.getPmid().equals(result.getPmid()) || !pubIds.getPmcid().isEmpty() && !result.getPmcid().isEmpty() && pubIds.getPmcid().equals(result.getPmcid()) || !pubIds.getDoi().isEmpty() && !result.getDoi().isEmpty() && pubIds.getDoi().equals(result.getDoi())) {
                    result.addExistingName(query.getId(), query.getName());
                    for (String link : result.getLinks()) {
                        String linkTrimmed = BIOTOOLS_LINK_TRIM_START.matcher(link).replaceFirst("");
                        linkTrimmed = BIOTOOLS_LINK_TRIM_END.matcher(linkTrimmed).replaceFirst("");
                        boolean found = false;
                        for (String queryLink : queryLinks.get(i)) {
                            if (linkTrimmed.equalsIgnoreCase(queryLink)) {
                                found = true;
                                break;
                            } else if (linkTrimmed.startsWith(queryLink)) {
                                String rest = linkTrimmed.substring(queryLink.length() - 1);
                                if (LINK_COMPARE_REST.matcher(rest).matches()) {
                                    found = true;
                                    break;
                                }
                            }
                        }
                        if (!found) {
                        // TODO queryLinks is not complete
                        // result.addNewLink(link);
                        }
                    }
                    break;
                }
            }
        }
        for (String suggestionProcessed : suggestionsProcessed) {
            suggestionProcessed = BIOTOOLS_PROCESSED_VERSION_TRIM.matcher(suggestionProcessed).replaceFirst("");
            if (suggestionProcessed.isEmpty())
                continue;
            for (int i = 0; i < queryNamesProcessed.size(); ++i) {
                if (suggestionProcessed.equals(queryNamesProcessed.get(i))) {
                    String possiblyExistingId = queries.get(i).getId();
                    if (!result.getExistingNames().keySet().contains(possiblyExistingId)) {
                        result.addPossiblyExisting(possiblyExistingId, queries.get(i).getName());
                    }
                }
            }
        }
        List<String> suggestionsExtracted = new ArrayList<>();
        suggestionsExtracted.add(result.getSuggestion());
        suggestionsExtracted.addAll(result.getOtherSuggestions());
        for (String suggestionExtracted : suggestionsExtracted) {
            suggestionExtracted = BIOTOOLS_EXTRACTED_VERSION_TRIM.matcher(suggestionExtracted).replaceFirst("");
            if (suggestionExtracted.isEmpty())
                continue;
            for (String suggestionExtractedWord : suggestionExtracted.split(" ")) {
                Map<String, String> possiblyExisting = new LinkedHashMap<>();
                for (int i = 0; i < queryNamesExtracted.size(); ++i) {
                    List<String> queryNameExtracted = queryNamesExtracted.get(i);
                    if (queryNameExtracted.contains(suggestionExtractedWord)) {
                        String possiblyExistingId = queries.get(i).getId();
                        if (!result.getExistingNames().keySet().contains(possiblyExistingId)) {
                            possiblyExisting.put(possiblyExistingId, queries.get(i).getName());
                        }
                    }
                }
                if (possiblyExisting.size() >= 1 && possiblyExisting.size() <= POSSIBLY_EXISTING_VALID_LIMIT) {
                    for (Map.Entry<String, String> possiblyExistingEntry : possiblyExisting.entrySet()) {
                        result.addPossiblyExisting(possiblyExistingEntry.getKey(), possiblyExistingEntry.getValue());
                    }
                }
            }
        }
        List<String> resultLinksOtherLinks = new ArrayList<>();
        resultLinksOtherLinks.addAll(result.getLinks());
        resultLinksOtherLinks.addAll(result.getOtherLinks());
        for (int i = 0; i < resultLinksOtherLinks.size(); ++i) {
            String resultLink = resultLinksOtherLinks.get(i);
            resultLink = BIOTOOLS_LINK_TRIM_START.matcher(resultLink).replaceFirst("");
            resultLink = BIOTOOLS_LINK_TRIM_END.matcher(resultLink).replaceFirst("");
            for (int j = 0; j < queryLinks.size(); ++j) {
                String possiblyExistingId = queries.get(j).getId();
                if (!result.getExistingNames().keySet().contains(possiblyExistingId)) {
                    List<String> queryLink = queryLinks.get(j);
                    for (String link : queryLink) {
                        if (resultLink.equalsIgnoreCase(link)) {
                            result.addPossiblyExisting(possiblyExistingId, queries.get(j).getName());
                        } else if (resultLink.startsWith(link)) {
                            String rest = resultLink.substring(link.length() - 1);
                            if (LINK_COMPARE_REST.matcher(rest).matches()) {
                                result.addPossiblyExisting(possiblyExistingId, queries.get(j).getName());
                            }
                        }
                    }
                }
            }
        }
        if (!(result.getExistingNames().size() == 1 && !result.getSuggestion().isEmpty() && result.getExistingNames().values().iterator().next().equals(result.getSuggestion()) && result.getNewLinks().isEmpty())) {
            results.add(result);
        }
    }
    // TODO
    System.err.println();
    results = results.stream().sorted(Comparator.comparing(Result::getScore).reversed()).collect(Collectors.toList());
    for (int i = 0; i < results.size() - 1; ++i) {
        Result resultI = results.get(i);
        for (int j = i + 1; j < results.size(); ++j) {
            Result resultJ = results.get(j);
            if (resultI.getSuggestion().equals(resultJ.getSuggestion())) {
                resultI.addSameSuggestion(resultJ.getPmid());
                resultJ.addSameSuggestion(resultI.getPmid());
            }
        }
    }
    return results;
}
Also used : Arrays(java.util.Arrays) URISyntaxException(java.net.URISyntaxException) FetcherUtil(org.edamontology.pubfetcher.FetcherUtil) Version(org.edamontology.pubfetcher.Version) Matcher(java.util.regex.Matcher) FetcherArgs(org.edamontology.pubfetcher.FetcherArgs) Locale(java.util.Locale) Map(java.util.Map) Element(org.jsoup.nodes.Element) FetcherCommon(org.edamontology.pubfetcher.FetcherCommon) URI(java.net.URI) ParseException(java.text.ParseException) Path(java.nio.file.Path) Link(org.edamontology.edammap.core.query.Link) Idf(org.edamontology.edammap.core.idf.Idf) MissingResourceException(java.util.MissingResourceException) Set(java.util.Set) Collectors(java.util.stream.Collectors) StandardCharsets(java.nio.charset.StandardCharsets) PublicationIds(org.edamontology.pubfetcher.PublicationIds) List(java.util.List) Logger(org.apache.logging.log4j.Logger) CharsetEncoder(java.nio.charset.CharsetEncoder) Document(org.jsoup.nodes.Document) Pattern(java.util.regex.Pattern) Query(org.edamontology.edammap.core.query.Query) Parameter(com.beust.jcommander.Parameter) HashMap(java.util.HashMap) PreProcessor(org.edamontology.edammap.core.preprocessing.PreProcessor) QueryLoader(org.edamontology.edammap.core.query.QueryLoader) ArrayList(java.util.ArrayList) LinkedHashMap(java.util.LinkedHashMap) CodingErrorAction(java.nio.charset.CodingErrorAction) OutputStreamWriter(java.io.OutputStreamWriter) LinkedHashSet(java.util.LinkedHashSet) BasicArgs(org.edamontology.pubfetcher.BasicArgs) Iterator(java.util.Iterator) Files(java.nio.file.Files) BufferedWriter(java.io.BufferedWriter) Fetcher(org.edamontology.pubfetcher.Fetcher) PreProcessorArgs(org.edamontology.edammap.core.preprocessing.PreProcessorArgs) IOException(java.io.IOException) Database(org.edamontology.pubfetcher.Database) Field(java.lang.reflect.Field) InputStreamReader(java.io.InputStreamReader) QueryType(org.edamontology.edammap.core.query.QueryType) Publication(org.edamontology.pubfetcher.Publication) BufferedReader(java.io.BufferedReader) Comparator(java.util.Comparator) LogManager(org.apache.logging.log4j.LogManager) InputStream(java.io.InputStream) LinkedHashSet(java.util.LinkedHashSet) Set(java.util.Set) LinkedHashSet(java.util.LinkedHashSet) Query(org.edamontology.edammap.core.query.Query) Matcher(java.util.regex.Matcher) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) ArrayList(java.util.ArrayList) PreProcessor(org.edamontology.edammap.core.preprocessing.PreProcessor) Idf(org.edamontology.edammap.core.idf.Idf) LinkedHashMap(java.util.LinkedHashMap) List(java.util.List) ArrayList(java.util.ArrayList) Publication(org.edamontology.pubfetcher.Publication) PublicationIds(org.edamontology.pubfetcher.PublicationIds) Map(java.util.Map) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) Link(org.edamontology.edammap.core.query.Link)

Example 3 with FetcherArgs

use of org.edamontology.pubfetcher.FetcherArgs in project edammap by edamontology.

the class Resource method patch.

/* TODO JSON
	// curl -H "Content-Type: application/json" -X POST -d '{"threads":2,"reportPaginationSize":"7","mapperArgs":{"algorithmArgs":{"compoundWords":2}}}' http://localhost:8080/api
	@POST
	@Consumes(MediaType.APPLICATION_JSON)
	@Produces(MediaType.TEXT_PLAIN)
	public String json(JsonObject json) {
		StringBuilder sb = new StringBuilder();
		for (Map.Entry<String, JsonValue> entry : json.entrySet()) {
			if (entry.getValue().getValueType() == JsonValue.ValueType.STRING || entry.getValue().getValueType() == JsonValue.ValueType.NUMBER) {
				sb.append(entry.getKey()).append(" --> ").append(entry.getValue().toString()).append("\n");
			}
		}
		return sb.toString();
	}
*/
private Response patch(String requestString, Request request, String resource, Class<?> clazz, boolean doc, int max) {
    try {
        logger.info("PATCH {} {} from {}", resource, requestString, request.getRemoteAddr());
        // TODO get actual args from form
        FetcherArgs fetcherArgs = new FetcherArgs();
        fetcherArgs.setPrivateArgs(Server.args.getFetcherPrivateArgs());
        Response response = Response.ok(Server.processor.getDatabaseEntries(QueryLoader.fromServerEntry(requestString, clazz, max), fetcherArgs, clazz, doc).stream().map(p -> p.toStringId() + " : " + p.getStatusString(fetcherArgs).toUpperCase(Locale.ROOT)).collect(Collectors.joining("\n"))).build();
        logger.info("PATCHED {} {}", resource, response.getEntity());
        return response;
    } catch (IllegalArgumentException e) {
        logger.error("Exception!", e);
        return Response.status(Status.BAD_REQUEST).entity(e.getMessage()).build();
    } catch (Throwable e) {
        logger.error("Exception!", e);
        throw e;
    }
}
Also used : Response(javax.ws.rs.core.Response) Results(org.edamontology.edammap.core.benchmarking.Results) Query(org.edamontology.edammap.core.query.Query) Request(org.glassfish.grizzly.http.server.Request) Produces(javax.ws.rs.Produces) GET(javax.ws.rs.GET) URISyntaxException(java.net.URISyntaxException) Path(javax.ws.rs.Path) Benchmark(org.edamontology.edammap.core.benchmarking.Benchmark) PreProcessor(org.edamontology.edammap.core.preprocessing.PreProcessor) Output(org.edamontology.edammap.core.output.Output) ConceptProcessed(org.edamontology.edammap.core.processing.ConceptProcessed) QueryLoader(org.edamontology.edammap.core.query.QueryLoader) MediaType(javax.ws.rs.core.MediaType) Consumes(javax.ws.rs.Consumes) FetcherArgs(org.edamontology.pubfetcher.FetcherArgs) Locale(java.util.Locale) Map(java.util.Map) URI(java.net.URI) PATCH(javax.ws.rs.PATCH) ParseException(java.text.ParseException) Status(javax.ws.rs.core.Response.Status) POST(javax.ws.rs.POST) Context(javax.ws.rs.core.Context) Files(java.nio.file.Files) Idf(org.edamontology.edammap.core.idf.Idf) IOException(java.io.IOException) QueryProcessed(org.edamontology.edammap.core.processing.QueryProcessed) UUID(java.util.UUID) Mapping(org.edamontology.edammap.core.mapping.Mapping) Instant(java.time.Instant) Collectors(java.util.stream.Collectors) EdamUri(org.edamontology.edammap.core.edam.EdamUri) CoreArgs(org.edamontology.edammap.core.args.CoreArgs) QueryType(org.edamontology.edammap.core.query.QueryType) MultivaluedMap(javax.ws.rs.core.MultivaluedMap) ServerInput(org.edamontology.edammap.core.input.ServerInput) List(java.util.List) Logger(org.apache.logging.log4j.Logger) Response(javax.ws.rs.core.Response) Webpage(org.edamontology.pubfetcher.Webpage) Paths(java.nio.file.Paths) Publication(org.edamontology.pubfetcher.Publication) UriInfo(javax.ws.rs.core.UriInfo) Collections(java.util.Collections) LogManager(org.apache.logging.log4j.LogManager) Mapper(org.edamontology.edammap.core.mapping.Mapper) FetcherArgs(org.edamontology.pubfetcher.FetcherArgs)

Aggregations

List (java.util.List)3 Map (java.util.Map)3 IOException (java.io.IOException)2 URI (java.net.URI)2 URISyntaxException (java.net.URISyntaxException)2 Files (java.nio.file.Files)2 ParseException (java.text.ParseException)2 ArrayList (java.util.ArrayList)2 LinkedHashMap (java.util.LinkedHashMap)2 Locale (java.util.Locale)2 Collectors (java.util.stream.Collectors)2 LogManager (org.apache.logging.log4j.LogManager)2 Logger (org.apache.logging.log4j.Logger)2 Idf (org.edamontology.edammap.core.idf.Idf)2 PreProcessor (org.edamontology.edammap.core.preprocessing.PreProcessor)2 Link (org.edamontology.edammap.core.query.Link)2 FetcherArgs (org.edamontology.pubfetcher.FetcherArgs)2 Parameter (com.beust.jcommander.Parameter)1 BufferedReader (java.io.BufferedReader)1 BufferedWriter (java.io.BufferedWriter)1