Examples with PreProcessor - org.edamontology.edammap.core.preprocessing.PreProcessor

Example 1 with PreProcessor

use of org.edamontology.edammap.core.preprocessing.PreProcessor in project edammap by edamontology.

the class Cli method run.

@Override
public void run() {
    synchronized (lock) {
        ++numThreads;
        lockDone = true;
    }
    try {
        PreProcessor pp = new PreProcessor(args.getPreProcessorArgs(), stopwords);
        Mapper mapper = new Mapper(processedConcepts);
        while (true) {
            Query query;
            int localIndex;
            synchronized (queries) {
                if (index >= queries.size()) {
                    break;
                }
                query = queries.get(index);
                localIndex = index;
                ++index;
            }
            logger.info("{}/{} @ {}s", localIndex + 1, queries.size(), (System.currentTimeMillis() - start) / 1000.0);
            QueryProcessed processedQuery = processor.getProcessedQuery(query, args.getType(), pp, idf, args.getFetcherArgs());
            Mapping mapping = mapper.map(query, processedQuery, args.getMapperArgs());
            synchronized (mappings) {
                webpages.set(localIndex, processedQuery.getWebpages());
                docs.set(localIndex, processedQuery.getDocs());
                publications.set(localIndex, processedQuery.getPublications());
                mappings.set(localIndex, mapping);
            }
        }
    } finally {
        synchronized (lock) {
            --numThreads;
            lock.notifyAll();
        }
    }
}

Also used : Mapper(org.edamontology.edammap.core.mapping.Mapper) QueryProcessed(org.edamontology.edammap.core.processing.QueryProcessed) Query(org.edamontology.edammap.core.query.Query) PreProcessor(org.edamontology.edammap.core.preprocessing.PreProcessor) Mapping(org.edamontology.edammap.core.mapping.Mapping)

Example 2 with PreProcessor

use of org.edamontology.edammap.core.preprocessing.PreProcessor in project edammap by edamontology.

the class Cli method run.

private static void run(Version version) throws IOException, ParseException {
    List<Param> paramsMain = new ArrayList<>();
    paramsMain.add(new Param("Ontology file", CliArgs.EDAM, new File(args.getEdam()).getName(), "https://github.com/edamontology/edamontology/tree/master/releases"));
    if (Input.isProtocol(args.getQuery())) {
        paramsMain.add(new Param("Query file", CliArgs.QUERY, args.getQuery(), args.getQuery()));
    } else {
        paramsMain.add(new Param("Query file", CliArgs.QUERY, new File(args.getQuery()).getName()));
    }
    paramsMain.add(new Param("Type", CliArgs.TYPE, args.getType().toString()));
    paramsMain.add(new Param("Output file", CliArgs.OUTPUT, new File(args.getOutput()).getName()));
    paramsMain.add(new Param("Report file", CliArgs.REPORT, new File(args.getReport()).getName()));
    paramsMain.add(new Param("Report page size", CliArgs.REPORT_PAGE_SIZE, args.getReportPageSize(), 0.0, null));
    paramsMain.add(new Param("Report pagination size", CliArgs.REPORT_PAGINATION_SIZE, args.getReportPaginationSize(), 0.0, null));
    paramsMain.add(new Param("Number of threads", CliArgs.THREADS, args.getThreads(), 0.0, null));
    Output output = new Output(args.getOutput(), args.getReport(), false);
    stopwords = PreProcessor.getStopwords(args.getPreProcessorArgs().getStopwords());
    processor = new Processor(args.getProcessorArgs());
    idf = null;
    if (args.getPreProcessorArgs().isStemming()) {
        if (args.getProcessorArgs().getIdfStemmed() != null && !args.getProcessorArgs().getIdfStemmed().isEmpty()) {
            idf = new Idf(args.getProcessorArgs().getIdfStemmed());
        }
    } else {
        if (args.getProcessorArgs().getIdf() != null && !args.getProcessorArgs().getIdf().isEmpty()) {
            idf = new Idf(args.getProcessorArgs().getIdf());
        }
    }
    logger.info("Loading concepts");
    Map<EdamUri, Concept> concepts = Edam.load(args.getEdam());
    logger.info("Processing {} concepts", concepts.size());
    processedConcepts = processor.getProcessedConcepts(concepts, args.getMapperArgs().getIdfArgs(), args.getMapperArgs().getMultiplierArgs(), new PreProcessor(args.getPreProcessorArgs(), stopwords));
    logger.info("Loading queries");
    queries = QueryLoader.get(args.getQuery(), args.getType(), concepts, args.getFetcherArgs().getTimeout(), args.getFetcherArgs().getPrivateArgs().getUserAgent());
    publications = new ArrayList<>(queries.size());
    webpages = new ArrayList<>(queries.size());
    docs = new ArrayList<>(queries.size());
    mappings = new ArrayList<>(queries.size());
    for (int i = 0; i < queries.size(); ++i) {
        publications.add(null);
        webpages.add(null);
        docs.add(null);
        mappings.add(null);
    }
    start = System.currentTimeMillis();
    logger.info("Start: {}", Instant.ofEpochMilli(start));
    logger.info("Starting mapper threads");
    for (int i = 0; i < args.getThreads(); ++i) {
        Thread t = new Thread(new Cli());
        t.setDaemon(true);
        t.start();
    }
    synchronized (lock) {
        while (!lockDone || numThreads > 0) {
            try {
                lock.wait();
            } catch (InterruptedException e) {
                // TODO exit threads cleanly? give timeout for threads to exit? close db? print that exiting and waiting for threads to terminate?
                logger.error("Exception!", e);
                System.exit(1);
            }
        }
    }
    logger.info("All mapper threads stopped");
    long stop = System.currentTimeMillis();
    logger.info("Stop: {}", Instant.ofEpochMilli(stop));
    logger.info("Mapping took {}s", (stop - start) / 1000.0);
    Results results = Benchmark.calculate(queries, mappings);
    logger.info("Outputting results");
    output.output(args, paramsMain, args.getType(), args.getReportPageSize(), args.getReportPaginationSize(), concepts, queries, webpages, docs, publications, results, start, stop, version);
    logger.info("{} : {}", results.toStringMeasure(Measure.recall), Measure.recall);
    logger.info("{} : {}", results.toStringMeasure(Measure.AveP), Measure.AveP);
}

Also used : Concept(org.edamontology.edammap.core.edam.Concept) PreProcessor(org.edamontology.edammap.core.preprocessing.PreProcessor) Processor(org.edamontology.edammap.core.processing.Processor) ArrayList(java.util.ArrayList) PreProcessor(org.edamontology.edammap.core.preprocessing.PreProcessor) Idf(org.edamontology.edammap.core.idf.Idf) Results(org.edamontology.edammap.core.benchmarking.Results) Output(org.edamontology.edammap.core.output.Output) Param(org.edamontology.edammap.core.output.Param) File(java.io.File) EdamUri(org.edamontology.edammap.core.edam.EdamUri)

Example 3 with PreProcessor

use of org.edamontology.edammap.core.preprocessing.PreProcessor in project edammap by edamontology.

the class PubMedApps method getResults.

private static List<Result> getResults(PreProcessorArgs preProcessorArgs, String queryIdf, String queryPath, QueryType queryType, FetcherArgs fetcherArgs, List<Publication> publications) throws IOException, ParseException {
    List<Result> results = new ArrayList<>();
    List<String> hostIgnore = getResource("host_ignore.txt");
    List<String> beforeTier1 = getResource("before_tier1.txt");
    List<String> beforeTier2 = getResource("before_tier2.txt");
    List<String> beforeTier3 = getResource("before_tier3.txt");
    List<String> afterTier1 = getResource("after_tier1.txt");
    List<String> afterTier2 = getResource("after_tier2.txt");
    List<String> afterTier3 = getResource("after_tier3.txt");
    PreProcessor preProcessor = new PreProcessor(preProcessorArgs);
    Idf idf = new Idf(queryIdf);
    List<Query> queries = QueryLoader.get(queryPath, queryType, fetcherArgs.getTimeout(), fetcherArgs.getPrivateArgs().getUserAgent());
    List<List<String>> queryNamesExtracted = new ArrayList<>();
    List<String> queryNamesProcessed = new ArrayList<>();
    List<List<String>> queryLinks = new ArrayList<>();
    for (Query query : queries) {
        List<String> queryNameExtracted = preProcessor.extract(query.getName());
        List<String> queryNameProcessed = preProcessor.process(query.getName(), queryNameExtracted);
        queryNamesExtracted.add(Arrays.asList(BIOTOOLS_EXTRACTED_VERSION_TRIM.matcher(String.join(" ", queryNameExtracted)).replaceFirst("").split(" ")));
        queryNamesProcessed.add(BIOTOOLS_PROCESSED_VERSION_TRIM.matcher(String.join(" ", queryNameProcessed)).replaceFirst(""));
        List<Link> links = new ArrayList<>();
        links.addAll(query.getWebpageUrls());
        links.addAll(query.getDocUrls());
        queryLinks.add(links.stream().map(l -> BIOTOOLS_LINK_TRIM_START.matcher(l.getUrl()).replaceFirst("")).map(l -> BIOTOOLS_LINK_TRIM_END.matcher(l).replaceFirst("")).filter(l -> !l.isEmpty()).collect(Collectors.toList()));
    }
    for (int publicationIndex = 0; publicationIndex < publications.size(); ++publicationIndex) {
        double percentage = (publicationIndex + 1) / (double) publications.size() * 100;
        percentage = Math.round(percentage * 10) / 10.0;
        // TODO
        System.err.print("\rMaking results: " + percentage + "%");
        Publication publication = publications.get(publicationIndex);
        String toolTitle = null;
        String toolTitleTwo = null;
        String toolTitleAcronym = null;
        String toolTitleTwoAcronym = null;
        String toolTitlePruned = null;
        String toolTitleTwoPruned = null;
        long toolTitleWordsTotal = 0;
        String title = publication.getTitle().getContent();
        int from = 0;
        Matcher matcher = TITLE_SEPARATOR.matcher(title);
        while (from < title.length() && matcher.find(from)) {
            String currentToolTitle = title.substring(from, matcher.start()).trim();
            String currentToolTitleTwo = null;
            String currentToolTitleAcronym = null;
            String currentToolTitleTwoAcronym = null;
            String currentToolTitlePruned = null;
            String currentToolTitleTwoPruned = null;
            String separatorString = " and ";
            int separator = currentToolTitle.indexOf(separatorString);
            if (separator < 0) {
                separatorString = " & ";
                separator = currentToolTitle.indexOf(separatorString);
            }
            if (separator > -1) {
                currentToolTitleTwo = currentToolTitle.substring(separator + separatorString.length(), currentToolTitle.length());
                currentToolTitle = currentToolTitle.substring(0, separator);
            }
            List<String> currentToolTitleExtracted = preProcessor.extract(currentToolTitle);
            // align indexes
            preProcessor.process(currentToolTitle, currentToolTitleExtracted);
            List<String> currentToolTitleTwoExtracted = null;
            if (currentToolTitleTwo != null) {
                currentToolTitleTwoExtracted = preProcessor.extract(currentToolTitleTwo);
                // align indexes
                preProcessor.process(currentToolTitleTwo, currentToolTitleTwoExtracted);
            }
            Integer firstAcronymIndex = firstAcronymIndex(currentToolTitle, preProcessor);
            if (firstAcronymIndex != null) {
                currentToolTitleAcronym = currentToolTitleExtracted.remove(firstAcronymIndex.intValue());
            }
            if (currentToolTitleTwo != null) {
                Integer firstAcronymIndexTwo = firstAcronymIndex(currentToolTitleTwo, preProcessor);
                if (firstAcronymIndexTwo != null) {
                    currentToolTitleTwoAcronym = currentToolTitleTwoExtracted.remove(firstAcronymIndexTwo.intValue());
                }
            }
            currentToolTitle = String.join(" ", currentToolTitleExtracted);
            if (currentToolTitleTwo != null) {
                currentToolTitleTwo = String.join(" ", currentToolTitleTwoExtracted);
            }
            currentToolTitlePruned = toolTitlePrune(currentToolTitleExtracted);
            if (currentToolTitleTwo != null) {
                currentToolTitleTwoPruned = toolTitlePrune(currentToolTitleTwoExtracted);
            }
            if (currentToolTitleTwo != null && (currentToolTitleExtracted.size() > 1 || currentToolTitleTwoExtracted.size() > 1) && (!currentToolTitle.isEmpty() && !currentToolTitleTwo.isEmpty())) {
                currentToolTitle += " " + currentToolTitleTwo;
                currentToolTitleTwo = null;
                if (!currentToolTitlePruned.isEmpty() && !currentToolTitleTwoPruned.isEmpty()) {
                    currentToolTitlePruned += " " + currentToolTitleTwoPruned;
                    currentToolTitleTwoPruned = null;
                } else if (!currentToolTitleTwoPruned.isEmpty()) {
                    currentToolTitlePruned = currentToolTitleTwoPruned;
                    currentToolTitleTwoPruned = null;
                }
                currentToolTitleExtracted.addAll(currentToolTitleTwoExtracted);
                currentToolTitleTwoExtracted = null;
            }
            long currentToolTitleWordsTotal = currentToolTitleExtracted.size();
            if (currentToolTitleTwoExtracted != null) {
                currentToolTitleWordsTotal += currentToolTitleTwoExtracted.size();
            }
            if (currentToolTitleWordsTotal < toolTitleWordsTotal || toolTitle == null) {
                toolTitle = currentToolTitle == null || currentToolTitle.isEmpty() ? null : currentToolTitle;
                toolTitleTwo = currentToolTitleTwo == null || currentToolTitleTwo.isEmpty() ? null : currentToolTitleTwo;
                toolTitleAcronym = currentToolTitleAcronym == null || currentToolTitleAcronym.isEmpty() ? null : currentToolTitleAcronym;
                toolTitleTwoAcronym = currentToolTitleTwoAcronym == null || currentToolTitleTwoAcronym.isEmpty() ? null : currentToolTitleTwoAcronym;
                toolTitlePruned = currentToolTitlePruned == null || currentToolTitlePruned.isEmpty() ? null : currentToolTitlePruned;
                toolTitleTwoPruned = currentToolTitleTwoPruned == null || currentToolTitleTwoPruned.isEmpty() ? null : currentToolTitleTwoPruned;
                toolTitleWordsTotal = currentToolTitleWordsTotal;
            }
            from = matcher.end();
        }
        String theAbstract = publication.getAbstract().getContent();
        String titleWithoutLinks = preProcessor.removeLinks(title);
        String abstractWithoutLinks = preProcessor.removeLinks(theAbstract);
        if (from > 0) {
            title = title.substring(from).trim();
        }
        List<String> titleAbstractSentences = preProcessor.sentences(preProcessor.removeLinks(title) + ". " + abstractWithoutLinks);
        List<List<String>> extracted = new ArrayList<>();
        List<List<String>> processed = new ArrayList<>();
        for (String sentence : titleAbstractSentences) {
            List<String> sentenceExtracted = preProcessor.extract(sentence);
            List<String> sentenceProcessed = preProcessor.process(sentence, sentenceExtracted);
            extracted.add(sentenceExtracted);
            processed.add(sentenceProcessed);
        }
        Map<String, Double> scores = new HashMap<>();
        Map<String, String> processedToExtracted = new HashMap<>();
        Map<String, Set<String>> processedToExtractedBegin = new HashMap<>();
        Map<String, List<String>> processedToExtractedWithin = new HashMap<>();
        for (int i = 0; i < processed.size(); ++i) {
            List<String> sentenceExtracted = extracted.get(i);
            List<String> sentenceProcessed = processed.get(i);
            for (int j = 0; j < COMPOUND_WORDS; ++j) {
                for (int k = 0; k < sentenceProcessed.size() - j; ++k) {
                    String wordExtracted = sentenceExtracted.get(k);
                    String wordProcessed = sentenceProcessed.get(k);
                    for (int l = k + 1; l <= k + j; ++l) wordExtracted += " " + sentenceExtracted.get(l);
                    for (int l = k + 1; l <= k + j; ++l) wordProcessed += " " + sentenceProcessed.get(l);
                    Double value;
                    if (j == 0) {
                        value = Math.pow(idf.getIdf(sentenceProcessed.get(k)), QUERY_IDF_SCALING);
                    } else {
                        value = scores.get(sentenceProcessed.get(k));
                        for (int l = k + 1; l <= k + j; ++l) value *= scores.get(sentenceProcessed.get(l));
                        value /= COMPOUND_DIVIDER;
                    }
                    scores.merge(wordProcessed, value, Double::sum);
                    if (i == 0 || k == 0) {
                        Set<String> wordsExtracted = processedToExtractedBegin.get(wordProcessed);
                        if (wordsExtracted == null) {
                            wordsExtracted = new LinkedHashSet<>();
                            processedToExtractedBegin.put(wordProcessed, wordsExtracted);
                        }
                        wordsExtracted.add(wordExtracted);
                    } else {
                        List<String> wordsExtracted = processedToExtractedWithin.get(wordProcessed);
                        if (wordsExtracted == null) {
                            wordsExtracted = new ArrayList<>();
                            processedToExtractedWithin.put(wordProcessed, wordsExtracted);
                        }
                        wordsExtracted.add(wordExtracted);
                    }
                }
            }
        }
        // put within before begin so that in case of equality option from within wins (because order-preserving sets)
        Set<String> processedToExtractedKeys = new LinkedHashSet<>();
        processedToExtractedKeys.addAll(processedToExtractedWithin.keySet());
        processedToExtractedKeys.addAll(processedToExtractedBegin.keySet());
        for (String key : processedToExtractedKeys) {
            Map<String, Integer> extractedCount = new LinkedHashMap<>();
            List<String> extractedWithins = processedToExtractedWithin.get(key);
            if (extractedWithins != null) {
                for (String extractedWithin : extractedWithins) {
                    extractedCount.merge(extractedWithin, 1, Integer::sum);
                }
            }
            Set<String> extractedBegins = processedToExtractedBegin.get(key);
            if (extractedBegins != null) {
                for (String extractedBegin : extractedBegins) {
                    extractedCount.merge(extractedBegin, 1, Integer::sum);
                }
            }
            extractedCount = extractedCount.entrySet().stream().sorted(Map.Entry.comparingByValue(Comparator.reverseOrder())).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue, (k, v) -> {
                throw new AssertionError();
            }, LinkedHashMap::new));
            processedToExtracted.put(key, extractedCount.keySet().iterator().next());
        }
        if (toolTitle != null && toolTitlePruned != null) {
            boolean existing = toolTitleScore(toolTitle, preProcessor, scores, processedToExtracted, false);
            if (!existing && !toolTitlePruned.equals(toolTitle)) {
                toolTitleScore(toolTitlePruned, preProcessor, scores, processedToExtracted, true);
            }
        }
        if (toolTitleTwo != null && toolTitleTwoPruned != null) {
            boolean existing = toolTitleScore(toolTitleTwo, preProcessor, scores, processedToExtracted, false);
            if (!existing && !toolTitleTwoPruned.equals(toolTitleTwo)) {
                toolTitleScore(toolTitleTwoPruned, preProcessor, scores, processedToExtracted, true);
            }
        }
        if (toolTitleAcronym != null) {
            toolTitleScore(toolTitleAcronym, preProcessor, scores, processedToExtracted, false);
        }
        if (toolTitleTwoAcronym != null) {
            toolTitleScore(toolTitleTwoAcronym, preProcessor, scores, processedToExtracted, false);
        }
        Map<String, Double> beforeAfterAdded = new HashMap<>();
        for (int i = 0; i < processed.size(); ++i) {
            List<String> sentenceProcessed = processed.get(i);
            boolean acronymsDone = false;
            List<Integer> acronyms = null;
            for (int j = 0; j < sentenceProcessed.size(); ++j) {
                String wordProcessed = sentenceProcessed.get(j);
                boolean inBeforeTier1 = beforeTier1.contains(wordProcessed);
                boolean inBeforeTier2 = beforeTier2.contains(wordProcessed);
                boolean inBeforeTier3 = beforeTier3.contains(wordProcessed);
                if (j + 1 < sentenceProcessed.size() && (inBeforeTier1 || inBeforeTier2 || inBeforeTier3)) {
                    if (!acronymsDone) {
                        acronyms = acronyms(titleAbstractSentences.get(i), preProcessor);
                        acronymsDone = true;
                    }
                    boolean acronymFound = false;
                    String acronym = null;
                    if (acronyms.contains(j + 1)) {
                        acronym = sentenceProcessed.get(j + 1);
                        acronymFound = true;
                    } else if (acronyms.contains(-(j + 1))) {
                        acronym = sentenceProcessed.get(j + 1);
                        acronymFound = true;
                    } else if (j + 2 < sentenceProcessed.size()) {
                        if (acronyms.contains(j + 2)) {
                            acronym = sentenceProcessed.get(j + 2);
                            acronymFound = true;
                        } else if (acronyms.contains(-(j + 2))) {
                            acronym = sentenceProcessed.get(j + 2);
                            acronymFound = true;
                        }
                    }
                    if (acronymFound) {
                        beforeAfterScore(acronym, scores, beforeAfterAdded, inBeforeTier1, inBeforeTier2, inBeforeTier3, true);
                    } else {
                        String nextWord = sentenceProcessed.get(j + 1);
                        beforeAfterScore(nextWord, scores, beforeAfterAdded, inBeforeTier1, inBeforeTier2, inBeforeTier3, false);
                        if (j + 2 < sentenceProcessed.size()) {
                            acronymFound = false;
                            for (int k = 1; k <= COMPOUND_WORDS && j + 2 + k < sentenceProcessed.size(); ++k) {
                                if (acronyms.contains(-(j + 2 + k))) {
                                    String nextNextWord = sentenceProcessed.get(j + 2 + k);
                                    beforeAfterScore(nextNextWord, scores, beforeAfterAdded, inBeforeTier1, inBeforeTier2, inBeforeTier3, false);
                                    acronymFound = true;
                                    break;
                                }
                            }
                            if (!acronymFound) {
                                String nextNextWord = sentenceProcessed.get(j + 2);
                                beforeAfterScore(nextNextWord, scores, beforeAfterAdded, inBeforeTier1, inBeforeTier2, inBeforeTier3, false);
                                String nextCompoundWord = nextWord + " " + nextNextWord;
                                beforeAfterScore(nextCompoundWord, scores, beforeAfterAdded, inBeforeTier1, inBeforeTier2, inBeforeTier3, false);
                            }
                        }
                    }
                }
                boolean inAfterTier1 = afterTier1.contains(wordProcessed);
                boolean inAfterTier2 = afterTier2.contains(wordProcessed);
                boolean inAfterTier3 = afterTier3.contains(wordProcessed);
                if (j - 1 >= 0 && (inAfterTier1 || inAfterTier2 || inAfterTier3)) {
                    if (!acronymsDone) {
                        acronyms = acronyms(titleAbstractSentences.get(i), preProcessor);
                        acronymsDone = true;
                    }
                    boolean acronymFound = false;
                    String acronym = null;
                    if (acronyms.contains(j - 1)) {
                        acronym = sentenceProcessed.get(j - 1);
                        acronymFound = true;
                    } else if (acronyms.contains(-(j - 1))) {
                        acronym = sentenceProcessed.get(j - 1);
                        acronymFound = true;
                    } else if (j - 2 >= 0) {
                        if (acronyms.contains(j - 2)) {
                            acronym = sentenceProcessed.get(j - 2);
                            acronymFound = true;
                        } else if (acronyms.contains(-(j - 2))) {
                            acronym = sentenceProcessed.get(j - 2);
                            acronymFound = true;
                        }
                    }
                    if (acronymFound) {
                        beforeAfterScore(acronym, scores, beforeAfterAdded, inAfterTier1, inAfterTier2, inAfterTier3, true);
                    } else {
                        String nextWord = sentenceProcessed.get(j - 1);
                        beforeAfterScore(nextWord, scores, beforeAfterAdded, inAfterTier1, inAfterTier2, inAfterTier3, false);
                        if (j - 2 >= 0) {
                            acronymFound = false;
                            for (int k = 1; k <= COMPOUND_WORDS && j - 2 - k >= 0; ++k) {
                                if (acronyms.contains(-(j - 2 - k))) {
                                    String nextNextWord = sentenceProcessed.get(j - 2 - k);
                                    beforeAfterScore(nextNextWord, scores, beforeAfterAdded, inAfterTier1, inAfterTier2, inAfterTier3, false);
                                    acronymFound = true;
                                    break;
                                }
                            }
                            if (!acronymFound) {
                                String nextNextWord = sentenceProcessed.get(j - 2);
                                beforeAfterScore(nextNextWord, scores, beforeAfterAdded, inAfterTier1, inAfterTier2, inAfterTier3, false);
                                String nextCompoundWord = nextNextWord + " " + nextWord;
                                beforeAfterScore(nextCompoundWord, scores, beforeAfterAdded, inAfterTier1, inAfterTier2, inAfterTier3, false);
                            }
                        }
                    }
                }
            }
        }
        List<String> titleAbstractLinks = preProcessor.links(title);
        titleAbstractLinks.addAll(preProcessor.links(theAbstract));
        List<String> fulltextLinks = preProcessor.links(publication.getFulltext().getContent());
        for (int i = 0; i < titleAbstractLinks.size(); ++i) {
            String titleAbstractLink = titleAbstractLinks.get(i);
            Iterator<String> it = fulltextLinks.iterator();
            while (it.hasNext()) {
                String fulltextLink = it.next();
                if (fulltextLink.equals(titleAbstractLink)) {
                    it.remove();
                    break;
                }
                String start = "";
                Matcher startTitleAbstractLink = LINK_COMPARE_START.matcher(titleAbstractLink);
                if (startTitleAbstractLink.find()) {
                    start = titleAbstractLink.substring(0, startTitleAbstractLink.end());
                    titleAbstractLink = titleAbstractLink.substring(startTitleAbstractLink.end());
                }
                Matcher startFulltextLink = LINK_COMPARE_START.matcher(fulltextLink);
                if (startFulltextLink.find()) {
                    String startFulltext = fulltextLink.substring(0, startFulltextLink.end());
                    if (startFulltext.length() > start.length()) {
                        start = startFulltext;
                    }
                    fulltextLink = fulltextLink.substring(startFulltextLink.end());
                }
                if (fulltextLink.equals(titleAbstractLink)) {
                    titleAbstractLinks.set(i, start + titleAbstractLink);
                    it.remove();
                    break;
                }
                if (fulltextLink.startsWith(titleAbstractLink)) {
                    String rest = fulltextLink.substring(titleAbstractLink.length());
                    if (rest.startsWith("/")) {
                        titleAbstractLinks.set(i, start + fulltextLink);
                    }
                    it.remove();
                    break;
                }
                if (fulltextLink.contains(titleAbstractLink)) {
                    titleAbstractLinks.set(i, start + fulltextLink);
                    it.remove();
                    break;
                }
                if (titleAbstractLink.startsWith(fulltextLink)) {
                    String rest = titleAbstractLink.substring(fulltextLink.length() - 1);
                    if (LINK_COMPARE_REST.matcher(rest).matches()) {
                        titleAbstractLinks.set(i, start + fulltextLink);
                    }
                    Matcher schemaMatcher = LINK_COMPARE_SCHEMA.matcher(rest);
                    if (schemaMatcher.find()) {
                        titleAbstractLinks.set(i, start + fulltextLink);
                        titleAbstractLinks.add(i + 1, rest.substring(schemaMatcher.start()));
                    }
                    it.remove();
                    break;
                }
                if (titleAbstractLink.contains(fulltextLink)) {
                    it.remove();
                    break;
                }
            }
        }
        Map<String, List<String>> linksAbstract = links(titleAbstractLinks, preProcessor, idf, hostIgnore, scores.keySet(), processedToExtracted, processed, titleWithoutLinks, abstractWithoutLinks, toolTitle, toolTitleTwo, toolTitleAcronym, toolTitleTwoAcronym, toolTitlePruned, toolTitleTwoPruned);
        Map<String, List<String>> linksFulltext = links(fulltextLinks, preProcessor, idf, hostIgnore, scores.keySet(), processedToExtracted, processed, titleWithoutLinks, abstractWithoutLinks, toolTitle, toolTitleTwo, toolTitleAcronym, toolTitleTwoAcronym, toolTitlePruned, toolTitleTwoPruned);
        for (Map.Entry<String, List<String>> linkEntry : linksAbstract.entrySet()) {
            double score = scores.get(linkEntry.getKey()) * LINK_MULTIPLIER_ABSTRACT * linkEntry.getValue().size();
            if (score > LINK_MULTIPLIER_ABSTRACT_MINIMUM) {
                scores.put(linkEntry.getKey(), score);
            } else {
                scores.put(linkEntry.getKey(), LINK_MULTIPLIER_ABSTRACT_MINIMUM);
            }
        }
        boolean genericLinkAugmentation = linksAbstract.isEmpty();
        for (String link : titleAbstractLinks) {
            boolean present = false;
            for (Map.Entry<String, List<String>> linkEntry : linksAbstract.entrySet()) {
                if (linkEntry.getValue().contains(link)) {
                    present = true;
                    break;
                }
            }
            if (!present) {
                if (genericLinkAugmentation) {
                    for (Map.Entry<String, Double> scoreEntry : scores.entrySet()) {
                        scores.put(scoreEntry.getKey(), scoreEntry.getValue() * LINK_MULTIPLIER_ABSTRACT_AUGMENTATION);
                    }
                    genericLinkAugmentation = false;
                }
                String fromLink = fromLink(link, preProcessor, idf, hostIgnore);
                if (!fromLink.isEmpty()) {
                    List<String> fromLinkExtracted = preProcessor.extract(fromLink);
                    List<String> fromLinkProcessed = preProcessor.process(fromLink, fromLinkExtracted);
                    String fromLinkExtractedString = String.join(" ", fromLinkExtracted);
                    String fromLinkProcessedString = String.join(" ", fromLinkProcessed);
                    if (!fromLinkProcessedString.isEmpty()) {
                        scores.merge(fromLinkProcessedString, LINK_MULTIPLIER_ABSTRACT_NEW / fromLinkProcessed.size(), (d1, d2) -> d1 * d2);
                        String wordExtracted = processedToExtracted.get(fromLinkProcessedString);
                        if (wordExtracted == null) {
                            processedToExtracted.put(fromLinkProcessedString, fromLinkExtractedString);
                        }
                    }
                }
            }
        }
        for (Map.Entry<String, List<String>> linkEntry : linksFulltext.entrySet()) {
            long multiplier = linkEntry.getValue().stream().filter(link -> !LINK_TWO_PART.matcher(link).matches()).count();
            if (multiplier > 0) {
                if (multiplier > 2) {
                    multiplier = 2;
                }
                scores.put(linkEntry.getKey(), scores.get(linkEntry.getKey()) * LINK_MULTIPLIER_FULLTEXT * multiplier);
            }
        }
        Map<String, Double> sortedScores = scores.entrySet().stream().sorted(Map.Entry.comparingByValue(Comparator.reverseOrder())).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue, (k, v) -> {
            throw new AssertionError();
        }, LinkedHashMap::new));
        Result result = new Result();
        result.setPmid(publication.getPmid().getContent());
        result.setPmcid(publication.getPmcid().getContent());
        result.setDoi(publication.getDoi().getContent());
        List<String> resultLinks = new ArrayList<>();
        List<String> suggestionsProcessed = new ArrayList<>();
        Iterator<Map.Entry<String, Double>> sortedScoresIterator = sortedScores.entrySet().iterator();
        if (sortedScoresIterator.hasNext()) {
            Map.Entry<String, Double> topEntry = sortedScoresIterator.next();
            double topScore = topEntry.getValue();
            result.setScore(topScore);
            result.setSuggestion(processedToExtracted.get(topEntry.getKey()));
            List<String> linksFromAbstract = linksAbstract.get(topEntry.getKey());
            if (linksFromAbstract != null) {
                for (String link : linksFromAbstract) {
                    resultLinks.add(link);
                }
            }
            List<String> linksFromFulltext = linksFulltext.get(topEntry.getKey());
            if (linksFromFulltext != null) {
                for (String link : linksFromFulltext) {
                    resultLinks.add(link);
                }
            }
            suggestionsProcessed.add(topEntry.getKey());
            for (int i = 1; i < SUGGESTION_LIMIT && sortedScoresIterator.hasNext(); ++i) {
                topEntry = sortedScoresIterator.next();
                if (topEntry.getValue() * TOP_SCORE_LIMIT < topScore) {
                    break;
                }
                result.addOtherSuggestion(processedToExtracted.get(topEntry.getKey()));
                suggestionsProcessed.add(topEntry.getKey());
            }
        }
        List<String> resultOtherLinks = new ArrayList<>();
        for (List<String> linksFromAbstract : linksAbstract.values()) {
            for (String link : linksFromAbstract) {
                if (!resultLinks.contains(link)) {
                    resultOtherLinks.add(link);
                }
            }
        }
        for (List<String> linksFromFulltext : linksFulltext.values()) {
            for (String link : linksFromFulltext) {
                if (!resultLinks.contains(link)) {
                    resultOtherLinks.add(link);
                }
            }
        }
        List<String> resultLeftoverLinks = new ArrayList<>();
        for (String link : titleAbstractLinks) {
            if (!resultLinks.contains(link) && !resultOtherLinks.contains(link)) {
                resultLeftoverLinks.add(link);
            }
        }
        for (String link : fulltextLinks) {
            if (!resultLinks.contains(link) && !resultOtherLinks.contains(link)) {
                resultLeftoverLinks.add(link);
            }
        }
        List<String> resultAllLinks = new ArrayList<>();
        resultAllLinks.addAll(resultLinks.stream().map(resultLink -> BIOTOOLS_LINK_TRIM_START.matcher(resultLink).replaceFirst("")).collect(Collectors.toList()));
        resultAllLinks.addAll(resultOtherLinks.stream().map(resultLink -> BIOTOOLS_LINK_TRIM_START.matcher(resultLink).replaceFirst("")).collect(Collectors.toList()));
        resultAllLinks.addAll(resultLeftoverLinks.stream().map(resultLink -> BIOTOOLS_LINK_TRIM_START.matcher(resultLink).replaceFirst("")).collect(Collectors.toList()));
        result.addLinks(breakLinks(resultLinks, resultAllLinks));
        result.addOtherLinks(breakLinks(resultOtherLinks, resultAllLinks));
        result.addLeftoverLinks(breakLinks(resultLeftoverLinks, resultAllLinks));
        for (int i = 0; i < queries.size(); ++i) {
            Query query = queries.get(i);
            for (PublicationIds pubIds : query.getPublicationIds()) {
                if (!pubIds.getPmid().isEmpty() && !result.getPmid().isEmpty() && pubIds.getPmid().equals(result.getPmid()) || !pubIds.getPmcid().isEmpty() && !result.getPmcid().isEmpty() && pubIds.getPmcid().equals(result.getPmcid()) || !pubIds.getDoi().isEmpty() && !result.getDoi().isEmpty() && pubIds.getDoi().equals(result.getDoi())) {
                    result.addExistingName(query.getId(), query.getName());
                    for (String link : result.getLinks()) {
                        String linkTrimmed = BIOTOOLS_LINK_TRIM_START.matcher(link).replaceFirst("");
                        linkTrimmed = BIOTOOLS_LINK_TRIM_END.matcher(linkTrimmed).replaceFirst("");
                        boolean found = false;
                        for (String queryLink : queryLinks.get(i)) {
                            if (linkTrimmed.equalsIgnoreCase(queryLink)) {
                                found = true;
                                break;
                            } else if (linkTrimmed.startsWith(queryLink)) {
                                String rest = linkTrimmed.substring(queryLink.length() - 1);
                                if (LINK_COMPARE_REST.matcher(rest).matches()) {
                                    found = true;
                                    break;
                                }
                            }
                        }
                        if (!found) {
                        // TODO queryLinks is not complete
                        // result.addNewLink(link);
                        }
                    }
                    break;
                }
            }
        }
        for (String suggestionProcessed : suggestionsProcessed) {
            suggestionProcessed = BIOTOOLS_PROCESSED_VERSION_TRIM.matcher(suggestionProcessed).replaceFirst("");
            if (suggestionProcessed.isEmpty())
                continue;
            for (int i = 0; i < queryNamesProcessed.size(); ++i) {
                if (suggestionProcessed.equals(queryNamesProcessed.get(i))) {
                    String possiblyExistingId = queries.get(i).getId();
                    if (!result.getExistingNames().keySet().contains(possiblyExistingId)) {
                        result.addPossiblyExisting(possiblyExistingId, queries.get(i).getName());
                    }
                }
            }
        }
        List<String> suggestionsExtracted = new ArrayList<>();
        suggestionsExtracted.add(result.getSuggestion());
        suggestionsExtracted.addAll(result.getOtherSuggestions());
        for (String suggestionExtracted : suggestionsExtracted) {
            suggestionExtracted = BIOTOOLS_EXTRACTED_VERSION_TRIM.matcher(suggestionExtracted).replaceFirst("");
            if (suggestionExtracted.isEmpty())
                continue;
            for (String suggestionExtractedWord : suggestionExtracted.split(" ")) {
                Map<String, String> possiblyExisting = new LinkedHashMap<>();
                for (int i = 0; i < queryNamesExtracted.size(); ++i) {
                    List<String> queryNameExtracted = queryNamesExtracted.get(i);
                    if (queryNameExtracted.contains(suggestionExtractedWord)) {
                        String possiblyExistingId = queries.get(i).getId();
                        if (!result.getExistingNames().keySet().contains(possiblyExistingId)) {
                            possiblyExisting.put(possiblyExistingId, queries.get(i).getName());
                        }
                    }
                }
                if (possiblyExisting.size() >= 1 && possiblyExisting.size() <= POSSIBLY_EXISTING_VALID_LIMIT) {
                    for (Map.Entry<String, String> possiblyExistingEntry : possiblyExisting.entrySet()) {
                        result.addPossiblyExisting(possiblyExistingEntry.getKey(), possiblyExistingEntry.getValue());
                    }
                }
            }
        }
        List<String> resultLinksOtherLinks = new ArrayList<>();
        resultLinksOtherLinks.addAll(result.getLinks());
        resultLinksOtherLinks.addAll(result.getOtherLinks());
        for (int i = 0; i < resultLinksOtherLinks.size(); ++i) {
            String resultLink = resultLinksOtherLinks.get(i);
            resultLink = BIOTOOLS_LINK_TRIM_START.matcher(resultLink).replaceFirst("");
            resultLink = BIOTOOLS_LINK_TRIM_END.matcher(resultLink).replaceFirst("");
            for (int j = 0; j < queryLinks.size(); ++j) {
                String possiblyExistingId = queries.get(j).getId();
                if (!result.getExistingNames().keySet().contains(possiblyExistingId)) {
                    List<String> queryLink = queryLinks.get(j);
                    for (String link : queryLink) {
                        if (resultLink.equalsIgnoreCase(link)) {
                            result.addPossiblyExisting(possiblyExistingId, queries.get(j).getName());
                        } else if (resultLink.startsWith(link)) {
                            String rest = resultLink.substring(link.length() - 1);
                            if (LINK_COMPARE_REST.matcher(rest).matches()) {
                                result.addPossiblyExisting(possiblyExistingId, queries.get(j).getName());
                            }
                        }
                    }
                }
            }
        }
        if (!(result.getExistingNames().size() == 1 && !result.getSuggestion().isEmpty() && result.getExistingNames().values().iterator().next().equals(result.getSuggestion()) && result.getNewLinks().isEmpty())) {
            results.add(result);
        }
    }
    // TODO
    System.err.println();
    results = results.stream().sorted(Comparator.comparing(Result::getScore).reversed()).collect(Collectors.toList());
    for (int i = 0; i < results.size() - 1; ++i) {
        Result resultI = results.get(i);
        for (int j = i + 1; j < results.size(); ++j) {
            Result resultJ = results.get(j);
            if (resultI.getSuggestion().equals(resultJ.getSuggestion())) {
                resultI.addSameSuggestion(resultJ.getPmid());
                resultJ.addSameSuggestion(resultI.getPmid());
            }
        }
    }
    return results;
}

Also used : Arrays(java.util.Arrays) URISyntaxException(java.net.URISyntaxException) FetcherUtil(org.edamontology.pubfetcher.FetcherUtil) Version(org.edamontology.pubfetcher.Version) Matcher(java.util.regex.Matcher) FetcherArgs(org.edamontology.pubfetcher.FetcherArgs) Locale(java.util.Locale) Map(java.util.Map) Element(org.jsoup.nodes.Element) FetcherCommon(org.edamontology.pubfetcher.FetcherCommon) URI(java.net.URI) ParseException(java.text.ParseException) Path(java.nio.file.Path) Link(org.edamontology.edammap.core.query.Link) Idf(org.edamontology.edammap.core.idf.Idf) MissingResourceException(java.util.MissingResourceException) Set(java.util.Set) Collectors(java.util.stream.Collectors) StandardCharsets(java.nio.charset.StandardCharsets) PublicationIds(org.edamontology.pubfetcher.PublicationIds) List(java.util.List) Logger(org.apache.logging.log4j.Logger) CharsetEncoder(java.nio.charset.CharsetEncoder) Document(org.jsoup.nodes.Document) Pattern(java.util.regex.Pattern) Query(org.edamontology.edammap.core.query.Query) Parameter(com.beust.jcommander.Parameter) HashMap(java.util.HashMap) PreProcessor(org.edamontology.edammap.core.preprocessing.PreProcessor) QueryLoader(org.edamontology.edammap.core.query.QueryLoader) ArrayList(java.util.ArrayList) LinkedHashMap(java.util.LinkedHashMap) CodingErrorAction(java.nio.charset.CodingErrorAction) OutputStreamWriter(java.io.OutputStreamWriter) LinkedHashSet(java.util.LinkedHashSet) BasicArgs(org.edamontology.pubfetcher.BasicArgs) Iterator(java.util.Iterator) Files(java.nio.file.Files) BufferedWriter(java.io.BufferedWriter) Fetcher(org.edamontology.pubfetcher.Fetcher) PreProcessorArgs(org.edamontology.edammap.core.preprocessing.PreProcessorArgs) IOException(java.io.IOException) Database(org.edamontology.pubfetcher.Database) Field(java.lang.reflect.Field) InputStreamReader(java.io.InputStreamReader) QueryType(org.edamontology.edammap.core.query.QueryType) Publication(org.edamontology.pubfetcher.Publication) BufferedReader(java.io.BufferedReader) Comparator(java.util.Comparator) LogManager(org.apache.logging.log4j.LogManager) InputStream(java.io.InputStream) LinkedHashSet(java.util.LinkedHashSet) Set(java.util.Set) LinkedHashSet(java.util.LinkedHashSet) Query(org.edamontology.edammap.core.query.Query) Matcher(java.util.regex.Matcher) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) ArrayList(java.util.ArrayList) PreProcessor(org.edamontology.edammap.core.preprocessing.PreProcessor) Idf(org.edamontology.edammap.core.idf.Idf) LinkedHashMap(java.util.LinkedHashMap) List(java.util.List) ArrayList(java.util.ArrayList) Publication(org.edamontology.pubfetcher.Publication) PublicationIds(org.edamontology.pubfetcher.PublicationIds) Map(java.util.Map) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) Link(org.edamontology.edammap.core.query.Link)

Example 4 with PreProcessor

use of org.edamontology.edammap.core.preprocessing.PreProcessor in project edammap by edamontology.

the class Resource method runPost.

private Response runPost(MultivaluedMap<String, String> params, Request request) throws IOException, ParseException, URISyntaxException {
    logger.info("POST {} from {}", params, request.getRemoteAddr());
    long start = System.currentTimeMillis();
    logger.info("Start: {}", Instant.ofEpochMilli(start));
    CoreArgs coreArgs = new CoreArgs();
    ParamParse.parseParams(params, coreArgs);
    coreArgs.setProcessorArgs(Server.args.getProcessorArgs());
    coreArgs.getFetcherArgs().setPrivateArgs(Server.args.getFetcherPrivateArgs());
    ServerInput serverInput = new ServerInput(ParamParse.getParamString(params, "name"), ParamParse.getParamString(params, "keywords"), ParamParse.getParamString(params, "description"), ParamParse.getParamString(params, "webpage-urls"), ParamParse.getParamString(params, "doc-urls"), ParamParse.getParamString(params, "publication-ids"), ParamParse.getParamString(params, "annotations"));
    if (serverInput.getName() != null && serverInput.getName().length() > MAX_NAME_LENGTH) {
        throw new IllegalArgumentException("Name length (" + serverInput.getName().length() + ") is greater than maximum allowed (" + MAX_NAME_LENGTH + ")");
    }
    if (serverInput.getKeywords() != null && serverInput.getKeywords().length() > MAX_KEYWORDS_LENGTH) {
        throw new IllegalArgumentException("Keywords length (" + serverInput.getKeywords().length() + ") is greater than maximum allowed (" + MAX_KEYWORDS_LENGTH + ")");
    }
    if (serverInput.getDescription() != null && serverInput.getDescription().length() > MAX_DESCRIPTION_LENGTH) {
        throw new IllegalArgumentException("Description length (" + serverInput.getDescription().length() + ") is greater than maximum allowed (" + MAX_DESCRIPTION_LENGTH + ")");
    }
    if (serverInput.getWebpageUrls() != null && serverInput.getWebpageUrls().length() > MAX_LINKS_LENGTH) {
        throw new IllegalArgumentException("Webpage URLs length (" + serverInput.getWebpageUrls().length() + ") is greater than maximum allowed (" + MAX_LINKS_LENGTH + ")");
    }
    if (serverInput.getDocUrls() != null && serverInput.getDocUrls().length() > MAX_LINKS_LENGTH) {
        throw new IllegalArgumentException("Doc URLs length (" + serverInput.getDocUrls().length() + ") is greater than maximum allowed (" + MAX_LINKS_LENGTH + ")");
    }
    if (serverInput.getPublicationIds() != null && serverInput.getPublicationIds().length() > MAX_PUBLICATION_IDS_LENGTH) {
        throw new IllegalArgumentException("Publication IDs length (" + serverInput.getPublicationIds().length() + ") is greater than maximum allowed (" + MAX_PUBLICATION_IDS_LENGTH + ")");
    }
    if (serverInput.getAnnotations() != null && serverInput.getAnnotations().length() > MAX_ANNOTATIONS_LENGTH) {
        throw new IllegalArgumentException("Annotations length (" + serverInput.getAnnotations().length() + ") is greater than maximum allowed (" + MAX_ANNOTATIONS_LENGTH + ")");
    }
    String uuid;
    String uuidDir;
    do {
        uuid = Server.version.getVersion() + "/" + UUID.randomUUID().toString();
        uuidDir = Server.args.getFiles() + "/" + uuid;
    } while (Files.exists(Paths.get(uuidDir)));
    Files.createDirectory(Paths.get(uuidDir));
    serverInput.setId(uuid);
    logger.info("UUID: {}", uuid);
    Output output = new Output(uuidDir + "/results.txt", uuidDir, true);
    // TODO params to choose if HTML or TXT output desired
    PreProcessor preProcessor = new PreProcessor(coreArgs.getPreProcessorArgs(), Server.stopwordsAll.get(coreArgs.getPreProcessorArgs().getStopwords()));
    logger.info("Processing {} concepts", Server.concepts.size());
    Map<EdamUri, ConceptProcessed> processedConcepts = Server.processor.getProcessedConcepts(Server.concepts, coreArgs.getMapperArgs().getIdfArgs(), coreArgs.getMapperArgs().getMultiplierArgs(), preProcessor);
    logger.info("Loading query");
    Query query = QueryLoader.fromServer(serverInput, Server.concepts, MAX_KEYWORDS_SIZE, MAX_LINKS_SIZE, MAX_PUBLICATION_IDS_SIZE);
    Idf idf;
    if (coreArgs.getPreProcessorArgs().isStemming()) {
        idf = Server.idfStemmed;
    } else {
        idf = Server.idf;
    }
    QueryProcessed processedQuery = Server.processor.getProcessedQuery(query, QueryType.server, preProcessor, idf, coreArgs.getFetcherArgs());
    logger.info("Mapping query");
    Mapping mapping = new Mapper(processedConcepts).map(query, processedQuery, coreArgs.getMapperArgs());
    List<Query> queries = Collections.singletonList(query);
    List<List<Webpage>> webpages = Collections.singletonList(processedQuery.getWebpages());
    List<List<Webpage>> docs = Collections.singletonList(processedQuery.getDocs());
    List<List<Publication>> publications = Collections.singletonList(processedQuery.getPublications());
    List<Mapping> mappings = Collections.singletonList(mapping);
    Results results = Benchmark.calculate(queries, mappings);
    long stop = System.currentTimeMillis();
    logger.info("Stop: {}", Instant.ofEpochMilli(stop));
    logger.info("Mapping took {}s", (stop - start) / 1000.0);
    logger.info("Outputting results");
    output.output(coreArgs, Server.paramsMain, QueryType.server, 1, 1, Server.concepts, queries, webpages, docs, publications, results, start, stop, Server.version);
    URI location = new URI("/" + Server.args.getPath() + "/" + uuid + "/");
    logger.info("POSTED {}", location);
    return Response.seeOther(location).build();
}

Also used : QueryProcessed(org.edamontology.edammap.core.processing.QueryProcessed) ConceptProcessed(org.edamontology.edammap.core.processing.ConceptProcessed) Query(org.edamontology.edammap.core.query.Query) CoreArgs(org.edamontology.edammap.core.args.CoreArgs) PreProcessor(org.edamontology.edammap.core.preprocessing.PreProcessor) Idf(org.edamontology.edammap.core.idf.Idf) Mapping(org.edamontology.edammap.core.mapping.Mapping) URI(java.net.URI) Mapper(org.edamontology.edammap.core.mapping.Mapper) Results(org.edamontology.edammap.core.benchmarking.Results) Output(org.edamontology.edammap.core.output.Output) List(java.util.List) EdamUri(org.edamontology.edammap.core.edam.EdamUri) ServerInput(org.edamontology.edammap.core.input.ServerInput)

Example 5 with PreProcessor

use of org.edamontology.edammap.core.preprocessing.PreProcessor in project edammap by edamontology.

the class PubMedApps method beforeAfter.

private static void beforeAfter(PreProcessorArgs preProcessorArgs, String queryIdf, String database, List<String> pubFile) throws IOException {
    PreProcessor preProcessor = new PreProcessor(preProcessorArgs);
    Idf idf = new Idf(queryIdf);
    List<Publication> publications = getPublications(database, pubFile);
    Map<String, Integer> before = new HashMap<>();
    Map<String, Integer> after = new HashMap<>();
    Map<String, Integer> all = new HashMap<>();
    Map<String, Double> allBeforeScores = new HashMap<>();
    int allBeforeScoresSum = 0;
    Map<String, Double> allAfterScores = new HashMap<>();
    int allAfterScoresSum = 0;
    for (Publication publication : publications) {
        String toolTitle = publication.getTitle().getContent();
        Matcher titleSeparator = TITLE_SEPARATOR.matcher(toolTitle);
        if (titleSeparator.find()) {
            toolTitle = toolTitle.substring(0, titleSeparator.start()).trim();
        } else {
            continue;
        }
        List<String> toolTitleProcessedWords = preProcessor.process(toolTitle);
        if (toolTitleProcessedWords.size() != 1)
            continue;
        String toolTitleProcessed = toolTitleProcessedWords.get(0);
        List<String> abstractSentences = preProcessor.sentences(preProcessor.removeLinks(publication.getAbstract().getContent()));
        List<List<String>> processed = new ArrayList<>();
        for (String sentence : abstractSentences) {
            processed.add(preProcessor.process(sentence));
        }
        Map<String, Double> scores = new HashMap<>();
        for (List<String> sentence : processed) {
            for (String word : sentence) {
                scores.merge(word, Math.pow(idf.getIdf(word), QUERY_IDF_SCALING), Double::sum);
            }
        }
        for (List<String> sentenceProcessed : processed) {
            for (int i = 0; i < sentenceProcessed.size(); ++i) {
                if (sentenceProcessed.get(i).equals(toolTitleProcessed)) {
                    if (i - 1 >= 0)
                        before.merge(sentenceProcessed.get(i - 1), 1, Integer::sum);
                    if (i - 2 >= 0)
                        before.merge(sentenceProcessed.get(i - 2), 1, Integer::sum);
                    if (i + 1 < sentenceProcessed.size())
                        after.merge(sentenceProcessed.get(i + 1), 1, Integer::sum);
                    if (i + 2 < sentenceProcessed.size())
                        after.merge(sentenceProcessed.get(i + 2), 1, Integer::sum);
                }
            }
        }
        for (List<String> sentenceProcessed : processed) {
            for (int i = 0; i < sentenceProcessed.size(); ++i) {
                String wordProcessed = sentenceProcessed.get(i);
                all.merge(wordProcessed, 1, Integer::sum);
                if (i - 1 >= 0) {
                    allBeforeScores.merge(wordProcessed, scores.get(sentenceProcessed.get(i - 1)), Double::sum);
                    ++allBeforeScoresSum;
                }
                if (i - 2 >= 0) {
                    allBeforeScores.merge(wordProcessed, scores.get(sentenceProcessed.get(i - 2)), Double::sum);
                    ++allBeforeScoresSum;
                }
                if (i + 1 < sentenceProcessed.size()) {
                    allAfterScores.merge(wordProcessed, scores.get(sentenceProcessed.get(i + 1)), Double::sum);
                    ++allAfterScoresSum;
                }
                if (i + 2 < sentenceProcessed.size()) {
                    allAfterScores.merge(wordProcessed, scores.get(sentenceProcessed.get(i + 2)), Double::sum);
                    ++allAfterScoresSum;
                }
            }
        }
    }
    Map<String, Integer> beforeSorted = before.entrySet().stream().sorted(Map.Entry.comparingByValue(Comparator.reverseOrder())).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue, (k, v) -> {
        throw new AssertionError();
    }, LinkedHashMap::new));
    System.out.println("BEFORE_TOOL_TITLE\tCOUNT\tTOTAL\tPRECISION\tAVERAGE_SCORE\tPRECISION/AVERAGE_SCORE");
    for (Map.Entry<String, Integer> bs : beforeSorted.entrySet()) {
        String word = bs.getKey();
        int count = bs.getValue();
        int total = all.get(word);
        double precision = count / (double) total;
        Double totalScore = allAfterScores.get(word);
        double averageScore = (totalScore != null ? totalScore / allAfterScoresSum : 0);
        System.out.printf(Locale.ROOT, "%16s\t%d\t%d\t%.6f\t%.6f\t%8.1f\n", word, count, total, precision, averageScore, precision / averageScore);
    }
    System.out.println();
    Map<String, Integer> afterSorted = after.entrySet().stream().sorted(Map.Entry.comparingByValue(Comparator.reverseOrder())).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue, (k, v) -> {
        throw new AssertionError();
    }, LinkedHashMap::new));
    System.out.println("AFTER_TOOL_TITLE\tCOUNT\tTOTAL\tPRECISION\tAVERAGE_SCORE\tPRECISION/AVERAGE_SCORE");
    for (Map.Entry<String, Integer> as : afterSorted.entrySet()) {
        String word = as.getKey();
        int count = as.getValue();
        int total = all.get(word);
        double precision = count / (double) total;
        Double totalScore = allBeforeScores.get(word);
        double averageScore = (totalScore != null ? totalScore / allBeforeScoresSum : 0);
        System.out.printf(Locale.ROOT, "%16s\t%d\t%d\t%.6f\t%.6f\t%8.1f\n", word, count, total, precision, averageScore, precision / averageScore);
    }
}

Also used : Arrays(java.util.Arrays) URISyntaxException(java.net.URISyntaxException) FetcherUtil(org.edamontology.pubfetcher.FetcherUtil) Version(org.edamontology.pubfetcher.Version) Matcher(java.util.regex.Matcher) FetcherArgs(org.edamontology.pubfetcher.FetcherArgs) Locale(java.util.Locale) Map(java.util.Map) Element(org.jsoup.nodes.Element) FetcherCommon(org.edamontology.pubfetcher.FetcherCommon) URI(java.net.URI) ParseException(java.text.ParseException) Path(java.nio.file.Path) Link(org.edamontology.edammap.core.query.Link) Idf(org.edamontology.edammap.core.idf.Idf) MissingResourceException(java.util.MissingResourceException) Set(java.util.Set) Collectors(java.util.stream.Collectors) StandardCharsets(java.nio.charset.StandardCharsets) PublicationIds(org.edamontology.pubfetcher.PublicationIds) List(java.util.List) Logger(org.apache.logging.log4j.Logger) CharsetEncoder(java.nio.charset.CharsetEncoder) Document(org.jsoup.nodes.Document) Pattern(java.util.regex.Pattern) Query(org.edamontology.edammap.core.query.Query) Parameter(com.beust.jcommander.Parameter) HashMap(java.util.HashMap) PreProcessor(org.edamontology.edammap.core.preprocessing.PreProcessor) QueryLoader(org.edamontology.edammap.core.query.QueryLoader) ArrayList(java.util.ArrayList) LinkedHashMap(java.util.LinkedHashMap) CodingErrorAction(java.nio.charset.CodingErrorAction) OutputStreamWriter(java.io.OutputStreamWriter) LinkedHashSet(java.util.LinkedHashSet) BasicArgs(org.edamontology.pubfetcher.BasicArgs) Iterator(java.util.Iterator) Files(java.nio.file.Files) BufferedWriter(java.io.BufferedWriter) Fetcher(org.edamontology.pubfetcher.Fetcher) PreProcessorArgs(org.edamontology.edammap.core.preprocessing.PreProcessorArgs) IOException(java.io.IOException) Database(org.edamontology.pubfetcher.Database) Field(java.lang.reflect.Field) InputStreamReader(java.io.InputStreamReader) QueryType(org.edamontology.edammap.core.query.QueryType) Publication(org.edamontology.pubfetcher.Publication) BufferedReader(java.io.BufferedReader) Comparator(java.util.Comparator) LogManager(org.apache.logging.log4j.LogManager) InputStream(java.io.InputStream) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) Matcher(java.util.regex.Matcher) ArrayList(java.util.ArrayList) PreProcessor(org.edamontology.edammap.core.preprocessing.PreProcessor) Publication(org.edamontology.pubfetcher.Publication) Idf(org.edamontology.edammap.core.idf.Idf) List(java.util.List) ArrayList(java.util.ArrayList) Map(java.util.Map) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap)

Aggregations

PreProcessor (org.edamontology.edammap.core.preprocessing.PreProcessor)6 Idf (org.edamontology.edammap.core.idf.Idf)4 Query (org.edamontology.edammap.core.query.Query)4 URI (java.net.URI)3 ArrayList (java.util.ArrayList)3 List (java.util.List)3 Parameter (com.beust.jcommander.Parameter)2 BufferedReader (java.io.BufferedReader)2 BufferedWriter (java.io.BufferedWriter)2 IOException (java.io.IOException)2 InputStream (java.io.InputStream)2 InputStreamReader (java.io.InputStreamReader)2 OutputStreamWriter (java.io.OutputStreamWriter)2 Field (java.lang.reflect.Field)2 URISyntaxException (java.net.URISyntaxException)2 CharsetEncoder (java.nio.charset.CharsetEncoder)2 CodingErrorAction (java.nio.charset.CodingErrorAction)2 StandardCharsets (java.nio.charset.StandardCharsets)2 Files (java.nio.file.Files)2 Path (java.nio.file.Path)2