Search in sources :

Example 1 with Database

use of org.edamontology.pubfetcher.Database in project edammap by edamontology.

the class PubMedApps method beforeAfter.

private static void beforeAfter(PreProcessorArgs preProcessorArgs, String queryIdf, String database, List<String> pubFile) throws IOException {
    PreProcessor preProcessor = new PreProcessor(preProcessorArgs);
    Idf idf = new Idf(queryIdf);
    List<Publication> publications = getPublications(database, pubFile);
    Map<String, Integer> before = new HashMap<>();
    Map<String, Integer> after = new HashMap<>();
    Map<String, Integer> all = new HashMap<>();
    Map<String, Double> allBeforeScores = new HashMap<>();
    int allBeforeScoresSum = 0;
    Map<String, Double> allAfterScores = new HashMap<>();
    int allAfterScoresSum = 0;
    for (Publication publication : publications) {
        String toolTitle = publication.getTitle().getContent();
        Matcher titleSeparator = TITLE_SEPARATOR.matcher(toolTitle);
        if (titleSeparator.find()) {
            toolTitle = toolTitle.substring(0, titleSeparator.start()).trim();
        } else {
            continue;
        }
        List<String> toolTitleProcessedWords = preProcessor.process(toolTitle);
        if (toolTitleProcessedWords.size() != 1)
            continue;
        String toolTitleProcessed = toolTitleProcessedWords.get(0);
        List<String> abstractSentences = preProcessor.sentences(preProcessor.removeLinks(publication.getAbstract().getContent()));
        List<List<String>> processed = new ArrayList<>();
        for (String sentence : abstractSentences) {
            processed.add(preProcessor.process(sentence));
        }
        Map<String, Double> scores = new HashMap<>();
        for (List<String> sentence : processed) {
            for (String word : sentence) {
                scores.merge(word, Math.pow(idf.getIdf(word), QUERY_IDF_SCALING), Double::sum);
            }
        }
        for (List<String> sentenceProcessed : processed) {
            for (int i = 0; i < sentenceProcessed.size(); ++i) {
                if (sentenceProcessed.get(i).equals(toolTitleProcessed)) {
                    if (i - 1 >= 0)
                        before.merge(sentenceProcessed.get(i - 1), 1, Integer::sum);
                    if (i - 2 >= 0)
                        before.merge(sentenceProcessed.get(i - 2), 1, Integer::sum);
                    if (i + 1 < sentenceProcessed.size())
                        after.merge(sentenceProcessed.get(i + 1), 1, Integer::sum);
                    if (i + 2 < sentenceProcessed.size())
                        after.merge(sentenceProcessed.get(i + 2), 1, Integer::sum);
                }
            }
        }
        for (List<String> sentenceProcessed : processed) {
            for (int i = 0; i < sentenceProcessed.size(); ++i) {
                String wordProcessed = sentenceProcessed.get(i);
                all.merge(wordProcessed, 1, Integer::sum);
                if (i - 1 >= 0) {
                    allBeforeScores.merge(wordProcessed, scores.get(sentenceProcessed.get(i - 1)), Double::sum);
                    ++allBeforeScoresSum;
                }
                if (i - 2 >= 0) {
                    allBeforeScores.merge(wordProcessed, scores.get(sentenceProcessed.get(i - 2)), Double::sum);
                    ++allBeforeScoresSum;
                }
                if (i + 1 < sentenceProcessed.size()) {
                    allAfterScores.merge(wordProcessed, scores.get(sentenceProcessed.get(i + 1)), Double::sum);
                    ++allAfterScoresSum;
                }
                if (i + 2 < sentenceProcessed.size()) {
                    allAfterScores.merge(wordProcessed, scores.get(sentenceProcessed.get(i + 2)), Double::sum);
                    ++allAfterScoresSum;
                }
            }
        }
    }
    Map<String, Integer> beforeSorted = before.entrySet().stream().sorted(Map.Entry.comparingByValue(Comparator.reverseOrder())).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue, (k, v) -> {
        throw new AssertionError();
    }, LinkedHashMap::new));
    System.out.println("BEFORE_TOOL_TITLE\tCOUNT\tTOTAL\tPRECISION\tAVERAGE_SCORE\tPRECISION/AVERAGE_SCORE");
    for (Map.Entry<String, Integer> bs : beforeSorted.entrySet()) {
        String word = bs.getKey();
        int count = bs.getValue();
        int total = all.get(word);
        double precision = count / (double) total;
        Double totalScore = allAfterScores.get(word);
        double averageScore = (totalScore != null ? totalScore / allAfterScoresSum : 0);
        System.out.printf(Locale.ROOT, "%16s\t%d\t%d\t%.6f\t%.6f\t%8.1f\n", word, count, total, precision, averageScore, precision / averageScore);
    }
    System.out.println();
    Map<String, Integer> afterSorted = after.entrySet().stream().sorted(Map.Entry.comparingByValue(Comparator.reverseOrder())).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue, (k, v) -> {
        throw new AssertionError();
    }, LinkedHashMap::new));
    System.out.println("AFTER_TOOL_TITLE\tCOUNT\tTOTAL\tPRECISION\tAVERAGE_SCORE\tPRECISION/AVERAGE_SCORE");
    for (Map.Entry<String, Integer> as : afterSorted.entrySet()) {
        String word = as.getKey();
        int count = as.getValue();
        int total = all.get(word);
        double precision = count / (double) total;
        Double totalScore = allBeforeScores.get(word);
        double averageScore = (totalScore != null ? totalScore / allBeforeScoresSum : 0);
        System.out.printf(Locale.ROOT, "%16s\t%d\t%d\t%.6f\t%.6f\t%8.1f\n", word, count, total, precision, averageScore, precision / averageScore);
    }
}
Also used : Arrays(java.util.Arrays) URISyntaxException(java.net.URISyntaxException) FetcherUtil(org.edamontology.pubfetcher.FetcherUtil) Version(org.edamontology.pubfetcher.Version) Matcher(java.util.regex.Matcher) FetcherArgs(org.edamontology.pubfetcher.FetcherArgs) Locale(java.util.Locale) Map(java.util.Map) Element(org.jsoup.nodes.Element) FetcherCommon(org.edamontology.pubfetcher.FetcherCommon) URI(java.net.URI) ParseException(java.text.ParseException) Path(java.nio.file.Path) Link(org.edamontology.edammap.core.query.Link) Idf(org.edamontology.edammap.core.idf.Idf) MissingResourceException(java.util.MissingResourceException) Set(java.util.Set) Collectors(java.util.stream.Collectors) StandardCharsets(java.nio.charset.StandardCharsets) PublicationIds(org.edamontology.pubfetcher.PublicationIds) List(java.util.List) Logger(org.apache.logging.log4j.Logger) CharsetEncoder(java.nio.charset.CharsetEncoder) Document(org.jsoup.nodes.Document) Pattern(java.util.regex.Pattern) Query(org.edamontology.edammap.core.query.Query) Parameter(com.beust.jcommander.Parameter) HashMap(java.util.HashMap) PreProcessor(org.edamontology.edammap.core.preprocessing.PreProcessor) QueryLoader(org.edamontology.edammap.core.query.QueryLoader) ArrayList(java.util.ArrayList) LinkedHashMap(java.util.LinkedHashMap) CodingErrorAction(java.nio.charset.CodingErrorAction) OutputStreamWriter(java.io.OutputStreamWriter) LinkedHashSet(java.util.LinkedHashSet) BasicArgs(org.edamontology.pubfetcher.BasicArgs) Iterator(java.util.Iterator) Files(java.nio.file.Files) BufferedWriter(java.io.BufferedWriter) Fetcher(org.edamontology.pubfetcher.Fetcher) PreProcessorArgs(org.edamontology.edammap.core.preprocessing.PreProcessorArgs) IOException(java.io.IOException) Database(org.edamontology.pubfetcher.Database) Field(java.lang.reflect.Field) InputStreamReader(java.io.InputStreamReader) QueryType(org.edamontology.edammap.core.query.QueryType) Publication(org.edamontology.pubfetcher.Publication) BufferedReader(java.io.BufferedReader) Comparator(java.util.Comparator) LogManager(org.apache.logging.log4j.LogManager) InputStream(java.io.InputStream) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) Matcher(java.util.regex.Matcher) ArrayList(java.util.ArrayList) PreProcessor(org.edamontology.edammap.core.preprocessing.PreProcessor) Publication(org.edamontology.pubfetcher.Publication) Idf(org.edamontology.edammap.core.idf.Idf) List(java.util.List) ArrayList(java.util.ArrayList) Map(java.util.Map) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap)

Aggregations

Parameter (com.beust.jcommander.Parameter)1 BufferedReader (java.io.BufferedReader)1 BufferedWriter (java.io.BufferedWriter)1 IOException (java.io.IOException)1 InputStream (java.io.InputStream)1 InputStreamReader (java.io.InputStreamReader)1 OutputStreamWriter (java.io.OutputStreamWriter)1 Field (java.lang.reflect.Field)1 URI (java.net.URI)1 URISyntaxException (java.net.URISyntaxException)1 CharsetEncoder (java.nio.charset.CharsetEncoder)1 CodingErrorAction (java.nio.charset.CodingErrorAction)1 StandardCharsets (java.nio.charset.StandardCharsets)1 Files (java.nio.file.Files)1 Path (java.nio.file.Path)1 ParseException (java.text.ParseException)1 ArrayList (java.util.ArrayList)1 Arrays (java.util.Arrays)1 Comparator (java.util.Comparator)1 HashMap (java.util.HashMap)1