Search in sources :

Example 6 with Ranking

use of org.baderlab.csplugins.enrichmentmap.model.Ranking in project EnrichmentMapApp by BaderLab.

the class InitializeGenesetsOfInterestTask method initializeSets.

/**
	 * filter the genesets, restricting them to only those passing the user
	 * specified thresholds.
	 * 
	 * @return true if successful and false otherwise.
	 */
public boolean initializeSets(TaskMonitor tm) {
    if (tm == null)
        tm = new NullTaskMonitor();
    DiscreteTaskMonitor taskMonitor = new DiscreteTaskMonitor(tm, map.getDataSetCount());
    //create subset of genesets that contains only the genesets of interest with pvalue and qbalue less than values specified by the user.
    //Go through each Dataset populating the Gene set of interest in each dataset object
    Map<String, EMDataSet> datasets = map.getDataSets();
    // count how many experiments (DataSets) contain the geneset
    Optional<Integer> minExperiments = map.getParams().getMinExperiments();
    Map<String, Integer> occurrences = minExperiments.isPresent() ? new HashMap<>() : null;
    for (String datasetName : datasets.keySet()) {
        taskMonitor.inc();
        EMDataSet dataset = datasets.get(datasetName);
        // all these maps use the geneset name as key
        Map<String, EnrichmentResult> enrichmentResults = dataset.getEnrichments().getEnrichments();
        Map<String, GeneSet> genesets = dataset.getSetOfGeneSets().getGeneSets();
        Map<String, GeneSet> genesetsOfInterest = dataset.getGeneSetsOfInterest().getGeneSets();
        // If there are no genesets associated with this dataset then get the complete set assumption being that the gmt file applies to all datasets.
        if (genesets == null || genesets.isEmpty()) {
            genesets = map.getAllGeneSets();
        }
        //if there are no enrichment Results then do nothing
        if (enrichmentResults == null || enrichmentResults.isEmpty()) {
            return false;
        }
        //iterate through the GSEA Results to figure out which genesets we want to use
        for (String genesetName : enrichmentResults.keySet()) {
            EnrichmentResult result = enrichmentResults.get(genesetName);
            // update rank at max for leading edge calculation
            if (dataset.getMethod() == Method.GSEA) {
                Ranking ranks = dataset.getExpressionSets().getRanksByName(datasetName);
                updateRankAtMax((GSEAResult) result, ranks);
            }
            if (result.geneSetOfInterest(map.getParams())) {
                GeneSet geneset = genesets.get(genesetName);
                if (geneset != null) {
                    // while we are checking, update the size of the genesets based on post filtered data
                    result.setGsSize(geneset.getGenes().size());
                    if (occurrences != null) {
                        occurrences.merge(genesetName, 1, (v, d) -> v + 1);
                    }
                    genesetsOfInterest.put(genesetName, geneset);
                } else if (throwIfMissing) {
                    throw new IllegalThreadStateException("The Geneset: " + genesetName + " is not found in the GMT file.");
                }
            }
        }
    }
    // Remove gene-sets that don't pass the minimum occurrence cutoff
    if (occurrences != null) {
        for (EMDataSet dataset : datasets.values()) {
            Map<String, GeneSet> genesetsOfInterest = dataset.getGeneSetsOfInterest().getGeneSets();
            genesetsOfInterest.keySet().removeIf(geneset -> occurrences.getOrDefault(geneset, 0) < minExperiments.get());
        }
    }
    return true;
}
Also used : DiscreteTaskMonitor(org.baderlab.csplugins.enrichmentmap.util.DiscreteTaskMonitor) EnrichmentResult(org.baderlab.csplugins.enrichmentmap.model.EnrichmentResult) Ranking(org.baderlab.csplugins.enrichmentmap.model.Ranking) EMDataSet(org.baderlab.csplugins.enrichmentmap.model.EMDataSet) GeneSet(org.baderlab.csplugins.enrichmentmap.model.GeneSet) NullTaskMonitor(org.baderlab.csplugins.enrichmentmap.util.NullTaskMonitor)

Example 7 with Ranking

use of org.baderlab.csplugins.enrichmentmap.model.Ranking in project EnrichmentMapApp by BaderLab.

the class CreateDiseaseSignatureTaskParallel method mannWhitney.

private void mannWhitney(Set<Integer> intersection, GenesetSimilarity comparison, EMDataSet dataSet) {
    String rankFile = params.getDataSetToRankFile().get(dataSet.getName());
    Ranking ranks = dataSet.getExpressionSets().getRanks().get(rankFile);
    // Calculate Mann-Whitney U pValue for Overlap
    Integer[] overlapGeneIds = intersection.toArray(new Integer[intersection.size()]);
    double[] overlapGeneScores = new double[overlapGeneIds.length];
    int j = 0;
    for (Integer geneId : overlapGeneIds) {
        Double score = ranks.getScore(geneId);
        if (score != null) {
            // unbox
            overlapGeneScores[j++] = score;
        }
    }
    overlapGeneScores = Arrays.copyOf(overlapGeneScores, j);
    if (ranks.isEmpty()) {
        // avoid NoDataException
        comparison.setMannWhitPValueTwoSided(1.5);
        comparison.setMannWhitPValueGreater(1.5);
        comparison.setMannWhitPValueLess(1.5);
        comparison.setMannWhitMissingRanks(true);
    } else {
        double[] scores = ranks.getScores();
        MannWhitneyTestResult result = mannWhitneyCache.mannWhitneyUTestBatch(overlapGeneScores, scores);
        comparison.setMannWhitPValueTwoSided(result.twoSided);
        comparison.setMannWhitPValueGreater(result.greater);
        comparison.setMannWhitPValueLess(result.less);
    }
}
Also used : Ranking(org.baderlab.csplugins.enrichmentmap.model.Ranking) MannWhitneyTestResult(org.baderlab.csplugins.mannwhit.MannWhitneyTestResult)

Example 8 with Ranking

use of org.baderlab.csplugins.enrichmentmap.model.Ranking in project EnrichmentMapApp by BaderLab.

the class RanksFileReaderTask method parse.

/**
	 * parse the rank file
	 */
public void parse(TaskMonitor taskMonitor) throws IOException {
    if (taskMonitor == null)
        taskMonitor = new NullTaskMonitor();
    List<String> lines = DatasetLineParser.readLines(RankFileName);
    int lineNumber = 0;
    int currentProgress = 0;
    int maxValue = lines.size();
    taskMonitor.setStatusMessage("Parsing Rank file - " + maxValue + " rows");
    EnrichmentMap map = dataset.getMap();
    // we don't know the number of scores in the rank file yet, but it can't be more than the number of lines.
    Double[] score_collector = new Double[lines.size()];
    boolean gseaDefinedRanks = false;
    Map<Integer, Rank> ranks = new HashMap<>();
    /*
		 * there are two possible Rank files: If loaded through the rpt file the
		 * file is the one generated by GSEA and will have 5 columns (name,
		 * description, empty,empty,score) If the user loaded it through the
		 * generic of specifying advanced options then it will 2 columns
		 * (name,score). The score in either case should be a double and the
		 * name a string so check for either option.
		 */
    //number of found scores
    int nScores = 0;
    for (int i = 0; i < lines.size(); i++) {
        String line = lines.get(i);
        //check to see if the line is commented out and should be ignored.
        if (line.startsWith("#")) {
            // look for ranks_name in comment line e.g.: "# Ranks Name : My Ranks"
            if (Pattern.matches("^# *Ranks[ _-]?Name *:.+", line)) {
                this.ranks_name = line.split(":", 2)[1];
                while (this.ranks_name.startsWith(" ")) this.ranks_name = this.ranks_name.substring(1);
            }
            //ignore comment line
            continue;
        }
        String[] tokens = line.split("\t");
        String name = tokens[0].toUpperCase();
        double score = 0;
        //if there are 5 columns in the data then the rank is the last column
        if (tokens.length == 5) {
            //ignore rows where the expected rank value is not a valid double
            try {
                //gseaDefinedRanks = true;
                score = Double.parseDouble(tokens[4]);
            } catch (NumberFormatException nfe) {
                if (lineNumber == 0) {
                    lineNumber++;
                    continue;
                } else
                    throw new IllegalThreadStateException("rank value for" + tokens[0] + "is not a valid number");
            }
            nScores++;
        } else //if there are 2 columns in the data then the rank is the 2 column
        if (tokens.length == 2) {
            try {
                score = Double.parseDouble(tokens[1]);
            } catch (NumberFormatException nfe) {
                if (lineNumber == 0) {
                    lineNumber++;
                    continue;
                } else
                    throw new IllegalThreadStateException("rank value for" + tokens[0] + "is not a valid number");
            }
            nScores++;
        } else {
            System.out.println("Invalid number of tokens line of Rank File (should be 5 or 2)");
            //skip invalid line
            continue;
        }
        if ((tokens.length == 5) || (dataset.getMethod() == Method.GSEA && !loadFromHeatmap))
            gseaDefinedRanks = true;
        //add score to array of scores
        score_collector[nScores - 1] = score;
        //check to see if the gene is in the genelist
        Integer genekey = map.getHashFromGene(name);
        if (genekey != null) {
            Rank current_ranking;
            // edge compatible files.
            if ((tokens.length == 5) || (dataset.getMethod() == Method.GSEA && !loadFromHeatmap)) {
                current_ranking = new Rank(name, score, nScores);
            } else {
                current_ranking = new Rank(name, score);
            }
            ranks.put(genekey, current_ranking);
        }
        // Calculate Percentage.  This must be a value between 0..100.
        int percentComplete = (int) (((double) currentProgress / maxValue) * 100);
        taskMonitor.setProgress(percentComplete);
        currentProgress++;
    }
    //the none of the genes are in the gene list
    if (ranks.isEmpty()) {
        throw new IllegalThreadStateException("None of the genes in the rank file are found in the expression file.  Make sure the identifiers of the two files match.");
    }
    //remove Null values from collector
    Double[] sort_scores = new Double[nScores];
    double[] scores = new double[nScores];
    for (int i = 0; i < nScores; i++) {
        sort_scores[i] = score_collector[i];
        scores[i] = (double) score_collector[i];
    }
    //after we have loaded in all the scores, sort the score to compute ranks
    //create hash of scores to ranks.
    HashMap<Double, Integer> score2ranks = new HashMap<Double, Integer>();
    //sorts the array in descending order
    Arrays.sort(sort_scores, Collections.reverseOrder());
    //just signed statistics for instance as it will sort them in the opposite direction.
    if (sort_scores[0] <= 1 && sort_scores[sort_scores.length - 1] >= -1)
        Arrays.sort(sort_scores);
    for (int j = 0; j < sort_scores.length; j++) {
        //check to see if this score is already enter
        if (!score2ranks.containsKey(sort_scores[j]))
            score2ranks.put(sort_scores[j], j);
    }
    //only update the ranks if we haven't already defined them using order of scores in file
    if (!gseaDefinedRanks) {
        for (Iterator<Integer> k = ranks.keySet().iterator(); k.hasNext(); ) {
            Integer gene_key = k.next();
            Rank current_ranking = ranks.get(gene_key);
            Integer rank = score2ranks.get(current_ranking.getScore());
            current_ranking.setRank(rank);
        // update rank2gene and gene2score as well
        }
    }
    //check to see if some of the dataset genes are not in this rank file
    Set<Integer> current_genes = dataset.getDataSetGenes();
    Set<Integer> current_ranks = ranks.keySet();
    //intersect the genes with the ranks.  only retain the genes that have ranks.
    Set<Integer> intersection = new HashSet<>(current_genes);
    intersection.retainAll(current_ranks);
    //see if there more genes than there are ranks
    if (!(intersection.size() == current_genes.size())) {
    //JOptionPane.showMessageDialog(Cytoscape.getDesktop(),"Ranks for some of the genes/proteins listed in the expression file are missing. \n These genes/proteins will be excluded from ranked listing in the heat map.");
    }
    //create a new Ranking
    Ranking new_ranking = new Ranking();
    ranks.forEach(new_ranking::addRank);
    //add the Ranks to the expression file ranking
    dataset.getExpressionSets().addRanks(ranks_name, new_ranking);
}
Also used : HashMap(java.util.HashMap) Rank(org.baderlab.csplugins.enrichmentmap.model.Rank) EnrichmentMap(org.baderlab.csplugins.enrichmentmap.model.EnrichmentMap) Ranking(org.baderlab.csplugins.enrichmentmap.model.Ranking) NullTaskMonitor(org.baderlab.csplugins.enrichmentmap.util.NullTaskMonitor) HashSet(java.util.HashSet)

Aggregations

Ranking (org.baderlab.csplugins.enrichmentmap.model.Ranking)8 EMDataSet (org.baderlab.csplugins.enrichmentmap.model.EMDataSet)5 EnrichmentMap (org.baderlab.csplugins.enrichmentmap.model.EnrichmentMap)4 Rank (org.baderlab.csplugins.enrichmentmap.model.Rank)4 HashMap (java.util.HashMap)3 HashSet (java.util.HashSet)3 List (java.util.List)3 Map (java.util.Map)3 Method (org.baderlab.csplugins.enrichmentmap.model.EMDataSet.Method)3 EnrichmentResult (org.baderlab.csplugins.enrichmentmap.model.EnrichmentResult)3 GeneSet (org.baderlab.csplugins.enrichmentmap.model.GeneSet)3 CyNetwork (org.cytoscape.model.CyNetwork)3 Inject (com.google.inject.Inject)2 ArrayList (java.util.ArrayList)2 Collection (java.util.Collection)2 Collections (java.util.Collections)2 Set (java.util.Set)2 DataSetFiles (org.baderlab.csplugins.enrichmentmap.model.DataSetFiles)2 EnrichmentMapManager (org.baderlab.csplugins.enrichmentmap.model.EnrichmentMapManager)2 GSEAResult (org.baderlab.csplugins.enrichmentmap.model.GSEAResult)2