use of org.baderlab.csplugins.enrichmentmap.model.Ranking in project EnrichmentMapApp by BaderLab.
the class InitializeGenesetsOfInterestTask method initializeSets.
/**
* filter the genesets, restricting them to only those passing the user
* specified thresholds.
*
* @return true if successful and false otherwise.
*/
public boolean initializeSets(TaskMonitor tm) {
if (tm == null)
tm = new NullTaskMonitor();
DiscreteTaskMonitor taskMonitor = new DiscreteTaskMonitor(tm, map.getDataSetCount());
//create subset of genesets that contains only the genesets of interest with pvalue and qbalue less than values specified by the user.
//Go through each Dataset populating the Gene set of interest in each dataset object
Map<String, EMDataSet> datasets = map.getDataSets();
// count how many experiments (DataSets) contain the geneset
Optional<Integer> minExperiments = map.getParams().getMinExperiments();
Map<String, Integer> occurrences = minExperiments.isPresent() ? new HashMap<>() : null;
for (String datasetName : datasets.keySet()) {
taskMonitor.inc();
EMDataSet dataset = datasets.get(datasetName);
// all these maps use the geneset name as key
Map<String, EnrichmentResult> enrichmentResults = dataset.getEnrichments().getEnrichments();
Map<String, GeneSet> genesets = dataset.getSetOfGeneSets().getGeneSets();
Map<String, GeneSet> genesetsOfInterest = dataset.getGeneSetsOfInterest().getGeneSets();
// If there are no genesets associated with this dataset then get the complete set assumption being that the gmt file applies to all datasets.
if (genesets == null || genesets.isEmpty()) {
genesets = map.getAllGeneSets();
}
//if there are no enrichment Results then do nothing
if (enrichmentResults == null || enrichmentResults.isEmpty()) {
return false;
}
//iterate through the GSEA Results to figure out which genesets we want to use
for (String genesetName : enrichmentResults.keySet()) {
EnrichmentResult result = enrichmentResults.get(genesetName);
// update rank at max for leading edge calculation
if (dataset.getMethod() == Method.GSEA) {
Ranking ranks = dataset.getExpressionSets().getRanksByName(datasetName);
updateRankAtMax((GSEAResult) result, ranks);
}
if (result.geneSetOfInterest(map.getParams())) {
GeneSet geneset = genesets.get(genesetName);
if (geneset != null) {
// while we are checking, update the size of the genesets based on post filtered data
result.setGsSize(geneset.getGenes().size());
if (occurrences != null) {
occurrences.merge(genesetName, 1, (v, d) -> v + 1);
}
genesetsOfInterest.put(genesetName, geneset);
} else if (throwIfMissing) {
throw new IllegalThreadStateException("The Geneset: " + genesetName + " is not found in the GMT file.");
}
}
}
}
// Remove gene-sets that don't pass the minimum occurrence cutoff
if (occurrences != null) {
for (EMDataSet dataset : datasets.values()) {
Map<String, GeneSet> genesetsOfInterest = dataset.getGeneSetsOfInterest().getGeneSets();
genesetsOfInterest.keySet().removeIf(geneset -> occurrences.getOrDefault(geneset, 0) < minExperiments.get());
}
}
return true;
}
use of org.baderlab.csplugins.enrichmentmap.model.Ranking in project EnrichmentMapApp by BaderLab.
the class CreateDiseaseSignatureTaskParallel method mannWhitney.
private void mannWhitney(Set<Integer> intersection, GenesetSimilarity comparison, EMDataSet dataSet) {
String rankFile = params.getDataSetToRankFile().get(dataSet.getName());
Ranking ranks = dataSet.getExpressionSets().getRanks().get(rankFile);
// Calculate Mann-Whitney U pValue for Overlap
Integer[] overlapGeneIds = intersection.toArray(new Integer[intersection.size()]);
double[] overlapGeneScores = new double[overlapGeneIds.length];
int j = 0;
for (Integer geneId : overlapGeneIds) {
Double score = ranks.getScore(geneId);
if (score != null) {
// unbox
overlapGeneScores[j++] = score;
}
}
overlapGeneScores = Arrays.copyOf(overlapGeneScores, j);
if (ranks.isEmpty()) {
// avoid NoDataException
comparison.setMannWhitPValueTwoSided(1.5);
comparison.setMannWhitPValueGreater(1.5);
comparison.setMannWhitPValueLess(1.5);
comparison.setMannWhitMissingRanks(true);
} else {
double[] scores = ranks.getScores();
MannWhitneyTestResult result = mannWhitneyCache.mannWhitneyUTestBatch(overlapGeneScores, scores);
comparison.setMannWhitPValueTwoSided(result.twoSided);
comparison.setMannWhitPValueGreater(result.greater);
comparison.setMannWhitPValueLess(result.less);
}
}
use of org.baderlab.csplugins.enrichmentmap.model.Ranking in project EnrichmentMapApp by BaderLab.
the class RanksFileReaderTask method parse.
/**
* parse the rank file
*/
public void parse(TaskMonitor taskMonitor) throws IOException {
if (taskMonitor == null)
taskMonitor = new NullTaskMonitor();
List<String> lines = DatasetLineParser.readLines(RankFileName);
int lineNumber = 0;
int currentProgress = 0;
int maxValue = lines.size();
taskMonitor.setStatusMessage("Parsing Rank file - " + maxValue + " rows");
EnrichmentMap map = dataset.getMap();
// we don't know the number of scores in the rank file yet, but it can't be more than the number of lines.
Double[] score_collector = new Double[lines.size()];
boolean gseaDefinedRanks = false;
Map<Integer, Rank> ranks = new HashMap<>();
/*
* there are two possible Rank files: If loaded through the rpt file the
* file is the one generated by GSEA and will have 5 columns (name,
* description, empty,empty,score) If the user loaded it through the
* generic of specifying advanced options then it will 2 columns
* (name,score). The score in either case should be a double and the
* name a string so check for either option.
*/
//number of found scores
int nScores = 0;
for (int i = 0; i < lines.size(); i++) {
String line = lines.get(i);
//check to see if the line is commented out and should be ignored.
if (line.startsWith("#")) {
// look for ranks_name in comment line e.g.: "# Ranks Name : My Ranks"
if (Pattern.matches("^# *Ranks[ _-]?Name *:.+", line)) {
this.ranks_name = line.split(":", 2)[1];
while (this.ranks_name.startsWith(" ")) this.ranks_name = this.ranks_name.substring(1);
}
//ignore comment line
continue;
}
String[] tokens = line.split("\t");
String name = tokens[0].toUpperCase();
double score = 0;
//if there are 5 columns in the data then the rank is the last column
if (tokens.length == 5) {
//ignore rows where the expected rank value is not a valid double
try {
//gseaDefinedRanks = true;
score = Double.parseDouble(tokens[4]);
} catch (NumberFormatException nfe) {
if (lineNumber == 0) {
lineNumber++;
continue;
} else
throw new IllegalThreadStateException("rank value for" + tokens[0] + "is not a valid number");
}
nScores++;
} else //if there are 2 columns in the data then the rank is the 2 column
if (tokens.length == 2) {
try {
score = Double.parseDouble(tokens[1]);
} catch (NumberFormatException nfe) {
if (lineNumber == 0) {
lineNumber++;
continue;
} else
throw new IllegalThreadStateException("rank value for" + tokens[0] + "is not a valid number");
}
nScores++;
} else {
System.out.println("Invalid number of tokens line of Rank File (should be 5 or 2)");
//skip invalid line
continue;
}
if ((tokens.length == 5) || (dataset.getMethod() == Method.GSEA && !loadFromHeatmap))
gseaDefinedRanks = true;
//add score to array of scores
score_collector[nScores - 1] = score;
//check to see if the gene is in the genelist
Integer genekey = map.getHashFromGene(name);
if (genekey != null) {
Rank current_ranking;
// edge compatible files.
if ((tokens.length == 5) || (dataset.getMethod() == Method.GSEA && !loadFromHeatmap)) {
current_ranking = new Rank(name, score, nScores);
} else {
current_ranking = new Rank(name, score);
}
ranks.put(genekey, current_ranking);
}
// Calculate Percentage. This must be a value between 0..100.
int percentComplete = (int) (((double) currentProgress / maxValue) * 100);
taskMonitor.setProgress(percentComplete);
currentProgress++;
}
//the none of the genes are in the gene list
if (ranks.isEmpty()) {
throw new IllegalThreadStateException("None of the genes in the rank file are found in the expression file. Make sure the identifiers of the two files match.");
}
//remove Null values from collector
Double[] sort_scores = new Double[nScores];
double[] scores = new double[nScores];
for (int i = 0; i < nScores; i++) {
sort_scores[i] = score_collector[i];
scores[i] = (double) score_collector[i];
}
//after we have loaded in all the scores, sort the score to compute ranks
//create hash of scores to ranks.
HashMap<Double, Integer> score2ranks = new HashMap<Double, Integer>();
//sorts the array in descending order
Arrays.sort(sort_scores, Collections.reverseOrder());
//just signed statistics for instance as it will sort them in the opposite direction.
if (sort_scores[0] <= 1 && sort_scores[sort_scores.length - 1] >= -1)
Arrays.sort(sort_scores);
for (int j = 0; j < sort_scores.length; j++) {
//check to see if this score is already enter
if (!score2ranks.containsKey(sort_scores[j]))
score2ranks.put(sort_scores[j], j);
}
//only update the ranks if we haven't already defined them using order of scores in file
if (!gseaDefinedRanks) {
for (Iterator<Integer> k = ranks.keySet().iterator(); k.hasNext(); ) {
Integer gene_key = k.next();
Rank current_ranking = ranks.get(gene_key);
Integer rank = score2ranks.get(current_ranking.getScore());
current_ranking.setRank(rank);
// update rank2gene and gene2score as well
}
}
//check to see if some of the dataset genes are not in this rank file
Set<Integer> current_genes = dataset.getDataSetGenes();
Set<Integer> current_ranks = ranks.keySet();
//intersect the genes with the ranks. only retain the genes that have ranks.
Set<Integer> intersection = new HashSet<>(current_genes);
intersection.retainAll(current_ranks);
//see if there more genes than there are ranks
if (!(intersection.size() == current_genes.size())) {
//JOptionPane.showMessageDialog(Cytoscape.getDesktop(),"Ranks for some of the genes/proteins listed in the expression file are missing. \n These genes/proteins will be excluded from ranked listing in the heat map.");
}
//create a new Ranking
Ranking new_ranking = new Ranking();
ranks.forEach(new_ranking::addRank);
//add the Ranks to the expression file ranking
dataset.getExpressionSets().addRanks(ranks_name, new_ranking);
}
Aggregations