use of org.baderlab.csplugins.enrichmentmap.util.NullTaskMonitor in project EnrichmentMapApp by BaderLab.
the class InitializeGenesetsOfInterestTask method initializeSets.
/**
* filter the genesets, restricting them to only those passing the user
* specified thresholds.
*
* @return true if successful and false otherwise.
*/
public boolean initializeSets(TaskMonitor tm) {
if (tm == null)
tm = new NullTaskMonitor();
DiscreteTaskMonitor taskMonitor = new DiscreteTaskMonitor(tm, map.getDataSetCount());
//create subset of genesets that contains only the genesets of interest with pvalue and qbalue less than values specified by the user.
//Go through each Dataset populating the Gene set of interest in each dataset object
Map<String, EMDataSet> datasets = map.getDataSets();
// count how many experiments (DataSets) contain the geneset
Optional<Integer> minExperiments = map.getParams().getMinExperiments();
Map<String, Integer> occurrences = minExperiments.isPresent() ? new HashMap<>() : null;
for (String datasetName : datasets.keySet()) {
taskMonitor.inc();
EMDataSet dataset = datasets.get(datasetName);
// all these maps use the geneset name as key
Map<String, EnrichmentResult> enrichmentResults = dataset.getEnrichments().getEnrichments();
Map<String, GeneSet> genesets = dataset.getSetOfGeneSets().getGeneSets();
Map<String, GeneSet> genesetsOfInterest = dataset.getGeneSetsOfInterest().getGeneSets();
// If there are no genesets associated with this dataset then get the complete set assumption being that the gmt file applies to all datasets.
if (genesets == null || genesets.isEmpty()) {
genesets = map.getAllGeneSets();
}
//if there are no enrichment Results then do nothing
if (enrichmentResults == null || enrichmentResults.isEmpty()) {
return false;
}
//iterate through the GSEA Results to figure out which genesets we want to use
for (String genesetName : enrichmentResults.keySet()) {
EnrichmentResult result = enrichmentResults.get(genesetName);
// update rank at max for leading edge calculation
if (dataset.getMethod() == Method.GSEA) {
Ranking ranks = dataset.getExpressionSets().getRanksByName(datasetName);
updateRankAtMax((GSEAResult) result, ranks);
}
if (result.geneSetOfInterest(map.getParams())) {
GeneSet geneset = genesets.get(genesetName);
if (geneset != null) {
// while we are checking, update the size of the genesets based on post filtered data
result.setGsSize(geneset.getGenes().size());
if (occurrences != null) {
occurrences.merge(genesetName, 1, (v, d) -> v + 1);
}
genesetsOfInterest.put(genesetName, geneset);
} else if (throwIfMissing) {
throw new IllegalThreadStateException("The Geneset: " + genesetName + " is not found in the GMT file.");
}
}
}
}
// Remove gene-sets that don't pass the minimum occurrence cutoff
if (occurrences != null) {
for (EMDataSet dataset : datasets.values()) {
Map<String, GeneSet> genesetsOfInterest = dataset.getGeneSetsOfInterest().getGeneSets();
genesetsOfInterest.keySet().removeIf(geneset -> occurrences.getOrDefault(geneset, 0) < minExperiments.get());
}
}
return true;
}
use of org.baderlab.csplugins.enrichmentmap.util.NullTaskMonitor in project EnrichmentMapApp by BaderLab.
the class ParseGREATEnrichmentResults method parseLines.
@Override
public void parseLines(List<String> lines, EMDataSet dataset, TaskMonitor taskMonitor) {
if (taskMonitor == null)
taskMonitor = new NullTaskMonitor();
taskMonitor.setTitle("Parsing Enrichment Result file");
boolean hasBackground = false;
EMCreationParameters params = dataset.getMap().getParams();
//Get the type of filter user specified on the GREAT results
//If it is hyper use column 14 Hypergeometric p-value and 16 FDR for hyper
//If it is binom use column 5 bionomial p-value and 7 FDR for binom
//If they specify both use the highest p-value and q-value from the above columns
GreatFilter filterType = dataset.getMap().getParams().getGreatFilter();
Map<String, GeneSet> genesets = dataset.getSetOfGeneSets().getGeneSets();
EnrichmentMap map = dataset.getMap();
Map<String, EnrichmentResult> results = dataset.getEnrichments().getEnrichments();
int currentProgress = 0;
int maxValue = lines.size();
taskMonitor.setStatusMessage("Parsing Great Results file - " + maxValue + " rows");
//for great files there is an FDR
dataset.getMap().getParams().setFDR(true);
//skip the first l9 which just has the field names (start i=1)
//check to see how many columns the data has
//go through each line until we find the header line
int k = 0;
String line = lines.get(k);
String[] tokens = line.split("\t");
for (; k < lines.size(); k++) {
line = lines.get(k);
tokens = line.split("\t");
int length = tokens.length;
if ((length == 24) && tokens[3].equalsIgnoreCase("BinomRank")) {
break;
} else //There is not binom rank and no binomial data.
if ((length == 20) && tokens[3].equalsIgnoreCase("Rank")) {
hasBackground = true;
break;
}
}
//go through the rest of the lines
for (int i = k + 1; i < lines.size(); i++) {
line = lines.get(i);
tokens = line.split("\t");
//there are extra lines at the end of the file that should be ignored.
if (!hasBackground && tokens.length != 24)
continue;
if (hasBackground && tokens.length != 20)
continue;
double pvalue = 1.0;
double FDRqvalue = 1.0;
GenericResult result;
int gs_size = 0;
double NES = 1.0;
//details of export file
//http://bejerano.stanford.edu/help/display/GREAT/Export
//The second column of the file is the name of the geneset
final String name = tokens[1].trim() + "-" + tokens[2].trim();
//the first column of the file is the description
final String description = tokens[2].trim();
//when there are two different species it is possible that the gene set could
//already exist in the set of genesets. if it does exist then add the genes
//in this set to the geneset
ImmutableSet.Builder<Integer> builder = ImmutableSet.builder();
if (genesets.containsKey(name))
builder = builder.addAll(genesets.get(name).getGenes());
String[] gene_tokens;
if (!hasBackground)
gene_tokens = tokens[23].split(",");
else
gene_tokens = tokens[18].split(",");
//All subsequent fields in the list are the geneset associated with this geneset.
for (int j = 0; j < gene_tokens.length; j++) {
String gene = gene_tokens[j].toUpperCase();
//if it is already in the hash then get its associated key and put it into the set of genes
if (map.containsGene(gene)) {
builder.add(map.getHashFromGene(gene));
} else if (!gene.isEmpty()) {
Integer hash = map.addGene(gene).get();
builder.add(hash);
}
}
//finished parsing that geneset
//add the current geneset to the hashmap of genesets
GeneSet gs = new GeneSet(name, description, builder.build());
genesets.put(name, gs);
//There are two tests run by GREAT, the binomial on regions and the hypergeometric based on genes
//The first pass of results shows only those that are significant both
//The user can then choose to use either or both together
//
//If it is hyper use column 14 Hypergeometric p-value and 16 FDR for hyper
//If it is binom use column 5 bionomial p-value and 7 FDR for binom
//If they specify both use the highest p-value and q-value from the above columns
double hyper_pvalue = 1;
double hyper_fdr = 1;
double binom_pvalue = 1;
double binom_fdr = 1;
if (!hasBackground) {
if (!tokens[4].equalsIgnoreCase(""))
binom_pvalue = Double.parseDouble(tokens[4]);
if (!tokens[6].equalsIgnoreCase(""))
binom_fdr = Double.parseDouble(tokens[6]);
if (!tokens[13].equalsIgnoreCase(""))
hyper_pvalue = Double.parseDouble(tokens[13]);
if (!tokens[15].equalsIgnoreCase(""))
hyper_fdr = Double.parseDouble(tokens[15]);
} else {
if (!tokens[4].equalsIgnoreCase(""))
hyper_pvalue = Double.parseDouble(tokens[4]);
if (!tokens[6].equalsIgnoreCase(""))
hyper_fdr = Double.parseDouble(tokens[6]);
}
if (filterType == GreatFilter.HYPER) {
pvalue = hyper_pvalue;
FDRqvalue = hyper_fdr;
} else if (filterType == GreatFilter.BINOM) {
pvalue = binom_pvalue;
FDRqvalue = binom_fdr;
} else if (filterType == GreatFilter.BOTH) {
pvalue = Math.max(hyper_pvalue, binom_pvalue);
FDRqvalue = Math.max(hyper_fdr, binom_fdr);
} else if (filterType == GreatFilter.EITHER) {
pvalue = Math.min(hyper_pvalue, binom_pvalue);
FDRqvalue = Math.min(hyper_fdr, binom_fdr);
} else {
System.out.println("Invalid attribute setting for GREAT p-value specification");
}
//Keep track of minimum p-value to better calculate jslider
if (pvalue < params.getPvalueMin())
params.setPvalueMin(pvalue);
if (FDRqvalue < params.getQvalueMin())
params.setQvalueMin(FDRqvalue);
//If this is a background set then it is in the 16th column
if ((!hasBackground) && (!tokens[19].equalsIgnoreCase("")))
gs_size = Integer.parseInt(tokens[19]);
else if ((hasBackground) && (!tokens[15].equalsIgnoreCase("")))
gs_size = Integer.parseInt(tokens[15]);
result = new GenericResult(name, description, pvalue, gs_size, FDRqvalue);
// Calculate Percentage. This must be a value between 0..100.
int percentComplete = (int) (((double) currentProgress / maxValue) * 100);
taskMonitor.setProgress(percentComplete);
currentProgress++;
//check to see if the gene set has already been entered in the results
//it is possible that one geneset will be in both phenotypes.
//if it is already exists then we want to make sure the one retained is the result with the
//lower p-value.
//ticket #149
GenericResult temp = (GenericResult) results.get(name);
if (temp == null)
results.put(name, result);
else {
if (result.getPvalue() < temp.getPvalue())
results.put(name, result);
}
}
}
use of org.baderlab.csplugins.enrichmentmap.util.NullTaskMonitor in project EnrichmentMapApp by BaderLab.
the class ParseGSEAEnrichmentResults method parseLines.
@Override
public void parseLines(List<String> lines, EMDataSet dataset, TaskMonitor taskMonitor) {
if (taskMonitor == null)
taskMonitor = new NullTaskMonitor();
taskMonitor.setTitle("Parsing Bingo Enrichment Result file");
//skip the first line which just has the field names (start i=1)
dataset.getMap().getParams().setFDR(true);
int currentProgress = 0;
int maxValue = lines.size();
taskMonitor.setStatusMessage("Parsing Enrichment Results file - " + maxValue + " rows");
Map<String, EnrichmentResult> results = dataset.getEnrichments().getEnrichments();
for (int i = 1; i < lines.size(); i++) {
String line = lines.get(i);
String[] tokens = line.split("\t");
int size = 0;
double ES = 0.0;
double NES = 0.0;
double pvalue = 1.0;
double FDRqvalue = 1.0;
double FWERqvalue = 1.0;
int rankAtMax = -1;
double scoreAtMax = DefaultScoreAtMax;
//The first column of the file is the name of the geneset
String Name = tokens[0].toUpperCase().trim();
//The fourth column is the size of the geneset
if (!tokens[3].isEmpty()) {
size = Integer.parseInt(tokens[3]);
}
//The fifth column is the Enrichment score (ES)
if (!tokens[4].isEmpty()) {
ES = Double.parseDouble(tokens[4]);
}
//The sixth column is the Normalize Enrichment Score (NES)
if (!tokens[5].isEmpty()) {
NES = Double.parseDouble(tokens[5]);
}
//The seventh column is the nominal p-value
if (!tokens[6].isEmpty()) {
pvalue = Double.parseDouble(tokens[6]);
}
//the eighth column is the FDR q-value
if (!tokens[7].isEmpty()) {
FDRqvalue = Double.parseDouble(tokens[7]);
}
//the ninth column is the FWER q-value
if (!tokens[8].isEmpty()) {
FWERqvalue = Double.parseDouble(tokens[8]);
}
//the tenth column is the rankatmax
if (!tokens[9].isEmpty()) {
rankAtMax = Integer.parseInt(tokens[9]);
}
GSEAResult result = new GSEAResult(Name, size, ES, NES, pvalue, FDRqvalue, FWERqvalue, rankAtMax, scoreAtMax);
// Calculate Percentage. This must be a value between 0..100.
int percentComplete = (int) (((double) currentProgress / maxValue) * 100);
taskMonitor.setProgress(percentComplete);
currentProgress++;
results.put(Name, result);
}
}
use of org.baderlab.csplugins.enrichmentmap.util.NullTaskMonitor in project EnrichmentMapApp by BaderLab.
the class ParseGenericEnrichmentResults method parseLines.
@Override
public void parseLines(List<String> lines, EMDataSet dataset, TaskMonitor taskMonitor) {
if (taskMonitor == null)
taskMonitor = new NullTaskMonitor();
taskMonitor.setTitle("Parsing Generic Result file");
//Get the current genesets so we can check that all the results are in the geneset list
//and put the size of the genesets into the visual style
Map<String, GeneSet> genesets = dataset.getSetOfGeneSets().getGeneSets();
int currentProgress = 0;
int maxValue = lines.size();
taskMonitor.setStatusMessage("Parsing Generic Results file - " + maxValue + " rows");
boolean FDR = false;
//skip the first line which just has the field names (start i=1)
//check to see how many columns the data has
String line = lines.get(0);
String[] tokens = line.split("\t");
int length = tokens.length;
EnrichmentMap map = dataset.getMap();
SetOfEnrichmentResults enrichments = dataset.getEnrichments();
Map<String, EnrichmentResult> results = enrichments.getEnrichments();
String upPhenotype = enrichments.getPhenotype1();
String downPhenotype = enrichments.getPhenotype2();
//check to see if there are genesets.
//if there are no genesets then populate the genesets from the generic file
//can only do this if the 6th column has a list of genes for that geneset.
boolean populate_gs = false;
if (genesets == null || genesets.isEmpty())
populate_gs = true;
else
//as this is the default for gprofiler use the Description in the visual style instead of the formatted name
//but only if there is a gmt supplied. If using just the generic output file there is not field for description
dataset.getMap().getParams().setEMgmt(true);
for (int i = 1; i < lines.size(); i++) {
line = lines.get(i);
tokens = line.split("\t");
//update the length each time because some line might have missing values
length = tokens.length;
double pvalue = 1.0;
double FDRqvalue = 1.0;
GenericResult result;
int gs_size = 0;
double NES = 1.0;
//The first column of the file is the name of the geneset
final String name = tokens[0].toUpperCase().trim();
final String description = tokens[1].toUpperCase();
if (genesets.containsKey(name)) {
gs_size = genesets.get(name).getGenes().size();
}
//The third column is the nominal p-value
if (tokens[2] == null || tokens[2].equalsIgnoreCase("")) {
//do nothing
} else {
pvalue = Double.parseDouble(tokens[2]);
}
if (length > 3) {
//the fourth column is the FDR q-value
if (tokens[3] == null || tokens[3].equalsIgnoreCase("")) {
//do nothing
} else {
FDRqvalue = Double.parseDouble(tokens[3]);
FDR = true;
}
// and if it is a number the only important part is the sign
if (length > 4) {
if (tokens[4] == null || tokens[4].equalsIgnoreCase("")) {
} else {
//check to see if the string matches the specified phenotypes
if (tokens[4].equalsIgnoreCase(upPhenotype))
NES = 1.0;
else if (tokens[4].equalsIgnoreCase(downPhenotype))
NES = -1.0;
else //try and see if the user has specified the phenotype as a number
{
try {
NES = Double.parseDouble(tokens[4]);
} catch (NumberFormatException nfe) {
throw new IllegalThreadStateException(tokens[4] + " is not a valid phenotype. Phenotype specified in generic enrichment results file must have the same phenotype as specified in advanced options or must be a positive or negative number.");
}
}
}
//its enrichment
if (length > 5 && populate_gs) {
//get all the genes in the field
String[] gene_tokens = tokens[5].split(",");
ImmutableSet.Builder<Integer> builder = ImmutableSet.builder();
//All subsequent fields in the list are the geneset associated with this geneset.
for (String token : gene_tokens) {
String gene = token.trim().toUpperCase();
//if it is already in the hash then get its associated key and put it into the set of genes
if (map.containsGene(gene)) {
builder.add(map.getHashFromGene(gene));
} else if (!gene.isEmpty()) {
Integer hash = map.addGene(gene).get();
builder.add(hash);
}
}
GeneSet gs = new GeneSet(name, description, builder.build());
gs_size = gs.getGenes().size();
//put the new or filtered geneset back into the set.
genesets.put(name, gs);
}
//end of tokens>5
result = new GenericResult(name, description, pvalue, gs_size, FDRqvalue, NES);
} else
//end of tokens>4
result = new GenericResult(name, description, pvalue, gs_size, FDRqvalue);
} else {
result = new GenericResult(name, description, pvalue, gs_size);
}
// Calculate Percentage. This must be a value between 0..100.
int percentComplete = (int) (((double) currentProgress / maxValue) * 100);
taskMonitor.setProgress(percentComplete);
currentProgress++;
//check to see if the gene set has already been entered in the results
//it is possible that one geneset will be in both phenotypes.
//if it is already exists then we want to make sure the one retained is the result with the
//lower p-value.
//ticket #149
GenericResult temp = (GenericResult) results.get(name);
if (temp == null)
results.put(name, result);
else {
if (result.getPvalue() < temp.getPvalue())
results.put(name, result);
}
}
if (FDR)
dataset.getMap().getParams().setFDR(FDR);
}
use of org.baderlab.csplugins.enrichmentmap.util.NullTaskMonitor in project EnrichmentMapApp by BaderLab.
the class ParseBingoEnrichmentResults method parseLines.
@Override
public void parseLines(List<String> lines, EMDataSet dataset, TaskMonitor taskMonitor) {
if (taskMonitor == null)
taskMonitor = new NullTaskMonitor();
taskMonitor.setTitle("Parsing Bingo Enrichment Result file");
//with Bingo results there are no genesets defined. first pass through the file
// needs to parse the genesets
//the bingo file has 20 lines of info at the top of the file before you get to results.
//parameters that can be extracted from Bingo files:
//GO-ID p-value corr p-value x n X N Description Genes in test set
// (column 1 ) GO-Id - is just the numerical part of the GO term (does not contain GO:0000)
//(column 2 ) p-value
//(column 3 ) corr pvalue
//(column 4 ) x - number of genes in the subset of interest with this annotation
//(column 5 ) n - number of genes in the universe with this annotation
//(column 6 ) X - number of genes in the subset
//(column 7 ) N - number of genes in the universe
//(column 8 ) Description - GO term name
//(column 9 ) Gene in test set - a list of genes in the subset of interest that are annotated to this term
// Column 8 is the geneset name
// Column 9 is the list of genes in this geneset -- therefore pre-filtered.
Map<String, GeneSet> genesets = dataset.getSetOfGeneSets().getGeneSets();
//get the genes (which should also be empty
EnrichmentMap map = dataset.getMap();
Map<String, EnrichmentResult> results = dataset.getEnrichments().getEnrichments();
int currentProgress = 0;
int maxValue = lines.size();
boolean FDR = true;
taskMonitor.setStatusMessage("Parsing Generic Results file -" + maxValue + " rows");
//skip the first l9 which just has the field names (start i=1)
//check to see how many columns the data has
//go through each line until we find the header line
int k = 0;
String line = lines.get(k);
String[] tokens = line.split("\t");
for (; k < lines.size(); k++) {
line = lines.get(k);
tokens = line.split("\t");
int length = tokens.length;
if ((length == 9) && tokens[0].equalsIgnoreCase("GO-ID") && tokens[8].equalsIgnoreCase("Genes in test set")) {
break;
}
}
if (k == lines.size())
throw new IllegalThreadStateException("Bingo results file is missing data.");
for (int i = k + 1; i < lines.size(); i++) {
line = lines.get(i);
tokens = line.split("\t");
double pvalue = 1.0;
double FDRqvalue = 1.0;
GenericResult result;
int gs_size = 0;
double NES = 1.0;
//The 8th column of the file is the name of the geneset
final String name = tokens[7].toUpperCase().trim();
//the 8th column of the file is the description
final String description = tokens[7].toUpperCase();
//when there are two different species it is possible that the gene set could
//already exist in the set of genesets. if it does exist then add the genes
//in this set to the geneset
ImmutableSet.Builder<Integer> builder = ImmutableSet.builder();
if (genesets.containsKey(name))
builder = builder.addAll(genesets.get(name).getGenes());
String[] gene_tokens = tokens[8].split("\\|");
//All subsequent fields in the list are the geneset associated with this geneset.
for (int j = 0; j < gene_tokens.length; j++) {
String gene = gene_tokens[j].toUpperCase();
//if it is already in the hash then get its associated key and put it into the set of genes
if (map.containsGene(gene)) {
builder.add(map.getHashFromGene(gene));
} else if (!gene.isEmpty()) {
Integer hash = map.addGene(gene).get();
builder.add(hash);
}
}
//finished parsing that geneset
//add the current geneset to the hashmap of genesets
GeneSet gs = new GeneSet(name, description, builder.build());
genesets.put(name, gs);
//The 2nd column is the nominal p-value
if (tokens[1].equalsIgnoreCase("")) {
//do nothing
} else {
pvalue = Double.parseDouble(tokens[1]);
}
//the Count is the size of the geneset (restricted by the gene list)
if (tokens[3].equalsIgnoreCase("")) {
//do nothing
} else {
gs_size = Integer.parseInt(tokens[3]);
}
//Use the correct p-value - 3rd column
if (tokens[2].equalsIgnoreCase("")) {
//do nothing
} else {
FDRqvalue = Double.parseDouble(tokens[2]);
}
result = new GenericResult(name, description, pvalue, gs_size, FDRqvalue);
// Calculate Percentage. This must be a value between 0..100.
int percentComplete = (int) (((double) currentProgress / maxValue) * 100);
taskMonitor.setProgress(percentComplete);
currentProgress++;
//check to see if the gene set has already been entered in the results
//it is possible that one geneset will be in both phenotypes.
//if it is already exists then we want to make sure the one retained is the result with the
//lower p-value.
//ticket #149
GenericResult temp = (GenericResult) results.get(name);
if (temp == null)
results.put(name, result);
else {
if (result.getPvalue() < temp.getPvalue())
results.put(name, result);
}
}
if (FDR)
dataset.getMap().getParams().setFDR(FDR);
}
Aggregations