use of org.baderlab.csplugins.enrichmentmap.model.EnrichmentResult in project EnrichmentMapApp by BaderLab.
the class ParseGREATEnrichmentResults method parseLines.
@Override
public void parseLines(List<String> lines, EMDataSet dataset, TaskMonitor taskMonitor) {
if (taskMonitor == null)
taskMonitor = new NullTaskMonitor();
taskMonitor.setTitle("Parsing Enrichment Result file");
boolean hasBackground = false;
EMCreationParameters params = dataset.getMap().getParams();
//Get the type of filter user specified on the GREAT results
//If it is hyper use column 14 Hypergeometric p-value and 16 FDR for hyper
//If it is binom use column 5 bionomial p-value and 7 FDR for binom
//If they specify both use the highest p-value and q-value from the above columns
GreatFilter filterType = dataset.getMap().getParams().getGreatFilter();
Map<String, GeneSet> genesets = dataset.getSetOfGeneSets().getGeneSets();
EnrichmentMap map = dataset.getMap();
Map<String, EnrichmentResult> results = dataset.getEnrichments().getEnrichments();
int currentProgress = 0;
int maxValue = lines.size();
taskMonitor.setStatusMessage("Parsing Great Results file - " + maxValue + " rows");
//for great files there is an FDR
dataset.getMap().getParams().setFDR(true);
//skip the first l9 which just has the field names (start i=1)
//check to see how many columns the data has
//go through each line until we find the header line
int k = 0;
String line = lines.get(k);
String[] tokens = line.split("\t");
for (; k < lines.size(); k++) {
line = lines.get(k);
tokens = line.split("\t");
int length = tokens.length;
if ((length == 24) && tokens[3].equalsIgnoreCase("BinomRank")) {
break;
} else //There is not binom rank and no binomial data.
if ((length == 20) && tokens[3].equalsIgnoreCase("Rank")) {
hasBackground = true;
break;
}
}
//go through the rest of the lines
for (int i = k + 1; i < lines.size(); i++) {
line = lines.get(i);
tokens = line.split("\t");
//there are extra lines at the end of the file that should be ignored.
if (!hasBackground && tokens.length != 24)
continue;
if (hasBackground && tokens.length != 20)
continue;
double pvalue = 1.0;
double FDRqvalue = 1.0;
GenericResult result;
int gs_size = 0;
double NES = 1.0;
//details of export file
//http://bejerano.stanford.edu/help/display/GREAT/Export
//The second column of the file is the name of the geneset
final String name = tokens[1].trim() + "-" + tokens[2].trim();
//the first column of the file is the description
final String description = tokens[2].trim();
//when there are two different species it is possible that the gene set could
//already exist in the set of genesets. if it does exist then add the genes
//in this set to the geneset
ImmutableSet.Builder<Integer> builder = ImmutableSet.builder();
if (genesets.containsKey(name))
builder = builder.addAll(genesets.get(name).getGenes());
String[] gene_tokens;
if (!hasBackground)
gene_tokens = tokens[23].split(",");
else
gene_tokens = tokens[18].split(",");
//All subsequent fields in the list are the geneset associated with this geneset.
for (int j = 0; j < gene_tokens.length; j++) {
String gene = gene_tokens[j].toUpperCase();
//if it is already in the hash then get its associated key and put it into the set of genes
if (map.containsGene(gene)) {
builder.add(map.getHashFromGene(gene));
} else if (!gene.isEmpty()) {
Integer hash = map.addGene(gene).get();
builder.add(hash);
}
}
//finished parsing that geneset
//add the current geneset to the hashmap of genesets
GeneSet gs = new GeneSet(name, description, builder.build());
genesets.put(name, gs);
//There are two tests run by GREAT, the binomial on regions and the hypergeometric based on genes
//The first pass of results shows only those that are significant both
//The user can then choose to use either or both together
//
//If it is hyper use column 14 Hypergeometric p-value and 16 FDR for hyper
//If it is binom use column 5 bionomial p-value and 7 FDR for binom
//If they specify both use the highest p-value and q-value from the above columns
double hyper_pvalue = 1;
double hyper_fdr = 1;
double binom_pvalue = 1;
double binom_fdr = 1;
if (!hasBackground) {
if (!tokens[4].equalsIgnoreCase(""))
binom_pvalue = Double.parseDouble(tokens[4]);
if (!tokens[6].equalsIgnoreCase(""))
binom_fdr = Double.parseDouble(tokens[6]);
if (!tokens[13].equalsIgnoreCase(""))
hyper_pvalue = Double.parseDouble(tokens[13]);
if (!tokens[15].equalsIgnoreCase(""))
hyper_fdr = Double.parseDouble(tokens[15]);
} else {
if (!tokens[4].equalsIgnoreCase(""))
hyper_pvalue = Double.parseDouble(tokens[4]);
if (!tokens[6].equalsIgnoreCase(""))
hyper_fdr = Double.parseDouble(tokens[6]);
}
if (filterType == GreatFilter.HYPER) {
pvalue = hyper_pvalue;
FDRqvalue = hyper_fdr;
} else if (filterType == GreatFilter.BINOM) {
pvalue = binom_pvalue;
FDRqvalue = binom_fdr;
} else if (filterType == GreatFilter.BOTH) {
pvalue = Math.max(hyper_pvalue, binom_pvalue);
FDRqvalue = Math.max(hyper_fdr, binom_fdr);
} else if (filterType == GreatFilter.EITHER) {
pvalue = Math.min(hyper_pvalue, binom_pvalue);
FDRqvalue = Math.min(hyper_fdr, binom_fdr);
} else {
System.out.println("Invalid attribute setting for GREAT p-value specification");
}
//Keep track of minimum p-value to better calculate jslider
if (pvalue < params.getPvalueMin())
params.setPvalueMin(pvalue);
if (FDRqvalue < params.getQvalueMin())
params.setQvalueMin(FDRqvalue);
//If this is a background set then it is in the 16th column
if ((!hasBackground) && (!tokens[19].equalsIgnoreCase("")))
gs_size = Integer.parseInt(tokens[19]);
else if ((hasBackground) && (!tokens[15].equalsIgnoreCase("")))
gs_size = Integer.parseInt(tokens[15]);
result = new GenericResult(name, description, pvalue, gs_size, FDRqvalue);
// Calculate Percentage. This must be a value between 0..100.
int percentComplete = (int) (((double) currentProgress / maxValue) * 100);
taskMonitor.setProgress(percentComplete);
currentProgress++;
//check to see if the gene set has already been entered in the results
//it is possible that one geneset will be in both phenotypes.
//if it is already exists then we want to make sure the one retained is the result with the
//lower p-value.
//ticket #149
GenericResult temp = (GenericResult) results.get(name);
if (temp == null)
results.put(name, result);
else {
if (result.getPvalue() < temp.getPvalue())
results.put(name, result);
}
}
}
use of org.baderlab.csplugins.enrichmentmap.model.EnrichmentResult in project EnrichmentMapApp by BaderLab.
the class ParseGSEAEnrichmentResults method parseLines.
@Override
public void parseLines(List<String> lines, EMDataSet dataset, TaskMonitor taskMonitor) {
if (taskMonitor == null)
taskMonitor = new NullTaskMonitor();
taskMonitor.setTitle("Parsing Bingo Enrichment Result file");
//skip the first line which just has the field names (start i=1)
dataset.getMap().getParams().setFDR(true);
int currentProgress = 0;
int maxValue = lines.size();
taskMonitor.setStatusMessage("Parsing Enrichment Results file - " + maxValue + " rows");
Map<String, EnrichmentResult> results = dataset.getEnrichments().getEnrichments();
for (int i = 1; i < lines.size(); i++) {
String line = lines.get(i);
String[] tokens = line.split("\t");
int size = 0;
double ES = 0.0;
double NES = 0.0;
double pvalue = 1.0;
double FDRqvalue = 1.0;
double FWERqvalue = 1.0;
int rankAtMax = -1;
double scoreAtMax = DefaultScoreAtMax;
//The first column of the file is the name of the geneset
String Name = tokens[0].toUpperCase().trim();
//The fourth column is the size of the geneset
if (!tokens[3].isEmpty()) {
size = Integer.parseInt(tokens[3]);
}
//The fifth column is the Enrichment score (ES)
if (!tokens[4].isEmpty()) {
ES = Double.parseDouble(tokens[4]);
}
//The sixth column is the Normalize Enrichment Score (NES)
if (!tokens[5].isEmpty()) {
NES = Double.parseDouble(tokens[5]);
}
//The seventh column is the nominal p-value
if (!tokens[6].isEmpty()) {
pvalue = Double.parseDouble(tokens[6]);
}
//the eighth column is the FDR q-value
if (!tokens[7].isEmpty()) {
FDRqvalue = Double.parseDouble(tokens[7]);
}
//the ninth column is the FWER q-value
if (!tokens[8].isEmpty()) {
FWERqvalue = Double.parseDouble(tokens[8]);
}
//the tenth column is the rankatmax
if (!tokens[9].isEmpty()) {
rankAtMax = Integer.parseInt(tokens[9]);
}
GSEAResult result = new GSEAResult(Name, size, ES, NES, pvalue, FDRqvalue, FWERqvalue, rankAtMax, scoreAtMax);
// Calculate Percentage. This must be a value between 0..100.
int percentComplete = (int) (((double) currentProgress / maxValue) * 100);
taskMonitor.setProgress(percentComplete);
currentProgress++;
results.put(Name, result);
}
}
use of org.baderlab.csplugins.enrichmentmap.model.EnrichmentResult in project EnrichmentMapApp by BaderLab.
the class ParseGenericEnrichmentResults method parseLines.
@Override
public void parseLines(List<String> lines, EMDataSet dataset, TaskMonitor taskMonitor) {
if (taskMonitor == null)
taskMonitor = new NullTaskMonitor();
taskMonitor.setTitle("Parsing Generic Result file");
//Get the current genesets so we can check that all the results are in the geneset list
//and put the size of the genesets into the visual style
Map<String, GeneSet> genesets = dataset.getSetOfGeneSets().getGeneSets();
int currentProgress = 0;
int maxValue = lines.size();
taskMonitor.setStatusMessage("Parsing Generic Results file - " + maxValue + " rows");
boolean FDR = false;
//skip the first line which just has the field names (start i=1)
//check to see how many columns the data has
String line = lines.get(0);
String[] tokens = line.split("\t");
int length = tokens.length;
EnrichmentMap map = dataset.getMap();
SetOfEnrichmentResults enrichments = dataset.getEnrichments();
Map<String, EnrichmentResult> results = enrichments.getEnrichments();
String upPhenotype = enrichments.getPhenotype1();
String downPhenotype = enrichments.getPhenotype2();
//check to see if there are genesets.
//if there are no genesets then populate the genesets from the generic file
//can only do this if the 6th column has a list of genes for that geneset.
boolean populate_gs = false;
if (genesets == null || genesets.isEmpty())
populate_gs = true;
else
//as this is the default for gprofiler use the Description in the visual style instead of the formatted name
//but only if there is a gmt supplied. If using just the generic output file there is not field for description
dataset.getMap().getParams().setEMgmt(true);
for (int i = 1; i < lines.size(); i++) {
line = lines.get(i);
tokens = line.split("\t");
//update the length each time because some line might have missing values
length = tokens.length;
double pvalue = 1.0;
double FDRqvalue = 1.0;
GenericResult result;
int gs_size = 0;
double NES = 1.0;
//The first column of the file is the name of the geneset
final String name = tokens[0].toUpperCase().trim();
final String description = tokens[1].toUpperCase();
if (genesets.containsKey(name)) {
gs_size = genesets.get(name).getGenes().size();
}
//The third column is the nominal p-value
if (tokens[2] == null || tokens[2].equalsIgnoreCase("")) {
//do nothing
} else {
pvalue = Double.parseDouble(tokens[2]);
}
if (length > 3) {
//the fourth column is the FDR q-value
if (tokens[3] == null || tokens[3].equalsIgnoreCase("")) {
//do nothing
} else {
FDRqvalue = Double.parseDouble(tokens[3]);
FDR = true;
}
// and if it is a number the only important part is the sign
if (length > 4) {
if (tokens[4] == null || tokens[4].equalsIgnoreCase("")) {
} else {
//check to see if the string matches the specified phenotypes
if (tokens[4].equalsIgnoreCase(upPhenotype))
NES = 1.0;
else if (tokens[4].equalsIgnoreCase(downPhenotype))
NES = -1.0;
else //try and see if the user has specified the phenotype as a number
{
try {
NES = Double.parseDouble(tokens[4]);
} catch (NumberFormatException nfe) {
throw new IllegalThreadStateException(tokens[4] + " is not a valid phenotype. Phenotype specified in generic enrichment results file must have the same phenotype as specified in advanced options or must be a positive or negative number.");
}
}
}
//its enrichment
if (length > 5 && populate_gs) {
//get all the genes in the field
String[] gene_tokens = tokens[5].split(",");
ImmutableSet.Builder<Integer> builder = ImmutableSet.builder();
//All subsequent fields in the list are the geneset associated with this geneset.
for (String token : gene_tokens) {
String gene = token.trim().toUpperCase();
//if it is already in the hash then get its associated key and put it into the set of genes
if (map.containsGene(gene)) {
builder.add(map.getHashFromGene(gene));
} else if (!gene.isEmpty()) {
Integer hash = map.addGene(gene).get();
builder.add(hash);
}
}
GeneSet gs = new GeneSet(name, description, builder.build());
gs_size = gs.getGenes().size();
//put the new or filtered geneset back into the set.
genesets.put(name, gs);
}
//end of tokens>5
result = new GenericResult(name, description, pvalue, gs_size, FDRqvalue, NES);
} else
//end of tokens>4
result = new GenericResult(name, description, pvalue, gs_size, FDRqvalue);
} else {
result = new GenericResult(name, description, pvalue, gs_size);
}
// Calculate Percentage. This must be a value between 0..100.
int percentComplete = (int) (((double) currentProgress / maxValue) * 100);
taskMonitor.setProgress(percentComplete);
currentProgress++;
//check to see if the gene set has already been entered in the results
//it is possible that one geneset will be in both phenotypes.
//if it is already exists then we want to make sure the one retained is the result with the
//lower p-value.
//ticket #149
GenericResult temp = (GenericResult) results.get(name);
if (temp == null)
results.put(name, result);
else {
if (result.getPvalue() < temp.getPvalue())
results.put(name, result);
}
}
if (FDR)
dataset.getMap().getParams().setFDR(FDR);
}
use of org.baderlab.csplugins.enrichmentmap.model.EnrichmentResult in project EnrichmentMapApp by BaderLab.
the class FileReaderTest method testGenericFileReader_5columns.
@Test
public void testGenericFileReader_5columns(Provider<EnrichmentMapParameters> empFactory) throws Exception {
//load the test expression file
String testDataFileName = "src/test/resources/org/baderlab/csplugins/enrichmentmap/generic_enr_5col.txt";
//create a new instance of the parameters
EnrichmentMapParameters params = empFactory.get();
//set enrichment results file name
params.getFiles().get(LegacySupport.DATASET1).setEnrichmentFileName1(testDataFileName);
//Create a new Enrichment map
EnrichmentMap map = new EnrichmentMap(params.getCreationParameters(), serviceRegistrar);
//get the default dataset
Method method = EnrichmentMapParameters.stringToMethod(params.getMethod());
DataSetFiles files = params.getFiles().get(LegacySupport.DATASET1);
EMDataSet dataset = map.createDataSet(LegacySupport.DATASET1, method, files);
// check if empty
assertEquals(0, map.getDataSet(LegacySupport.DATASET1).getEnrichments().getEnrichments().size());
// read
ParseGenericEnrichmentResults task = new ParseGenericEnrichmentResults(dataset);
task.run(taskMonitor);
Map<String, EnrichmentResult> results = map.getDataSet(LegacySupport.DATASET1).getEnrichments().getEnrichments();
// check we have 4 results
assertEquals(4, results.size());
// check pValues
assertEquals(0.01, ((GenericResult) results.get("GO:0000346")).getPvalue(), 0.0);
assertEquals(0.05, ((GenericResult) results.get("GO:0030904")).getPvalue(), 0.0);
assertEquals(0.05, ((GenericResult) results.get("GO:0008623")).getPvalue(), 0.0);
assertEquals(5.60E-42, ((GenericResult) results.get("GO:0046540")).getPvalue(), 0.0);
// check getFdrqvalues
assertEquals(0.02, ((GenericResult) results.get("GO:0000346")).getFdrqvalue(), 0.0);
assertEquals(0.10, ((GenericResult) results.get("GO:0030904")).getFdrqvalue(), 0.0);
assertEquals(0.12, ((GenericResult) results.get("GO:0008623")).getFdrqvalue(), 0.0);
assertEquals(0.03, ((GenericResult) results.get("GO:0046540")).getFdrqvalue(), 0.0);
// check phenotypes
assertEquals(1.0, ((GenericResult) results.get("GO:0000346")).getNES(), 0.0);
assertEquals(1.0, ((GenericResult) results.get("GO:0030904")).getNES(), 0.0);
assertEquals(-1.0, ((GenericResult) results.get("GO:0008623")).getNES(), 0.0);
assertEquals(-1.0, ((GenericResult) results.get("GO:0046540")).getNES(), 0.0);
return;
}
use of org.baderlab.csplugins.enrichmentmap.model.EnrichmentResult in project EnrichmentMapApp by BaderLab.
the class FileReaderTest method testGSEAEDBEnrichmentsReader.
//test GSEA enrichment results reader
@Test
public void testGSEAEDBEnrichmentsReader(Provider<EnrichmentMapParameters> empFactory) throws Exception {
//load the test enrichment files - GSEA creates two enrichment results files.
String testDataFileName = "src/test/resources/org/baderlab/csplugins/enrichmentmap/task/LoadDataset/GSEA_example_results/edb/results.edb";
//create a new instance of the parameters
EnrichmentMapParameters params = empFactory.get();
//set enrichment file name
params.getFiles().get(LegacySupport.DATASET1).setEnrichmentFileName1(testDataFileName);
//Create a new Enrichment map
EnrichmentMap map = new EnrichmentMap(params.getCreationParameters(), serviceRegistrar);
//get the default dataset
Method method = EnrichmentMapParameters.stringToMethod(params.getMethod());
DataSetFiles files = params.getFiles().get(LegacySupport.DATASET1);
EMDataSet dataset = map.createDataSet(LegacySupport.DATASET1, method, files);
ParseEDBEnrichmentResults task = new ParseEDBEnrichmentResults(dataset);
task.run(taskMonitor);
//Get the enrichment
Map<String, EnrichmentResult> enrichments = map.getDataSet(LegacySupport.DATASET1).getEnrichments().getEnrichments();
assertEquals(14, enrichments.size());
//Check the contents of some of the genesets
// example from file 1 (ANTIGEN PROCESSING AND PRESENTATION%KEGG%HSA04612)
//check p-values
assertEquals(0.2271, ((GSEAResult) enrichments.get("PROTEASOME ACTIVATOR COMPLEX%GO%GO:0008537")).getPvalue(), 0.0);
//check fdr value
assertEquals(0.2447, ((GSEAResult) enrichments.get("PROTEASOME ACTIVATOR COMPLEX%GO%GO:0008537")).getFdrqvalue(), 0.0);
//check ES value
assertEquals(0.7852, ((GSEAResult) enrichments.get("PROTEASOME ACTIVATOR COMPLEX%GO%GO:0008537")).getES(), 0.0);
//check NES
assertEquals(1.1793, ((GSEAResult) enrichments.get("PROTEASOME ACTIVATOR COMPLEX%GO%GO:0008537")).getNES(), 0.0);
//check ranks at max
assertEquals(6, ((GSEAResult) enrichments.get("PROTEASOME ACTIVATOR COMPLEX%GO%GO:0008537")).getRankAtMax());
//check size
assertEquals(2, ((GSEAResult) enrichments.get("PROTEASOME ACTIVATOR COMPLEX%GO%GO:0008537")).getGsSize());
// example from file 2 (EMBRYONIC HEART TUBE MORPHOGENESIS%GO%GO:0003143)
//check p-values
assertEquals(0.4545, ((GSEAResult) enrichments.get("PROTEASOME COMPLEX%GO%GO:0000502")).getPvalue(), 0.0);
//check fdr value
assertEquals(0.8650, ((GSEAResult) enrichments.get("PROTEASOME COMPLEX%GO%GO:0000502")).getFdrqvalue(), 0.0);
//check ES value
assertEquals(-0.4707, ((GSEAResult) enrichments.get("PROTEASOME COMPLEX%GO%GO:0000502")).getES(), 0.0);
//check NES
assertEquals(-0.9696, ((GSEAResult) enrichments.get("PROTEASOME COMPLEX%GO%GO:0000502")).getNES(), 0.0);
//check ranks at max
//The Rank at max in the edb file is different from the excel files. In the excel file that we have been
// using up until now they convert the rank as if you are counting from the bottom of the list but in the
//edb file they count from the top of the ranked list (going from positive to negative ES scores)
assertEquals(15, ((GSEAResult) enrichments.get("PROTEASOME COMPLEX%GO%GO:0000502")).getRankAtMax());
//check size
assertEquals(39, ((GSEAResult) enrichments.get("PROTEASOME COMPLEX%GO%GO:0000502")).getGsSize());
}
Aggregations