use of org.baderlab.csplugins.enrichmentmap.model.GenericResult in project EnrichmentMapApp by BaderLab.
the class ParseGREATEnrichmentResults method parseLines.
@Override
public void parseLines(List<String> lines, EMDataSet dataset, TaskMonitor taskMonitor) {
if (taskMonitor == null)
taskMonitor = new NullTaskMonitor();
taskMonitor.setTitle("Parsing Enrichment Result file");
boolean hasBackground = false;
EMCreationParameters params = dataset.getMap().getParams();
//Get the type of filter user specified on the GREAT results
//If it is hyper use column 14 Hypergeometric p-value and 16 FDR for hyper
//If it is binom use column 5 bionomial p-value and 7 FDR for binom
//If they specify both use the highest p-value and q-value from the above columns
GreatFilter filterType = dataset.getMap().getParams().getGreatFilter();
Map<String, GeneSet> genesets = dataset.getSetOfGeneSets().getGeneSets();
EnrichmentMap map = dataset.getMap();
Map<String, EnrichmentResult> results = dataset.getEnrichments().getEnrichments();
int currentProgress = 0;
int maxValue = lines.size();
taskMonitor.setStatusMessage("Parsing Great Results file - " + maxValue + " rows");
//for great files there is an FDR
dataset.getMap().getParams().setFDR(true);
//skip the first l9 which just has the field names (start i=1)
//check to see how many columns the data has
//go through each line until we find the header line
int k = 0;
String line = lines.get(k);
String[] tokens = line.split("\t");
for (; k < lines.size(); k++) {
line = lines.get(k);
tokens = line.split("\t");
int length = tokens.length;
if ((length == 24) && tokens[3].equalsIgnoreCase("BinomRank")) {
break;
} else //There is not binom rank and no binomial data.
if ((length == 20) && tokens[3].equalsIgnoreCase("Rank")) {
hasBackground = true;
break;
}
}
//go through the rest of the lines
for (int i = k + 1; i < lines.size(); i++) {
line = lines.get(i);
tokens = line.split("\t");
//there are extra lines at the end of the file that should be ignored.
if (!hasBackground && tokens.length != 24)
continue;
if (hasBackground && tokens.length != 20)
continue;
double pvalue = 1.0;
double FDRqvalue = 1.0;
GenericResult result;
int gs_size = 0;
double NES = 1.0;
//details of export file
//http://bejerano.stanford.edu/help/display/GREAT/Export
//The second column of the file is the name of the geneset
final String name = tokens[1].trim() + "-" + tokens[2].trim();
//the first column of the file is the description
final String description = tokens[2].trim();
//when there are two different species it is possible that the gene set could
//already exist in the set of genesets. if it does exist then add the genes
//in this set to the geneset
ImmutableSet.Builder<Integer> builder = ImmutableSet.builder();
if (genesets.containsKey(name))
builder = builder.addAll(genesets.get(name).getGenes());
String[] gene_tokens;
if (!hasBackground)
gene_tokens = tokens[23].split(",");
else
gene_tokens = tokens[18].split(",");
//All subsequent fields in the list are the geneset associated with this geneset.
for (int j = 0; j < gene_tokens.length; j++) {
String gene = gene_tokens[j].toUpperCase();
//if it is already in the hash then get its associated key and put it into the set of genes
if (map.containsGene(gene)) {
builder.add(map.getHashFromGene(gene));
} else if (!gene.isEmpty()) {
Integer hash = map.addGene(gene).get();
builder.add(hash);
}
}
//finished parsing that geneset
//add the current geneset to the hashmap of genesets
GeneSet gs = new GeneSet(name, description, builder.build());
genesets.put(name, gs);
//There are two tests run by GREAT, the binomial on regions and the hypergeometric based on genes
//The first pass of results shows only those that are significant both
//The user can then choose to use either or both together
//
//If it is hyper use column 14 Hypergeometric p-value and 16 FDR for hyper
//If it is binom use column 5 bionomial p-value and 7 FDR for binom
//If they specify both use the highest p-value and q-value from the above columns
double hyper_pvalue = 1;
double hyper_fdr = 1;
double binom_pvalue = 1;
double binom_fdr = 1;
if (!hasBackground) {
if (!tokens[4].equalsIgnoreCase(""))
binom_pvalue = Double.parseDouble(tokens[4]);
if (!tokens[6].equalsIgnoreCase(""))
binom_fdr = Double.parseDouble(tokens[6]);
if (!tokens[13].equalsIgnoreCase(""))
hyper_pvalue = Double.parseDouble(tokens[13]);
if (!tokens[15].equalsIgnoreCase(""))
hyper_fdr = Double.parseDouble(tokens[15]);
} else {
if (!tokens[4].equalsIgnoreCase(""))
hyper_pvalue = Double.parseDouble(tokens[4]);
if (!tokens[6].equalsIgnoreCase(""))
hyper_fdr = Double.parseDouble(tokens[6]);
}
if (filterType == GreatFilter.HYPER) {
pvalue = hyper_pvalue;
FDRqvalue = hyper_fdr;
} else if (filterType == GreatFilter.BINOM) {
pvalue = binom_pvalue;
FDRqvalue = binom_fdr;
} else if (filterType == GreatFilter.BOTH) {
pvalue = Math.max(hyper_pvalue, binom_pvalue);
FDRqvalue = Math.max(hyper_fdr, binom_fdr);
} else if (filterType == GreatFilter.EITHER) {
pvalue = Math.min(hyper_pvalue, binom_pvalue);
FDRqvalue = Math.min(hyper_fdr, binom_fdr);
} else {
System.out.println("Invalid attribute setting for GREAT p-value specification");
}
//Keep track of minimum p-value to better calculate jslider
if (pvalue < params.getPvalueMin())
params.setPvalueMin(pvalue);
if (FDRqvalue < params.getQvalueMin())
params.setQvalueMin(FDRqvalue);
//If this is a background set then it is in the 16th column
if ((!hasBackground) && (!tokens[19].equalsIgnoreCase("")))
gs_size = Integer.parseInt(tokens[19]);
else if ((hasBackground) && (!tokens[15].equalsIgnoreCase("")))
gs_size = Integer.parseInt(tokens[15]);
result = new GenericResult(name, description, pvalue, gs_size, FDRqvalue);
// Calculate Percentage. This must be a value between 0..100.
int percentComplete = (int) (((double) currentProgress / maxValue) * 100);
taskMonitor.setProgress(percentComplete);
currentProgress++;
//check to see if the gene set has already been entered in the results
//it is possible that one geneset will be in both phenotypes.
//if it is already exists then we want to make sure the one retained is the result with the
//lower p-value.
//ticket #149
GenericResult temp = (GenericResult) results.get(name);
if (temp == null)
results.put(name, result);
else {
if (result.getPvalue() < temp.getPvalue())
results.put(name, result);
}
}
}
use of org.baderlab.csplugins.enrichmentmap.model.GenericResult in project EnrichmentMapApp by BaderLab.
the class ParseGenericEnrichmentResults method parseLines.
@Override
public void parseLines(List<String> lines, EMDataSet dataset, TaskMonitor taskMonitor) {
if (taskMonitor == null)
taskMonitor = new NullTaskMonitor();
taskMonitor.setTitle("Parsing Generic Result file");
//Get the current genesets so we can check that all the results are in the geneset list
//and put the size of the genesets into the visual style
Map<String, GeneSet> genesets = dataset.getSetOfGeneSets().getGeneSets();
int currentProgress = 0;
int maxValue = lines.size();
taskMonitor.setStatusMessage("Parsing Generic Results file - " + maxValue + " rows");
boolean FDR = false;
//skip the first line which just has the field names (start i=1)
//check to see how many columns the data has
String line = lines.get(0);
String[] tokens = line.split("\t");
int length = tokens.length;
EnrichmentMap map = dataset.getMap();
SetOfEnrichmentResults enrichments = dataset.getEnrichments();
Map<String, EnrichmentResult> results = enrichments.getEnrichments();
String upPhenotype = enrichments.getPhenotype1();
String downPhenotype = enrichments.getPhenotype2();
//check to see if there are genesets.
//if there are no genesets then populate the genesets from the generic file
//can only do this if the 6th column has a list of genes for that geneset.
boolean populate_gs = false;
if (genesets == null || genesets.isEmpty())
populate_gs = true;
else
//as this is the default for gprofiler use the Description in the visual style instead of the formatted name
//but only if there is a gmt supplied. If using just the generic output file there is not field for description
dataset.getMap().getParams().setEMgmt(true);
for (int i = 1; i < lines.size(); i++) {
line = lines.get(i);
tokens = line.split("\t");
//update the length each time because some line might have missing values
length = tokens.length;
double pvalue = 1.0;
double FDRqvalue = 1.0;
GenericResult result;
int gs_size = 0;
double NES = 1.0;
//The first column of the file is the name of the geneset
final String name = tokens[0].toUpperCase().trim();
final String description = tokens[1].toUpperCase();
if (genesets.containsKey(name)) {
gs_size = genesets.get(name).getGenes().size();
}
//The third column is the nominal p-value
if (tokens[2] == null || tokens[2].equalsIgnoreCase("")) {
//do nothing
} else {
pvalue = Double.parseDouble(tokens[2]);
}
if (length > 3) {
//the fourth column is the FDR q-value
if (tokens[3] == null || tokens[3].equalsIgnoreCase("")) {
//do nothing
} else {
FDRqvalue = Double.parseDouble(tokens[3]);
FDR = true;
}
// and if it is a number the only important part is the sign
if (length > 4) {
if (tokens[4] == null || tokens[4].equalsIgnoreCase("")) {
} else {
//check to see if the string matches the specified phenotypes
if (tokens[4].equalsIgnoreCase(upPhenotype))
NES = 1.0;
else if (tokens[4].equalsIgnoreCase(downPhenotype))
NES = -1.0;
else //try and see if the user has specified the phenotype as a number
{
try {
NES = Double.parseDouble(tokens[4]);
} catch (NumberFormatException nfe) {
throw new IllegalThreadStateException(tokens[4] + " is not a valid phenotype. Phenotype specified in generic enrichment results file must have the same phenotype as specified in advanced options or must be a positive or negative number.");
}
}
}
//its enrichment
if (length > 5 && populate_gs) {
//get all the genes in the field
String[] gene_tokens = tokens[5].split(",");
ImmutableSet.Builder<Integer> builder = ImmutableSet.builder();
//All subsequent fields in the list are the geneset associated with this geneset.
for (String token : gene_tokens) {
String gene = token.trim().toUpperCase();
//if it is already in the hash then get its associated key and put it into the set of genes
if (map.containsGene(gene)) {
builder.add(map.getHashFromGene(gene));
} else if (!gene.isEmpty()) {
Integer hash = map.addGene(gene).get();
builder.add(hash);
}
}
GeneSet gs = new GeneSet(name, description, builder.build());
gs_size = gs.getGenes().size();
//put the new or filtered geneset back into the set.
genesets.put(name, gs);
}
//end of tokens>5
result = new GenericResult(name, description, pvalue, gs_size, FDRqvalue, NES);
} else
//end of tokens>4
result = new GenericResult(name, description, pvalue, gs_size, FDRqvalue);
} else {
result = new GenericResult(name, description, pvalue, gs_size);
}
// Calculate Percentage. This must be a value between 0..100.
int percentComplete = (int) (((double) currentProgress / maxValue) * 100);
taskMonitor.setProgress(percentComplete);
currentProgress++;
//check to see if the gene set has already been entered in the results
//it is possible that one geneset will be in both phenotypes.
//if it is already exists then we want to make sure the one retained is the result with the
//lower p-value.
//ticket #149
GenericResult temp = (GenericResult) results.get(name);
if (temp == null)
results.put(name, result);
else {
if (result.getPvalue() < temp.getPvalue())
results.put(name, result);
}
}
if (FDR)
dataset.getMap().getParams().setFDR(FDR);
}
use of org.baderlab.csplugins.enrichmentmap.model.GenericResult in project EnrichmentMapApp by BaderLab.
the class ParseBingoEnrichmentResults method parseLines.
@Override
public void parseLines(List<String> lines, EMDataSet dataset, TaskMonitor taskMonitor) {
if (taskMonitor == null)
taskMonitor = new NullTaskMonitor();
taskMonitor.setTitle("Parsing Bingo Enrichment Result file");
//with Bingo results there are no genesets defined. first pass through the file
// needs to parse the genesets
//the bingo file has 20 lines of info at the top of the file before you get to results.
//parameters that can be extracted from Bingo files:
//GO-ID p-value corr p-value x n X N Description Genes in test set
// (column 1 ) GO-Id - is just the numerical part of the GO term (does not contain GO:0000)
//(column 2 ) p-value
//(column 3 ) corr pvalue
//(column 4 ) x - number of genes in the subset of interest with this annotation
//(column 5 ) n - number of genes in the universe with this annotation
//(column 6 ) X - number of genes in the subset
//(column 7 ) N - number of genes in the universe
//(column 8 ) Description - GO term name
//(column 9 ) Gene in test set - a list of genes in the subset of interest that are annotated to this term
// Column 8 is the geneset name
// Column 9 is the list of genes in this geneset -- therefore pre-filtered.
Map<String, GeneSet> genesets = dataset.getSetOfGeneSets().getGeneSets();
//get the genes (which should also be empty
EnrichmentMap map = dataset.getMap();
Map<String, EnrichmentResult> results = dataset.getEnrichments().getEnrichments();
int currentProgress = 0;
int maxValue = lines.size();
boolean FDR = true;
taskMonitor.setStatusMessage("Parsing Generic Results file -" + maxValue + " rows");
//skip the first l9 which just has the field names (start i=1)
//check to see how many columns the data has
//go through each line until we find the header line
int k = 0;
String line = lines.get(k);
String[] tokens = line.split("\t");
for (; k < lines.size(); k++) {
line = lines.get(k);
tokens = line.split("\t");
int length = tokens.length;
if ((length == 9) && tokens[0].equalsIgnoreCase("GO-ID") && tokens[8].equalsIgnoreCase("Genes in test set")) {
break;
}
}
if (k == lines.size())
throw new IllegalThreadStateException("Bingo results file is missing data.");
for (int i = k + 1; i < lines.size(); i++) {
line = lines.get(i);
tokens = line.split("\t");
double pvalue = 1.0;
double FDRqvalue = 1.0;
GenericResult result;
int gs_size = 0;
double NES = 1.0;
//The 8th column of the file is the name of the geneset
final String name = tokens[7].toUpperCase().trim();
//the 8th column of the file is the description
final String description = tokens[7].toUpperCase();
//when there are two different species it is possible that the gene set could
//already exist in the set of genesets. if it does exist then add the genes
//in this set to the geneset
ImmutableSet.Builder<Integer> builder = ImmutableSet.builder();
if (genesets.containsKey(name))
builder = builder.addAll(genesets.get(name).getGenes());
String[] gene_tokens = tokens[8].split("\\|");
//All subsequent fields in the list are the geneset associated with this geneset.
for (int j = 0; j < gene_tokens.length; j++) {
String gene = gene_tokens[j].toUpperCase();
//if it is already in the hash then get its associated key and put it into the set of genes
if (map.containsGene(gene)) {
builder.add(map.getHashFromGene(gene));
} else if (!gene.isEmpty()) {
Integer hash = map.addGene(gene).get();
builder.add(hash);
}
}
//finished parsing that geneset
//add the current geneset to the hashmap of genesets
GeneSet gs = new GeneSet(name, description, builder.build());
genesets.put(name, gs);
//The 2nd column is the nominal p-value
if (tokens[1].equalsIgnoreCase("")) {
//do nothing
} else {
pvalue = Double.parseDouble(tokens[1]);
}
//the Count is the size of the geneset (restricted by the gene list)
if (tokens[3].equalsIgnoreCase("")) {
//do nothing
} else {
gs_size = Integer.parseInt(tokens[3]);
}
//Use the correct p-value - 3rd column
if (tokens[2].equalsIgnoreCase("")) {
//do nothing
} else {
FDRqvalue = Double.parseDouble(tokens[2]);
}
result = new GenericResult(name, description, pvalue, gs_size, FDRqvalue);
// Calculate Percentage. This must be a value between 0..100.
int percentComplete = (int) (((double) currentProgress / maxValue) * 100);
taskMonitor.setProgress(percentComplete);
currentProgress++;
//check to see if the gene set has already been entered in the results
//it is possible that one geneset will be in both phenotypes.
//if it is already exists then we want to make sure the one retained is the result with the
//lower p-value.
//ticket #149
GenericResult temp = (GenericResult) results.get(name);
if (temp == null)
results.put(name, result);
else {
if (result.getPvalue() < temp.getPvalue())
results.put(name, result);
}
}
if (FDR)
dataset.getMap().getParams().setFDR(FDR);
}
use of org.baderlab.csplugins.enrichmentmap.model.GenericResult in project EnrichmentMapApp by BaderLab.
the class ParseDavidEnrichmentResults method parseLines.
/**
* Parse david enrichment results file
*/
@Override
public void parseLines(List<String> lines, EMDataSet dataset, TaskMonitor taskMonitor) {
if (taskMonitor == null)
taskMonitor = new NullTaskMonitor();
taskMonitor.setTitle("Parsing David Enrichment Result file");
//with David results there are no genesets defined. first pass through the file
// needs to parse the genesets
//parameters that can be extracted from David files:
//Category Term Count % PValue Genes List Total Pop Hits Pop Total Fold Enrichment Bonferroni Benjamini FDR
// Count = number of genes in the geneset that came from the input list, number of genes in the genelist mapping toa specific term.
// List Total - number of genes in the gene list mapping to the category (ie. GO Cellular component)
// Pop Hits - number of genes in the background gene list mapping to a specific term
// Pop total - number of gene s in the background gene list mapping to the category (i.e. Go Cellular Component)
// Column 2 is the geneset name
// Column 1 is the category (and can be used for the description)
// Column 6 is the list of genes (from the loaded list) in this geneset -- therefore pre-filtered.
Map<String, GeneSet> genesets = dataset.getSetOfGeneSets().getGeneSets();
EnrichmentMap map = dataset.getMap();
Map<String, EnrichmentResult> results = dataset.getEnrichments().getEnrichments();
int currentProgress = 0;
int maxValue = lines.size();
taskMonitor.setStatusMessage("Parsing Generic Results file - " + maxValue + " rows");
boolean FDR = true;
//skip the first line which just has the field names (start i=1)
//check to see how many columns the data has
String line = lines.get(0);
String[] tokens = line.split("\t");
int length = tokens.length;
if (length != 13)
throw new IllegalThreadStateException("David results file is missing data.");
for (int i = 1; i < lines.size(); i++) {
line = lines.get(i);
tokens = line.split("\t");
double pvalue = 1.0;
double FDRqvalue = 1.0;
GenericResult result;
int gs_size = 0;
double NES = 1.0;
//The second column of the file is the name of the geneset
final String name = tokens[1].toUpperCase().trim();
//the first column of the file is the description
final String description = tokens[0].toUpperCase();
//when there are two different species it is possible that the gene set could
//already exist in the set of genesets. if it does exist then add the genes
//in this set to the geneset
ImmutableSet.Builder<Integer> builder = ImmutableSet.builder();
if (genesets.containsKey(name))
builder = builder.addAll(genesets.get(name).getGenes());
String[] gene_tokens = tokens[5].split(", ");
//All subsequent fields in the list are the geneset associated with this geneset.
for (int j = 0; j < gene_tokens.length; j++) {
String gene = gene_tokens[j].toUpperCase();
//if it is already in the hash then get its associated key and put it into the set of genes
if (map.containsGene(gene)) {
builder.add(map.getHashFromGene(gene));
} else if (!gene.isEmpty()) {
Integer hash = map.addGene(gene).get();
builder.add(hash);
}
}
//finished parsing that geneset
//add the current geneset to the hashmap of genesets
GeneSet gs = new GeneSet(name, description, builder.build());
genesets.put(name, gs);
//The 5th column is the nominal p-value
if (tokens[4].equalsIgnoreCase("")) {
//do nothing
} else {
pvalue = Double.parseDouble(tokens[4]);
}
//the Count is the size of the geneset (restricted by the gene list)
if (tokens[2].equalsIgnoreCase("")) {
//do nothing
} else {
gs_size = Integer.parseInt(tokens[2]);
}
//Use the Benjamini value for the fdr
if (tokens[11].equalsIgnoreCase("")) {
//do nothing
} else {
FDRqvalue = Double.parseDouble(tokens[11]);
}
result = new GenericResult(name, description, pvalue, gs_size, FDRqvalue);
// Calculate Percentage. This must be a value between 0..100.
int percentComplete = (int) (((double) currentProgress / maxValue) * 100);
taskMonitor.setProgress(percentComplete);
currentProgress++;
//check to see if the gene set has already been entered in the results
//it is possible that one geneset will be in both phenotypes.
//if it is already exists then we want to make sure the one retained is the result with the
//lower p-value.
//ticket #149
GenericResult temp = (GenericResult) results.get(name);
if (temp == null)
results.put(name, result);
else {
if (result.getPvalue() < temp.getPvalue())
results.put(name, result);
}
}
if (FDR)
dataset.getMap().getParams().setFDR(FDR);
}
use of org.baderlab.csplugins.enrichmentmap.model.GenericResult in project EnrichmentMapApp by BaderLab.
the class CreateEMNetworkTask method createNodes.
private Map<String, CyNode> createNodes(CyNetwork network) {
Map<String, CyNode> nodes = new HashMap<>();
Map<String, Set<Integer>> geneSets = map.unionAllGeneSetsOfInterest();
for (String genesetName : geneSets.keySet()) {
CyNode node = network.addNode();
nodes.put(genesetName, node);
// Set common attributes
CyRow row = network.getRow(node);
row.set(CyNetwork.NAME, genesetName);
Columns.NODE_FORMATTED_NAME.set(row, prefix, null, formatLabel(genesetName));
// MKTODO why is this column needed?
Columns.NODE_NAME.set(row, prefix, null, genesetName);
Columns.NODE_GS_DESCR.set(row, prefix, null, map.findGeneSetDescription(genesetName));
Columns.NODE_GS_TYPE.set(row, prefix, null, Columns.NODE_GS_TYPE_ENRICHMENT);
Set<Integer> geneIds = geneSets.get(genesetName);
List<String> genes = geneIds.stream().map(map::getGeneFromHashKey).collect(Collectors.toList());
Columns.NODE_GENES.set(row, prefix, null, genes);
Columns.NODE_GS_SIZE.set(row, prefix, null, genes.size());
// Set attributes specific to each dataset
for (EMDataSet ds : map.getDataSetList()) {
if (ds.getGeneSetsOfInterest().getGeneSets().containsKey(genesetName))
ds.addNodeSuid(node.getSUID());
Map<String, EnrichmentResult> enrichmentResults = ds.getEnrichments().getEnrichments();
EnrichmentResult result = enrichmentResults.get(genesetName);
// if result is null it will fail both instanceof checks
if (result instanceof GSEAResult)
setGSEAResultNodeAttributes(row, ds.getName(), (GSEAResult) result);
else if (result instanceof GenericResult)
setGenericResultNodeAttributes(row, ds.getName(), (GenericResult) result);
}
}
return nodes;
}
Aggregations