use of org.baderlab.csplugins.enrichmentmap.util.NullTaskMonitor in project EnrichmentMapApp by BaderLab.
the class ParseDavidEnrichmentResults method parseLines.
/**
* Parse david enrichment results file
*/
@Override
public void parseLines(List<String> lines, EMDataSet dataset, TaskMonitor taskMonitor) {
if (taskMonitor == null)
taskMonitor = new NullTaskMonitor();
taskMonitor.setTitle("Parsing David Enrichment Result file");
//with David results there are no genesets defined. first pass through the file
// needs to parse the genesets
//parameters that can be extracted from David files:
//Category Term Count % PValue Genes List Total Pop Hits Pop Total Fold Enrichment Bonferroni Benjamini FDR
// Count = number of genes in the geneset that came from the input list, number of genes in the genelist mapping toa specific term.
// List Total - number of genes in the gene list mapping to the category (ie. GO Cellular component)
// Pop Hits - number of genes in the background gene list mapping to a specific term
// Pop total - number of gene s in the background gene list mapping to the category (i.e. Go Cellular Component)
// Column 2 is the geneset name
// Column 1 is the category (and can be used for the description)
// Column 6 is the list of genes (from the loaded list) in this geneset -- therefore pre-filtered.
Map<String, GeneSet> genesets = dataset.getSetOfGeneSets().getGeneSets();
EnrichmentMap map = dataset.getMap();
Map<String, EnrichmentResult> results = dataset.getEnrichments().getEnrichments();
int currentProgress = 0;
int maxValue = lines.size();
taskMonitor.setStatusMessage("Parsing Generic Results file - " + maxValue + " rows");
boolean FDR = true;
//skip the first line which just has the field names (start i=1)
//check to see how many columns the data has
String line = lines.get(0);
String[] tokens = line.split("\t");
int length = tokens.length;
if (length != 13)
throw new IllegalThreadStateException("David results file is missing data.");
for (int i = 1; i < lines.size(); i++) {
line = lines.get(i);
tokens = line.split("\t");
double pvalue = 1.0;
double FDRqvalue = 1.0;
GenericResult result;
int gs_size = 0;
double NES = 1.0;
//The second column of the file is the name of the geneset
final String name = tokens[1].toUpperCase().trim();
//the first column of the file is the description
final String description = tokens[0].toUpperCase();
//when there are two different species it is possible that the gene set could
//already exist in the set of genesets. if it does exist then add the genes
//in this set to the geneset
ImmutableSet.Builder<Integer> builder = ImmutableSet.builder();
if (genesets.containsKey(name))
builder = builder.addAll(genesets.get(name).getGenes());
String[] gene_tokens = tokens[5].split(", ");
//All subsequent fields in the list are the geneset associated with this geneset.
for (int j = 0; j < gene_tokens.length; j++) {
String gene = gene_tokens[j].toUpperCase();
//if it is already in the hash then get its associated key and put it into the set of genes
if (map.containsGene(gene)) {
builder.add(map.getHashFromGene(gene));
} else if (!gene.isEmpty()) {
Integer hash = map.addGene(gene).get();
builder.add(hash);
}
}
//finished parsing that geneset
//add the current geneset to the hashmap of genesets
GeneSet gs = new GeneSet(name, description, builder.build());
genesets.put(name, gs);
//The 5th column is the nominal p-value
if (tokens[4].equalsIgnoreCase("")) {
//do nothing
} else {
pvalue = Double.parseDouble(tokens[4]);
}
//the Count is the size of the geneset (restricted by the gene list)
if (tokens[2].equalsIgnoreCase("")) {
//do nothing
} else {
gs_size = Integer.parseInt(tokens[2]);
}
//Use the Benjamini value for the fdr
if (tokens[11].equalsIgnoreCase("")) {
//do nothing
} else {
FDRqvalue = Double.parseDouble(tokens[11]);
}
result = new GenericResult(name, description, pvalue, gs_size, FDRqvalue);
// Calculate Percentage. This must be a value between 0..100.
int percentComplete = (int) (((double) currentProgress / maxValue) * 100);
taskMonitor.setProgress(percentComplete);
currentProgress++;
//check to see if the gene set has already been entered in the results
//it is possible that one geneset will be in both phenotypes.
//if it is already exists then we want to make sure the one retained is the result with the
//lower p-value.
//ticket #149
GenericResult temp = (GenericResult) results.get(name);
if (temp == null)
results.put(name, result);
else {
if (result.getPvalue() < temp.getPvalue())
results.put(name, result);
}
}
if (FDR)
dataset.getMap().getParams().setFDR(FDR);
}
use of org.baderlab.csplugins.enrichmentmap.util.NullTaskMonitor in project EnrichmentMapApp by BaderLab.
the class RanksFileReaderTask method parse.
/**
* parse the rank file
*/
public void parse(TaskMonitor taskMonitor) throws IOException {
if (taskMonitor == null)
taskMonitor = new NullTaskMonitor();
List<String> lines = DatasetLineParser.readLines(RankFileName);
int lineNumber = 0;
int currentProgress = 0;
int maxValue = lines.size();
taskMonitor.setStatusMessage("Parsing Rank file - " + maxValue + " rows");
EnrichmentMap map = dataset.getMap();
// we don't know the number of scores in the rank file yet, but it can't be more than the number of lines.
Double[] score_collector = new Double[lines.size()];
boolean gseaDefinedRanks = false;
Map<Integer, Rank> ranks = new HashMap<>();
/*
* there are two possible Rank files: If loaded through the rpt file the
* file is the one generated by GSEA and will have 5 columns (name,
* description, empty,empty,score) If the user loaded it through the
* generic of specifying advanced options then it will 2 columns
* (name,score). The score in either case should be a double and the
* name a string so check for either option.
*/
//number of found scores
int nScores = 0;
for (int i = 0; i < lines.size(); i++) {
String line = lines.get(i);
//check to see if the line is commented out and should be ignored.
if (line.startsWith("#")) {
// look for ranks_name in comment line e.g.: "# Ranks Name : My Ranks"
if (Pattern.matches("^# *Ranks[ _-]?Name *:.+", line)) {
this.ranks_name = line.split(":", 2)[1];
while (this.ranks_name.startsWith(" ")) this.ranks_name = this.ranks_name.substring(1);
}
//ignore comment line
continue;
}
String[] tokens = line.split("\t");
String name = tokens[0].toUpperCase();
double score = 0;
//if there are 5 columns in the data then the rank is the last column
if (tokens.length == 5) {
//ignore rows where the expected rank value is not a valid double
try {
//gseaDefinedRanks = true;
score = Double.parseDouble(tokens[4]);
} catch (NumberFormatException nfe) {
if (lineNumber == 0) {
lineNumber++;
continue;
} else
throw new IllegalThreadStateException("rank value for" + tokens[0] + "is not a valid number");
}
nScores++;
} else //if there are 2 columns in the data then the rank is the 2 column
if (tokens.length == 2) {
try {
score = Double.parseDouble(tokens[1]);
} catch (NumberFormatException nfe) {
if (lineNumber == 0) {
lineNumber++;
continue;
} else
throw new IllegalThreadStateException("rank value for" + tokens[0] + "is not a valid number");
}
nScores++;
} else {
System.out.println("Invalid number of tokens line of Rank File (should be 5 or 2)");
//skip invalid line
continue;
}
if ((tokens.length == 5) || (dataset.getMethod() == Method.GSEA && !loadFromHeatmap))
gseaDefinedRanks = true;
//add score to array of scores
score_collector[nScores - 1] = score;
//check to see if the gene is in the genelist
Integer genekey = map.getHashFromGene(name);
if (genekey != null) {
Rank current_ranking;
// edge compatible files.
if ((tokens.length == 5) || (dataset.getMethod() == Method.GSEA && !loadFromHeatmap)) {
current_ranking = new Rank(name, score, nScores);
} else {
current_ranking = new Rank(name, score);
}
ranks.put(genekey, current_ranking);
}
// Calculate Percentage. This must be a value between 0..100.
int percentComplete = (int) (((double) currentProgress / maxValue) * 100);
taskMonitor.setProgress(percentComplete);
currentProgress++;
}
//the none of the genes are in the gene list
if (ranks.isEmpty()) {
throw new IllegalThreadStateException("None of the genes in the rank file are found in the expression file. Make sure the identifiers of the two files match.");
}
//remove Null values from collector
Double[] sort_scores = new Double[nScores];
double[] scores = new double[nScores];
for (int i = 0; i < nScores; i++) {
sort_scores[i] = score_collector[i];
scores[i] = (double) score_collector[i];
}
//after we have loaded in all the scores, sort the score to compute ranks
//create hash of scores to ranks.
HashMap<Double, Integer> score2ranks = new HashMap<Double, Integer>();
//sorts the array in descending order
Arrays.sort(sort_scores, Collections.reverseOrder());
//just signed statistics for instance as it will sort them in the opposite direction.
if (sort_scores[0] <= 1 && sort_scores[sort_scores.length - 1] >= -1)
Arrays.sort(sort_scores);
for (int j = 0; j < sort_scores.length; j++) {
//check to see if this score is already enter
if (!score2ranks.containsKey(sort_scores[j]))
score2ranks.put(sort_scores[j], j);
}
//only update the ranks if we haven't already defined them using order of scores in file
if (!gseaDefinedRanks) {
for (Iterator<Integer> k = ranks.keySet().iterator(); k.hasNext(); ) {
Integer gene_key = k.next();
Rank current_ranking = ranks.get(gene_key);
Integer rank = score2ranks.get(current_ranking.getScore());
current_ranking.setRank(rank);
// update rank2gene and gene2score as well
}
}
//check to see if some of the dataset genes are not in this rank file
Set<Integer> current_genes = dataset.getDataSetGenes();
Set<Integer> current_ranks = ranks.keySet();
//intersect the genes with the ranks. only retain the genes that have ranks.
Set<Integer> intersection = new HashSet<>(current_genes);
intersection.retainAll(current_ranks);
//see if there more genes than there are ranks
if (!(intersection.size() == current_genes.size())) {
//JOptionPane.showMessageDialog(Cytoscape.getDesktop(),"Ranks for some of the genes/proteins listed in the expression file are missing. \n These genes/proteins will be excluded from ranked listing in the heat map.");
}
//create a new Ranking
Ranking new_ranking = new Ranking();
ranks.forEach(new_ranking::addRank);
//add the Ranks to the expression file ranking
dataset.getExpressionSets().addRanks(ranks_name, new_ranking);
}
use of org.baderlab.csplugins.enrichmentmap.util.NullTaskMonitor in project EnrichmentMapApp by BaderLab.
the class ExpressionFileReaderTask method parse.
/**
* Parse expression/rank file
*/
public GeneExpressionMatrix parse(TaskMonitor taskMonitor) throws IOException {
if (taskMonitor == null)
taskMonitor = new NullTaskMonitor();
//Need to check if the file specified as an expression file is actually a rank file
//If it is a rank file it can either be 5 or 2 columns but it is important that the rank
//value is extracted from the right column and placed in the expression matrix as if it
//was an expression value in order for other features to work.
//Also a problem with old session files that imported a rank file so it also
//important to check if the file only has two columns. If it only has two columns,
//check to see if the second column is a double. If it is then consider that column
//expression
boolean twoColumns = false;
Set<Integer> datasetGenes = dataset.getDataSetGenes();
// Map<Integer,String> genes = dataset.getMap().getGenes();
EnrichmentMap map = dataset.getMap();
String expressionFileName = dataset.getExpressionSets().getFilename();
List<String> lines = DatasetLineParser.readLines(expressionFileName);
int currentProgress = 0;
int maxValue = lines.size();
int expressionUniverse = 0;
taskMonitor.setStatusMessage("Parsing GCT file - " + maxValue + " rows");
GeneExpressionMatrix expressionMatrix = dataset.getExpressionSets();
//GeneExpressionMatrix expressionMatrix = new GeneExpressionMatrix(lines[0].split("\t"));
//HashMap<Integer,GeneExpression> expression = new HashMap<Integer, GeneExpression>();
Map<Integer, GeneExpression> expression = expressionMatrix.getExpressionMatrix();
for (int i = 0; i < lines.size(); i++) {
String line = lines.get(i);
String[] tokens = line.split("\t");
//The first column of the file is the name of the geneset
String Name = tokens[0].toUpperCase().trim();
//the first time we have given them default headings
if (i == 0 && (expressionMatrix == null || expressionMatrix.getExpressionMatrix().isEmpty()) && expressionMatrix.getColumnNames() == null) {
//otherwise the first line is the header
if (Name.equalsIgnoreCase("#1.2")) {
line = lines.get(2);
i = 2;
} else {
line = lines.get(0);
//ignore all comment lines
int k = 0;
while (line.startsWith("#")) {
k++;
line = lines.get(k);
}
i = k;
}
tokens = line.split("\t");
//check to see if the second column contains expression values.
if (tokens.length == 2) {
twoColumns = true;
//if we are loading a GSEA edb rnk file then their might not be column names
try {
int temp = Integer.parseInt(tokens[1]);
i = -1;
tokens[0] = "Name";
tokens[1] = "Rank/Score";
} catch (NumberFormatException v) {
try {
double temp2 = Double.parseDouble(tokens[1]);
i = -1;
tokens[0] = "Name";
tokens[1] = "Rank/Score";
} catch (NumberFormatException v2) {
//if it isn't a double or int then we have a title line.
}
}
}
//expressionMatrix = new GeneExpressionMatrix(tokens);
expressionMatrix.setColumnNames(tokens);
expressionMatrix.setNumConditions(expressionMatrix.getColumnNames().length);
expressionMatrix.setExpressionMatrix(expression);
continue;
}
//Check to see if this gene is in the genes list
//Currently we only load gene expression data for genes that are already in the gene list (i.e. are listed in at least one geneset)
//TODO:is there the possibility that we need all the expression genes? Currently this great decreases space when saving sessions
Integer genekey = map.getHashFromGene(Name);
if (genekey != null) {
//we want the genes hashmap and dataset genes hashmap to have the same keys so it is easier to compare.
datasetGenes.add(genekey);
String description = "";
//check to see if the second column is parseable
if (twoColumns) {
try {
Double.parseDouble(tokens[1]);
} catch (NumberFormatException e) {
description = tokens[1];
}
} else {
description = tokens[1];
}
GeneExpression expres = new GeneExpression(Name, description);
expres.setExpression(tokens);
double newMax = expres.newMax(expressionMatrix.getMaxExpression());
if (newMax != -100)
expressionMatrix.setMaxExpression(newMax);
double newMin = expres.newMin(expressionMatrix.getMinExpression());
if (newMin != -100)
expressionMatrix.setMinExpression(newMin);
double newClosest = expres.newclosesttoZero(expressionMatrix.getClosesttoZero());
if (newClosest != -100)
expressionMatrix.setClosesttoZero(newClosest);
expression.put(genekey, expres);
}
expressionUniverse++;
// Calculate Percentage. This must be a value between 0..100.
int percentComplete = (int) (((double) currentProgress / maxValue) * 100);
taskMonitor.setProgress(percentComplete);
currentProgress++;
}
//set the number of genes
expressionMatrix.setExpressionUniverse(expressionUniverse);
//row Normalize expressionset
expressionMatrix.rowNormalizeMatrix();
return expressionMatrix;
//TODO: intialize phenotypes associated with class files from expression file load
/*
* if(dataset == 1){ //set up the classes definition if it is set.
* //check to see if the phenotypes were already set in the params from
* a session load if(params.getTemp_class1() != null)
* expressionMatrix.setPhenotypes(params.getTemp_class1());
* if(params.getClassFile1() != null)
* expressionMatrix.setPhenotypes(setClasses( params.getClassFile1()));
* //params.getEM().addExpression(EnrichmentMap.DATASET1,
* expressionMatrix); } else{ //set up the classes definition if it is
* set.
*
* //check to see if the phenotypes were already set in the params from
* a session load if(params.getTemp_class2() != null)
* expressionMatrix.setPhenotypes(params.getTemp_class2()); else
* if(params.getClassFile2() != null)
* expressionMatrix.setPhenotypes(setClasses( params.getClassFile2()));
* //params.getEM().addExpression(EnrichmentMap.DATASET2,
* expressionMatrix); }
*/
}
use of org.baderlab.csplugins.enrichmentmap.util.NullTaskMonitor in project EnrichmentMapApp by BaderLab.
the class HierarchicalClusterTask method cluster.
public Map<Integer, RankValue> cluster(TaskMonitor tm) {
if (tm == null)
tm = new NullTaskMonitor();
tm.setTitle("Hierarchical Cluster");
tm.setStatusMessage("Loading expression data");
List<double[]> clusteringExpressionSet = new ArrayList<>(genes.size());
ArrayList<Integer> labels = new ArrayList<>(genes.size());
List<String> names = new ArrayList<>(genes.size());
List<EMDataSet> dataSets = map.getDataSetList();
final int expressionCount = getTotalExpressionCount(dataSets);
for (int geneId : genes) {
// values all default to 0.0
double[] vals = new double[expressionCount];
int valsIndex = 0;
boolean found = false;
String name = null;
for (EMDataSet dataSet : dataSets) {
GeneExpressionMatrix expressionSets = dataSet.getExpressionSets();
int numConditions = expressionSets.getNumConditions() - 2;
GeneExpression geneExpression = expressionSets.getExpressionMatrix().get(geneId);
if (geneExpression != null) {
found = true;
name = geneExpression.getName();
double[] expression = geneExpression.getExpression();
System.arraycopy(expression, 0, vals, valsIndex, expression.length);
}
valsIndex += numConditions;
}
if (found) {
clusteringExpressionSet.add(vals);
labels.add(geneId);
names.add(name);
}
}
tm.setStatusMessage("Calculating Distance");
DistanceMatrix distanceMatrix = new DistanceMatrix(genes.size());
distanceMatrix.calcDistances(clusteringExpressionSet, distanceMetric);
distanceMatrix.setLabels(labels);
tm.setStatusMessage("Clustering");
AvgLinkHierarchicalClustering clusterResult = new AvgLinkHierarchicalClustering(distanceMatrix);
//check to see if there more than 1000 genes, if there are use eisen ordering otherwise use bar-joseph
clusterResult.setOptimalLeafOrdering(genes.size() <= 1000);
clusterResult.run();
tm.setStatusMessage("Ranking");
Map<Integer, RankValue> ranks = new HashMap<>();
int[] order = clusterResult.getLeafOrder();
for (int i = 0; i < order.length; i++) {
Integer geneId = labels.get(order[i]);
ranks.put(geneId, new RankValue(i + 1, null, false));
}
tm.setStatusMessage("");
return ranks;
}
Aggregations