use of org.baderlab.csplugins.enrichmentmap.model.GeneExpressionMatrix in project EnrichmentMapApp by BaderLab.
the class ExpressionFileReaderTask method parse.
/**
* Parse expression/rank file
*/
public GeneExpressionMatrix parse(TaskMonitor taskMonitor) throws IOException {
if (taskMonitor == null)
taskMonitor = new NullTaskMonitor();
//Need to check if the file specified as an expression file is actually a rank file
//If it is a rank file it can either be 5 or 2 columns but it is important that the rank
//value is extracted from the right column and placed in the expression matrix as if it
//was an expression value in order for other features to work.
//Also a problem with old session files that imported a rank file so it also
//important to check if the file only has two columns. If it only has two columns,
//check to see if the second column is a double. If it is then consider that column
//expression
boolean twoColumns = false;
Set<Integer> datasetGenes = dataset.getDataSetGenes();
// Map<Integer,String> genes = dataset.getMap().getGenes();
EnrichmentMap map = dataset.getMap();
String expressionFileName = dataset.getExpressionSets().getFilename();
List<String> lines = DatasetLineParser.readLines(expressionFileName);
int currentProgress = 0;
int maxValue = lines.size();
int expressionUniverse = 0;
taskMonitor.setStatusMessage("Parsing GCT file - " + maxValue + " rows");
GeneExpressionMatrix expressionMatrix = dataset.getExpressionSets();
//GeneExpressionMatrix expressionMatrix = new GeneExpressionMatrix(lines[0].split("\t"));
//HashMap<Integer,GeneExpression> expression = new HashMap<Integer, GeneExpression>();
Map<Integer, GeneExpression> expression = expressionMatrix.getExpressionMatrix();
for (int i = 0; i < lines.size(); i++) {
String line = lines.get(i);
String[] tokens = line.split("\t");
//The first column of the file is the name of the geneset
String Name = tokens[0].toUpperCase().trim();
//the first time we have given them default headings
if (i == 0 && (expressionMatrix == null || expressionMatrix.getExpressionMatrix().isEmpty()) && expressionMatrix.getColumnNames() == null) {
//otherwise the first line is the header
if (Name.equalsIgnoreCase("#1.2")) {
line = lines.get(2);
i = 2;
} else {
line = lines.get(0);
//ignore all comment lines
int k = 0;
while (line.startsWith("#")) {
k++;
line = lines.get(k);
}
i = k;
}
tokens = line.split("\t");
//check to see if the second column contains expression values.
if (tokens.length == 2) {
twoColumns = true;
//if we are loading a GSEA edb rnk file then their might not be column names
try {
int temp = Integer.parseInt(tokens[1]);
i = -1;
tokens[0] = "Name";
tokens[1] = "Rank/Score";
} catch (NumberFormatException v) {
try {
double temp2 = Double.parseDouble(tokens[1]);
i = -1;
tokens[0] = "Name";
tokens[1] = "Rank/Score";
} catch (NumberFormatException v2) {
//if it isn't a double or int then we have a title line.
}
}
}
//expressionMatrix = new GeneExpressionMatrix(tokens);
expressionMatrix.setColumnNames(tokens);
expressionMatrix.setNumConditions(expressionMatrix.getColumnNames().length);
expressionMatrix.setExpressionMatrix(expression);
continue;
}
//Check to see if this gene is in the genes list
//Currently we only load gene expression data for genes that are already in the gene list (i.e. are listed in at least one geneset)
//TODO:is there the possibility that we need all the expression genes? Currently this great decreases space when saving sessions
Integer genekey = map.getHashFromGene(Name);
if (genekey != null) {
//we want the genes hashmap and dataset genes hashmap to have the same keys so it is easier to compare.
datasetGenes.add(genekey);
String description = "";
//check to see if the second column is parseable
if (twoColumns) {
try {
Double.parseDouble(tokens[1]);
} catch (NumberFormatException e) {
description = tokens[1];
}
} else {
description = tokens[1];
}
GeneExpression expres = new GeneExpression(Name, description);
expres.setExpression(tokens);
double newMax = expres.newMax(expressionMatrix.getMaxExpression());
if (newMax != -100)
expressionMatrix.setMaxExpression(newMax);
double newMin = expres.newMin(expressionMatrix.getMinExpression());
if (newMin != -100)
expressionMatrix.setMinExpression(newMin);
double newClosest = expres.newclosesttoZero(expressionMatrix.getClosesttoZero());
if (newClosest != -100)
expressionMatrix.setClosesttoZero(newClosest);
expression.put(genekey, expres);
}
expressionUniverse++;
// Calculate Percentage. This must be a value between 0..100.
int percentComplete = (int) (((double) currentProgress / maxValue) * 100);
taskMonitor.setProgress(percentComplete);
currentProgress++;
}
//set the number of genes
expressionMatrix.setExpressionUniverse(expressionUniverse);
//row Normalize expressionset
expressionMatrix.rowNormalizeMatrix();
return expressionMatrix;
//TODO: intialize phenotypes associated with class files from expression file load
/*
* if(dataset == 1){ //set up the classes definition if it is set.
* //check to see if the phenotypes were already set in the params from
* a session load if(params.getTemp_class1() != null)
* expressionMatrix.setPhenotypes(params.getTemp_class1());
* if(params.getClassFile1() != null)
* expressionMatrix.setPhenotypes(setClasses( params.getClassFile1()));
* //params.getEM().addExpression(EnrichmentMap.DATASET1,
* expressionMatrix); } else{ //set up the classes definition if it is
* set.
*
* //check to see if the phenotypes were already set in the params from
* a session load if(params.getTemp_class2() != null)
* expressionMatrix.setPhenotypes(params.getTemp_class2()); else
* if(params.getClassFile2() != null)
* expressionMatrix.setPhenotypes(setClasses( params.getClassFile2()));
* //params.getEM().addExpression(EnrichmentMap.DATASET2,
* expressionMatrix); }
*/
}
use of org.baderlab.csplugins.enrichmentmap.model.GeneExpressionMatrix in project EnrichmentMapApp by BaderLab.
the class HierarchicalClusterTask method cluster.
public Map<Integer, RankValue> cluster(TaskMonitor tm) {
if (tm == null)
tm = new NullTaskMonitor();
tm.setTitle("Hierarchical Cluster");
tm.setStatusMessage("Loading expression data");
List<double[]> clusteringExpressionSet = new ArrayList<>(genes.size());
ArrayList<Integer> labels = new ArrayList<>(genes.size());
List<String> names = new ArrayList<>(genes.size());
List<EMDataSet> dataSets = map.getDataSetList();
final int expressionCount = getTotalExpressionCount(dataSets);
for (int geneId : genes) {
// values all default to 0.0
double[] vals = new double[expressionCount];
int valsIndex = 0;
boolean found = false;
String name = null;
for (EMDataSet dataSet : dataSets) {
GeneExpressionMatrix expressionSets = dataSet.getExpressionSets();
int numConditions = expressionSets.getNumConditions() - 2;
GeneExpression geneExpression = expressionSets.getExpressionMatrix().get(geneId);
if (geneExpression != null) {
found = true;
name = geneExpression.getName();
double[] expression = geneExpression.getExpression();
System.arraycopy(expression, 0, vals, valsIndex, expression.length);
}
valsIndex += numConditions;
}
if (found) {
clusteringExpressionSet.add(vals);
labels.add(geneId);
names.add(name);
}
}
tm.setStatusMessage("Calculating Distance");
DistanceMatrix distanceMatrix = new DistanceMatrix(genes.size());
distanceMatrix.calcDistances(clusteringExpressionSet, distanceMetric);
distanceMatrix.setLabels(labels);
tm.setStatusMessage("Clustering");
AvgLinkHierarchicalClustering clusterResult = new AvgLinkHierarchicalClustering(distanceMatrix);
//check to see if there more than 1000 genes, if there are use eisen ordering otherwise use bar-joseph
clusterResult.setOptimalLeafOrdering(genes.size() <= 1000);
clusterResult.run();
tm.setStatusMessage("Ranking");
Map<Integer, RankValue> ranks = new HashMap<>();
int[] order = clusterResult.getLeafOrder();
for (int i = 0; i < order.length; i++) {
Integer geneId = labels.get(order[i]);
ranks.put(geneId, new RankValue(i + 1, null, false));
}
tm.setStatusMessage("");
return ranks;
}
use of org.baderlab.csplugins.enrichmentmap.model.GeneExpressionMatrix in project EnrichmentMapApp by BaderLab.
the class DataSetColorRange method create.
/**
* Reset color gradients based on a change in the data transformation.
*/
public static DataSetColorRange create(EMDataSet ds, Transform transform) {
GeneExpressionMatrix expression = ds.getExpressionSets();
double minExpression = expression.getMinExpression();
double maxExpression = expression.getMaxExpression();
double min;
double max;
switch(transform) {
case AS_IS:
default:
min = minExpression;
max = Math.max(Math.abs(minExpression), maxExpression);
break;
case ROW_NORMALIZE:
min = expression.getMinExpression(expression.getExpressionMatrix_rowNormalized());
max = expression.getMaxExpression(expression.getExpressionMatrix_rowNormalized());
//This happens when there is only one data column in the dataset (or if it is rank file)
if ((min == 0) && (max == 0)) {
//JOptionPane.showMessageDialog(Cytoscape.getDesktop(),"Row normalization does not work with only one data column per dataset.","Row normalization error",JOptionPane.WARNING_MESSAGE);
}
max = Math.max(Math.abs(min), max);
break;
case LOG_TRANSFORM:
//issue a warning.
if ((minExpression <= 0) && (maxExpression <= 0)) {
//both the max and min are probably negative values
//JOptionPane.showMessageDialog(Cytoscape.getDesktop(),"Both the max and min expression are negative, log of negative numbers is not valid", "log normalization error", JOptionPane.WARNING_MESSAGE);
min = 0;
max = 0;
} else //if min expression is negative then use the max expression as the max
if (minExpression <= 0) {
double closestToZeroExpression = expression.getClosesttoZero();
min = Math.min(Math.log(closestToZeroExpression), Math.log1p(maxExpression));
max = Math.max(Math.log(closestToZeroExpression), Math.log1p(maxExpression));
} else //if the max expression is negative then use the min expression as the max (should never happen!)
if (maxExpression <= 0) {
min = 0;
max = Math.log1p(minExpression);
} else {
min = Math.log1p(minExpression);
max = Math.log1p(maxExpression);
max = Math.max(Math.abs(min), max);
}
break;
}
if (min >= 0) {
double median = max / 2;
ColorGradientRange range = ColorGradientRange.getInstance(0, median, median, max, 0, median, median, max);
ColorGradientTheme theme = ColorGradientTheme.GREEN_ONECOLOR_GRADIENT_THEME;
return new DataSetColorRange(theme, range);
} else {
ColorGradientRange range = ColorGradientRange.getInstance(-max, 0, 0, max, -max, 0, 0, max);
ColorGradientTheme theme = ColorGradientTheme.PR_GN_GRADIENT_THEME;
return new DataSetColorRange(theme, range);
}
}
Aggregations