Search in sources :

Example 6 with GeneExpressionMatrix

use of org.baderlab.csplugins.enrichmentmap.model.GeneExpressionMatrix in project EnrichmentMapApp by BaderLab.

the class ExpressionFileReaderTask method parse.

/**
	 * Parse expression/rank file
	 */
public GeneExpressionMatrix parse(TaskMonitor taskMonitor) throws IOException {
    if (taskMonitor == null)
        taskMonitor = new NullTaskMonitor();
    //Need to check if the file specified as an expression file is actually a rank file
    //If it is a rank file it can either be 5 or 2 columns but it is important that the rank
    //value is extracted from the right column and placed in the expression matrix as if it
    //was an expression value in order for other features to work.
    //Also a problem with old session files that imported a rank file so it also
    //important to check if the file only has two columns.  If it only has two columns,
    //check to see if the second column is a double.  If it is then consider that column
    //expression
    boolean twoColumns = false;
    Set<Integer> datasetGenes = dataset.getDataSetGenes();
    //		Map<Integer,String> genes = dataset.getMap().getGenes();
    EnrichmentMap map = dataset.getMap();
    String expressionFileName = dataset.getExpressionSets().getFilename();
    List<String> lines = DatasetLineParser.readLines(expressionFileName);
    int currentProgress = 0;
    int maxValue = lines.size();
    int expressionUniverse = 0;
    taskMonitor.setStatusMessage("Parsing GCT file - " + maxValue + " rows");
    GeneExpressionMatrix expressionMatrix = dataset.getExpressionSets();
    //GeneExpressionMatrix expressionMatrix = new GeneExpressionMatrix(lines[0].split("\t"));
    //HashMap<Integer,GeneExpression> expression = new HashMap<Integer, GeneExpression>();
    Map<Integer, GeneExpression> expression = expressionMatrix.getExpressionMatrix();
    for (int i = 0; i < lines.size(); i++) {
        String line = lines.get(i);
        String[] tokens = line.split("\t");
        //The first column of the file is the name of the geneset
        String Name = tokens[0].toUpperCase().trim();
        //the first time we have given them default headings
        if (i == 0 && (expressionMatrix == null || expressionMatrix.getExpressionMatrix().isEmpty()) && expressionMatrix.getColumnNames() == null) {
            //otherwise the first line is the header
            if (Name.equalsIgnoreCase("#1.2")) {
                line = lines.get(2);
                i = 2;
            } else {
                line = lines.get(0);
                //ignore all comment lines
                int k = 0;
                while (line.startsWith("#")) {
                    k++;
                    line = lines.get(k);
                }
                i = k;
            }
            tokens = line.split("\t");
            //check to see if the second column contains expression values.
            if (tokens.length == 2) {
                twoColumns = true;
                //if we are loading a GSEA edb rnk file then their might not be column names
                try {
                    int temp = Integer.parseInt(tokens[1]);
                    i = -1;
                    tokens[0] = "Name";
                    tokens[1] = "Rank/Score";
                } catch (NumberFormatException v) {
                    try {
                        double temp2 = Double.parseDouble(tokens[1]);
                        i = -1;
                        tokens[0] = "Name";
                        tokens[1] = "Rank/Score";
                    } catch (NumberFormatException v2) {
                    //if it isn't a double or int then we have a title line.
                    }
                }
            }
            //expressionMatrix = new GeneExpressionMatrix(tokens);
            expressionMatrix.setColumnNames(tokens);
            expressionMatrix.setNumConditions(expressionMatrix.getColumnNames().length);
            expressionMatrix.setExpressionMatrix(expression);
            continue;
        }
        //Check to see if this gene is in the genes list
        //Currently we only load gene expression data for genes that are already in the gene list (i.e. are listed in at least one geneset)
        //TODO:is there the possibility that we need all the expression genes?  Currently this great decreases space when saving sessions
        Integer genekey = map.getHashFromGene(Name);
        if (genekey != null) {
            //we want the genes hashmap and dataset genes hashmap to have the same keys so it is easier to compare.
            datasetGenes.add(genekey);
            String description = "";
            //check to see if the second column is parseable
            if (twoColumns) {
                try {
                    Double.parseDouble(tokens[1]);
                } catch (NumberFormatException e) {
                    description = tokens[1];
                }
            } else {
                description = tokens[1];
            }
            GeneExpression expres = new GeneExpression(Name, description);
            expres.setExpression(tokens);
            double newMax = expres.newMax(expressionMatrix.getMaxExpression());
            if (newMax != -100)
                expressionMatrix.setMaxExpression(newMax);
            double newMin = expres.newMin(expressionMatrix.getMinExpression());
            if (newMin != -100)
                expressionMatrix.setMinExpression(newMin);
            double newClosest = expres.newclosesttoZero(expressionMatrix.getClosesttoZero());
            if (newClosest != -100)
                expressionMatrix.setClosesttoZero(newClosest);
            expression.put(genekey, expres);
        }
        expressionUniverse++;
        // Calculate Percentage.  This must be a value between 0..100.
        int percentComplete = (int) (((double) currentProgress / maxValue) * 100);
        taskMonitor.setProgress(percentComplete);
        currentProgress++;
    }
    //set the number of genes
    expressionMatrix.setExpressionUniverse(expressionUniverse);
    //row Normalize expressionset
    expressionMatrix.rowNormalizeMatrix();
    return expressionMatrix;
//TODO: intialize phenotypes associated with class files from expression file load
/*
		 * if(dataset == 1){ //set up the classes definition if it is set.
		 * //check to see if the phenotypes were already set in the params from
		 * a session load if(params.getTemp_class1() != null)
		 * expressionMatrix.setPhenotypes(params.getTemp_class1());
		 * if(params.getClassFile1() != null)
		 * expressionMatrix.setPhenotypes(setClasses( params.getClassFile1()));
		 * //params.getEM().addExpression(EnrichmentMap.DATASET1,
		 * expressionMatrix); } else{ //set up the classes definition if it is
		 * set.
		 * 
		 * //check to see if the phenotypes were already set in the params from
		 * a session load if(params.getTemp_class2() != null)
		 * expressionMatrix.setPhenotypes(params.getTemp_class2()); else
		 * if(params.getClassFile2() != null)
		 * expressionMatrix.setPhenotypes(setClasses( params.getClassFile2()));
		 * //params.getEM().addExpression(EnrichmentMap.DATASET2,
		 * expressionMatrix); }
		 */
}
Also used : EnrichmentMap(org.baderlab.csplugins.enrichmentmap.model.EnrichmentMap) GeneExpressionMatrix(org.baderlab.csplugins.enrichmentmap.model.GeneExpressionMatrix) GeneExpression(org.baderlab.csplugins.enrichmentmap.model.GeneExpression) NullTaskMonitor(org.baderlab.csplugins.enrichmentmap.util.NullTaskMonitor)

Example 7 with GeneExpressionMatrix

use of org.baderlab.csplugins.enrichmentmap.model.GeneExpressionMatrix in project EnrichmentMapApp by BaderLab.

the class HierarchicalClusterTask method cluster.

public Map<Integer, RankValue> cluster(TaskMonitor tm) {
    if (tm == null)
        tm = new NullTaskMonitor();
    tm.setTitle("Hierarchical Cluster");
    tm.setStatusMessage("Loading expression data");
    List<double[]> clusteringExpressionSet = new ArrayList<>(genes.size());
    ArrayList<Integer> labels = new ArrayList<>(genes.size());
    List<String> names = new ArrayList<>(genes.size());
    List<EMDataSet> dataSets = map.getDataSetList();
    final int expressionCount = getTotalExpressionCount(dataSets);
    for (int geneId : genes) {
        // values all default to 0.0
        double[] vals = new double[expressionCount];
        int valsIndex = 0;
        boolean found = false;
        String name = null;
        for (EMDataSet dataSet : dataSets) {
            GeneExpressionMatrix expressionSets = dataSet.getExpressionSets();
            int numConditions = expressionSets.getNumConditions() - 2;
            GeneExpression geneExpression = expressionSets.getExpressionMatrix().get(geneId);
            if (geneExpression != null) {
                found = true;
                name = geneExpression.getName();
                double[] expression = geneExpression.getExpression();
                System.arraycopy(expression, 0, vals, valsIndex, expression.length);
            }
            valsIndex += numConditions;
        }
        if (found) {
            clusteringExpressionSet.add(vals);
            labels.add(geneId);
            names.add(name);
        }
    }
    tm.setStatusMessage("Calculating Distance");
    DistanceMatrix distanceMatrix = new DistanceMatrix(genes.size());
    distanceMatrix.calcDistances(clusteringExpressionSet, distanceMetric);
    distanceMatrix.setLabels(labels);
    tm.setStatusMessage("Clustering");
    AvgLinkHierarchicalClustering clusterResult = new AvgLinkHierarchicalClustering(distanceMatrix);
    //check to see if there more than 1000 genes, if there are use eisen ordering otherwise use bar-joseph
    clusterResult.setOptimalLeafOrdering(genes.size() <= 1000);
    clusterResult.run();
    tm.setStatusMessage("Ranking");
    Map<Integer, RankValue> ranks = new HashMap<>();
    int[] order = clusterResult.getLeafOrder();
    for (int i = 0; i < order.length; i++) {
        Integer geneId = labels.get(order[i]);
        ranks.put(geneId, new RankValue(i + 1, null, false));
    }
    tm.setStatusMessage("");
    return ranks;
}
Also used : HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) AvgLinkHierarchicalClustering(org.baderlab.csplugins.brainlib.AvgLinkHierarchicalClustering) RankValue(org.baderlab.csplugins.enrichmentmap.view.heatmap.table.RankValue) GeneExpressionMatrix(org.baderlab.csplugins.enrichmentmap.model.GeneExpressionMatrix) EMDataSet(org.baderlab.csplugins.enrichmentmap.model.EMDataSet) DistanceMatrix(org.baderlab.csplugins.brainlib.DistanceMatrix) GeneExpression(org.baderlab.csplugins.enrichmentmap.model.GeneExpression) NullTaskMonitor(org.baderlab.csplugins.enrichmentmap.util.NullTaskMonitor)

Example 8 with GeneExpressionMatrix

use of org.baderlab.csplugins.enrichmentmap.model.GeneExpressionMatrix in project EnrichmentMapApp by BaderLab.

the class DataSetColorRange method create.

/**
	 * Reset color gradients based on a change in the data transformation.
	 */
public static DataSetColorRange create(EMDataSet ds, Transform transform) {
    GeneExpressionMatrix expression = ds.getExpressionSets();
    double minExpression = expression.getMinExpression();
    double maxExpression = expression.getMaxExpression();
    double min;
    double max;
    switch(transform) {
        case AS_IS:
        default:
            min = minExpression;
            max = Math.max(Math.abs(minExpression), maxExpression);
            break;
        case ROW_NORMALIZE:
            min = expression.getMinExpression(expression.getExpressionMatrix_rowNormalized());
            max = expression.getMaxExpression(expression.getExpressionMatrix_rowNormalized());
            //This happens when there is only one data column in the dataset (or if it is rank file)
            if ((min == 0) && (max == 0)) {
            //JOptionPane.showMessageDialog(Cytoscape.getDesktop(),"Row normalization does not work with only one data column per dataset.","Row normalization error",JOptionPane.WARNING_MESSAGE);
            }
            max = Math.max(Math.abs(min), max);
            break;
        case LOG_TRANSFORM:
            //issue a warning.
            if ((minExpression <= 0) && (maxExpression <= 0)) {
                //both the max and min are probably negative values
                //JOptionPane.showMessageDialog(Cytoscape.getDesktop(),"Both the max and min expression are negative, log of negative numbers is not valid", "log normalization error", JOptionPane.WARNING_MESSAGE);
                min = 0;
                max = 0;
            } else //if min expression is negative then use the max expression as the max
            if (minExpression <= 0) {
                double closestToZeroExpression = expression.getClosesttoZero();
                min = Math.min(Math.log(closestToZeroExpression), Math.log1p(maxExpression));
                max = Math.max(Math.log(closestToZeroExpression), Math.log1p(maxExpression));
            } else //if the max expression is negative then use the min expression as the max (should never happen!)
            if (maxExpression <= 0) {
                min = 0;
                max = Math.log1p(minExpression);
            } else {
                min = Math.log1p(minExpression);
                max = Math.log1p(maxExpression);
                max = Math.max(Math.abs(min), max);
            }
            break;
    }
    if (min >= 0) {
        double median = max / 2;
        ColorGradientRange range = ColorGradientRange.getInstance(0, median, median, max, 0, median, median, max);
        ColorGradientTheme theme = ColorGradientTheme.GREEN_ONECOLOR_GRADIENT_THEME;
        return new DataSetColorRange(theme, range);
    } else {
        ColorGradientRange range = ColorGradientRange.getInstance(-max, 0, 0, max, -max, 0, 0, max);
        ColorGradientTheme theme = ColorGradientTheme.PR_GN_GRADIENT_THEME;
        return new DataSetColorRange(theme, range);
    }
}
Also used : GeneExpressionMatrix(org.baderlab.csplugins.enrichmentmap.model.GeneExpressionMatrix) ColorGradientRange(org.mskcc.colorgradient.ColorGradientRange) ColorGradientTheme(org.mskcc.colorgradient.ColorGradientTheme)

Aggregations

GeneExpressionMatrix (org.baderlab.csplugins.enrichmentmap.model.GeneExpressionMatrix)8 GeneExpression (org.baderlab.csplugins.enrichmentmap.model.GeneExpression)5 EMDataSet (org.baderlab.csplugins.enrichmentmap.model.EMDataSet)3 EnrichmentMap (org.baderlab.csplugins.enrichmentmap.model.EnrichmentMap)3 HashMap (java.util.HashMap)2 DataSetFiles (org.baderlab.csplugins.enrichmentmap.model.DataSetFiles)2 GeneSet (org.baderlab.csplugins.enrichmentmap.model.GeneSet)2 Rank (org.baderlab.csplugins.enrichmentmap.model.Rank)2 Ranking (org.baderlab.csplugins.enrichmentmap.model.Ranking)2 SetOfEnrichmentResults (org.baderlab.csplugins.enrichmentmap.model.SetOfEnrichmentResults)2 NullTaskMonitor (org.baderlab.csplugins.enrichmentmap.util.NullTaskMonitor)2 CyNetwork (org.cytoscape.model.CyNetwork)2 Inject (com.google.inject.Inject)1 File (java.io.File)1 InputStream (java.io.InputStream)1 ArrayList (java.util.ArrayList)1 Collections (java.util.Collections)1 HashSet (java.util.HashSet)1 Iterator (java.util.Iterator)1 List (java.util.List)1