Search in sources :

Example 6 with NullTaskMonitor

use of org.baderlab.csplugins.enrichmentmap.util.NullTaskMonitor in project EnrichmentMapApp by BaderLab.

the class ParseDavidEnrichmentResults method parseLines.

/**
	 * Parse david enrichment results file
	 */
@Override
public void parseLines(List<String> lines, EMDataSet dataset, TaskMonitor taskMonitor) {
    if (taskMonitor == null)
        taskMonitor = new NullTaskMonitor();
    taskMonitor.setTitle("Parsing David Enrichment Result file");
    //with David results there are no genesets defined.  first pass through the file
    // needs to parse the genesets
    //parameters that can be extracted from David files:
    //Category	Term	Count	%	PValue	Genes	List Total	Pop Hits	Pop Total	Fold Enrichment	Bonferroni	Benjamini	FDR
    // Count = number of genes in the geneset that came from the input list, number of genes in the genelist mapping toa specific term.
    // List Total - number of genes in the gene list mapping to the category (ie. GO Cellular component)
    // Pop Hits - number of genes in the background gene list mapping to a specific term
    // Pop total - number of gene s in the background gene list mapping to the category (i.e. Go Cellular Component)
    // Column 2 is the geneset name
    // Column 1 is the category (and can be used for the description)
    // Column 6 is the list of genes (from the loaded list) in this geneset -- therefore pre-filtered.
    Map<String, GeneSet> genesets = dataset.getSetOfGeneSets().getGeneSets();
    EnrichmentMap map = dataset.getMap();
    Map<String, EnrichmentResult> results = dataset.getEnrichments().getEnrichments();
    int currentProgress = 0;
    int maxValue = lines.size();
    taskMonitor.setStatusMessage("Parsing Generic Results file - " + maxValue + " rows");
    boolean FDR = true;
    //skip the first line which just has the field names (start i=1)
    //check to see how many columns the data has
    String line = lines.get(0);
    String[] tokens = line.split("\t");
    int length = tokens.length;
    if (length != 13)
        throw new IllegalThreadStateException("David results file is missing data.");
    for (int i = 1; i < lines.size(); i++) {
        line = lines.get(i);
        tokens = line.split("\t");
        double pvalue = 1.0;
        double FDRqvalue = 1.0;
        GenericResult result;
        int gs_size = 0;
        double NES = 1.0;
        //The second column of the file is the name of the geneset
        final String name = tokens[1].toUpperCase().trim();
        //the first column of the file is the description
        final String description = tokens[0].toUpperCase();
        //when there are two different species it is possible that the gene set could
        //already exist in the set of genesets.  if it does exist then add the genes
        //in this set to the geneset
        ImmutableSet.Builder<Integer> builder = ImmutableSet.builder();
        if (genesets.containsKey(name))
            builder = builder.addAll(genesets.get(name).getGenes());
        String[] gene_tokens = tokens[5].split(", ");
        //All subsequent fields in the list are the geneset associated with this geneset.
        for (int j = 0; j < gene_tokens.length; j++) {
            String gene = gene_tokens[j].toUpperCase();
            //if it is already in the hash then get its associated key and put it into the set of genes
            if (map.containsGene(gene)) {
                builder.add(map.getHashFromGene(gene));
            } else if (!gene.isEmpty()) {
                Integer hash = map.addGene(gene).get();
                builder.add(hash);
            }
        }
        //finished parsing that geneset
        //add the current geneset to the hashmap of genesets
        GeneSet gs = new GeneSet(name, description, builder.build());
        genesets.put(name, gs);
        //The 5th column is the nominal p-value
        if (tokens[4].equalsIgnoreCase("")) {
        //do nothing
        } else {
            pvalue = Double.parseDouble(tokens[4]);
        }
        //the Count is the size of the geneset (restricted by the gene list)
        if (tokens[2].equalsIgnoreCase("")) {
        //do nothing
        } else {
            gs_size = Integer.parseInt(tokens[2]);
        }
        //Use the Benjamini value for the fdr
        if (tokens[11].equalsIgnoreCase("")) {
        //do nothing
        } else {
            FDRqvalue = Double.parseDouble(tokens[11]);
        }
        result = new GenericResult(name, description, pvalue, gs_size, FDRqvalue);
        // Calculate Percentage.  This must be a value between 0..100.
        int percentComplete = (int) (((double) currentProgress / maxValue) * 100);
        taskMonitor.setProgress(percentComplete);
        currentProgress++;
        //check to see if the gene set has already been entered in the results
        //it is possible that one geneset will be in both phenotypes.
        //if it is already exists then we want to make sure the one retained is the result with the
        //lower p-value.
        //ticket #149
        GenericResult temp = (GenericResult) results.get(name);
        if (temp == null)
            results.put(name, result);
        else {
            if (result.getPvalue() < temp.getPvalue())
                results.put(name, result);
        }
    }
    if (FDR)
        dataset.getMap().getParams().setFDR(FDR);
}
Also used : EnrichmentResult(org.baderlab.csplugins.enrichmentmap.model.EnrichmentResult) EnrichmentMap(org.baderlab.csplugins.enrichmentmap.model.EnrichmentMap) GenericResult(org.baderlab.csplugins.enrichmentmap.model.GenericResult) ImmutableSet(com.google.common.collect.ImmutableSet) GeneSet(org.baderlab.csplugins.enrichmentmap.model.GeneSet) NullTaskMonitor(org.baderlab.csplugins.enrichmentmap.util.NullTaskMonitor)

Example 7 with NullTaskMonitor

use of org.baderlab.csplugins.enrichmentmap.util.NullTaskMonitor in project EnrichmentMapApp by BaderLab.

the class RanksFileReaderTask method parse.

/**
	 * parse the rank file
	 */
public void parse(TaskMonitor taskMonitor) throws IOException {
    if (taskMonitor == null)
        taskMonitor = new NullTaskMonitor();
    List<String> lines = DatasetLineParser.readLines(RankFileName);
    int lineNumber = 0;
    int currentProgress = 0;
    int maxValue = lines.size();
    taskMonitor.setStatusMessage("Parsing Rank file - " + maxValue + " rows");
    EnrichmentMap map = dataset.getMap();
    // we don't know the number of scores in the rank file yet, but it can't be more than the number of lines.
    Double[] score_collector = new Double[lines.size()];
    boolean gseaDefinedRanks = false;
    Map<Integer, Rank> ranks = new HashMap<>();
    /*
		 * there are two possible Rank files: If loaded through the rpt file the
		 * file is the one generated by GSEA and will have 5 columns (name,
		 * description, empty,empty,score) If the user loaded it through the
		 * generic of specifying advanced options then it will 2 columns
		 * (name,score). The score in either case should be a double and the
		 * name a string so check for either option.
		 */
    //number of found scores
    int nScores = 0;
    for (int i = 0; i < lines.size(); i++) {
        String line = lines.get(i);
        //check to see if the line is commented out and should be ignored.
        if (line.startsWith("#")) {
            // look for ranks_name in comment line e.g.: "# Ranks Name : My Ranks"
            if (Pattern.matches("^# *Ranks[ _-]?Name *:.+", line)) {
                this.ranks_name = line.split(":", 2)[1];
                while (this.ranks_name.startsWith(" ")) this.ranks_name = this.ranks_name.substring(1);
            }
            //ignore comment line
            continue;
        }
        String[] tokens = line.split("\t");
        String name = tokens[0].toUpperCase();
        double score = 0;
        //if there are 5 columns in the data then the rank is the last column
        if (tokens.length == 5) {
            //ignore rows where the expected rank value is not a valid double
            try {
                //gseaDefinedRanks = true;
                score = Double.parseDouble(tokens[4]);
            } catch (NumberFormatException nfe) {
                if (lineNumber == 0) {
                    lineNumber++;
                    continue;
                } else
                    throw new IllegalThreadStateException("rank value for" + tokens[0] + "is not a valid number");
            }
            nScores++;
        } else //if there are 2 columns in the data then the rank is the 2 column
        if (tokens.length == 2) {
            try {
                score = Double.parseDouble(tokens[1]);
            } catch (NumberFormatException nfe) {
                if (lineNumber == 0) {
                    lineNumber++;
                    continue;
                } else
                    throw new IllegalThreadStateException("rank value for" + tokens[0] + "is not a valid number");
            }
            nScores++;
        } else {
            System.out.println("Invalid number of tokens line of Rank File (should be 5 or 2)");
            //skip invalid line
            continue;
        }
        if ((tokens.length == 5) || (dataset.getMethod() == Method.GSEA && !loadFromHeatmap))
            gseaDefinedRanks = true;
        //add score to array of scores
        score_collector[nScores - 1] = score;
        //check to see if the gene is in the genelist
        Integer genekey = map.getHashFromGene(name);
        if (genekey != null) {
            Rank current_ranking;
            // edge compatible files.
            if ((tokens.length == 5) || (dataset.getMethod() == Method.GSEA && !loadFromHeatmap)) {
                current_ranking = new Rank(name, score, nScores);
            } else {
                current_ranking = new Rank(name, score);
            }
            ranks.put(genekey, current_ranking);
        }
        // Calculate Percentage.  This must be a value between 0..100.
        int percentComplete = (int) (((double) currentProgress / maxValue) * 100);
        taskMonitor.setProgress(percentComplete);
        currentProgress++;
    }
    //the none of the genes are in the gene list
    if (ranks.isEmpty()) {
        throw new IllegalThreadStateException("None of the genes in the rank file are found in the expression file.  Make sure the identifiers of the two files match.");
    }
    //remove Null values from collector
    Double[] sort_scores = new Double[nScores];
    double[] scores = new double[nScores];
    for (int i = 0; i < nScores; i++) {
        sort_scores[i] = score_collector[i];
        scores[i] = (double) score_collector[i];
    }
    //after we have loaded in all the scores, sort the score to compute ranks
    //create hash of scores to ranks.
    HashMap<Double, Integer> score2ranks = new HashMap<Double, Integer>();
    //sorts the array in descending order
    Arrays.sort(sort_scores, Collections.reverseOrder());
    //just signed statistics for instance as it will sort them in the opposite direction.
    if (sort_scores[0] <= 1 && sort_scores[sort_scores.length - 1] >= -1)
        Arrays.sort(sort_scores);
    for (int j = 0; j < sort_scores.length; j++) {
        //check to see if this score is already enter
        if (!score2ranks.containsKey(sort_scores[j]))
            score2ranks.put(sort_scores[j], j);
    }
    //only update the ranks if we haven't already defined them using order of scores in file
    if (!gseaDefinedRanks) {
        for (Iterator<Integer> k = ranks.keySet().iterator(); k.hasNext(); ) {
            Integer gene_key = k.next();
            Rank current_ranking = ranks.get(gene_key);
            Integer rank = score2ranks.get(current_ranking.getScore());
            current_ranking.setRank(rank);
        // update rank2gene and gene2score as well
        }
    }
    //check to see if some of the dataset genes are not in this rank file
    Set<Integer> current_genes = dataset.getDataSetGenes();
    Set<Integer> current_ranks = ranks.keySet();
    //intersect the genes with the ranks.  only retain the genes that have ranks.
    Set<Integer> intersection = new HashSet<>(current_genes);
    intersection.retainAll(current_ranks);
    //see if there more genes than there are ranks
    if (!(intersection.size() == current_genes.size())) {
    //JOptionPane.showMessageDialog(Cytoscape.getDesktop(),"Ranks for some of the genes/proteins listed in the expression file are missing. \n These genes/proteins will be excluded from ranked listing in the heat map.");
    }
    //create a new Ranking
    Ranking new_ranking = new Ranking();
    ranks.forEach(new_ranking::addRank);
    //add the Ranks to the expression file ranking
    dataset.getExpressionSets().addRanks(ranks_name, new_ranking);
}
Also used : HashMap(java.util.HashMap) Rank(org.baderlab.csplugins.enrichmentmap.model.Rank) EnrichmentMap(org.baderlab.csplugins.enrichmentmap.model.EnrichmentMap) Ranking(org.baderlab.csplugins.enrichmentmap.model.Ranking) NullTaskMonitor(org.baderlab.csplugins.enrichmentmap.util.NullTaskMonitor) HashSet(java.util.HashSet)

Example 8 with NullTaskMonitor

use of org.baderlab.csplugins.enrichmentmap.util.NullTaskMonitor in project EnrichmentMapApp by BaderLab.

the class ExpressionFileReaderTask method parse.

/**
	 * Parse expression/rank file
	 */
public GeneExpressionMatrix parse(TaskMonitor taskMonitor) throws IOException {
    if (taskMonitor == null)
        taskMonitor = new NullTaskMonitor();
    //Need to check if the file specified as an expression file is actually a rank file
    //If it is a rank file it can either be 5 or 2 columns but it is important that the rank
    //value is extracted from the right column and placed in the expression matrix as if it
    //was an expression value in order for other features to work.
    //Also a problem with old session files that imported a rank file so it also
    //important to check if the file only has two columns.  If it only has two columns,
    //check to see if the second column is a double.  If it is then consider that column
    //expression
    boolean twoColumns = false;
    Set<Integer> datasetGenes = dataset.getDataSetGenes();
    //		Map<Integer,String> genes = dataset.getMap().getGenes();
    EnrichmentMap map = dataset.getMap();
    String expressionFileName = dataset.getExpressionSets().getFilename();
    List<String> lines = DatasetLineParser.readLines(expressionFileName);
    int currentProgress = 0;
    int maxValue = lines.size();
    int expressionUniverse = 0;
    taskMonitor.setStatusMessage("Parsing GCT file - " + maxValue + " rows");
    GeneExpressionMatrix expressionMatrix = dataset.getExpressionSets();
    //GeneExpressionMatrix expressionMatrix = new GeneExpressionMatrix(lines[0].split("\t"));
    //HashMap<Integer,GeneExpression> expression = new HashMap<Integer, GeneExpression>();
    Map<Integer, GeneExpression> expression = expressionMatrix.getExpressionMatrix();
    for (int i = 0; i < lines.size(); i++) {
        String line = lines.get(i);
        String[] tokens = line.split("\t");
        //The first column of the file is the name of the geneset
        String Name = tokens[0].toUpperCase().trim();
        //the first time we have given them default headings
        if (i == 0 && (expressionMatrix == null || expressionMatrix.getExpressionMatrix().isEmpty()) && expressionMatrix.getColumnNames() == null) {
            //otherwise the first line is the header
            if (Name.equalsIgnoreCase("#1.2")) {
                line = lines.get(2);
                i = 2;
            } else {
                line = lines.get(0);
                //ignore all comment lines
                int k = 0;
                while (line.startsWith("#")) {
                    k++;
                    line = lines.get(k);
                }
                i = k;
            }
            tokens = line.split("\t");
            //check to see if the second column contains expression values.
            if (tokens.length == 2) {
                twoColumns = true;
                //if we are loading a GSEA edb rnk file then their might not be column names
                try {
                    int temp = Integer.parseInt(tokens[1]);
                    i = -1;
                    tokens[0] = "Name";
                    tokens[1] = "Rank/Score";
                } catch (NumberFormatException v) {
                    try {
                        double temp2 = Double.parseDouble(tokens[1]);
                        i = -1;
                        tokens[0] = "Name";
                        tokens[1] = "Rank/Score";
                    } catch (NumberFormatException v2) {
                    //if it isn't a double or int then we have a title line.
                    }
                }
            }
            //expressionMatrix = new GeneExpressionMatrix(tokens);
            expressionMatrix.setColumnNames(tokens);
            expressionMatrix.setNumConditions(expressionMatrix.getColumnNames().length);
            expressionMatrix.setExpressionMatrix(expression);
            continue;
        }
        //Check to see if this gene is in the genes list
        //Currently we only load gene expression data for genes that are already in the gene list (i.e. are listed in at least one geneset)
        //TODO:is there the possibility that we need all the expression genes?  Currently this great decreases space when saving sessions
        Integer genekey = map.getHashFromGene(Name);
        if (genekey != null) {
            //we want the genes hashmap and dataset genes hashmap to have the same keys so it is easier to compare.
            datasetGenes.add(genekey);
            String description = "";
            //check to see if the second column is parseable
            if (twoColumns) {
                try {
                    Double.parseDouble(tokens[1]);
                } catch (NumberFormatException e) {
                    description = tokens[1];
                }
            } else {
                description = tokens[1];
            }
            GeneExpression expres = new GeneExpression(Name, description);
            expres.setExpression(tokens);
            double newMax = expres.newMax(expressionMatrix.getMaxExpression());
            if (newMax != -100)
                expressionMatrix.setMaxExpression(newMax);
            double newMin = expres.newMin(expressionMatrix.getMinExpression());
            if (newMin != -100)
                expressionMatrix.setMinExpression(newMin);
            double newClosest = expres.newclosesttoZero(expressionMatrix.getClosesttoZero());
            if (newClosest != -100)
                expressionMatrix.setClosesttoZero(newClosest);
            expression.put(genekey, expres);
        }
        expressionUniverse++;
        // Calculate Percentage.  This must be a value between 0..100.
        int percentComplete = (int) (((double) currentProgress / maxValue) * 100);
        taskMonitor.setProgress(percentComplete);
        currentProgress++;
    }
    //set the number of genes
    expressionMatrix.setExpressionUniverse(expressionUniverse);
    //row Normalize expressionset
    expressionMatrix.rowNormalizeMatrix();
    return expressionMatrix;
//TODO: intialize phenotypes associated with class files from expression file load
/*
		 * if(dataset == 1){ //set up the classes definition if it is set.
		 * //check to see if the phenotypes were already set in the params from
		 * a session load if(params.getTemp_class1() != null)
		 * expressionMatrix.setPhenotypes(params.getTemp_class1());
		 * if(params.getClassFile1() != null)
		 * expressionMatrix.setPhenotypes(setClasses( params.getClassFile1()));
		 * //params.getEM().addExpression(EnrichmentMap.DATASET1,
		 * expressionMatrix); } else{ //set up the classes definition if it is
		 * set.
		 * 
		 * //check to see if the phenotypes were already set in the params from
		 * a session load if(params.getTemp_class2() != null)
		 * expressionMatrix.setPhenotypes(params.getTemp_class2()); else
		 * if(params.getClassFile2() != null)
		 * expressionMatrix.setPhenotypes(setClasses( params.getClassFile2()));
		 * //params.getEM().addExpression(EnrichmentMap.DATASET2,
		 * expressionMatrix); }
		 */
}
Also used : EnrichmentMap(org.baderlab.csplugins.enrichmentmap.model.EnrichmentMap) GeneExpressionMatrix(org.baderlab.csplugins.enrichmentmap.model.GeneExpressionMatrix) GeneExpression(org.baderlab.csplugins.enrichmentmap.model.GeneExpression) NullTaskMonitor(org.baderlab.csplugins.enrichmentmap.util.NullTaskMonitor)

Example 9 with NullTaskMonitor

use of org.baderlab.csplugins.enrichmentmap.util.NullTaskMonitor in project EnrichmentMapApp by BaderLab.

the class HierarchicalClusterTask method cluster.

public Map<Integer, RankValue> cluster(TaskMonitor tm) {
    if (tm == null)
        tm = new NullTaskMonitor();
    tm.setTitle("Hierarchical Cluster");
    tm.setStatusMessage("Loading expression data");
    List<double[]> clusteringExpressionSet = new ArrayList<>(genes.size());
    ArrayList<Integer> labels = new ArrayList<>(genes.size());
    List<String> names = new ArrayList<>(genes.size());
    List<EMDataSet> dataSets = map.getDataSetList();
    final int expressionCount = getTotalExpressionCount(dataSets);
    for (int geneId : genes) {
        // values all default to 0.0
        double[] vals = new double[expressionCount];
        int valsIndex = 0;
        boolean found = false;
        String name = null;
        for (EMDataSet dataSet : dataSets) {
            GeneExpressionMatrix expressionSets = dataSet.getExpressionSets();
            int numConditions = expressionSets.getNumConditions() - 2;
            GeneExpression geneExpression = expressionSets.getExpressionMatrix().get(geneId);
            if (geneExpression != null) {
                found = true;
                name = geneExpression.getName();
                double[] expression = geneExpression.getExpression();
                System.arraycopy(expression, 0, vals, valsIndex, expression.length);
            }
            valsIndex += numConditions;
        }
        if (found) {
            clusteringExpressionSet.add(vals);
            labels.add(geneId);
            names.add(name);
        }
    }
    tm.setStatusMessage("Calculating Distance");
    DistanceMatrix distanceMatrix = new DistanceMatrix(genes.size());
    distanceMatrix.calcDistances(clusteringExpressionSet, distanceMetric);
    distanceMatrix.setLabels(labels);
    tm.setStatusMessage("Clustering");
    AvgLinkHierarchicalClustering clusterResult = new AvgLinkHierarchicalClustering(distanceMatrix);
    //check to see if there more than 1000 genes, if there are use eisen ordering otherwise use bar-joseph
    clusterResult.setOptimalLeafOrdering(genes.size() <= 1000);
    clusterResult.run();
    tm.setStatusMessage("Ranking");
    Map<Integer, RankValue> ranks = new HashMap<>();
    int[] order = clusterResult.getLeafOrder();
    for (int i = 0; i < order.length; i++) {
        Integer geneId = labels.get(order[i]);
        ranks.put(geneId, new RankValue(i + 1, null, false));
    }
    tm.setStatusMessage("");
    return ranks;
}
Also used : HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) AvgLinkHierarchicalClustering(org.baderlab.csplugins.brainlib.AvgLinkHierarchicalClustering) RankValue(org.baderlab.csplugins.enrichmentmap.view.heatmap.table.RankValue) GeneExpressionMatrix(org.baderlab.csplugins.enrichmentmap.model.GeneExpressionMatrix) EMDataSet(org.baderlab.csplugins.enrichmentmap.model.EMDataSet) DistanceMatrix(org.baderlab.csplugins.brainlib.DistanceMatrix) GeneExpression(org.baderlab.csplugins.enrichmentmap.model.GeneExpression) NullTaskMonitor(org.baderlab.csplugins.enrichmentmap.util.NullTaskMonitor)

Aggregations

NullTaskMonitor (org.baderlab.csplugins.enrichmentmap.util.NullTaskMonitor)9 EnrichmentMap (org.baderlab.csplugins.enrichmentmap.model.EnrichmentMap)6 EnrichmentResult (org.baderlab.csplugins.enrichmentmap.model.EnrichmentResult)6 GeneSet (org.baderlab.csplugins.enrichmentmap.model.GeneSet)5 ImmutableSet (com.google.common.collect.ImmutableSet)4 GenericResult (org.baderlab.csplugins.enrichmentmap.model.GenericResult)4 HashMap (java.util.HashMap)2 EMDataSet (org.baderlab.csplugins.enrichmentmap.model.EMDataSet)2 GeneExpression (org.baderlab.csplugins.enrichmentmap.model.GeneExpression)2 GeneExpressionMatrix (org.baderlab.csplugins.enrichmentmap.model.GeneExpressionMatrix)2 Ranking (org.baderlab.csplugins.enrichmentmap.model.Ranking)2 ArrayList (java.util.ArrayList)1 HashSet (java.util.HashSet)1 AvgLinkHierarchicalClustering (org.baderlab.csplugins.brainlib.AvgLinkHierarchicalClustering)1 DistanceMatrix (org.baderlab.csplugins.brainlib.DistanceMatrix)1 EMCreationParameters (org.baderlab.csplugins.enrichmentmap.model.EMCreationParameters)1 GreatFilter (org.baderlab.csplugins.enrichmentmap.model.EMCreationParameters.GreatFilter)1 GSEAResult (org.baderlab.csplugins.enrichmentmap.model.GSEAResult)1 Rank (org.baderlab.csplugins.enrichmentmap.model.Rank)1 SetOfEnrichmentResults (org.baderlab.csplugins.enrichmentmap.model.SetOfEnrichmentResults)1