Search in sources :

Example 91 with Gene

use of ubic.gemma.model.genome.Gene in project Gemma by PavlidisLab.

the class DifferentialExpressionSearchController method getDifferentialExpressionMetaAnalysis.

/**
 * Returns the results of the meta-analysis.
 */
private DifferentialExpressionMetaAnalysisValueObject getDifferentialExpressionMetaAnalysis(Long geneId, Collection<DiffExpressionSelectedFactorCommand> selectedFactors, double threshold) {
    Gene g = geneService.load(geneId);
    if (g == null) {
        log.warn("No Gene with id=" + geneId);
        return null;
    }
    /* find experiments that have had the diff cli run on it and have the gene g (analyzed) - security filtered. */
    Collection<BioAssaySet> experimentsAnalyzed = differentialExpressionAnalysisService.findExperimentsWithAnalyses(g);
    if (experimentsAnalyzed.size() == 0) {
        throw new EntityNotFoundException("No results were found: no experiment analyzed those genes");
    }
    /* the 'chosen' factors (and their associated experiments) */
    Map<Long, Long> eeFactorsMap = new HashMap<>();
    for (DiffExpressionSelectedFactorCommand selectedFactor : selectedFactors) {
        Long eeId = selectedFactor.getEeId();
        eeFactorsMap.put(eeId, selectedFactor.getEfId());
        if (log.isDebugEnabled())
            log.debug(eeId + " --> " + selectedFactor.getEfId());
    }
    /*
         * filter experiments that had the diff cli run on it and are in the scope of eeFactorsMap eeIds
         * (active/available to the user).
         */
    Collection<BioAssaySet> activeExperiments;
    if (eeFactorsMap.keySet() == null || eeFactorsMap.isEmpty()) {
        activeExperiments = experimentsAnalyzed;
    } else {
        activeExperiments = new ArrayList<>();
        for (BioAssaySet ee : experimentsAnalyzed) {
            if (eeFactorsMap.keySet().contains(ee.getId())) {
                activeExperiments.add(ee);
            }
        }
    }
    if (activeExperiments.isEmpty()) {
        throw new EntityNotFoundException("No results were found: none of the experiments selected analyzed those genes");
    }
    return geneDifferentialExpressionService.getDifferentialExpressionMetaAnalysis(threshold, g, eeFactorsMap, activeExperiments);
}
Also used : DiffExpressionSelectedFactorCommand(ubic.gemma.core.analysis.expression.diff.DiffExpressionSelectedFactorCommand) Gene(ubic.gemma.model.genome.Gene) EntityNotFoundException(ubic.gemma.web.util.EntityNotFoundException)

Example 92 with Gene

use of ubic.gemma.model.genome.Gene in project Gemma by PavlidisLab.

the class BlatAssociationScorer method clusterGenes.

/**
 * @param associations assocs.
 * @return map of physical locations for the alignments, and which genes are found there.
 */
private static Map<PhysicalLocation, Collection<Gene>> clusterGenes(Map<Gene, Collection<BlatAssociation>> associations) {
    Map<PhysicalLocation, Collection<Gene>> clusters = new HashMap<>();
    for (Gene gene : associations.keySet()) {
        Collection<BlatAssociation> geneAssoc = associations.get(gene);
        for (BlatAssociation ba : geneAssoc) {
            PhysicalLocation pl = ba.getBlatResult().getTargetAlignedRegion();
            if (!clusters.containsKey(pl)) {
                clusters.put(pl, new HashSet<Gene>());
            }
            clusters.get(pl).add(gene);
        }
    }
    // debugging information about clusters.
    if (BlatAssociationScorer.log.isDebugEnabled()) {
        for (PhysicalLocation pl : clusters.keySet()) {
            if (clusters.get(pl).size() > 1) {
                BlatAssociationScorer.log.debug("Cluster at " + pl + " with " + clusters.get(pl).size() + " members:\n" + StringUtils.join(clusters.get(pl).iterator(), "\n"));
            }
        }
    }
    return clusters;
}
Also used : Gene(ubic.gemma.model.genome.Gene) HashMap(java.util.HashMap) Collection(java.util.Collection) BlatAssociation(ubic.gemma.model.genome.sequenceAnalysis.BlatAssociation) PhysicalLocation(ubic.gemma.model.genome.PhysicalLocation)

Example 93 with Gene

use of ubic.gemma.model.genome.Gene in project Gemma by PavlidisLab.

the class BlatAssociationScorer method scoreResults.

/**
 * From a collection of BlatAssociations from a single BioSequence, reduce redundancy, fill in the specificity and
 * score and pick the one with the best scoring statistics.
 * This is a little complicated because a single sequence can yield many BlatResults to the same gene and/or gene
 * product. We reduce the results down to a single (best) result for any given gene product. We also score
 * specificity by the gene: if a sequence 'hits' multiple genes, then the specificity of the generated associations
 * will be less than 1.
 *
 * @param blatAssociations for a single sequence.
 * @return the highest-scoring result (if there are ties this will be a random one). Note that this return value is
 * not all that useful because it assumes there is a "clear winner". The passed-in blatAssociations will be
 * pruned to remove redundant entries, and will have score information filled in as well. It is intended
 * that these 'refined' BlatAssociations will be used in further analysis.
 * @throws IllegalArgumentException if the blatAssociations are from multiple biosequences.
 */
public static BlatAssociation scoreResults(Collection<BlatAssociation> blatAssociations) {
    Map<GeneProduct, Collection<BlatAssociation>> geneProducts2Associations = BlatAssociationScorer.organizeBlatAssociationsByGeneProductAndInitializeScores(blatAssociations);
    BlatAssociation globalBest = BlatAssociationScorer.removeExtraHitsPerGeneProduct(blatAssociations, geneProducts2Associations);
    assert blatAssociations.size() > 0;
    Map<Gene, Collection<BlatAssociation>> genes2Associations = BlatAssociationScorer.organizeBlatAssociationsByGene(blatAssociations);
    assert genes2Associations.size() > 0;
    /*
         * At this point there should be just one blatAssociation per gene product. However, all of these really might
         * be for the same gene. It is only in the case of truly multiple genes that we flag a lower specificity.
         */
    if (genes2Associations.size() == 1) {
        return globalBest;
    }
    Map<PhysicalLocation, Collection<Gene>> geneClusters = BlatAssociationScorer.clusterGenes(genes2Associations);
    // compute specificity at the level of genes. First, get the best score for each gene cluster.
    Map<PhysicalLocation, Double> scores = new HashMap<>();
    for (PhysicalLocation pl : geneClusters.keySet()) {
        Double geneScore = 0.0;
        for (Gene cgene : geneClusters.get(pl)) {
            for (BlatAssociation blatAssociation : genes2Associations.get(cgene)) {
                Double alignScore = blatAssociation.getScore();
                if (alignScore > geneScore) {
                    geneScore = alignScore;
                }
            }
        }
        scores.put(pl, geneScore);
    }
    for (PhysicalLocation pl : geneClusters.keySet()) {
        Double alignScore = scores.get(pl);
        for (Gene cgene : geneClusters.get(pl)) {
            // All members of the cluster get the same specificity.
            for (BlatAssociation blatAssociation : genes2Associations.get(cgene)) {
                blatAssociation.setSpecificity(BlatAssociationScorer.computeSpecificity(scores.values(), alignScore));
            }
        }
    }
    return globalBest;
}
Also used : GeneProduct(ubic.gemma.model.genome.gene.GeneProduct) Gene(ubic.gemma.model.genome.Gene) HashMap(java.util.HashMap) Collection(java.util.Collection) BlatAssociation(ubic.gemma.model.genome.sequenceAnalysis.BlatAssociation) PhysicalLocation(ubic.gemma.model.genome.PhysicalLocation)

Example 94 with Gene

use of ubic.gemma.model.genome.Gene in project Gemma by PavlidisLab.

the class LinearModelAnalyzer method computeHitListSizes.

@Override
public Collection<HitListSize> computeHitListSizes(Collection<DifferentialExpressionAnalysisResult> results, Map<CompositeSequence, Collection<Gene>> probeToGeneMap) {
    Collection<HitListSize> hitListSizes = new HashSet<>();
    StopWatch timer = new StopWatch();
    timer.start();
    double maxThreshold = MathUtil.max(LinearModelAnalyzer.qValueThresholdsForHitLists);
    assert probeToGeneMap != null;
    Collection<Gene> allGenes = new HashSet<>();
    for (Collection<Gene> genes : probeToGeneMap.values()) {
        allGenes.addAll(genes);
    }
    // maps from Doubles are a bit dodgy...
    Map<Double, Integer> upCounts = new HashMap<>();
    Map<Double, Integer> downCounts = new HashMap<>();
    Map<Double, Integer> eitherCounts = new HashMap<>();
    Map<Double, Integer> upCountGenes = new HashMap<>();
    Map<Double, Integer> downCountGenes = new HashMap<>();
    Map<Double, Integer> eitherCountGenes = new HashMap<>();
    Collection<Gene> seenGenes = new HashSet<>();
    for (DifferentialExpressionAnalysisResult r : results) {
        Double corrP = r.getCorrectedPvalue();
        if (corrP == null || corrP > maxThreshold) {
            continue;
        }
        CompositeSequence probe = r.getProbe();
        Collection<Gene> genesForProbe = probeToGeneMap.get(probe);
        int numGenes = 0;
        if (genesForProbe != null) {
            numGenes = this.countNumberOfGenesNotSeenAlready(genesForProbe, seenGenes);
        }
        // if ( numGenes == 0 ) // This is okay; it might mean we already counted it as differentially expressed.
        Collection<ContrastResult> crs = r.getContrasts();
        boolean up = false;
        boolean down = false;
        /*
             * We set up and down to be true (either or both) if at least on contrast is shown.
             */
        for (ContrastResult cr : crs) {
            Double lf = cr.getLogFoldChange();
            // noinspection StatementWithEmptyBody // Better readability
            if (lf == null) {
            /*
                     * A contrast which is actually not valid, so it won't be counted in the hit list.
                     */
            } else if (lf < 0) {
                down = true;
            } else if (lf > 0) {
                up = true;
            }
        }
        for (double thresh : LinearModelAnalyzer.qValueThresholdsForHitLists) {
            if (!upCounts.containsKey(thresh)) {
                upCounts.put(thresh, 0);
                upCountGenes.put(thresh, 0);
            }
            if (!downCounts.containsKey(thresh)) {
                downCounts.put(thresh, 0);
                downCountGenes.put(thresh, 0);
            }
            if (!eitherCounts.containsKey(thresh)) {
                eitherCounts.put(thresh, 0);
                eitherCountGenes.put(thresh, 0);
            }
            if (corrP < thresh) {
                if (up) {
                    upCounts.put(thresh, upCounts.get(thresh) + 1);
                    upCountGenes.put(thresh, upCountGenes.get(thresh) + numGenes);
                }
                if (down) {
                    downCounts.put(thresh, downCounts.get(thresh) + 1);
                    downCountGenes.put(thresh, downCountGenes.get(thresh) + numGenes);
                }
                eitherCounts.put(thresh, eitherCounts.get(thresh) + 1);
                eitherCountGenes.put(thresh, eitherCountGenes.get(thresh) + numGenes);
            }
        }
    }
    for (double thresh : LinearModelAnalyzer.qValueThresholdsForHitLists) {
        // Ensure we don't set values to null.
        Integer up = upCounts.get(thresh) == null ? 0 : upCounts.get(thresh);
        Integer down = downCounts.get(thresh) == null ? 0 : downCounts.get(thresh);
        Integer either = eitherCounts.get(thresh) == null ? 0 : eitherCounts.get(thresh);
        Integer upGenes = upCountGenes.get(thresh) == null ? 0 : upCountGenes.get(thresh);
        Integer downGenes = downCountGenes.get(thresh) == null ? 0 : downCountGenes.get(thresh);
        Integer eitherGenes = eitherCountGenes.get(thresh) == null ? 0 : eitherCountGenes.get(thresh);
        assert !(allGenes.size() < upGenes || allGenes.size() < downGenes || allGenes.size() < eitherGenes) : "There were more genes differentially expressed than exist in the experiment";
        HitListSize upS = HitListSize.Factory.newInstance(thresh, up, Direction.UP, upGenes);
        HitListSize downS = HitListSize.Factory.newInstance(thresh, down, Direction.DOWN, downGenes);
        HitListSize eitherS = HitListSize.Factory.newInstance(thresh, either, Direction.EITHER, eitherGenes);
        hitListSizes.add(upS);
        hitListSizes.add(downS);
        hitListSizes.add(eitherS);
        assert upGenes <= eitherGenes : "More genes upregulated than upregulated+downregulated";
        assert downGenes <= eitherGenes : "More genes downregulated than upregulated+downregulated";
    }
    if (timer.getTime() > 1000) {
        LinearModelAnalyzer.log.info("Hitlist computation: " + timer.getTime() + "ms");
    }
    return hitListSizes;
}
Also used : CompositeSequence(ubic.gemma.model.expression.designElement.CompositeSequence) StopWatch(org.apache.commons.lang3.time.StopWatch) Gene(ubic.gemma.model.genome.Gene)

Example 95 with Gene

use of ubic.gemma.model.genome.Gene in project Gemma by PavlidisLab.

the class ArrayDesignAnnotationServiceImpl method parseAnnotationFile.

private static Map<Long, Collection<Gene>> parseAnnotationFile(Map<Long, Collection<Gene>> results, InputStream is, Map<String, Long> probeNameToId) {
    try {
        BufferedReader br = new BufferedReader(new InputStreamReader(is));
        String line;
        while ((line = br.readLine()) != null) {
            if (StringUtils.isBlank(line) || line.startsWith(ArrayDesignAnnotationServiceImpl.COMMENT_CHARACTER)) {
                continue;
            }
            String[] fields = StringUtils.splitPreserveAllTokens(line, '\t');
            if (fields.length < 3)
                // means there are no gene annotations.
                continue;
            String probeName = fields[0];
            if (!probeNameToId.containsKey(probeName))
                continue;
            Long probeId = probeNameToId.get(probeName);
            List<String> geneSymbols = Arrays.asList(StringUtils.splitPreserveAllTokens(fields[1], '|'));
            List<String> geneNames = Arrays.asList(StringUtils.splitPreserveAllTokens(fields[2], '|'));
            if (geneSymbols.size() != geneNames.size()) {
                ArrayDesignAnnotationServiceImpl.log.warn("Annotation file format error: Unequal number of gene symbols and names for probe=" + probeName + ", skipping row");
                continue;
            }
            List<String> gemmaGeneIds = null;
            List<String> ncbiIds = null;
            if (fields.length > 4) {
                // new style. fields[3] is the GO annotations.
                gemmaGeneIds = Arrays.asList(StringUtils.splitPreserveAllTokens(fields[4], '|'));
            }
            if (fields.length > 5) {
                ncbiIds = Arrays.asList(StringUtils.splitPreserveAllTokens(fields[5], '|'));
            }
            for (int i = 0; i < geneSymbols.size(); i++) {
                String symbol = geneSymbols.get(i);
                String name = geneNames.get(i);
                if (StringUtils.isBlank(symbol)) {
                    continue;
                }
                String[] symbolsB = StringUtils.split(symbol, ',');
                String[] namesB = StringUtils.split(name, '$');
                for (int j = 0; j < symbolsB.length; j++) {
                    String s = symbolsB[j];
                    Gene g = Gene.Factory.newInstance();
                    g.setOfficialSymbol(s);
                    try {
                        if (gemmaGeneIds != null) {
                            g.setId(Long.parseLong(gemmaGeneIds.get(j)));
                        }
                        if (ncbiIds != null) {
                            g.setNcbiGeneId(Integer.parseInt(ncbiIds.get(j)));
                        }
                    } catch (NumberFormatException e) {
                    // oh well, couldn't populate extra info.
                    }
                    if (namesB.length >= j + 1) {
                        String n = namesB[j];
                        g.setName(n);
                    }
                    results.get(probeId).add(g);
                }
            }
        }
        return results;
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
}
Also used : Gene(ubic.gemma.model.genome.Gene)

Aggregations

Gene (ubic.gemma.model.genome.Gene)186 Taxon (ubic.gemma.model.genome.Taxon)34 CompositeSequence (ubic.gemma.model.expression.designElement.CompositeSequence)32 StopWatch (org.apache.commons.lang3.time.StopWatch)31 Test (org.junit.Test)24 HashSet (java.util.HashSet)23 GeneProduct (ubic.gemma.model.genome.gene.GeneProduct)20 BaseSpringContextTest (ubic.gemma.core.testing.BaseSpringContextTest)18 Element (org.w3c.dom.Element)16 ArrayList (java.util.ArrayList)13 Transactional (org.springframework.transaction.annotation.Transactional)12 ExpressionExperiment (ubic.gemma.model.expression.experiment.ExpressionExperiment)12 Collection (java.util.Collection)11 OntologyTerm (ubic.basecode.ontology.model.OntologyTerm)11 CharacteristicValueObject (ubic.gemma.model.genome.gene.phenotype.valueObject.CharacteristicValueObject)10 HashMap (java.util.HashMap)8 ArrayDesign (ubic.gemma.model.expression.arrayDesign.ArrayDesign)8 BioSequence2GeneProduct (ubic.gemma.model.association.BioSequence2GeneProduct)7 PhysicalLocation (ubic.gemma.model.genome.PhysicalLocation)7 BioSequence (ubic.gemma.model.genome.biosequence.BioSequence)7