Search in sources :

Example 21 with GeneProduct

use of ubic.gemma.model.genome.gene.GeneProduct in project Gemma by PavlidisLab.

the class BlatAssociationScorer method scoreResults.

/**
 * From a collection of BlatAssociations from a single BioSequence, reduce redundancy, fill in the specificity and
 * score and pick the one with the best scoring statistics.
 * This is a little complicated because a single sequence can yield many BlatResults to the same gene and/or gene
 * product. We reduce the results down to a single (best) result for any given gene product. We also score
 * specificity by the gene: if a sequence 'hits' multiple genes, then the specificity of the generated associations
 * will be less than 1.
 *
 * @param blatAssociations for a single sequence.
 * @return the highest-scoring result (if there are ties this will be a random one). Note that this return value is
 * not all that useful because it assumes there is a "clear winner". The passed-in blatAssociations will be
 * pruned to remove redundant entries, and will have score information filled in as well. It is intended
 * that these 'refined' BlatAssociations will be used in further analysis.
 * @throws IllegalArgumentException if the blatAssociations are from multiple biosequences.
 */
public static BlatAssociation scoreResults(Collection<BlatAssociation> blatAssociations) {
    Map<GeneProduct, Collection<BlatAssociation>> geneProducts2Associations = BlatAssociationScorer.organizeBlatAssociationsByGeneProductAndInitializeScores(blatAssociations);
    BlatAssociation globalBest = BlatAssociationScorer.removeExtraHitsPerGeneProduct(blatAssociations, geneProducts2Associations);
    assert blatAssociations.size() > 0;
    Map<Gene, Collection<BlatAssociation>> genes2Associations = BlatAssociationScorer.organizeBlatAssociationsByGene(blatAssociations);
    assert genes2Associations.size() > 0;
    /*
         * At this point there should be just one blatAssociation per gene product. However, all of these really might
         * be for the same gene. It is only in the case of truly multiple genes that we flag a lower specificity.
         */
    if (genes2Associations.size() == 1) {
        return globalBest;
    }
    Map<PhysicalLocation, Collection<Gene>> geneClusters = BlatAssociationScorer.clusterGenes(genes2Associations);
    // compute specificity at the level of genes. First, get the best score for each gene cluster.
    Map<PhysicalLocation, Double> scores = new HashMap<>();
    for (PhysicalLocation pl : geneClusters.keySet()) {
        Double geneScore = 0.0;
        for (Gene cgene : geneClusters.get(pl)) {
            for (BlatAssociation blatAssociation : genes2Associations.get(cgene)) {
                Double alignScore = blatAssociation.getScore();
                if (alignScore > geneScore) {
                    geneScore = alignScore;
                }
            }
        }
        scores.put(pl, geneScore);
    }
    for (PhysicalLocation pl : geneClusters.keySet()) {
        Double alignScore = scores.get(pl);
        for (Gene cgene : geneClusters.get(pl)) {
            // All members of the cluster get the same specificity.
            for (BlatAssociation blatAssociation : genes2Associations.get(cgene)) {
                blatAssociation.setSpecificity(BlatAssociationScorer.computeSpecificity(scores.values(), alignScore));
            }
        }
    }
    return globalBest;
}
Also used : GeneProduct(ubic.gemma.model.genome.gene.GeneProduct) Gene(ubic.gemma.model.genome.Gene) HashMap(java.util.HashMap) Collection(java.util.Collection) BlatAssociation(ubic.gemma.model.genome.sequenceAnalysis.BlatAssociation) PhysicalLocation(ubic.gemma.model.genome.PhysicalLocation)

Example 22 with GeneProduct

use of ubic.gemma.model.genome.gene.GeneProduct in project Gemma by PavlidisLab.

the class BlatAssociationScorer method removeExtraHitsPerGeneProduct.

/**
 * Compute scores and find the best one, for each gene product, removing all other hits (so there is just one per
 * gene product
 *
 * @param blatAssociations             blat assocs.
 * @param geneProduct2BlatAssociations gene prods 2 blat assocs
 * @return blat accoc
 */
private static BlatAssociation removeExtraHitsPerGeneProduct(Collection<BlatAssociation> blatAssociations, Map<GeneProduct, Collection<BlatAssociation>> geneProduct2BlatAssociations) {
    double globalMaxScore = 0.0;
    BlatAssociation globalBest = null;
    Collection<BlatAssociation> keepers = new HashSet<>();
    for (GeneProduct geneProduct : geneProduct2BlatAssociations.keySet()) {
        Collection<BlatAssociation> geneProductBlatAssociations = geneProduct2BlatAssociations.get(geneProduct);
        if (geneProductBlatAssociations.isEmpty())
            continue;
        BlatAssociation ba = geneProductBlatAssociations.iterator().next();
        // Find the best one. If there are ties it's arbitrary which one we pick.
        double maxScore = ba.getScore();
        BlatAssociation best = ba;
        for (BlatAssociation blatAssociation : geneProductBlatAssociations) {
            double score = blatAssociation.getScore();
            if (score >= maxScore) {
                maxScore = score;
                best = blatAssociation;
            }
        }
        // Remove the lower-scoring ones for this gene product
        Collection<BlatAssociation> toKeep = new HashSet<>();
        toKeep.add(best);
        keepers.add(best);
        geneProduct2BlatAssociations.put(geneProduct, toKeep);
        if (best.getScore() > globalMaxScore) {
            globalMaxScore = best.getScore();
            globalBest = best;
        }
    }
    blatAssociations.retainAll(keepers);
    return globalBest;
}
Also used : GeneProduct(ubic.gemma.model.genome.gene.GeneProduct) BlatAssociation(ubic.gemma.model.genome.sequenceAnalysis.BlatAssociation) HashSet(java.util.HashSet)

Example 23 with GeneProduct

use of ubic.gemma.model.genome.gene.GeneProduct in project Gemma by PavlidisLab.

the class ProbeMapperTest method testLocateGene.

/*
     * Test based on U83843, should bring up CCT7 (NM_006429 and NM_001009570). Valid locations as of 2/2011 for hg19.
     * <a href="http://genome.ucsc.edu/cgi-bin/hgTracks?hgsid=79741184&hgt.out1=1.5x&position=chr2%3A73320308-73331929">here</a>
     * 73,461,405-73,480,144)
     */
public void testLocateGene() {
    Collection<GeneProduct> products = humangp.findRefGenesByLocation("2", 73461505L, 73462405L, "+");
    TestCase.assertEquals(6, products.size());
    GeneProduct gprod = products.iterator().next();
    // okay as of 1/2008.
    TestCase.assertEquals("CCT7", gprod.getGene().getOfficialSymbol());
}
Also used : GeneProduct(ubic.gemma.model.genome.gene.GeneProduct)

Example 24 with GeneProduct

use of ubic.gemma.model.genome.gene.GeneProduct in project Gemma by PavlidisLab.

the class NCBIGeneLoadingTest method testGeneLoader.

@Test
public void testGeneLoader() throws Exception {
    NcbiGeneLoader loader = new NcbiGeneLoader(persisterHelper);
    loader.setTaxonService(taxonService);
    String geneInfoTestFile = "/data/loader/genome/gene/gene_info.human.sample";
    String gene2AccTestFile = "/data/loader/genome/gene/gene2accession.human.sample";
    String geneHistoryFile = "/data/loader/genome/gene/gene_history.human.sample";
    // threaded load
    Taxon ta = taxonService.findByCommonName("human");
    assertNotNull(ta);
    loader.load(FileTools.resourceToPath(geneInfoTestFile), FileTools.resourceToPath(gene2AccTestFile), FileTools.resourceToPath(geneHistoryFile), null, ta);
    // wait until the loader is done.
    while (!loader.isLoaderDone()) {
        Thread.sleep(100);
    }
    // loader is done.
    // check if it loaded elements to the database
    log.debug("Loader done with number of elements: " + loader.getLoadedGeneCount());
    assertEquals(51, loader.getLoadedGeneCount());
    // grab one gene and check its information
    // (depends on information in gene_info and gene2accession file
    // gene_info
    Collection<Gene> geneCollection = geneService.findByOfficialSymbol("A2M");
    assertEquals(1, geneCollection.size());
    g = geneCollection.iterator().next();
    g = geneService.thaw(g);
    Collection<GeneProduct> products = g.getProducts();
    Collection<String> expectedAccessions = new ArrayList<>();
    Collection<String> hasAccessions = new ArrayList<>();
    expectedAccessions.add("AB209614.2");
    expectedAccessions.add("AK307832.1");
    for (GeneProduct product : products) {
        Collection<DatabaseEntry> accessions = product.getAccessions();
        for (DatabaseEntry de : accessions) {
            String accession = de.getAccession();
            String accVersion = de.getAccessionVersion();
            hasAccessions.add(accession + "." + accVersion);
            log.debug(accession + "." + accVersion);
        }
    }
    assertEquals(12, hasAccessions.size());
    assertTrue(hasAccessions.containsAll(expectedAccessions));
    Taxon t = g.getTaxon();
    assertEquals(9606, t.getNcbiId().intValue());
    assertEquals(new Integer(2), g.getNcbiGeneId());
    /*
         * Test history change. One gene has been updated, from 7003 to 44444 (fake), and mimic adding ensembl
         */
    geneInfoTestFile = "/data/loader/genome/gene/gene_info.human.changed.sample";
    gene2AccTestFile = "/data/loader/genome/gene/gene2accession.human.changed.sample";
    String updatedHistory = "/data/loader/genome/gene/gene_history.human.changed.sample";
    String geneEnsemblFile = "/data/loader/genome/gene/gene2ensembl.human.sample";
    loader.load(FileTools.resourceToPath(geneInfoTestFile), FileTools.resourceToPath(gene2AccTestFile), FileTools.resourceToPath(updatedHistory), FileTools.resourceToPath(geneEnsemblFile), ta);
    // wait until the loader is done.
    while (!loader.isLoaderDone()) {
        Thread.sleep(100);
    }
    Collection<Gene> updatedTestGene = geneService.findByOfficialSymbol("TEAD1");
    assertEquals(1, updatedTestGene.size());
    g = updatedTestGene.iterator().next();
    assertEquals("7003", g.getPreviousNcbiId());
    assertEquals(new Integer(44444), g.getNcbiGeneId());
    g = geneService.findByNCBIId(1);
    assertEquals("ENSG00000121410", g.getEnsemblId());
    // test remove...
    geneProductService.remove(products);
}
Also used : GeneProduct(ubic.gemma.model.genome.gene.GeneProduct) Gene(ubic.gemma.model.genome.Gene) Taxon(ubic.gemma.model.genome.Taxon) ArrayList(java.util.ArrayList) DatabaseEntry(ubic.gemma.model.common.description.DatabaseEntry) Test(org.junit.Test) BaseSpringContextTest(ubic.gemma.core.testing.BaseSpringContextTest)

Example 25 with GeneProduct

use of ubic.gemma.model.genome.gene.GeneProduct in project Gemma by PavlidisLab.

the class AnnotationAssociationDaoImpl method find.

@Override
public Collection<AnnotationAssociation> find(Gene gene) {
    if (gene.getProducts().size() == 0) {
        throw new IllegalArgumentException("Gene has no products");
    }
    Collection<AnnotationAssociation> result = new HashSet<>();
    for (GeneProduct geneProduct : gene.getProducts()) {
        BusinessKey.checkValidKey(geneProduct);
        Criteria queryObject = this.getSessionFactory().getCurrentSession().createCriteria(AnnotationAssociation.class);
        Criteria innerQuery = queryObject.createCriteria("geneProduct");
        if (StringUtils.isNotBlank(geneProduct.getNcbiGi())) {
            innerQuery.add(Restrictions.eq("ncbiGi", geneProduct.getNcbiGi()));
        }
        if (StringUtils.isNotBlank(geneProduct.getName())) {
            innerQuery.add(Restrictions.eq("name", geneProduct.getName()));
        }
        // noinspection unchecked
        result.addAll(queryObject.list());
    }
    return result;
}
Also used : GeneProduct(ubic.gemma.model.genome.gene.GeneProduct) AnnotationAssociation(ubic.gemma.model.genome.sequenceAnalysis.AnnotationAssociation) Criteria(org.hibernate.Criteria) HashSet(java.util.HashSet)

Aggregations

GeneProduct (ubic.gemma.model.genome.gene.GeneProduct)41 Gene (ubic.gemma.model.genome.Gene)20 HashSet (java.util.HashSet)16 BioSequence2GeneProduct (ubic.gemma.model.association.BioSequence2GeneProduct)12 DatabaseEntry (ubic.gemma.model.common.description.DatabaseEntry)8 BlatAssociation (ubic.gemma.model.genome.sequenceAnalysis.BlatAssociation)8 Test (org.junit.Test)6 BaseSpringContextTest (ubic.gemma.core.testing.BaseSpringContextTest)5 BioSequence (ubic.gemma.model.genome.biosequence.BioSequence)5 AnnotationAssociation (ubic.gemma.model.genome.sequenceAnalysis.AnnotationAssociation)5 HashMap (java.util.HashMap)4 PhysicalLocation (ubic.gemma.model.genome.PhysicalLocation)4 Criteria (org.hibernate.Criteria)3 CompositeSequence (ubic.gemma.model.expression.designElement.CompositeSequence)3 IOException (java.io.IOException)2 ArrayList (java.util.ArrayList)2 Collection (java.util.Collection)2 GeneProductValueObject (ubic.gemma.model.genome.gene.GeneProductValueObject)2 BufferedReader (java.io.BufferedReader)1 FileReader (java.io.FileReader)1