use of ubic.gemma.model.genome.gene.GeneProduct in project Gemma by PavlidisLab.
the class BlatAssociationScorer method scoreResults.
/**
* From a collection of BlatAssociations from a single BioSequence, reduce redundancy, fill in the specificity and
* score and pick the one with the best scoring statistics.
* This is a little complicated because a single sequence can yield many BlatResults to the same gene and/or gene
* product. We reduce the results down to a single (best) result for any given gene product. We also score
* specificity by the gene: if a sequence 'hits' multiple genes, then the specificity of the generated associations
* will be less than 1.
*
* @param blatAssociations for a single sequence.
* @return the highest-scoring result (if there are ties this will be a random one). Note that this return value is
* not all that useful because it assumes there is a "clear winner". The passed-in blatAssociations will be
* pruned to remove redundant entries, and will have score information filled in as well. It is intended
* that these 'refined' BlatAssociations will be used in further analysis.
* @throws IllegalArgumentException if the blatAssociations are from multiple biosequences.
*/
public static BlatAssociation scoreResults(Collection<BlatAssociation> blatAssociations) {
Map<GeneProduct, Collection<BlatAssociation>> geneProducts2Associations = BlatAssociationScorer.organizeBlatAssociationsByGeneProductAndInitializeScores(blatAssociations);
BlatAssociation globalBest = BlatAssociationScorer.removeExtraHitsPerGeneProduct(blatAssociations, geneProducts2Associations);
assert blatAssociations.size() > 0;
Map<Gene, Collection<BlatAssociation>> genes2Associations = BlatAssociationScorer.organizeBlatAssociationsByGene(blatAssociations);
assert genes2Associations.size() > 0;
/*
* At this point there should be just one blatAssociation per gene product. However, all of these really might
* be for the same gene. It is only in the case of truly multiple genes that we flag a lower specificity.
*/
if (genes2Associations.size() == 1) {
return globalBest;
}
Map<PhysicalLocation, Collection<Gene>> geneClusters = BlatAssociationScorer.clusterGenes(genes2Associations);
// compute specificity at the level of genes. First, get the best score for each gene cluster.
Map<PhysicalLocation, Double> scores = new HashMap<>();
for (PhysicalLocation pl : geneClusters.keySet()) {
Double geneScore = 0.0;
for (Gene cgene : geneClusters.get(pl)) {
for (BlatAssociation blatAssociation : genes2Associations.get(cgene)) {
Double alignScore = blatAssociation.getScore();
if (alignScore > geneScore) {
geneScore = alignScore;
}
}
}
scores.put(pl, geneScore);
}
for (PhysicalLocation pl : geneClusters.keySet()) {
Double alignScore = scores.get(pl);
for (Gene cgene : geneClusters.get(pl)) {
// All members of the cluster get the same specificity.
for (BlatAssociation blatAssociation : genes2Associations.get(cgene)) {
blatAssociation.setSpecificity(BlatAssociationScorer.computeSpecificity(scores.values(), alignScore));
}
}
}
return globalBest;
}
use of ubic.gemma.model.genome.gene.GeneProduct in project Gemma by PavlidisLab.
the class BlatAssociationScorer method removeExtraHitsPerGeneProduct.
/**
* Compute scores and find the best one, for each gene product, removing all other hits (so there is just one per
* gene product
*
* @param blatAssociations blat assocs.
* @param geneProduct2BlatAssociations gene prods 2 blat assocs
* @return blat accoc
*/
private static BlatAssociation removeExtraHitsPerGeneProduct(Collection<BlatAssociation> blatAssociations, Map<GeneProduct, Collection<BlatAssociation>> geneProduct2BlatAssociations) {
double globalMaxScore = 0.0;
BlatAssociation globalBest = null;
Collection<BlatAssociation> keepers = new HashSet<>();
for (GeneProduct geneProduct : geneProduct2BlatAssociations.keySet()) {
Collection<BlatAssociation> geneProductBlatAssociations = geneProduct2BlatAssociations.get(geneProduct);
if (geneProductBlatAssociations.isEmpty())
continue;
BlatAssociation ba = geneProductBlatAssociations.iterator().next();
// Find the best one. If there are ties it's arbitrary which one we pick.
double maxScore = ba.getScore();
BlatAssociation best = ba;
for (BlatAssociation blatAssociation : geneProductBlatAssociations) {
double score = blatAssociation.getScore();
if (score >= maxScore) {
maxScore = score;
best = blatAssociation;
}
}
// Remove the lower-scoring ones for this gene product
Collection<BlatAssociation> toKeep = new HashSet<>();
toKeep.add(best);
keepers.add(best);
geneProduct2BlatAssociations.put(geneProduct, toKeep);
if (best.getScore() > globalMaxScore) {
globalMaxScore = best.getScore();
globalBest = best;
}
}
blatAssociations.retainAll(keepers);
return globalBest;
}
use of ubic.gemma.model.genome.gene.GeneProduct in project Gemma by PavlidisLab.
the class ProbeMapperTest method testLocateGene.
/*
* Test based on U83843, should bring up CCT7 (NM_006429 and NM_001009570). Valid locations as of 2/2011 for hg19.
* <a href="http://genome.ucsc.edu/cgi-bin/hgTracks?hgsid=79741184&hgt.out1=1.5x&position=chr2%3A73320308-73331929">here</a>
* 73,461,405-73,480,144)
*/
public void testLocateGene() {
Collection<GeneProduct> products = humangp.findRefGenesByLocation("2", 73461505L, 73462405L, "+");
TestCase.assertEquals(6, products.size());
GeneProduct gprod = products.iterator().next();
// okay as of 1/2008.
TestCase.assertEquals("CCT7", gprod.getGene().getOfficialSymbol());
}
use of ubic.gemma.model.genome.gene.GeneProduct in project Gemma by PavlidisLab.
the class NCBIGeneLoadingTest method testGeneLoader.
@Test
public void testGeneLoader() throws Exception {
NcbiGeneLoader loader = new NcbiGeneLoader(persisterHelper);
loader.setTaxonService(taxonService);
String geneInfoTestFile = "/data/loader/genome/gene/gene_info.human.sample";
String gene2AccTestFile = "/data/loader/genome/gene/gene2accession.human.sample";
String geneHistoryFile = "/data/loader/genome/gene/gene_history.human.sample";
// threaded load
Taxon ta = taxonService.findByCommonName("human");
assertNotNull(ta);
loader.load(FileTools.resourceToPath(geneInfoTestFile), FileTools.resourceToPath(gene2AccTestFile), FileTools.resourceToPath(geneHistoryFile), null, ta);
// wait until the loader is done.
while (!loader.isLoaderDone()) {
Thread.sleep(100);
}
// loader is done.
// check if it loaded elements to the database
log.debug("Loader done with number of elements: " + loader.getLoadedGeneCount());
assertEquals(51, loader.getLoadedGeneCount());
// grab one gene and check its information
// (depends on information in gene_info and gene2accession file
// gene_info
Collection<Gene> geneCollection = geneService.findByOfficialSymbol("A2M");
assertEquals(1, geneCollection.size());
g = geneCollection.iterator().next();
g = geneService.thaw(g);
Collection<GeneProduct> products = g.getProducts();
Collection<String> expectedAccessions = new ArrayList<>();
Collection<String> hasAccessions = new ArrayList<>();
expectedAccessions.add("AB209614.2");
expectedAccessions.add("AK307832.1");
for (GeneProduct product : products) {
Collection<DatabaseEntry> accessions = product.getAccessions();
for (DatabaseEntry de : accessions) {
String accession = de.getAccession();
String accVersion = de.getAccessionVersion();
hasAccessions.add(accession + "." + accVersion);
log.debug(accession + "." + accVersion);
}
}
assertEquals(12, hasAccessions.size());
assertTrue(hasAccessions.containsAll(expectedAccessions));
Taxon t = g.getTaxon();
assertEquals(9606, t.getNcbiId().intValue());
assertEquals(new Integer(2), g.getNcbiGeneId());
/*
* Test history change. One gene has been updated, from 7003 to 44444 (fake), and mimic adding ensembl
*/
geneInfoTestFile = "/data/loader/genome/gene/gene_info.human.changed.sample";
gene2AccTestFile = "/data/loader/genome/gene/gene2accession.human.changed.sample";
String updatedHistory = "/data/loader/genome/gene/gene_history.human.changed.sample";
String geneEnsemblFile = "/data/loader/genome/gene/gene2ensembl.human.sample";
loader.load(FileTools.resourceToPath(geneInfoTestFile), FileTools.resourceToPath(gene2AccTestFile), FileTools.resourceToPath(updatedHistory), FileTools.resourceToPath(geneEnsemblFile), ta);
// wait until the loader is done.
while (!loader.isLoaderDone()) {
Thread.sleep(100);
}
Collection<Gene> updatedTestGene = geneService.findByOfficialSymbol("TEAD1");
assertEquals(1, updatedTestGene.size());
g = updatedTestGene.iterator().next();
assertEquals("7003", g.getPreviousNcbiId());
assertEquals(new Integer(44444), g.getNcbiGeneId());
g = geneService.findByNCBIId(1);
assertEquals("ENSG00000121410", g.getEnsemblId());
// test remove...
geneProductService.remove(products);
}
use of ubic.gemma.model.genome.gene.GeneProduct in project Gemma by PavlidisLab.
the class AnnotationAssociationDaoImpl method find.
@Override
public Collection<AnnotationAssociation> find(Gene gene) {
if (gene.getProducts().size() == 0) {
throw new IllegalArgumentException("Gene has no products");
}
Collection<AnnotationAssociation> result = new HashSet<>();
for (GeneProduct geneProduct : gene.getProducts()) {
BusinessKey.checkValidKey(geneProduct);
Criteria queryObject = this.getSessionFactory().getCurrentSession().createCriteria(AnnotationAssociation.class);
Criteria innerQuery = queryObject.createCriteria("geneProduct");
if (StringUtils.isNotBlank(geneProduct.getNcbiGi())) {
innerQuery.add(Restrictions.eq("ncbiGi", geneProduct.getNcbiGi()));
}
if (StringUtils.isNotBlank(geneProduct.getName())) {
innerQuery.add(Restrictions.eq("name", geneProduct.getName()));
}
// noinspection unchecked
result.addAll(queryObject.list());
}
return result;
}
Aggregations