use of ubic.gemma.model.genome.PhysicalLocation in project Gemma by PavlidisLab.
the class BlatAssociationScorer method scoreResults.
/**
* From a collection of BlatAssociations from a single BioSequence, reduce redundancy, fill in the specificity and
* score and pick the one with the best scoring statistics.
* This is a little complicated because a single sequence can yield many BlatResults to the same gene and/or gene
* product. We reduce the results down to a single (best) result for any given gene product. We also score
* specificity by the gene: if a sequence 'hits' multiple genes, then the specificity of the generated associations
* will be less than 1.
*
* @param blatAssociations for a single sequence.
* @return the highest-scoring result (if there are ties this will be a random one). Note that this return value is
* not all that useful because it assumes there is a "clear winner". The passed-in blatAssociations will be
* pruned to remove redundant entries, and will have score information filled in as well. It is intended
* that these 'refined' BlatAssociations will be used in further analysis.
* @throws IllegalArgumentException if the blatAssociations are from multiple biosequences.
*/
public static BlatAssociation scoreResults(Collection<BlatAssociation> blatAssociations) {
Map<GeneProduct, Collection<BlatAssociation>> geneProducts2Associations = BlatAssociationScorer.organizeBlatAssociationsByGeneProductAndInitializeScores(blatAssociations);
BlatAssociation globalBest = BlatAssociationScorer.removeExtraHitsPerGeneProduct(blatAssociations, geneProducts2Associations);
assert blatAssociations.size() > 0;
Map<Gene, Collection<BlatAssociation>> genes2Associations = BlatAssociationScorer.organizeBlatAssociationsByGene(blatAssociations);
assert genes2Associations.size() > 0;
/*
* At this point there should be just one blatAssociation per gene product. However, all of these really might
* be for the same gene. It is only in the case of truly multiple genes that we flag a lower specificity.
*/
if (genes2Associations.size() == 1) {
return globalBest;
}
Map<PhysicalLocation, Collection<Gene>> geneClusters = BlatAssociationScorer.clusterGenes(genes2Associations);
// compute specificity at the level of genes. First, get the best score for each gene cluster.
Map<PhysicalLocation, Double> scores = new HashMap<>();
for (PhysicalLocation pl : geneClusters.keySet()) {
Double geneScore = 0.0;
for (Gene cgene : geneClusters.get(pl)) {
for (BlatAssociation blatAssociation : genes2Associations.get(cgene)) {
Double alignScore = blatAssociation.getScore();
if (alignScore > geneScore) {
geneScore = alignScore;
}
}
}
scores.put(pl, geneScore);
}
for (PhysicalLocation pl : geneClusters.keySet()) {
Double alignScore = scores.get(pl);
for (Gene cgene : geneClusters.get(pl)) {
// All members of the cluster get the same specificity.
for (BlatAssociation blatAssociation : genes2Associations.get(cgene)) {
blatAssociation.setSpecificity(BlatAssociationScorer.computeSpecificity(scores.values(), alignScore));
}
}
}
return globalBest;
}
use of ubic.gemma.model.genome.PhysicalLocation in project Gemma by PavlidisLab.
the class SequenceManipulation method getGeneProductExonOverlap.
/**
* Compute the overlap of a physical location with a transcript (gene product). This assumes that the chromosome is
* already matched.
*
* @param starts of the locations we are testing (in the target, so on the same coordinates as the geneProduct
* location is scored)
* @param sizes of the locations we are testing.
* @param strand the strand to look on. If null, strand is ignored.
* @param geneProduct GeneProduct we are testing. If strand of PhysicalLocation is null, we ignore strand.
* @return Total number of bases which overlap with exons of the transcript. A value of zero indicates that the
* location is entirely within an intron, or the strand is wrong.
*/
public static int getGeneProductExonOverlap(String starts, String sizes, String strand, GeneProduct geneProduct) {
if (starts == null || sizes == null || geneProduct == null)
throw new IllegalArgumentException("Null data");
// If strand is null we don't bother looking at it; if the strands don't match we return 0
PhysicalLocation gpPhysicalLocation = geneProduct.getPhysicalLocation();
if (strand != null && gpPhysicalLocation != null && gpPhysicalLocation.getStrand() != null && !gpPhysicalLocation.getStrand().equals(strand)) {
return 0;
}
// These are transient instances
Collection<PhysicalLocation> exons = geneProduct.getExons();
int[] startArray = SequenceManipulation.blatLocationsToIntArray(starts);
int[] sizesArray = SequenceManipulation.blatLocationsToIntArray(sizes);
if (exons.size() == 0) {
/*
* simply use the gene product location itself. This was the case if we have ProbeAlignedRegion...otherwise
* it's not expected
*/
SequenceManipulation.log.warn("No exons for " + geneProduct);
return 0;
}
// this was happening when data was truncated by the database!
assert startArray.length == sizesArray.length : startArray.length + " starts and " + sizesArray.length + " sizes (expected equal numbers)";
int totalOverlap = 0;
int totalLength = 0;
for (int i = 0; i < sizesArray.length; i++) {
int start = startArray[i];
int end = start + sizesArray[i];
for (PhysicalLocation exonLocation : exons) {
int exonStart = exonLocation.getNucleotide().intValue();
int exonEnd = exonLocation.getNucleotide().intValue() + exonLocation.getNucleotideLength();
totalOverlap += PhysicalLocation.computeOverlap(start, end, exonStart, exonEnd);
}
totalLength += end - start;
}
if (totalOverlap > totalLength)
SequenceManipulation.log.warn("More overlap than length of sequence, trimming because " + totalOverlap + " > " + totalLength);
return Math.min(totalOverlap, totalLength);
}
use of ubic.gemma.model.genome.PhysicalLocation in project Gemma by PavlidisLab.
the class ArrayDesignSequenceAlignmentServiceImpl method persistBlatResults.
/**
* @param brs, assumed to be from alignments to the genome for the array design (that is, we don't consider aligning
* mouse to human)
*/
@SuppressWarnings("unchecked")
private Collection<BlatResult> persistBlatResults(Collection<BlatResult> brs) {
Collection<Integer> seen = new HashSet<>();
int duplicates = 0;
for (BlatResult br : brs) {
Integer hash = ProbeMapUtils.hashBlatResult(br);
if (seen.contains(hash)) {
duplicates++;
continue;
}
seen.add(hash);
assert br.getQuerySequence() != null;
assert br.getQuerySequence().getName() != null;
Taxon taxon = br.getQuerySequence().getTaxon();
assert taxon != null;
try {
FieldUtils.writeField(br.getTargetChromosome(), "taxon", taxon, true);
} catch (IllegalAccessException e) {
e.printStackTrace();
}
br.getTargetChromosome().getSequence().setTaxon(taxon);
PhysicalLocation pl = br.getTargetAlignedRegion();
if (pl == null) {
pl = PhysicalLocation.Factory.newInstance();
pl.setChromosome(br.getTargetChromosome());
pl.setNucleotide(br.getTargetStart());
assert br.getTargetEnd() != null && br.getTargetStart() != null;
pl.setNucleotideLength(br.getTargetEnd().intValue() - br.getTargetStart().intValue());
pl.setStrand(br.getStrand());
br.setTargetAlignedRegion(pl);
pl.setBin(SequenceBinUtils.binFromRange(br.getTargetStart().intValue(), br.getTargetEnd().intValue()));
}
}
if (duplicates > 0) {
ArrayDesignSequenceAlignmentServiceImpl.log.info(duplicates + " duplicate BLAT hits skipped");
}
return (Collection<BlatResult>) persisterHelper.persist(brs);
}
use of ubic.gemma.model.genome.PhysicalLocation in project Gemma by PavlidisLab.
the class MockBlat method blatQuery.
@Override
public Collection<BlatResult> blatQuery(BioSequence b) {
Collection<BlatResult> result = new HashSet<>();
BioSequence chromseq = PersistentDummyObjectHelper.getTestNonPersistentBioSequence(taxon);
chromseq.setLength((long) 1e7);
BlatResult br = BlatResult.Factory.newInstance();
Chromosome chromosome = new Chromosome("XXX", null, chromseq, taxon);
br.setTargetChromosome(chromosome);
assert br.getTargetChromosome().getSequence() != null;
long targetStart = MockBlat.RANDOM.nextInt(chromseq.getLength().intValue());
br.setQuerySequence(b);
br.setTargetStart(targetStart);
br.setTargetEnd(targetStart + b.getLength());
br.setMatches((int) (b.getLength() - 1));
br.setMismatches(1);
br.setRepMatches(0);
br.setQueryGapCount(0);
br.setQueryGapBases(0);
br.setQueryStart(0);
br.setQueryEnd(b.getLength().intValue());
br.setTargetGapBases(0);
br.setTargetGapCount(0);
PhysicalLocation targetAlignedRegion = PhysicalLocation.Factory.newInstance();
targetAlignedRegion.setChromosome(br.getTargetChromosome());
targetAlignedRegion.setNucleotide(targetStart);
targetAlignedRegion.setNucleotideLength(b.getLength().intValue());
targetAlignedRegion.setStrand("+");
result.add(br);
return result;
}
use of ubic.gemma.model.genome.PhysicalLocation in project Gemma by PavlidisLab.
the class BlatResultParser method makePhysicalLocation.
private PhysicalLocation makePhysicalLocation(BlatResult blatResult) {
PhysicalLocation pl = PhysicalLocation.Factory.newInstance();
pl.setChromosome(blatResult.getTargetChromosome());
pl.setNucleotide(blatResult.getTargetStart());
pl.setNucleotideLength((int) (blatResult.getTargetEnd() - pl.getNucleotide()));
pl.setStrand(blatResult.getStrand());
return pl;
}
Aggregations