Search in sources :

Example 11 with PhysicalLocation

use of ubic.gemma.model.genome.PhysicalLocation in project Gemma by PavlidisLab.

the class BlatAssociationScorer method scoreResults.

/**
 * From a collection of BlatAssociations from a single BioSequence, reduce redundancy, fill in the specificity and
 * score and pick the one with the best scoring statistics.
 * This is a little complicated because a single sequence can yield many BlatResults to the same gene and/or gene
 * product. We reduce the results down to a single (best) result for any given gene product. We also score
 * specificity by the gene: if a sequence 'hits' multiple genes, then the specificity of the generated associations
 * will be less than 1.
 *
 * @param blatAssociations for a single sequence.
 * @return the highest-scoring result (if there are ties this will be a random one). Note that this return value is
 * not all that useful because it assumes there is a "clear winner". The passed-in blatAssociations will be
 * pruned to remove redundant entries, and will have score information filled in as well. It is intended
 * that these 'refined' BlatAssociations will be used in further analysis.
 * @throws IllegalArgumentException if the blatAssociations are from multiple biosequences.
 */
public static BlatAssociation scoreResults(Collection<BlatAssociation> blatAssociations) {
    Map<GeneProduct, Collection<BlatAssociation>> geneProducts2Associations = BlatAssociationScorer.organizeBlatAssociationsByGeneProductAndInitializeScores(blatAssociations);
    BlatAssociation globalBest = BlatAssociationScorer.removeExtraHitsPerGeneProduct(blatAssociations, geneProducts2Associations);
    assert blatAssociations.size() > 0;
    Map<Gene, Collection<BlatAssociation>> genes2Associations = BlatAssociationScorer.organizeBlatAssociationsByGene(blatAssociations);
    assert genes2Associations.size() > 0;
    /*
         * At this point there should be just one blatAssociation per gene product. However, all of these really might
         * be for the same gene. It is only in the case of truly multiple genes that we flag a lower specificity.
         */
    if (genes2Associations.size() == 1) {
        return globalBest;
    }
    Map<PhysicalLocation, Collection<Gene>> geneClusters = BlatAssociationScorer.clusterGenes(genes2Associations);
    // compute specificity at the level of genes. First, get the best score for each gene cluster.
    Map<PhysicalLocation, Double> scores = new HashMap<>();
    for (PhysicalLocation pl : geneClusters.keySet()) {
        Double geneScore = 0.0;
        for (Gene cgene : geneClusters.get(pl)) {
            for (BlatAssociation blatAssociation : genes2Associations.get(cgene)) {
                Double alignScore = blatAssociation.getScore();
                if (alignScore > geneScore) {
                    geneScore = alignScore;
                }
            }
        }
        scores.put(pl, geneScore);
    }
    for (PhysicalLocation pl : geneClusters.keySet()) {
        Double alignScore = scores.get(pl);
        for (Gene cgene : geneClusters.get(pl)) {
            // All members of the cluster get the same specificity.
            for (BlatAssociation blatAssociation : genes2Associations.get(cgene)) {
                blatAssociation.setSpecificity(BlatAssociationScorer.computeSpecificity(scores.values(), alignScore));
            }
        }
    }
    return globalBest;
}
Also used : GeneProduct(ubic.gemma.model.genome.gene.GeneProduct) Gene(ubic.gemma.model.genome.Gene) HashMap(java.util.HashMap) Collection(java.util.Collection) BlatAssociation(ubic.gemma.model.genome.sequenceAnalysis.BlatAssociation) PhysicalLocation(ubic.gemma.model.genome.PhysicalLocation)

Example 12 with PhysicalLocation

use of ubic.gemma.model.genome.PhysicalLocation in project Gemma by PavlidisLab.

the class SequenceManipulation method getGeneProductExonOverlap.

/**
 * Compute the overlap of a physical location with a transcript (gene product). This assumes that the chromosome is
 * already matched.
 *
 * @param starts      of the locations we are testing (in the target, so on the same coordinates as the geneProduct
 *                    location is scored)
 * @param sizes       of the locations we are testing.
 * @param strand      the strand to look on. If null, strand is ignored.
 * @param geneProduct GeneProduct we are testing. If strand of PhysicalLocation is null, we ignore strand.
 * @return Total number of bases which overlap with exons of the transcript. A value of zero indicates that the
 * location is entirely within an intron, or the strand is wrong.
 */
public static int getGeneProductExonOverlap(String starts, String sizes, String strand, GeneProduct geneProduct) {
    if (starts == null || sizes == null || geneProduct == null)
        throw new IllegalArgumentException("Null data");
    // If strand is null we don't bother looking at it; if the strands don't match we return 0
    PhysicalLocation gpPhysicalLocation = geneProduct.getPhysicalLocation();
    if (strand != null && gpPhysicalLocation != null && gpPhysicalLocation.getStrand() != null && !gpPhysicalLocation.getStrand().equals(strand)) {
        return 0;
    }
    // These are transient instances
    Collection<PhysicalLocation> exons = geneProduct.getExons();
    int[] startArray = SequenceManipulation.blatLocationsToIntArray(starts);
    int[] sizesArray = SequenceManipulation.blatLocationsToIntArray(sizes);
    if (exons.size() == 0) {
        /*
             * simply use the gene product location itself. This was the case if we have ProbeAlignedRegion...otherwise
             * it's not expected
             */
        SequenceManipulation.log.warn("No exons for " + geneProduct);
        return 0;
    }
    // this was happening when data was truncated by the database!
    assert startArray.length == sizesArray.length : startArray.length + " starts and " + sizesArray.length + " sizes (expected equal numbers)";
    int totalOverlap = 0;
    int totalLength = 0;
    for (int i = 0; i < sizesArray.length; i++) {
        int start = startArray[i];
        int end = start + sizesArray[i];
        for (PhysicalLocation exonLocation : exons) {
            int exonStart = exonLocation.getNucleotide().intValue();
            int exonEnd = exonLocation.getNucleotide().intValue() + exonLocation.getNucleotideLength();
            totalOverlap += PhysicalLocation.computeOverlap(start, end, exonStart, exonEnd);
        }
        totalLength += end - start;
    }
    if (totalOverlap > totalLength)
        SequenceManipulation.log.warn("More overlap than length of sequence, trimming because " + totalOverlap + " > " + totalLength);
    return Math.min(totalOverlap, totalLength);
}
Also used : PhysicalLocation(ubic.gemma.model.genome.PhysicalLocation)

Example 13 with PhysicalLocation

use of ubic.gemma.model.genome.PhysicalLocation in project Gemma by PavlidisLab.

the class ArrayDesignSequenceAlignmentServiceImpl method persistBlatResults.

/**
 * @param brs, assumed to be from alignments to the genome for the array design (that is, we don't consider aligning
 *             mouse to human)
 */
@SuppressWarnings("unchecked")
private Collection<BlatResult> persistBlatResults(Collection<BlatResult> brs) {
    Collection<Integer> seen = new HashSet<>();
    int duplicates = 0;
    for (BlatResult br : brs) {
        Integer hash = ProbeMapUtils.hashBlatResult(br);
        if (seen.contains(hash)) {
            duplicates++;
            continue;
        }
        seen.add(hash);
        assert br.getQuerySequence() != null;
        assert br.getQuerySequence().getName() != null;
        Taxon taxon = br.getQuerySequence().getTaxon();
        assert taxon != null;
        try {
            FieldUtils.writeField(br.getTargetChromosome(), "taxon", taxon, true);
        } catch (IllegalAccessException e) {
            e.printStackTrace();
        }
        br.getTargetChromosome().getSequence().setTaxon(taxon);
        PhysicalLocation pl = br.getTargetAlignedRegion();
        if (pl == null) {
            pl = PhysicalLocation.Factory.newInstance();
            pl.setChromosome(br.getTargetChromosome());
            pl.setNucleotide(br.getTargetStart());
            assert br.getTargetEnd() != null && br.getTargetStart() != null;
            pl.setNucleotideLength(br.getTargetEnd().intValue() - br.getTargetStart().intValue());
            pl.setStrand(br.getStrand());
            br.setTargetAlignedRegion(pl);
            pl.setBin(SequenceBinUtils.binFromRange(br.getTargetStart().intValue(), br.getTargetEnd().intValue()));
        }
    }
    if (duplicates > 0) {
        ArrayDesignSequenceAlignmentServiceImpl.log.info(duplicates + " duplicate BLAT hits skipped");
    }
    return (Collection<BlatResult>) persisterHelper.persist(brs);
}
Also used : Taxon(ubic.gemma.model.genome.Taxon) Collection(java.util.Collection) HashSet(java.util.HashSet) BlatResult(ubic.gemma.model.genome.sequenceAnalysis.BlatResult) PhysicalLocation(ubic.gemma.model.genome.PhysicalLocation)

Example 14 with PhysicalLocation

use of ubic.gemma.model.genome.PhysicalLocation in project Gemma by PavlidisLab.

the class MockBlat method blatQuery.

@Override
public Collection<BlatResult> blatQuery(BioSequence b) {
    Collection<BlatResult> result = new HashSet<>();
    BioSequence chromseq = PersistentDummyObjectHelper.getTestNonPersistentBioSequence(taxon);
    chromseq.setLength((long) 1e7);
    BlatResult br = BlatResult.Factory.newInstance();
    Chromosome chromosome = new Chromosome("XXX", null, chromseq, taxon);
    br.setTargetChromosome(chromosome);
    assert br.getTargetChromosome().getSequence() != null;
    long targetStart = MockBlat.RANDOM.nextInt(chromseq.getLength().intValue());
    br.setQuerySequence(b);
    br.setTargetStart(targetStart);
    br.setTargetEnd(targetStart + b.getLength());
    br.setMatches((int) (b.getLength() - 1));
    br.setMismatches(1);
    br.setRepMatches(0);
    br.setQueryGapCount(0);
    br.setQueryGapBases(0);
    br.setQueryStart(0);
    br.setQueryEnd(b.getLength().intValue());
    br.setTargetGapBases(0);
    br.setTargetGapCount(0);
    PhysicalLocation targetAlignedRegion = PhysicalLocation.Factory.newInstance();
    targetAlignedRegion.setChromosome(br.getTargetChromosome());
    targetAlignedRegion.setNucleotide(targetStart);
    targetAlignedRegion.setNucleotideLength(b.getLength().intValue());
    targetAlignedRegion.setStrand("+");
    result.add(br);
    return result;
}
Also used : BioSequence(ubic.gemma.model.genome.biosequence.BioSequence) Chromosome(ubic.gemma.model.genome.Chromosome) BlatResult(ubic.gemma.model.genome.sequenceAnalysis.BlatResult) PhysicalLocation(ubic.gemma.model.genome.PhysicalLocation)

Example 15 with PhysicalLocation

use of ubic.gemma.model.genome.PhysicalLocation in project Gemma by PavlidisLab.

the class BlatResultParser method makePhysicalLocation.

private PhysicalLocation makePhysicalLocation(BlatResult blatResult) {
    PhysicalLocation pl = PhysicalLocation.Factory.newInstance();
    pl.setChromosome(blatResult.getTargetChromosome());
    pl.setNucleotide(blatResult.getTargetStart());
    pl.setNucleotideLength((int) (blatResult.getTargetEnd() - pl.getNucleotide()));
    pl.setStrand(blatResult.getStrand());
    return pl;
}
Also used : PhysicalLocation(ubic.gemma.model.genome.PhysicalLocation)

Aggregations

PhysicalLocation (ubic.gemma.model.genome.PhysicalLocation)22 Chromosome (ubic.gemma.model.genome.Chromosome)9 Gene (ubic.gemma.model.genome.Gene)7 Taxon (ubic.gemma.model.genome.Taxon)7 HashSet (java.util.HashSet)5 GeneProduct (ubic.gemma.model.genome.gene.GeneProduct)4 BlatAssociation (ubic.gemma.model.genome.sequenceAnalysis.BlatAssociation)4 BlatResult (ubic.gemma.model.genome.sequenceAnalysis.BlatResult)4 Collection (java.util.Collection)3 Test (org.junit.Test)3 BaseSpringContextTest (ubic.gemma.core.testing.BaseSpringContextTest)3 BioSequence2GeneProduct (ubic.gemma.model.association.BioSequence2GeneProduct)3 HashMap (java.util.HashMap)2 DatabaseEntry (ubic.gemma.model.common.description.DatabaseEntry)2 ArrayList (java.util.ArrayList)1 Before (org.junit.Before)1 Transactional (org.springframework.transaction.annotation.Transactional)1 GeneServiceImpl (ubic.gemma.core.genome.gene.service.GeneServiceImpl)1 CompositeSequence (ubic.gemma.model.expression.designElement.CompositeSequence)1 CompositeSequenceValueObject (ubic.gemma.model.expression.designElement.CompositeSequenceValueObject)1