Search in sources :

Example 6 with BlatAssociation

use of ubic.gemma.model.genome.sequenceAnalysis.BlatAssociation in project Gemma by PavlidisLab.

the class ProbeMapperImpl method processBlatResults.

@Override
public Map<String, Collection<BlatAssociation>> processBlatResults(GoldenPathSequenceAnalysis goldenPathDb, Collection<BlatResult> blatResults, ProbeMapperConfig config) {
    if (log.isDebugEnabled()) {
        log.debug(blatResults.size() + " Blat results to map ");
    }
    assert goldenPathDb != null;
    Map<String, Collection<BlatAssociation>> allRes = new HashMap<>();
    int count = 0;
    Map<BioSequence, Collection<BlatResult>> biosequenceToBlatResults = this.groupBlatResultsByBioSequence(blatResults);
    assert !biosequenceToBlatResults.isEmpty();
    // Do them one sequence at a time.
    for (BioSequence sequence : biosequenceToBlatResults.keySet()) {
        Collection<BlatResult> blatResultsForSequence = biosequenceToBlatResults.get(sequence);
        if (log.isDebugEnabled()) {
            log.debug(blatResultsForSequence.size() + " Blat results for " + sequence);
        }
        assert !blatResultsForSequence.isEmpty();
        Collection<BlatResult> trimmedBlatResultsForSequence = this.trimNonCanonicalChromosomeHits(blatResultsForSequence, config);
        /*
             * Screen out likely repeats, where cross-hybridization is a problem.
             */
        Double fractionRepeats = sequence.getFractionRepeats();
        if (fractionRepeats != null && fractionRepeats > config.getMaximumRepeatFraction() && trimmedBlatResultsForSequence.size() >= config.getNonSpecificSiteCountThreshold()) {
            if (config.getWarnings() < ProbeMapperImpl.MAX_WARNINGS) {
                log.info("Skipped " + sequence + " due to repeat content (" + fractionRepeats + ")");
                if (config.getWarnings() == ProbeMapperImpl.MAX_WARNINGS)
                    log.info("Further non-mappings will not be logged");
                config.incrementWarnings();
            }
            continue;
        }
        /*
             * Filter out sequences that have many high-quality alignments to the genome, even if it isn't a repeat. It
             * just seems like a good idea to reject probes that have too many alignment sites, even if only one of them
             * is to the site of a known gene.
             */
        if (trimmedBlatResultsForSequence.size() >= config.getNonRepeatNonSpecificSiteCountThreshold()) {
            if (config.getWarnings() < ProbeMapperImpl.MAX_WARNINGS) {
                log.info("Skipped " + sequence + " due to non-specificity (" + trimmedBlatResultsForSequence.size() + " hits)");
                if (config.getWarnings() == ProbeMapperImpl.MAX_WARNINGS)
                    log.info("Further non-mappings will not be logged");
                config.incrementWarnings();
            }
            continue;
        }
        /*
             * Sanity check to make sure user is paying attention to what they are putting in.
             */
        if (trimmedBlatResultsForSequence.size() > 25) {
            log.warn(sequence + " has " + trimmedBlatResultsForSequence.size() + " blat associations (after trimming non-canonical chromosome hits)");
        }
        Collection<BlatAssociation> blatAssociationsForSequence = new HashSet<>();
        int skipped = 0;
        // map each blat result.
        for (BlatResult blatResult : trimmedBlatResultsForSequence) {
            if (blatResult.getQuerySequence() == null || (blatResult.getQuerySequence().getLength() == null && StringUtils.isBlank(blatResult.getQuerySequence().getSequence()))) {
                log.warn("Blat result had no sequence: " + blatResult);
                continue;
            }
            assert blatResult.score() >= 0 && blatResult.score() <= 1.0 : "Score was " + blatResult.score();
            assert blatResult.identity() >= 0 && blatResult.identity() <= 1.0 : "Identity was " + blatResult.identity();
            if (blatResult.score() < config.getBlatScoreThreshold() || blatResult.identity() < config.getIdentityThreshold()) {
                if (log.isDebugEnabled())
                    log.debug("Result for " + sequence + " skipped with score=" + blatResult.score() + " identity=" + blatResult.identity());
                skipped++;
                continue;
            }
            if (blatResult.getQuerySequence().getTaxon() == null) {
                Taxon taxon = goldenPathDb.getTaxon();
                assert taxon != null;
                blatResult.getQuerySequence().setTaxon(taxon);
            }
            // here's the key line! Find gene products that map to the given blat result.
            Collection<BlatAssociation> resultsForOneBlatResult = this.processBlatResult(goldenPathDb, blatResult, config);
            if (resultsForOneBlatResult != null && resultsForOneBlatResult.size() > 0) {
                // would be very unlikely.
                if (resultsForOneBlatResult.size() > 100) {
                    log.warn(blatResult + " for " + sequence + " has " + resultsForOneBlatResult.size() + " blat associations");
                }
                blatAssociationsForSequence.addAll(resultsForOneBlatResult);
            }
            // there are rarely this many, but it does happen.
            if (++count % 100 == 0 && log.isDebugEnabled())
                log.debug("Annotations computed for " + count + " blat results for " + sequence);
        }
        // Another important step: fill in the specificity, remove duplicates
        if (!blatAssociationsForSequence.isEmpty()) {
            BlatAssociationScorer.scoreResults(blatAssociationsForSequence);
            // Remove hits not meeting criteria
            blatAssociationsForSequence = this.filterOnScores(blatAssociationsForSequence, config);
        }
        // it might be empty now...
        if (blatAssociationsForSequence.isEmpty()) {
            if (config.getWarnings() < ProbeMapperImpl.MAX_WARNINGS) {
                log.info("No mappings for " + sequence + "; " + trimmedBlatResultsForSequence.size() + " individual blat results checked; " + skipped + " had identity or score below threshold, rest had no mapping");
                if (config.getWarnings() == ProbeMapperImpl.MAX_WARNINGS)
                    log.info("Further non-mappings will not be logged");
            }
            config.incrementWarnings();
            continue;
        } else if (log.isDebugEnabled()) {
            log.debug(blatAssociationsForSequence.size() + " associations for " + sequence);
        }
        if (skipped > 0) {
            if (log.isDebugEnabled())
                log.debug("Skipped " + skipped + "/" + blatResults.size() + " individual blat results that didn't meet criteria");
        }
        String queryName = sequence.getName();
        assert StringUtils.isNotBlank(queryName);
        if (!allRes.containsKey(queryName)) {
            allRes.put(queryName, blatAssociationsForSequence);
        }
        allRes.get(queryName).addAll(blatAssociationsForSequence);
    // if ( log.isDebugEnabled() ) {
    // log.info( blatAssociationsForSequence.size() + " associations for " + sequence
    // + " after redundancy reduction and score filtering; "
    // + ( scored - blatAssociationsForSequence.size() ) + " not used" );
    // }
    }
    return allRes;
}
Also used : HashMap(java.util.HashMap) BioSequence(ubic.gemma.model.genome.biosequence.BioSequence) Taxon(ubic.gemma.model.genome.Taxon) Collection(java.util.Collection) BlatAssociation(ubic.gemma.model.genome.sequenceAnalysis.BlatAssociation) BlatResult(ubic.gemma.model.genome.sequenceAnalysis.BlatResult) HashSet(java.util.HashSet)

Example 7 with BlatAssociation

use of ubic.gemma.model.genome.sequenceAnalysis.BlatAssociation in project Gemma by PavlidisLab.

the class ProbeMapperImpl method processBlatResult.

/**
 * Process a single BlatResult, identifying gene products it maps to.
 *
 * @return BlatAssociations between the queried biosequence and one or more gene products.
 */
private Collection<BlatAssociation> processBlatResult(GoldenPathSequenceAnalysis goldenPathDb, BlatResult blatResult, ProbeMapperConfig config) {
    assert blatResult.getTargetChromosome() != null : "Chromosome not filled in for blat result";
    boolean ignoreStrand = this.determineStrandTreatment(blatResult);
    String strand = ignoreStrand ? null : blatResult.getStrand();
    Collection<BlatAssociation> blatAssociations = goldenPathDb.findAssociations(blatResult.getTargetChromosome().getName(), blatResult.getTargetStart(), blatResult.getTargetEnd(), blatResult.getTargetStarts(), blatResult.getBlockSizes(), strand, threePrimeMethod, config);
    if (blatAssociations != null && blatAssociations.size() > 0) {
        for (BlatAssociation association : blatAssociations) {
            association.setBlatResult(blatResult);
            association.setBioSequence(blatResult.getQuerySequence());
        }
        return blatAssociations;
    }
    return new HashSet<>();
}
Also used : BlatAssociation(ubic.gemma.model.genome.sequenceAnalysis.BlatAssociation) HashSet(java.util.HashSet)

Example 8 with BlatAssociation

use of ubic.gemma.model.genome.sequenceAnalysis.BlatAssociation in project Gemma by PavlidisLab.

the class ArrayDesignProbeMapperServiceImpl method processArrayDesign.

@Override
public void processArrayDesign(ArrayDesign arrayDesign, ProbeMapperConfig config, boolean useDB) {
    assert config != null;
    if (arrayDesign.getTechnologyType().equals(TechnologyType.NONE)) {
        throw new IllegalArgumentException("Do not use this service to process platforms that do not use an probe-based technology.");
    }
    Collection<Taxon> taxa = arrayDesignService.getTaxa(arrayDesign.getId());
    Taxon taxon = arrayDesign.getPrimaryTaxon();
    if (taxa.size() > 1 && taxon == null) {
        throw new IllegalArgumentException("Array design has sequence from multiple taxa and has no primary taxon set: " + arrayDesign);
    }
    GoldenPathSequenceAnalysis goldenPathDb = new GoldenPathSequenceAnalysis(taxon);
    BlockingQueue<BACS> persistingQueue = new ArrayBlockingQueue<>(ArrayDesignProbeMapperServiceImpl.QUEUE_SIZE);
    AtomicBoolean generatorDone = new AtomicBoolean(false);
    AtomicBoolean loaderDone = new AtomicBoolean(false);
    this.load(persistingQueue, generatorDone, loaderDone, useDB);
    if (useDB) {
        ArrayDesignProbeMapperServiceImpl.log.info("Removing any old associations");
        arrayDesignService.deleteGeneProductAssociations(arrayDesign);
    }
    int count = 0;
    int hits = 0;
    ArrayDesignProbeMapperServiceImpl.log.info("Start processing " + arrayDesign.getCompositeSequences().size() + " probes ...");
    for (CompositeSequence compositeSequence : arrayDesign.getCompositeSequences()) {
        if (compositeSequence.getName().equals("1431126_a_at")) {
            ArrayDesignProbeMapperServiceImpl.log.debug("HERE");
        }
        Map<String, Collection<BlatAssociation>> results = this.processCompositeSequence(config, taxon, goldenPathDb, compositeSequence);
        if (results == null)
            continue;
        for (Collection<BlatAssociation> col : results.values()) {
            for (BlatAssociation association : col) {
                if (ArrayDesignProbeMapperServiceImpl.log.isDebugEnabled())
                    ArrayDesignProbeMapperServiceImpl.log.debug(association);
                persistingQueue.add(new BACS(compositeSequence, association));
            }
            ++hits;
        }
        if (++count % 200 == 0) {
            ArrayDesignProbeMapperServiceImpl.log.info("Processed " + count + " composite sequences" + " with blat results; " + hits + " mappings found.");
        }
    }
    generatorDone.set(true);
    ArrayDesignProbeMapperServiceImpl.log.info("Waiting for loading to complete ...");
    while (!loaderDone.get()) {
        try {
            Thread.sleep(1000);
        } catch (InterruptedException e) {
            throw new RuntimeException(e);
        }
    }
    ArrayDesignProbeMapperServiceImpl.log.info("Processed " + count + " composite sequences with blat results; " + hits + " mappings found.");
    arrayDesignReportService.generateArrayDesignReport(arrayDesign.getId());
    this.deleteOldFiles(arrayDesign);
}
Also used : GoldenPathSequenceAnalysis(ubic.gemma.core.externalDb.GoldenPathSequenceAnalysis) Taxon(ubic.gemma.model.genome.Taxon) CompositeSequence(ubic.gemma.model.expression.designElement.CompositeSequence) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) ArrayBlockingQueue(java.util.concurrent.ArrayBlockingQueue) Collection(java.util.Collection) BlatAssociation(ubic.gemma.model.genome.sequenceAnalysis.BlatAssociation)

Example 9 with BlatAssociation

use of ubic.gemma.model.genome.sequenceAnalysis.BlatAssociation in project Gemma by PavlidisLab.

the class CompositeSequenceDaoImpl method batchGetGenesWithSpecificity.

/**
 * @param batch   of composite sequences to process
 * @param results - adding to this
 */
private void batchGetGenesWithSpecificity(Collection<CompositeSequence> batch, Map<CompositeSequence, Collection<BioSequence2GeneProduct>> results) {
    if (batch.size() == 0) {
        return;
    }
    // language=HQL
    final String queryString = "select cs,bas from CompositeSequence cs, BioSequence2GeneProduct bas inner join cs.biologicalCharacteristic bs " + "inner join fetch bas.geneProduct gp inner join fetch gp.gene gene " + "where bas.bioSequence=bs and cs in (:cs)";
    List<?> qr = this.getHibernateTemplate().findByNamedParam(queryString, "cs", batch);
    for (Object o : qr) {
        Object[] oa = (Object[]) o;
        CompositeSequence csa = (CompositeSequence) oa[0];
        BioSequence2GeneProduct ba = (BioSequence2GeneProduct) oa[1];
        if (ba instanceof BlatAssociation) {
            BlatResult blatResult = ((BlatAssociation) ba).getBlatResult();
            PhysicalLocation pl = blatResult.getTargetAlignedRegion();
            /*
                 * We didn't always used to fill in the targetAlignedRegion ... this is just in case.
                 */
            if (pl == null) {
                pl = PhysicalLocation.Factory.newInstance();
                pl.setChromosome(blatResult.getTargetChromosome());
                pl.setNucleotide(blatResult.getTargetStart());
                pl.setNucleotideLength(blatResult.getTargetEnd().intValue() - blatResult.getTargetStart().intValue());
                pl.setStrand(blatResult.getStrand());
            // Note: not bothering to fill in the bin.
            }
        }
        if (!results.containsKey(csa)) {
            results.put(csa, new HashSet<BioSequence2GeneProduct>());
        }
        results.get(csa).add(ba);
    }
    /*
         * This is kind of important. We ensure we return an empty map for probes that do not have a mapping.
         */
    for (CompositeSequence cs : batch) {
        if (!results.containsKey(cs)) {
            results.put(cs, new HashSet<BioSequence2GeneProduct>());
        }
    }
}
Also used : BioSequence2GeneProduct(ubic.gemma.model.association.BioSequence2GeneProduct) CompositeSequenceValueObject(ubic.gemma.model.expression.designElement.CompositeSequenceValueObject) CompositeSequence(ubic.gemma.model.expression.designElement.CompositeSequence) BlatAssociation(ubic.gemma.model.genome.sequenceAnalysis.BlatAssociation) BlatResult(ubic.gemma.model.genome.sequenceAnalysis.BlatResult) PhysicalLocation(ubic.gemma.model.genome.PhysicalLocation)

Example 10 with BlatAssociation

use of ubic.gemma.model.genome.sequenceAnalysis.BlatAssociation in project Gemma by PavlidisLab.

the class CompositeSequenceServiceImpl method getGeneMappingSummary.

@Override
public Collection<GeneMappingSummary> getGeneMappingSummary(CompositeSequence cs) {
    BioSequence biologicalCharacteristic = cs.getBiologicalCharacteristic();
    biologicalCharacteristic = bioSequenceService.thaw(biologicalCharacteristic);
    Map<Integer, GeneMappingSummary> results = new HashMap<>();
    if (biologicalCharacteristic == null || biologicalCharacteristic.getBioSequence2GeneProduct() == null) {
        return results.values();
    }
    Collection<BioSequence2GeneProduct> bs2gps = biologicalCharacteristic.getBioSequence2GeneProduct();
    for (BioSequence2GeneProduct bs2gp : bs2gps) {
        GeneProductValueObject geneProduct = new GeneProductValueObject(geneProductService.thaw(bs2gp.getGeneProduct()));
        GeneValueObject gene = new GeneValueObject(bs2gp.getGeneProduct().getGene());
        BlatResultValueObject blatResult = null;
        if ((bs2gp instanceof BlatAssociation)) {
            BlatAssociation blatAssociation = (BlatAssociation) bs2gp;
            blatResult = new BlatResultValueObject(blatResultService.thaw(blatAssociation.getBlatResult()));
        } else if (bs2gp instanceof AnnotationAssociation) {
            /*
                 * Make a dummy blat result
                 */
            blatResult = new BlatResultValueObject();
            blatResult.setQuerySequence(BioSequenceValueObject.fromEntity(biologicalCharacteristic));
            blatResult.setId(biologicalCharacteristic.getId());
        }
        if (blatResult == null) {
            continue;
        }
        if (results.containsKey(ProbeMapUtils.hashBlatResult(blatResult))) {
            results.get(ProbeMapUtils.hashBlatResult(blatResult)).addGene(geneProduct, gene);
        } else {
            GeneMappingSummary summary = new GeneMappingSummary();
            summary.addGene(geneProduct, gene);
            summary.setBlatResult(blatResult);
            summary.setCompositeSequence(this.loadValueObject(cs));
            results.put(ProbeMapUtils.hashBlatResult(blatResult), summary);
        }
    }
    this.addBlatResultsLackingGenes(cs, results);
    if (results.size() == 0) {
        // add a 'dummy' that at least contains the information about the CS. This is a bit of a hack...
        GeneMappingSummary summary = new GeneMappingSummary();
        summary.setCompositeSequence(this.loadValueObject(cs));
        BlatResultValueObject newInstance = new BlatResultValueObject(-1L);
        newInstance.setQuerySequence(BioSequenceValueObject.fromEntity(biologicalCharacteristic));
        summary.setBlatResult(newInstance);
        results.put(ProbeMapUtils.hashBlatResult(newInstance), summary);
    }
    return results.values();
}
Also used : GeneValueObject(ubic.gemma.model.genome.gene.GeneValueObject) AnnotationAssociation(ubic.gemma.model.genome.sequenceAnalysis.AnnotationAssociation) GeneProductValueObject(ubic.gemma.model.genome.gene.GeneProductValueObject) GeneMappingSummary(ubic.gemma.core.analysis.sequence.GeneMappingSummary) BioSequence(ubic.gemma.model.genome.biosequence.BioSequence) BlatResultValueObject(ubic.gemma.model.genome.sequenceAnalysis.BlatResultValueObject) BioSequence2GeneProduct(ubic.gemma.model.association.BioSequence2GeneProduct) BlatAssociation(ubic.gemma.model.genome.sequenceAnalysis.BlatAssociation)

Aggregations

BlatAssociation (ubic.gemma.model.genome.sequenceAnalysis.BlatAssociation)24 HashSet (java.util.HashSet)10 GeneProduct (ubic.gemma.model.genome.gene.GeneProduct)8 Collection (java.util.Collection)7 HashMap (java.util.HashMap)5 BioSequence2GeneProduct (ubic.gemma.model.association.BioSequence2GeneProduct)5 CompositeSequence (ubic.gemma.model.expression.designElement.CompositeSequence)5 BioSequence (ubic.gemma.model.genome.biosequence.BioSequence)5 BlatResult (ubic.gemma.model.genome.sequenceAnalysis.BlatResult)5 Gene (ubic.gemma.model.genome.Gene)4 PhysicalLocation (ubic.gemma.model.genome.PhysicalLocation)4 Taxon (ubic.gemma.model.genome.Taxon)3 AnnotationAssociation (ubic.gemma.model.genome.sequenceAnalysis.AnnotationAssociation)3 DatabaseEntry (ubic.gemma.model.common.description.DatabaseEntry)2 ArrayList (java.util.ArrayList)1 ArrayBlockingQueue (java.util.concurrent.ArrayBlockingQueue)1 AtomicBoolean (java.util.concurrent.atomic.AtomicBoolean)1 Criteria (org.hibernate.Criteria)1 Test (org.junit.Test)1 HibernateTemplate (org.springframework.orm.hibernate3.HibernateTemplate)1