Search in sources :

Example 1 with BlatResult

use of ubic.gemma.model.genome.sequenceAnalysis.BlatResult in project Gemma by PavlidisLab.

the class GoldenPathQueryTest method testQueryMrna.

public final void testQueryMrna() {
    if (!hasDb) {
        GoldenPathQueryTest.log.warn("Skipping test because hg could not be configured");
        return;
    }
    Collection<BlatResult> actualValue = queryer.findAlignments("AK095183");
    // assertEquals( 3, actualValue.size() );
    // value used to be 3, now 2; this should be safer.
    TestCase.assertTrue(actualValue.size() > 0);
    BlatResult r = actualValue.iterator().next();
    TestCase.assertEquals("AK095183", (r.getQuerySequence().getName()));
}
Also used : BlatResult(ubic.gemma.model.genome.sequenceAnalysis.BlatResult)

Example 2 with BlatResult

use of ubic.gemma.model.genome.sequenceAnalysis.BlatResult in project Gemma by PavlidisLab.

the class ArrayDesignMapResultServiceImpl method summarizeMapResults.

@Override
public Collection<CompositeSequenceMapSummary> summarizeMapResults(Collection<CompositeSequence> compositeSequences) {
    Collection<CompositeSequenceMapSummary> result = new HashSet<>();
    int count = 0;
    for (CompositeSequence cs : compositeSequences) {
        CompositeSequenceMapSummary summary = new CompositeSequenceMapSummary(cs);
        BioSequence bioSequence = cs.getBiologicalCharacteristic();
        if (bioSequence == null) {
            result.add(summary);
            continue;
        }
        Collection<BlatResult> blats = blatResultService.findByBioSequence(bioSequence);
        summary.setBlatResults(blats);
        Collection<BlatAssociation> maps = blatAssociationService.find(bioSequence);
        blatAssociationService.thaw(maps);
        for (BlatAssociation association : maps) {
            summary.getGeneProducts().add(association.getGeneProduct());
            summary.getGenes().add(association.getGeneProduct().getGene());
        }
        result.add(summary);
        if (++count % 1000 == 0) {
            ArrayDesignMapResultServiceImpl.log.info("Processed " + count + " elements...");
        }
    }
    ArrayDesignMapResultServiceImpl.log.info("Done, processed " + count + " elements");
    return result;
}
Also used : BioSequence(ubic.gemma.model.genome.biosequence.BioSequence) CompositeSequence(ubic.gemma.model.expression.designElement.CompositeSequence) BlatAssociation(ubic.gemma.model.genome.sequenceAnalysis.BlatAssociation) BlatResult(ubic.gemma.model.genome.sequenceAnalysis.BlatResult)

Example 3 with BlatResult

use of ubic.gemma.model.genome.sequenceAnalysis.BlatResult in project Gemma by PavlidisLab.

the class ProbeMapUtils method removeDuplicates.

/**
 * Prune a set of results that have the same coordinates and query. See bug 4037
 *
 * @param blatResults blat results
 */
public static void removeDuplicates(Collection<BlatResult> blatResults) {
    int init = blatResults.size();
    Set<Integer> hashes = new HashSet<>();
    for (Iterator<BlatResult> it = blatResults.iterator(); it.hasNext(); ) {
        BlatResult br = it.next();
        Integer hash = ProbeMapUtils.hashBlatResult(br);
        if (hashes.contains(hash)) {
            it.remove();
        }
        hashes.add(hash);
    }
    if (blatResults.size() < init && ProbeMapUtils.log.isDebugEnabled()) {
        ProbeMapUtils.log.debug("Pruned " + (init - blatResults.size()) + "/" + init + " duplicates");
    }
}
Also used : HashSet(java.util.HashSet) BlatResult(ubic.gemma.model.genome.sequenceAnalysis.BlatResult)

Example 4 with BlatResult

use of ubic.gemma.model.genome.sequenceAnalysis.BlatResult in project Gemma by PavlidisLab.

the class ArrayDesignBlatCli method doWork.

@Override
protected Exception doWork(String[] args) {
    Exception err = this.processCommandLine(args);
    if (err != null)
        return err;
    final Date skipIfLastRunLaterThan = this.getLimitingDate();
    if (!this.arrayDesignsToProcess.isEmpty()) {
        if (this.blatResultFile != null && this.arrayDesignsToProcess.size() > 1) {
            throw new IllegalArgumentException("Cannot provide a blat result file when multiple arrays are being analyzed");
        }
        for (ArrayDesign arrayDesign : this.arrayDesignsToProcess) {
            if (!this.needToRun(skipIfLastRunLaterThan, arrayDesign, ArrayDesignSequenceAnalysisEvent.class)) {
                AbstractCLI.log.warn(arrayDesign + " was last run more recently than " + skipIfLastRunLaterThan);
                return null;
            }
            arrayDesign = this.thaw(arrayDesign);
            Collection<BlatResult> persistedResults;
            try {
                if (this.blatResultFile != null) {
                    Collection<BlatResult> blatResults = this.getBlatResultsFromFile(arrayDesign);
                    if (blatResults == null || blatResults.size() == 0) {
                        throw new IllegalStateException("No blat results in file!");
                    }
                    AbstractCLI.log.info("Got " + blatResults.size() + " blat records");
                    persistedResults = arrayDesignSequenceAlignmentService.processArrayDesign(arrayDesign, taxon, blatResults);
                    this.audit(arrayDesign, "BLAT results read from file: " + blatResultFile);
                } else {
                    // Run blat from scratch.
                    persistedResults = arrayDesignSequenceAlignmentService.processArrayDesign(arrayDesign, this.sensitive);
                    this.audit(arrayDesign, "Based on a fresh alignment analysis; BLAT score threshold was " + this.blatScoreThreshold + "; sensitive mode was " + this.sensitive);
                }
                AbstractCLI.log.info("Persisted " + persistedResults.size() + " results");
            } catch (IOException e) {
                this.errorObjects.add(e);
            }
        }
    } else if (taxon != null) {
        Collection<ArrayDesign> allArrayDesigns = arrayDesignService.findByTaxon(taxon);
        AbstractCLI.log.warn("*** Running BLAT for all " + taxon.getCommonName() + " Array designs *** [" + allArrayDesigns.size() + " items]");
        final SecurityContext context = SecurityContextHolder.getContext();
        /*
             * Here is our task runner.
             */
        class BlatCliConsumer extends Consumer {

            private BlatCliConsumer(BlockingQueue<ArrayDesign> q) {
                super(q, context);
            }

            @Override
            void consume(ArrayDesign x) {
                x = arrayDesignService.thaw(x);
                ArrayDesignBlatCli.this.processArrayDesign(skipIfLastRunLaterThan, x);
            }
        }
        BlockingQueue<ArrayDesign> arrayDesigns = new ArrayBlockingQueue<>(allArrayDesigns.size());
        arrayDesigns.addAll(allArrayDesigns);
        Collection<Thread> threads = new ArrayList<>();
        for (int i = 0; i < this.numThreads; i++) {
            Consumer c1 = new BlatCliConsumer(arrayDesigns);
            Thread k = new Thread(c1);
            threads.add(k);
            k.start();
        }
        this.waitForThreadPoolCompletion(threads);
        /*
             * All done
             */
        this.summarizeProcessing();
    } else {
        this.bail(ErrorCode.MISSING_ARGUMENT);
    }
    return null;
}
Also used : BlockingQueue(java.util.concurrent.BlockingQueue) ArrayBlockingQueue(java.util.concurrent.ArrayBlockingQueue) ArrayDesign(ubic.gemma.model.expression.arrayDesign.ArrayDesign) IOException(java.io.IOException) IOException(java.io.IOException) Date(java.util.Date) ArrayDesignSequenceAnalysisEvent(ubic.gemma.model.common.auditAndSecurity.eventType.ArrayDesignSequenceAnalysisEvent) SecurityContext(org.springframework.security.core.context.SecurityContext) Collection(java.util.Collection) BlatResult(ubic.gemma.model.genome.sequenceAnalysis.BlatResult)

Example 5 with BlatResult

use of ubic.gemma.model.genome.sequenceAnalysis.BlatResult in project Gemma by PavlidisLab.

the class ProbeMapperImpl method processBlatResults.

@Override
public Map<String, Collection<BlatAssociation>> processBlatResults(GoldenPathSequenceAnalysis goldenPathDb, Collection<BlatResult> blatResults, ProbeMapperConfig config) {
    if (log.isDebugEnabled()) {
        log.debug(blatResults.size() + " Blat results to map ");
    }
    assert goldenPathDb != null;
    Map<String, Collection<BlatAssociation>> allRes = new HashMap<>();
    int count = 0;
    Map<BioSequence, Collection<BlatResult>> biosequenceToBlatResults = this.groupBlatResultsByBioSequence(blatResults);
    assert !biosequenceToBlatResults.isEmpty();
    // Do them one sequence at a time.
    for (BioSequence sequence : biosequenceToBlatResults.keySet()) {
        Collection<BlatResult> blatResultsForSequence = biosequenceToBlatResults.get(sequence);
        if (log.isDebugEnabled()) {
            log.debug(blatResultsForSequence.size() + " Blat results for " + sequence);
        }
        assert !blatResultsForSequence.isEmpty();
        Collection<BlatResult> trimmedBlatResultsForSequence = this.trimNonCanonicalChromosomeHits(blatResultsForSequence, config);
        /*
             * Screen out likely repeats, where cross-hybridization is a problem.
             */
        Double fractionRepeats = sequence.getFractionRepeats();
        if (fractionRepeats != null && fractionRepeats > config.getMaximumRepeatFraction() && trimmedBlatResultsForSequence.size() >= config.getNonSpecificSiteCountThreshold()) {
            if (config.getWarnings() < ProbeMapperImpl.MAX_WARNINGS) {
                log.info("Skipped " + sequence + " due to repeat content (" + fractionRepeats + ")");
                if (config.getWarnings() == ProbeMapperImpl.MAX_WARNINGS)
                    log.info("Further non-mappings will not be logged");
                config.incrementWarnings();
            }
            continue;
        }
        /*
             * Filter out sequences that have many high-quality alignments to the genome, even if it isn't a repeat. It
             * just seems like a good idea to reject probes that have too many alignment sites, even if only one of them
             * is to the site of a known gene.
             */
        if (trimmedBlatResultsForSequence.size() >= config.getNonRepeatNonSpecificSiteCountThreshold()) {
            if (config.getWarnings() < ProbeMapperImpl.MAX_WARNINGS) {
                log.info("Skipped " + sequence + " due to non-specificity (" + trimmedBlatResultsForSequence.size() + " hits)");
                if (config.getWarnings() == ProbeMapperImpl.MAX_WARNINGS)
                    log.info("Further non-mappings will not be logged");
                config.incrementWarnings();
            }
            continue;
        }
        /*
             * Sanity check to make sure user is paying attention to what they are putting in.
             */
        if (trimmedBlatResultsForSequence.size() > 25) {
            log.warn(sequence + " has " + trimmedBlatResultsForSequence.size() + " blat associations (after trimming non-canonical chromosome hits)");
        }
        Collection<BlatAssociation> blatAssociationsForSequence = new HashSet<>();
        int skipped = 0;
        // map each blat result.
        for (BlatResult blatResult : trimmedBlatResultsForSequence) {
            if (blatResult.getQuerySequence() == null || (blatResult.getQuerySequence().getLength() == null && StringUtils.isBlank(blatResult.getQuerySequence().getSequence()))) {
                log.warn("Blat result had no sequence: " + blatResult);
                continue;
            }
            assert blatResult.score() >= 0 && blatResult.score() <= 1.0 : "Score was " + blatResult.score();
            assert blatResult.identity() >= 0 && blatResult.identity() <= 1.0 : "Identity was " + blatResult.identity();
            if (blatResult.score() < config.getBlatScoreThreshold() || blatResult.identity() < config.getIdentityThreshold()) {
                if (log.isDebugEnabled())
                    log.debug("Result for " + sequence + " skipped with score=" + blatResult.score() + " identity=" + blatResult.identity());
                skipped++;
                continue;
            }
            if (blatResult.getQuerySequence().getTaxon() == null) {
                Taxon taxon = goldenPathDb.getTaxon();
                assert taxon != null;
                blatResult.getQuerySequence().setTaxon(taxon);
            }
            // here's the key line! Find gene products that map to the given blat result.
            Collection<BlatAssociation> resultsForOneBlatResult = this.processBlatResult(goldenPathDb, blatResult, config);
            if (resultsForOneBlatResult != null && resultsForOneBlatResult.size() > 0) {
                // would be very unlikely.
                if (resultsForOneBlatResult.size() > 100) {
                    log.warn(blatResult + " for " + sequence + " has " + resultsForOneBlatResult.size() + " blat associations");
                }
                blatAssociationsForSequence.addAll(resultsForOneBlatResult);
            }
            // there are rarely this many, but it does happen.
            if (++count % 100 == 0 && log.isDebugEnabled())
                log.debug("Annotations computed for " + count + " blat results for " + sequence);
        }
        // Another important step: fill in the specificity, remove duplicates
        if (!blatAssociationsForSequence.isEmpty()) {
            BlatAssociationScorer.scoreResults(blatAssociationsForSequence);
            // Remove hits not meeting criteria
            blatAssociationsForSequence = this.filterOnScores(blatAssociationsForSequence, config);
        }
        // it might be empty now...
        if (blatAssociationsForSequence.isEmpty()) {
            if (config.getWarnings() < ProbeMapperImpl.MAX_WARNINGS) {
                log.info("No mappings for " + sequence + "; " + trimmedBlatResultsForSequence.size() + " individual blat results checked; " + skipped + " had identity or score below threshold, rest had no mapping");
                if (config.getWarnings() == ProbeMapperImpl.MAX_WARNINGS)
                    log.info("Further non-mappings will not be logged");
            }
            config.incrementWarnings();
            continue;
        } else if (log.isDebugEnabled()) {
            log.debug(blatAssociationsForSequence.size() + " associations for " + sequence);
        }
        if (skipped > 0) {
            if (log.isDebugEnabled())
                log.debug("Skipped " + skipped + "/" + blatResults.size() + " individual blat results that didn't meet criteria");
        }
        String queryName = sequence.getName();
        assert StringUtils.isNotBlank(queryName);
        if (!allRes.containsKey(queryName)) {
            allRes.put(queryName, blatAssociationsForSequence);
        }
        allRes.get(queryName).addAll(blatAssociationsForSequence);
    // if ( log.isDebugEnabled() ) {
    // log.info( blatAssociationsForSequence.size() + " associations for " + sequence
    // + " after redundancy reduction and score filtering; "
    // + ( scored - blatAssociationsForSequence.size() ) + " not used" );
    // }
    }
    return allRes;
}
Also used : HashMap(java.util.HashMap) BioSequence(ubic.gemma.model.genome.biosequence.BioSequence) Taxon(ubic.gemma.model.genome.Taxon) Collection(java.util.Collection) BlatAssociation(ubic.gemma.model.genome.sequenceAnalysis.BlatAssociation) BlatResult(ubic.gemma.model.genome.sequenceAnalysis.BlatResult) HashSet(java.util.HashSet)

Aggregations

BlatResult (ubic.gemma.model.genome.sequenceAnalysis.BlatResult)33 BioSequence (ubic.gemma.model.genome.biosequence.BioSequence)12 HashSet (java.util.HashSet)10 Collection (java.util.Collection)9 Taxon (ubic.gemma.model.genome.Taxon)6 Chromosome (ubic.gemma.model.genome.Chromosome)5 BlatAssociation (ubic.gemma.model.genome.sequenceAnalysis.BlatAssociation)5 HashMap (java.util.HashMap)4 CompositeSequence (ubic.gemma.model.expression.designElement.CompositeSequence)4 PhysicalLocation (ubic.gemma.model.genome.PhysicalLocation)4 IOException (java.io.IOException)3 InputStream (java.io.InputStream)3 Blat (ubic.gemma.core.apps.Blat)3 ShellDelegatingBlat (ubic.gemma.core.apps.ShellDelegatingBlat)3 ExternalDatabase (ubic.gemma.model.common.description.ExternalDatabase)3 Test (org.junit.Test)2 BioSequence2GeneProduct (ubic.gemma.model.association.BioSequence2GeneProduct)2 ArrayDesign (ubic.gemma.model.expression.arrayDesign.ArrayDesign)2 ArrayList (java.util.ArrayList)1 Date (java.util.Date)1