use of ubic.gemma.model.genome.sequenceAnalysis.BlatResult in project Gemma by PavlidisLab.
the class GoldenPathQueryTest method testQueryMrna.
public final void testQueryMrna() {
if (!hasDb) {
GoldenPathQueryTest.log.warn("Skipping test because hg could not be configured");
return;
}
Collection<BlatResult> actualValue = queryer.findAlignments("AK095183");
// assertEquals( 3, actualValue.size() );
// value used to be 3, now 2; this should be safer.
TestCase.assertTrue(actualValue.size() > 0);
BlatResult r = actualValue.iterator().next();
TestCase.assertEquals("AK095183", (r.getQuerySequence().getName()));
}
use of ubic.gemma.model.genome.sequenceAnalysis.BlatResult in project Gemma by PavlidisLab.
the class ArrayDesignMapResultServiceImpl method summarizeMapResults.
@Override
public Collection<CompositeSequenceMapSummary> summarizeMapResults(Collection<CompositeSequence> compositeSequences) {
Collection<CompositeSequenceMapSummary> result = new HashSet<>();
int count = 0;
for (CompositeSequence cs : compositeSequences) {
CompositeSequenceMapSummary summary = new CompositeSequenceMapSummary(cs);
BioSequence bioSequence = cs.getBiologicalCharacteristic();
if (bioSequence == null) {
result.add(summary);
continue;
}
Collection<BlatResult> blats = blatResultService.findByBioSequence(bioSequence);
summary.setBlatResults(blats);
Collection<BlatAssociation> maps = blatAssociationService.find(bioSequence);
blatAssociationService.thaw(maps);
for (BlatAssociation association : maps) {
summary.getGeneProducts().add(association.getGeneProduct());
summary.getGenes().add(association.getGeneProduct().getGene());
}
result.add(summary);
if (++count % 1000 == 0) {
ArrayDesignMapResultServiceImpl.log.info("Processed " + count + " elements...");
}
}
ArrayDesignMapResultServiceImpl.log.info("Done, processed " + count + " elements");
return result;
}
use of ubic.gemma.model.genome.sequenceAnalysis.BlatResult in project Gemma by PavlidisLab.
the class ProbeMapUtils method removeDuplicates.
/**
* Prune a set of results that have the same coordinates and query. See bug 4037
*
* @param blatResults blat results
*/
public static void removeDuplicates(Collection<BlatResult> blatResults) {
int init = blatResults.size();
Set<Integer> hashes = new HashSet<>();
for (Iterator<BlatResult> it = blatResults.iterator(); it.hasNext(); ) {
BlatResult br = it.next();
Integer hash = ProbeMapUtils.hashBlatResult(br);
if (hashes.contains(hash)) {
it.remove();
}
hashes.add(hash);
}
if (blatResults.size() < init && ProbeMapUtils.log.isDebugEnabled()) {
ProbeMapUtils.log.debug("Pruned " + (init - blatResults.size()) + "/" + init + " duplicates");
}
}
use of ubic.gemma.model.genome.sequenceAnalysis.BlatResult in project Gemma by PavlidisLab.
the class ArrayDesignBlatCli method doWork.
@Override
protected Exception doWork(String[] args) {
Exception err = this.processCommandLine(args);
if (err != null)
return err;
final Date skipIfLastRunLaterThan = this.getLimitingDate();
if (!this.arrayDesignsToProcess.isEmpty()) {
if (this.blatResultFile != null && this.arrayDesignsToProcess.size() > 1) {
throw new IllegalArgumentException("Cannot provide a blat result file when multiple arrays are being analyzed");
}
for (ArrayDesign arrayDesign : this.arrayDesignsToProcess) {
if (!this.needToRun(skipIfLastRunLaterThan, arrayDesign, ArrayDesignSequenceAnalysisEvent.class)) {
AbstractCLI.log.warn(arrayDesign + " was last run more recently than " + skipIfLastRunLaterThan);
return null;
}
arrayDesign = this.thaw(arrayDesign);
Collection<BlatResult> persistedResults;
try {
if (this.blatResultFile != null) {
Collection<BlatResult> blatResults = this.getBlatResultsFromFile(arrayDesign);
if (blatResults == null || blatResults.size() == 0) {
throw new IllegalStateException("No blat results in file!");
}
AbstractCLI.log.info("Got " + blatResults.size() + " blat records");
persistedResults = arrayDesignSequenceAlignmentService.processArrayDesign(arrayDesign, taxon, blatResults);
this.audit(arrayDesign, "BLAT results read from file: " + blatResultFile);
} else {
// Run blat from scratch.
persistedResults = arrayDesignSequenceAlignmentService.processArrayDesign(arrayDesign, this.sensitive);
this.audit(arrayDesign, "Based on a fresh alignment analysis; BLAT score threshold was " + this.blatScoreThreshold + "; sensitive mode was " + this.sensitive);
}
AbstractCLI.log.info("Persisted " + persistedResults.size() + " results");
} catch (IOException e) {
this.errorObjects.add(e);
}
}
} else if (taxon != null) {
Collection<ArrayDesign> allArrayDesigns = arrayDesignService.findByTaxon(taxon);
AbstractCLI.log.warn("*** Running BLAT for all " + taxon.getCommonName() + " Array designs *** [" + allArrayDesigns.size() + " items]");
final SecurityContext context = SecurityContextHolder.getContext();
/*
* Here is our task runner.
*/
class BlatCliConsumer extends Consumer {
private BlatCliConsumer(BlockingQueue<ArrayDesign> q) {
super(q, context);
}
@Override
void consume(ArrayDesign x) {
x = arrayDesignService.thaw(x);
ArrayDesignBlatCli.this.processArrayDesign(skipIfLastRunLaterThan, x);
}
}
BlockingQueue<ArrayDesign> arrayDesigns = new ArrayBlockingQueue<>(allArrayDesigns.size());
arrayDesigns.addAll(allArrayDesigns);
Collection<Thread> threads = new ArrayList<>();
for (int i = 0; i < this.numThreads; i++) {
Consumer c1 = new BlatCliConsumer(arrayDesigns);
Thread k = new Thread(c1);
threads.add(k);
k.start();
}
this.waitForThreadPoolCompletion(threads);
/*
* All done
*/
this.summarizeProcessing();
} else {
this.bail(ErrorCode.MISSING_ARGUMENT);
}
return null;
}
use of ubic.gemma.model.genome.sequenceAnalysis.BlatResult in project Gemma by PavlidisLab.
the class ProbeMapperImpl method processBlatResults.
@Override
public Map<String, Collection<BlatAssociation>> processBlatResults(GoldenPathSequenceAnalysis goldenPathDb, Collection<BlatResult> blatResults, ProbeMapperConfig config) {
if (log.isDebugEnabled()) {
log.debug(blatResults.size() + " Blat results to map ");
}
assert goldenPathDb != null;
Map<String, Collection<BlatAssociation>> allRes = new HashMap<>();
int count = 0;
Map<BioSequence, Collection<BlatResult>> biosequenceToBlatResults = this.groupBlatResultsByBioSequence(blatResults);
assert !biosequenceToBlatResults.isEmpty();
// Do them one sequence at a time.
for (BioSequence sequence : biosequenceToBlatResults.keySet()) {
Collection<BlatResult> blatResultsForSequence = biosequenceToBlatResults.get(sequence);
if (log.isDebugEnabled()) {
log.debug(blatResultsForSequence.size() + " Blat results for " + sequence);
}
assert !blatResultsForSequence.isEmpty();
Collection<BlatResult> trimmedBlatResultsForSequence = this.trimNonCanonicalChromosomeHits(blatResultsForSequence, config);
/*
* Screen out likely repeats, where cross-hybridization is a problem.
*/
Double fractionRepeats = sequence.getFractionRepeats();
if (fractionRepeats != null && fractionRepeats > config.getMaximumRepeatFraction() && trimmedBlatResultsForSequence.size() >= config.getNonSpecificSiteCountThreshold()) {
if (config.getWarnings() < ProbeMapperImpl.MAX_WARNINGS) {
log.info("Skipped " + sequence + " due to repeat content (" + fractionRepeats + ")");
if (config.getWarnings() == ProbeMapperImpl.MAX_WARNINGS)
log.info("Further non-mappings will not be logged");
config.incrementWarnings();
}
continue;
}
/*
* Filter out sequences that have many high-quality alignments to the genome, even if it isn't a repeat. It
* just seems like a good idea to reject probes that have too many alignment sites, even if only one of them
* is to the site of a known gene.
*/
if (trimmedBlatResultsForSequence.size() >= config.getNonRepeatNonSpecificSiteCountThreshold()) {
if (config.getWarnings() < ProbeMapperImpl.MAX_WARNINGS) {
log.info("Skipped " + sequence + " due to non-specificity (" + trimmedBlatResultsForSequence.size() + " hits)");
if (config.getWarnings() == ProbeMapperImpl.MAX_WARNINGS)
log.info("Further non-mappings will not be logged");
config.incrementWarnings();
}
continue;
}
/*
* Sanity check to make sure user is paying attention to what they are putting in.
*/
if (trimmedBlatResultsForSequence.size() > 25) {
log.warn(sequence + " has " + trimmedBlatResultsForSequence.size() + " blat associations (after trimming non-canonical chromosome hits)");
}
Collection<BlatAssociation> blatAssociationsForSequence = new HashSet<>();
int skipped = 0;
// map each blat result.
for (BlatResult blatResult : trimmedBlatResultsForSequence) {
if (blatResult.getQuerySequence() == null || (blatResult.getQuerySequence().getLength() == null && StringUtils.isBlank(blatResult.getQuerySequence().getSequence()))) {
log.warn("Blat result had no sequence: " + blatResult);
continue;
}
assert blatResult.score() >= 0 && blatResult.score() <= 1.0 : "Score was " + blatResult.score();
assert blatResult.identity() >= 0 && blatResult.identity() <= 1.0 : "Identity was " + blatResult.identity();
if (blatResult.score() < config.getBlatScoreThreshold() || blatResult.identity() < config.getIdentityThreshold()) {
if (log.isDebugEnabled())
log.debug("Result for " + sequence + " skipped with score=" + blatResult.score() + " identity=" + blatResult.identity());
skipped++;
continue;
}
if (blatResult.getQuerySequence().getTaxon() == null) {
Taxon taxon = goldenPathDb.getTaxon();
assert taxon != null;
blatResult.getQuerySequence().setTaxon(taxon);
}
// here's the key line! Find gene products that map to the given blat result.
Collection<BlatAssociation> resultsForOneBlatResult = this.processBlatResult(goldenPathDb, blatResult, config);
if (resultsForOneBlatResult != null && resultsForOneBlatResult.size() > 0) {
// would be very unlikely.
if (resultsForOneBlatResult.size() > 100) {
log.warn(blatResult + " for " + sequence + " has " + resultsForOneBlatResult.size() + " blat associations");
}
blatAssociationsForSequence.addAll(resultsForOneBlatResult);
}
// there are rarely this many, but it does happen.
if (++count % 100 == 0 && log.isDebugEnabled())
log.debug("Annotations computed for " + count + " blat results for " + sequence);
}
// Another important step: fill in the specificity, remove duplicates
if (!blatAssociationsForSequence.isEmpty()) {
BlatAssociationScorer.scoreResults(blatAssociationsForSequence);
// Remove hits not meeting criteria
blatAssociationsForSequence = this.filterOnScores(blatAssociationsForSequence, config);
}
// it might be empty now...
if (blatAssociationsForSequence.isEmpty()) {
if (config.getWarnings() < ProbeMapperImpl.MAX_WARNINGS) {
log.info("No mappings for " + sequence + "; " + trimmedBlatResultsForSequence.size() + " individual blat results checked; " + skipped + " had identity or score below threshold, rest had no mapping");
if (config.getWarnings() == ProbeMapperImpl.MAX_WARNINGS)
log.info("Further non-mappings will not be logged");
}
config.incrementWarnings();
continue;
} else if (log.isDebugEnabled()) {
log.debug(blatAssociationsForSequence.size() + " associations for " + sequence);
}
if (skipped > 0) {
if (log.isDebugEnabled())
log.debug("Skipped " + skipped + "/" + blatResults.size() + " individual blat results that didn't meet criteria");
}
String queryName = sequence.getName();
assert StringUtils.isNotBlank(queryName);
if (!allRes.containsKey(queryName)) {
allRes.put(queryName, blatAssociationsForSequence);
}
allRes.get(queryName).addAll(blatAssociationsForSequence);
// if ( log.isDebugEnabled() ) {
// log.info( blatAssociationsForSequence.size() + " associations for " + sequence
// + " after redundancy reduction and score filtering; "
// + ( scored - blatAssociationsForSequence.size() ) + " not used" );
// }
}
return allRes;
}
Aggregations