use of ubic.gemma.model.genome.sequenceAnalysis.BlatAssociation in project Gemma by PavlidisLab.
the class ProbeMapperImpl method processBlatResults.
@Override
public Map<String, Collection<BlatAssociation>> processBlatResults(GoldenPathSequenceAnalysis goldenPathDb, Collection<BlatResult> blatResults, ProbeMapperConfig config) {
if (log.isDebugEnabled()) {
log.debug(blatResults.size() + " Blat results to map ");
}
assert goldenPathDb != null;
Map<String, Collection<BlatAssociation>> allRes = new HashMap<>();
int count = 0;
Map<BioSequence, Collection<BlatResult>> biosequenceToBlatResults = this.groupBlatResultsByBioSequence(blatResults);
assert !biosequenceToBlatResults.isEmpty();
// Do them one sequence at a time.
for (BioSequence sequence : biosequenceToBlatResults.keySet()) {
Collection<BlatResult> blatResultsForSequence = biosequenceToBlatResults.get(sequence);
if (log.isDebugEnabled()) {
log.debug(blatResultsForSequence.size() + " Blat results for " + sequence);
}
assert !blatResultsForSequence.isEmpty();
Collection<BlatResult> trimmedBlatResultsForSequence = this.trimNonCanonicalChromosomeHits(blatResultsForSequence, config);
/*
* Screen out likely repeats, where cross-hybridization is a problem.
*/
Double fractionRepeats = sequence.getFractionRepeats();
if (fractionRepeats != null && fractionRepeats > config.getMaximumRepeatFraction() && trimmedBlatResultsForSequence.size() >= config.getNonSpecificSiteCountThreshold()) {
if (config.getWarnings() < ProbeMapperImpl.MAX_WARNINGS) {
log.info("Skipped " + sequence + " due to repeat content (" + fractionRepeats + ")");
if (config.getWarnings() == ProbeMapperImpl.MAX_WARNINGS)
log.info("Further non-mappings will not be logged");
config.incrementWarnings();
}
continue;
}
/*
* Filter out sequences that have many high-quality alignments to the genome, even if it isn't a repeat. It
* just seems like a good idea to reject probes that have too many alignment sites, even if only one of them
* is to the site of a known gene.
*/
if (trimmedBlatResultsForSequence.size() >= config.getNonRepeatNonSpecificSiteCountThreshold()) {
if (config.getWarnings() < ProbeMapperImpl.MAX_WARNINGS) {
log.info("Skipped " + sequence + " due to non-specificity (" + trimmedBlatResultsForSequence.size() + " hits)");
if (config.getWarnings() == ProbeMapperImpl.MAX_WARNINGS)
log.info("Further non-mappings will not be logged");
config.incrementWarnings();
}
continue;
}
/*
* Sanity check to make sure user is paying attention to what they are putting in.
*/
if (trimmedBlatResultsForSequence.size() > 25) {
log.warn(sequence + " has " + trimmedBlatResultsForSequence.size() + " blat associations (after trimming non-canonical chromosome hits)");
}
Collection<BlatAssociation> blatAssociationsForSequence = new HashSet<>();
int skipped = 0;
// map each blat result.
for (BlatResult blatResult : trimmedBlatResultsForSequence) {
if (blatResult.getQuerySequence() == null || (blatResult.getQuerySequence().getLength() == null && StringUtils.isBlank(blatResult.getQuerySequence().getSequence()))) {
log.warn("Blat result had no sequence: " + blatResult);
continue;
}
assert blatResult.score() >= 0 && blatResult.score() <= 1.0 : "Score was " + blatResult.score();
assert blatResult.identity() >= 0 && blatResult.identity() <= 1.0 : "Identity was " + blatResult.identity();
if (blatResult.score() < config.getBlatScoreThreshold() || blatResult.identity() < config.getIdentityThreshold()) {
if (log.isDebugEnabled())
log.debug("Result for " + sequence + " skipped with score=" + blatResult.score() + " identity=" + blatResult.identity());
skipped++;
continue;
}
if (blatResult.getQuerySequence().getTaxon() == null) {
Taxon taxon = goldenPathDb.getTaxon();
assert taxon != null;
blatResult.getQuerySequence().setTaxon(taxon);
}
// here's the key line! Find gene products that map to the given blat result.
Collection<BlatAssociation> resultsForOneBlatResult = this.processBlatResult(goldenPathDb, blatResult, config);
if (resultsForOneBlatResult != null && resultsForOneBlatResult.size() > 0) {
// would be very unlikely.
if (resultsForOneBlatResult.size() > 100) {
log.warn(blatResult + " for " + sequence + " has " + resultsForOneBlatResult.size() + " blat associations");
}
blatAssociationsForSequence.addAll(resultsForOneBlatResult);
}
// there are rarely this many, but it does happen.
if (++count % 100 == 0 && log.isDebugEnabled())
log.debug("Annotations computed for " + count + " blat results for " + sequence);
}
// Another important step: fill in the specificity, remove duplicates
if (!blatAssociationsForSequence.isEmpty()) {
BlatAssociationScorer.scoreResults(blatAssociationsForSequence);
// Remove hits not meeting criteria
blatAssociationsForSequence = this.filterOnScores(blatAssociationsForSequence, config);
}
// it might be empty now...
if (blatAssociationsForSequence.isEmpty()) {
if (config.getWarnings() < ProbeMapperImpl.MAX_WARNINGS) {
log.info("No mappings for " + sequence + "; " + trimmedBlatResultsForSequence.size() + " individual blat results checked; " + skipped + " had identity or score below threshold, rest had no mapping");
if (config.getWarnings() == ProbeMapperImpl.MAX_WARNINGS)
log.info("Further non-mappings will not be logged");
}
config.incrementWarnings();
continue;
} else if (log.isDebugEnabled()) {
log.debug(blatAssociationsForSequence.size() + " associations for " + sequence);
}
if (skipped > 0) {
if (log.isDebugEnabled())
log.debug("Skipped " + skipped + "/" + blatResults.size() + " individual blat results that didn't meet criteria");
}
String queryName = sequence.getName();
assert StringUtils.isNotBlank(queryName);
if (!allRes.containsKey(queryName)) {
allRes.put(queryName, blatAssociationsForSequence);
}
allRes.get(queryName).addAll(blatAssociationsForSequence);
// if ( log.isDebugEnabled() ) {
// log.info( blatAssociationsForSequence.size() + " associations for " + sequence
// + " after redundancy reduction and score filtering; "
// + ( scored - blatAssociationsForSequence.size() ) + " not used" );
// }
}
return allRes;
}
use of ubic.gemma.model.genome.sequenceAnalysis.BlatAssociation in project Gemma by PavlidisLab.
the class ProbeMapperImpl method processBlatResult.
/**
* Process a single BlatResult, identifying gene products it maps to.
*
* @return BlatAssociations between the queried biosequence and one or more gene products.
*/
private Collection<BlatAssociation> processBlatResult(GoldenPathSequenceAnalysis goldenPathDb, BlatResult blatResult, ProbeMapperConfig config) {
assert blatResult.getTargetChromosome() != null : "Chromosome not filled in for blat result";
boolean ignoreStrand = this.determineStrandTreatment(blatResult);
String strand = ignoreStrand ? null : blatResult.getStrand();
Collection<BlatAssociation> blatAssociations = goldenPathDb.findAssociations(blatResult.getTargetChromosome().getName(), blatResult.getTargetStart(), blatResult.getTargetEnd(), blatResult.getTargetStarts(), blatResult.getBlockSizes(), strand, threePrimeMethod, config);
if (blatAssociations != null && blatAssociations.size() > 0) {
for (BlatAssociation association : blatAssociations) {
association.setBlatResult(blatResult);
association.setBioSequence(blatResult.getQuerySequence());
}
return blatAssociations;
}
return new HashSet<>();
}
use of ubic.gemma.model.genome.sequenceAnalysis.BlatAssociation in project Gemma by PavlidisLab.
the class ArrayDesignProbeMapperServiceImpl method processArrayDesign.
@Override
public void processArrayDesign(ArrayDesign arrayDesign, ProbeMapperConfig config, boolean useDB) {
assert config != null;
if (arrayDesign.getTechnologyType().equals(TechnologyType.NONE)) {
throw new IllegalArgumentException("Do not use this service to process platforms that do not use an probe-based technology.");
}
Collection<Taxon> taxa = arrayDesignService.getTaxa(arrayDesign.getId());
Taxon taxon = arrayDesign.getPrimaryTaxon();
if (taxa.size() > 1 && taxon == null) {
throw new IllegalArgumentException("Array design has sequence from multiple taxa and has no primary taxon set: " + arrayDesign);
}
GoldenPathSequenceAnalysis goldenPathDb = new GoldenPathSequenceAnalysis(taxon);
BlockingQueue<BACS> persistingQueue = new ArrayBlockingQueue<>(ArrayDesignProbeMapperServiceImpl.QUEUE_SIZE);
AtomicBoolean generatorDone = new AtomicBoolean(false);
AtomicBoolean loaderDone = new AtomicBoolean(false);
this.load(persistingQueue, generatorDone, loaderDone, useDB);
if (useDB) {
ArrayDesignProbeMapperServiceImpl.log.info("Removing any old associations");
arrayDesignService.deleteGeneProductAssociations(arrayDesign);
}
int count = 0;
int hits = 0;
ArrayDesignProbeMapperServiceImpl.log.info("Start processing " + arrayDesign.getCompositeSequences().size() + " probes ...");
for (CompositeSequence compositeSequence : arrayDesign.getCompositeSequences()) {
if (compositeSequence.getName().equals("1431126_a_at")) {
ArrayDesignProbeMapperServiceImpl.log.debug("HERE");
}
Map<String, Collection<BlatAssociation>> results = this.processCompositeSequence(config, taxon, goldenPathDb, compositeSequence);
if (results == null)
continue;
for (Collection<BlatAssociation> col : results.values()) {
for (BlatAssociation association : col) {
if (ArrayDesignProbeMapperServiceImpl.log.isDebugEnabled())
ArrayDesignProbeMapperServiceImpl.log.debug(association);
persistingQueue.add(new BACS(compositeSequence, association));
}
++hits;
}
if (++count % 200 == 0) {
ArrayDesignProbeMapperServiceImpl.log.info("Processed " + count + " composite sequences" + " with blat results; " + hits + " mappings found.");
}
}
generatorDone.set(true);
ArrayDesignProbeMapperServiceImpl.log.info("Waiting for loading to complete ...");
while (!loaderDone.get()) {
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
}
ArrayDesignProbeMapperServiceImpl.log.info("Processed " + count + " composite sequences with blat results; " + hits + " mappings found.");
arrayDesignReportService.generateArrayDesignReport(arrayDesign.getId());
this.deleteOldFiles(arrayDesign);
}
use of ubic.gemma.model.genome.sequenceAnalysis.BlatAssociation in project Gemma by PavlidisLab.
the class CompositeSequenceDaoImpl method batchGetGenesWithSpecificity.
/**
* @param batch of composite sequences to process
* @param results - adding to this
*/
private void batchGetGenesWithSpecificity(Collection<CompositeSequence> batch, Map<CompositeSequence, Collection<BioSequence2GeneProduct>> results) {
if (batch.size() == 0) {
return;
}
// language=HQL
final String queryString = "select cs,bas from CompositeSequence cs, BioSequence2GeneProduct bas inner join cs.biologicalCharacteristic bs " + "inner join fetch bas.geneProduct gp inner join fetch gp.gene gene " + "where bas.bioSequence=bs and cs in (:cs)";
List<?> qr = this.getHibernateTemplate().findByNamedParam(queryString, "cs", batch);
for (Object o : qr) {
Object[] oa = (Object[]) o;
CompositeSequence csa = (CompositeSequence) oa[0];
BioSequence2GeneProduct ba = (BioSequence2GeneProduct) oa[1];
if (ba instanceof BlatAssociation) {
BlatResult blatResult = ((BlatAssociation) ba).getBlatResult();
PhysicalLocation pl = blatResult.getTargetAlignedRegion();
/*
* We didn't always used to fill in the targetAlignedRegion ... this is just in case.
*/
if (pl == null) {
pl = PhysicalLocation.Factory.newInstance();
pl.setChromosome(blatResult.getTargetChromosome());
pl.setNucleotide(blatResult.getTargetStart());
pl.setNucleotideLength(blatResult.getTargetEnd().intValue() - blatResult.getTargetStart().intValue());
pl.setStrand(blatResult.getStrand());
// Note: not bothering to fill in the bin.
}
}
if (!results.containsKey(csa)) {
results.put(csa, new HashSet<BioSequence2GeneProduct>());
}
results.get(csa).add(ba);
}
/*
* This is kind of important. We ensure we return an empty map for probes that do not have a mapping.
*/
for (CompositeSequence cs : batch) {
if (!results.containsKey(cs)) {
results.put(cs, new HashSet<BioSequence2GeneProduct>());
}
}
}
use of ubic.gemma.model.genome.sequenceAnalysis.BlatAssociation in project Gemma by PavlidisLab.
the class CompositeSequenceServiceImpl method getGeneMappingSummary.
@Override
public Collection<GeneMappingSummary> getGeneMappingSummary(CompositeSequence cs) {
BioSequence biologicalCharacteristic = cs.getBiologicalCharacteristic();
biologicalCharacteristic = bioSequenceService.thaw(biologicalCharacteristic);
Map<Integer, GeneMappingSummary> results = new HashMap<>();
if (biologicalCharacteristic == null || biologicalCharacteristic.getBioSequence2GeneProduct() == null) {
return results.values();
}
Collection<BioSequence2GeneProduct> bs2gps = biologicalCharacteristic.getBioSequence2GeneProduct();
for (BioSequence2GeneProduct bs2gp : bs2gps) {
GeneProductValueObject geneProduct = new GeneProductValueObject(geneProductService.thaw(bs2gp.getGeneProduct()));
GeneValueObject gene = new GeneValueObject(bs2gp.getGeneProduct().getGene());
BlatResultValueObject blatResult = null;
if ((bs2gp instanceof BlatAssociation)) {
BlatAssociation blatAssociation = (BlatAssociation) bs2gp;
blatResult = new BlatResultValueObject(blatResultService.thaw(blatAssociation.getBlatResult()));
} else if (bs2gp instanceof AnnotationAssociation) {
/*
* Make a dummy blat result
*/
blatResult = new BlatResultValueObject();
blatResult.setQuerySequence(BioSequenceValueObject.fromEntity(biologicalCharacteristic));
blatResult.setId(biologicalCharacteristic.getId());
}
if (blatResult == null) {
continue;
}
if (results.containsKey(ProbeMapUtils.hashBlatResult(blatResult))) {
results.get(ProbeMapUtils.hashBlatResult(blatResult)).addGene(geneProduct, gene);
} else {
GeneMappingSummary summary = new GeneMappingSummary();
summary.addGene(geneProduct, gene);
summary.setBlatResult(blatResult);
summary.setCompositeSequence(this.loadValueObject(cs));
results.put(ProbeMapUtils.hashBlatResult(blatResult), summary);
}
}
this.addBlatResultsLackingGenes(cs, results);
if (results.size() == 0) {
// add a 'dummy' that at least contains the information about the CS. This is a bit of a hack...
GeneMappingSummary summary = new GeneMappingSummary();
summary.setCompositeSequence(this.loadValueObject(cs));
BlatResultValueObject newInstance = new BlatResultValueObject(-1L);
newInstance.setQuerySequence(BioSequenceValueObject.fromEntity(biologicalCharacteristic));
summary.setBlatResult(newInstance);
results.put(ProbeMapUtils.hashBlatResult(newInstance), summary);
}
return results.values();
}
Aggregations