use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.
the class ArrayDesignSequenceAlignmentServiceImpl method processArrayDesign.
@Override
public Collection<BlatResult> processArrayDesign(ArrayDesign ad, Taxon taxon, Collection<BlatResult> rawBlatResults) {
ArrayDesignSequenceAlignmentServiceImpl.log.info("Looking for old results to remove...");
ad = arrayDesignService.thaw(ad);
arrayDesignService.deleteAlignmentData(ad);
// Blat file processing can only be run on one taxon at a time
taxon = this.validateTaxaForBlatFile(ad, taxon);
Collection<BioSequence> sequencesToBlat = ArrayDesignSequenceAlignmentServiceImpl.getSequences(ad);
sequencesToBlat = bioSequenceService.thaw(sequencesToBlat);
// if the blat results were loaded from a file, we have to replace the
// query sequences with the actual ones
// attached to the array design. We have to do this by name because the
// sequence name is what the files contain.
// Note that if there is ambiguity there will be problems!
Map<String, BioSequence> seqMap = new HashMap<>();
for (BioSequence bioSequence : sequencesToBlat) {
seqMap.put(bioSequence.getName(), bioSequence);
}
ExternalDatabase searchedDatabase = ShellDelegatingBlat.getSearchedGenome(taxon);
Collection<BlatResult> toSkip = new HashSet<>();
for (BlatResult result : rawBlatResults) {
/*
* If the sequences don't have ids, replace them with the actual sequences associated with the array design.
*/
if (result.getQuerySequence().getId() == null) {
String querySeqName = result.getQuerySequence().getName();
BioSequence actualSequence = seqMap.get(querySeqName);
if (actualSequence == null) {
ArrayDesignSequenceAlignmentServiceImpl.log.debug("Array design does not contain a sequence with name " + querySeqName);
toSkip.add(result);
continue;
}
result.setQuerySequence(actualSequence);
} else {
result.getQuerySequence().setTaxon(taxon);
}
result.setSearchedDatabase(searchedDatabase);
try {
FieldUtils.writeField(result.getTargetChromosome(), "taxon", taxon, true);
} catch (IllegalAccessException e) {
e.printStackTrace();
}
result.getTargetChromosome().getSequence().setTaxon(taxon);
}
if (toSkip.size() > 0) {
ArrayDesignSequenceAlignmentServiceImpl.log.warn(toSkip.size() + " blat results were for sequences not on " + ad + "; they will be ignored.");
rawBlatResults.removeAll(toSkip);
}
Map<BioSequence, Collection<BlatResult>> goldenPathAlignments = new HashMap<>();
this.getGoldenPathAlignments(sequencesToBlat, taxon, goldenPathAlignments);
for (BioSequence sequence : goldenPathAlignments.keySet()) {
rawBlatResults.addAll(goldenPathAlignments.get(sequence));
}
Collection<BlatResult> results = this.persistBlatResults(rawBlatResults);
arrayDesignReportService.generateArrayDesignReport(ad.getId());
return results;
}
use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.
the class ArrayDesignSequenceAlignmentServiceImpl method getSequences.
/**
* @param taxon (specified in case array has multiple taxa)
*/
// Possible external use
@SuppressWarnings({ "unused", "WeakerAccess" })
public static Collection<BioSequence> getSequences(ArrayDesign ad, Taxon taxon) {
Collection<CompositeSequence> compositeSequences = ad.getCompositeSequences();
Collection<BioSequence> sequencesToBlat = new HashSet<>();
int numWithNoBioSequence = 0;
int numWithNoSequenceData = 0;
boolean warned = false;
for (CompositeSequence cs : compositeSequences) {
BioSequence bs = cs.getBiologicalCharacteristic();
if (!warned && (numWithNoBioSequence > 20 || numWithNoSequenceData > 20)) {
warned = true;
ArrayDesignSequenceAlignmentServiceImpl.log.warn("More than 20 composite sequences don't have sequence information, no more warnings...");
}
if (bs == null) {
++numWithNoBioSequence;
if (!warned) {
ArrayDesignSequenceAlignmentServiceImpl.log.warn(cs + " had no associated biosequence object");
}
continue;
}
if (bs.getTaxon() == null) {
warned = true;
ArrayDesignSequenceAlignmentServiceImpl.log.warn("There is no taxon defined for this biosequence ");
continue;
}
// if the taxon is null that means we want this run for all taxa for that array
if (taxon != null && !bs.getTaxon().equals(taxon)) {
continue;
}
// noinspection ConstantConditions // Better readability
if (!warned && (numWithNoBioSequence > 20 || numWithNoSequenceData > 20)) {
warned = true;
ArrayDesignSequenceAlignmentServiceImpl.log.warn("More than 20 composite sequences don't have sequence information, no more warnings...");
}
if (StringUtils.isBlank(bs.getSequence())) {
++numWithNoSequenceData;
if (!warned) {
ArrayDesignSequenceAlignmentServiceImpl.log.warn(cs + " had " + bs + " but no sequence, skipping");
}
continue;
}
sequencesToBlat.add(bs);
}
if (numWithNoBioSequence > 0 || numWithNoSequenceData > 0) {
ArrayDesignSequenceAlignmentServiceImpl.log.warn(numWithNoBioSequence + " composite sequences lacked biosequence associations; " + numWithNoSequenceData + " lacked sequence data ( out of " + compositeSequences.size() + " total).");
}
return sequencesToBlat;
}
use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.
the class ArrayDesignSequenceAlignmentServiceImpl method getGoldenPathAlignments.
/**
* Check if there are alignment results in the goldenpath database, in which case we do not reanalyze the sequences.
*
* @param sequencesToBlat The full set of sequences that need analysis.
* @param results Will be stored here.
* @return the sequences which ARE NOT found in goldenpath and which therefore DO need blat.
*/
private Collection<BioSequence> getGoldenPathAlignments(Collection<BioSequence> sequencesToBlat, Taxon taxon, Map<BioSequence, Collection<BlatResult>> results) {
GoldenPathQuery gpq = new GoldenPathQuery(taxon);
Collection<BioSequence> needBlat = new HashSet<>();
int count = 0;
int totalFound = 0;
for (BioSequence sequence : sequencesToBlat) {
boolean found = false;
if (sequence.getSequenceDatabaseEntry() != null) {
Collection<BlatResult> brs = gpq.findAlignments(sequence.getSequenceDatabaseEntry().getAccession());
if (brs != null && brs.size() > 0) {
for (BlatResult result : brs) {
this.copyLengthInformation(sequence, result);
result.setQuerySequence(sequence);
}
results.put(sequence, brs);
found = true;
totalFound++;
}
}
if (++count % 1000 == 0 && totalFound > 0) {
ArrayDesignSequenceAlignmentServiceImpl.log.info("Alignments in Golden Path database for " + totalFound + "/" + count + " checked so far.");
}
if (!found) {
needBlat.add(sequence);
}
}
if (totalFound > 0) {
ArrayDesignSequenceAlignmentServiceImpl.log.info("Found " + totalFound + "/" + count + " alignments in Golden Path database");
}
return needBlat;
}
use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.
the class ArrayDesignSequenceProcessingServiceImpl method assignSequencesToDesignElements.
@Override
public void assignSequencesToDesignElements(Collection<CompositeSequence> designElements, Collection<BioSequence> sequences) {
Map<String, BioSequence> nameMap = new HashMap<>();
for (BioSequence sequence : sequences) {
nameMap.put(this.deMangleProbeId(sequence.getName()), sequence);
}
int numNotFound = 0;
for (CompositeSequence designElement : designElements) {
if (!nameMap.containsKey(designElement.getName())) {
ArrayDesignSequenceProcessingServiceImpl.log.debug("No sequence matches " + designElement.getName());
numNotFound++;
continue;
}
designElement.setBiologicalCharacteristic(nameMap.get(designElement.getName()));
}
ArrayDesignSequenceProcessingServiceImpl.log.info(sequences.size() + " sequences processed for " + designElements.size() + " design elements");
if (numNotFound > 0) {
ArrayDesignSequenceProcessingServiceImpl.log.warn(numNotFound + " probes had no matching sequence");
}
}
use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.
the class ArrayDesignSequenceProcessingServiceImpl method findLocalSequences.
private Map<String, BioSequence> findLocalSequences(Collection<String> identifiersToSearch, Taxon taxon) {
Map<String, BioSequence> found = new HashMap<>();
for (String id : identifiersToSearch) {
BioSequence template = BioSequence.Factory.newInstance();
template.setTaxon(taxon);
template.setName(id);
BioSequence seq = bioSequenceService.find(template);
if (seq != null) {
found.put(id, seq);
}
}
return found;
}
Aggregations