use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.
the class ArrayDesignSequenceAlignmentServiceImpl method processArrayDesign.
private Collection<BlatResult> processArrayDesign(ArrayDesign ad, boolean sensitive, Blat blat) {
if (blat == null)
blat = new ShellDelegatingBlat();
Collection<BlatResult> allResults = new HashSet<>();
if (sensitive)
ArrayDesignSequenceAlignmentServiceImpl.log.info("Running in 'sensitive' mode if possible");
Collection<Taxon> taxa = arrayDesignService.getTaxa(ad.getId());
boolean first = true;
for (Taxon taxon : taxa) {
Collection<BioSequence> sequencesToBlat = ArrayDesignSequenceAlignmentServiceImpl.getSequences(ad, taxon);
Map<BioSequence, Collection<BlatResult>> results = this.getAlignments(sequencesToBlat, sensitive, taxon, blat);
ArrayDesignSequenceAlignmentServiceImpl.log.info("Got BLAT results for " + results.keySet().size() + " query sequences");
Map<String, BioSequence> nameMap = new HashMap<>();
for (BioSequence bs : results.keySet()) {
if (nameMap.containsKey(bs.getName())) {
throw new IllegalStateException("All distinct sequences on the array must have unique names; found " + bs.getName() + " more than once.");
}
nameMap.put(bs.getName(), bs);
}
int noResults = 0;
int count = 0;
// We only remove the results here, after we have at least one set of blat results.
if (first) {
ArrayDesignSequenceAlignmentServiceImpl.log.info("Looking for old results to remove...");
arrayDesignService.deleteAlignmentData(ad);
}
for (BioSequence sequence : sequencesToBlat) {
if (sequence == null) {
ArrayDesignSequenceAlignmentServiceImpl.log.warn("Null sequence!");
continue;
}
Collection<BlatResult> brs = results.get(nameMap.get(sequence.getName()));
if (brs == null) {
++noResults;
continue;
}
for (BlatResult result : brs) {
// must do this to replace
result.setQuerySequence(sequence);
// placeholder instance.
}
allResults.addAll(this.persistBlatResults(brs));
if (++count % 2000 == 0) {
ArrayDesignSequenceAlignmentServiceImpl.log.info("Checked results for " + count + " queries, " + allResults.size() + " blat results so far.");
}
}
ArrayDesignSequenceAlignmentServiceImpl.log.info(noResults + "/" + sequencesToBlat.size() + " sequences had no blat results");
first = false;
}
arrayDesignReportService.generateArrayDesignReport(ad.getId());
return allResults;
}
use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.
the class ArrayDesignSequenceProcessingServiceImpl method assignSequencesToDesignElements.
/**
* Associate sequences with an array design. It is assumed that the name of the sequences can be matched to the name
* of a design element. Provided for testing purposes.
*/
@Override
public void assignSequencesToDesignElements(Collection<CompositeSequence> designElements, InputStream fastaFile) throws IOException {
FastaParser fp = new FastaParser();
fp.parse(fastaFile);
Collection<BioSequence> sequences = fp.getResults();
ArrayDesignSequenceProcessingServiceImpl.log.debug("Parsed " + sequences.size() + " sequences");
this.assignSequencesToDesignElements(designElements, sequences);
}
use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.
the class ArrayDesignSequenceProcessingServiceImpl method findOrUpdateSequences.
/**
* Copy sequences into the original versions, or create new sequences in the DB, as needed.
*
* @param accessionsToFetch accessions that we need to fill in
* @param retrievedSequences candidate sequence information for copying into the database.
* @param force If true, if an existing BioSequence that matches if found in the system, any existing sequence
* information in the BioSequence will be overwritten.
* @param taxa Representing taxa on array
* @return Items that were found.
*/
private Map<String, BioSequence> findOrUpdateSequences(Map<String, BioSequence> accessionsToFetch, Collection<BioSequence> retrievedSequences, Collection<Taxon> taxa, boolean force) {
Map<String, BioSequence> found = new HashMap<>();
for (Taxon taxon : taxa) {
if (taxon == null) {
ArrayDesignSequenceProcessingServiceImpl.log.warn(// probably should be an exception
"Null taxon ...");
}
assert taxon != null;
boolean warned = false;
for (BioSequence sequence : retrievedSequences) {
if (sequence.getTaxon() == null) {
if (!warned) {
ArrayDesignSequenceProcessingServiceImpl.log.warn("Sequence taxon is null [" + sequence + "], copying array taxon " + taxon + " ; further warnings for this array taxon are suppressed.");
}
warned = true;
} else if (!sequence.getTaxon().equals(taxon)) {
continue;
}
sequence.setTaxon(taxon);
if (sequence.getSequenceDatabaseEntry() == null) {
ArrayDesignSequenceProcessingServiceImpl.log.warn("Sequence from BLAST db lacks database entry: " + sequence + "; skipping");
continue;
}
sequence = this.createOrUpdateGenbankSequence(sequence, force);
sequence = this.bioSequenceService.thaw(sequence);
String accession = sequence.getSequenceDatabaseEntry().getAccession();
found.put(accession, sequence);
accessionsToFetch.remove(accession);
}
}
return found;
}
use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.
the class ArrayDesignSequenceProcessingServiceImpl method processArrayDesign.
@Override
public Collection<BioSequence> processArrayDesign(ArrayDesign arrayDesign, String[] databaseNames, String blastDbHome, boolean force, FastaCmd fc) {
Map<String, BioSequence> accessionsToFetch = this.initializeFetchList(arrayDesign, force);
if (accessionsToFetch.size() == 0) {
ArrayDesignSequenceProcessingServiceImpl.log.info("No accessions to fetch, no processing will be done");
return null;
}
Collection<Taxon> taxaOnArray = arrayDesignService.getTaxa(arrayDesign.getId());
// not taxon found
if (taxaOnArray.size() == 0) {
throw new IllegalArgumentException(taxaOnArray.size() + " taxon found for " + arrayDesign + "please specify which taxon to run");
}
Collection<String> notFound = accessionsToFetch.keySet();
Collection<BioSequence> finalResult = new HashSet<>();
int versionNumber = 1;
if (fc == null)
fc = new SimpleFastaCmd();
while (versionNumber < ArrayDesignSequenceProcessingService.MAX_VERSION_NUMBER) {
Collection<BioSequence> retrievedSequences = this.searchBlastDbs(databaseNames, blastDbHome, notFound, fc);
// we can loop through the taxa as we can ignore sequence when retrieved and arraydesign taxon not match.
Map<String, BioSequence> found = this.findOrUpdateSequences(accessionsToFetch, retrievedSequences, taxaOnArray, force);
finalResult.addAll(found.values());
notFound = this.getUnFound(notFound, found);
if (notFound.isEmpty()) {
break;
}
for (String accession : notFound) {
if (ArrayDesignSequenceProcessingServiceImpl.log.isTraceEnabled())
ArrayDesignSequenceProcessingServiceImpl.log.trace(accession + " not found, increasing version number to " + versionNumber);
// remove the version number and increase it
BioSequence bs = accessionsToFetch.get(accession);
accessionsToFetch.remove(accession);
// add or increase the version number.
accession = accession.replaceFirst("\\.\\d+$", "");
accession = accession + "." + Integer.toString(versionNumber);
accessionsToFetch.put(accession, bs);
}
notFound = accessionsToFetch.keySet();
++versionNumber;
}
if (!notFound.isEmpty()) {
this.logMissingSequences(arrayDesign, notFound);
}
ArrayDesignSequenceProcessingServiceImpl.log.info(finalResult.size() + " sequences found");
arrayDesignReportService.generateArrayDesignReport(arrayDesign.getId());
return finalResult;
}
use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.
the class ArrayDesignSequenceProcessingServiceImpl method processAffymetrixDesign.
@Override
public Collection<BioSequence> processAffymetrixDesign(ArrayDesign arrayDesign, InputStream probeSequenceFile, Taxon taxon, boolean force) throws IOException {
ArrayDesignSequenceProcessingServiceImpl.log.info("Processing Affymetrix design");
// arrayDesignService.thawRawAndProcessed( arrayDesign );
boolean wasOriginallyLackingCompositeSequences = arrayDesign.getCompositeSequences().size() == 0;
taxon = this.validateTaxon(taxon, arrayDesign);
Collection<BioSequence> bioSequences = new HashSet<>();
int done = 0;
int percent = 0;
AffyProbeReader apr = new AffyProbeReader();
apr.parse(probeSequenceFile);
Collection<CompositeSequence> compositeSequencesFromProbes = apr.getKeySet();
int total = compositeSequencesFromProbes.size();
Map<String, CompositeSequence> quickFindMap = new HashMap<>();
List<BioSequence> sequenceBuffer = new ArrayList<>();
Map<String, CompositeSequence> csBuffer = new HashMap<>();
for (CompositeSequence newCompositeSequence : compositeSequencesFromProbes) {
// these composite sequences are just use
newCompositeSequence.setArrayDesign(arrayDesign);
BioSequence collapsed = SequenceManipulation.collapse(apr.get(newCompositeSequence));
String sequenceName = newCompositeSequence.getName() + "_collapsed";
collapsed.setName(sequenceName);
collapsed.setType(SequenceType.AFFY_COLLAPSED);
collapsed.setPolymerType(PolymerType.DNA);
collapsed.setTaxon(taxon);
sequenceBuffer.add(collapsed);
if (csBuffer.containsKey(sequenceName))
throw new IllegalArgumentException("All probes must have unique names");
csBuffer.put(sequenceName, newCompositeSequence);
if (sequenceBuffer.size() == ArrayDesignSequenceProcessingServiceImpl.BATCH_SIZE) {
this.flushBuffer(bioSequences, sequenceBuffer, csBuffer);
}
if (wasOriginallyLackingCompositeSequences) {
arrayDesign.getCompositeSequences().add(newCompositeSequence);
} else {
if (force) {
collapsed = this.persistSequence(collapsed);
assert collapsed.getTaxon().equals(taxon);
}
quickFindMap.put(newCompositeSequence.getName(), newCompositeSequence);
}
if (++done % 1000 == 0) {
percent = this.updateProgress(total, done, percent);
}
}
this.flushBuffer(bioSequences, sequenceBuffer, csBuffer);
this.updateProgress(total, done, percent);
if (!wasOriginallyLackingCompositeSequences) {
percent = 0;
done = 0;
int numWithNoSequence = 0;
for (CompositeSequence originalCompositeSequence : arrayDesign.getCompositeSequences()) {
// go back and fill this information into the composite sequences, namely the database entry
// information.
CompositeSequence compositeSequenceFromParse = quickFindMap.get(originalCompositeSequence.getName());
if (compositeSequenceFromParse == null) {
numWithNoSequence++;
this.notifyAboutMissingSequences(numWithNoSequence, originalCompositeSequence);
continue;
}
ArrayDesignSequenceProcessingServiceImpl.log.debug(originalCompositeSequence + " matches " + compositeSequenceFromParse + " seq is " + compositeSequenceFromParse.getBiologicalCharacteristic());
originalCompositeSequence.setBiologicalCharacteristic(compositeSequenceFromParse.getBiologicalCharacteristic());
assert originalCompositeSequence.getBiologicalCharacteristic().getId() != null;
originalCompositeSequence.setArrayDesign(compositeSequenceFromParse.getArrayDesign());
if (++done % 1000 == 0) {
percent = this.updateProgress(total, done, percent);
}
}
ArrayDesignSequenceProcessingServiceImpl.log.info(numWithNoSequence + "/" + arrayDesign.getCompositeSequences().size() + " probes could not be matched to a sequence");
}
arrayDesign.setAdvertisedNumberOfDesignElements(compositeSequencesFromProbes.size());
ArrayDesignSequenceProcessingServiceImpl.log.info("Updating " + arrayDesign);
arrayDesignService.update(arrayDesign);
arrayDesignReportService.generateArrayDesignReport(arrayDesign.getId());
ArrayDesignSequenceProcessingServiceImpl.log.info("Done adding sequence information!");
return bioSequences;
}
Aggregations