use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.
the class ArrayDesignSequenceProcessingServiceImpl method flushBuffer.
private void flushBuffer(Collection<BioSequence> bioSequences, Collection<BioSequence> sequenceBuffer, Map<String, CompositeSequence> csBuffer) {
Collection<BioSequence> newOnes = bioSequenceService.findOrCreate(sequenceBuffer);
bioSequences.addAll(newOnes);
for (BioSequence sequence : newOnes) {
CompositeSequence cs = csBuffer.get(sequence.getName());
assert cs != null;
cs.setBiologicalCharacteristic(sequence);
}
csBuffer.clear();
sequenceBuffer.clear();
}
use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.
the class ArrayDesignSequenceProcessingServiceImpl method processArrayDesign.
@Override
public Collection<BioSequence> processArrayDesign(ArrayDesign arrayDesign, InputStream sequenceFile, SequenceType sequenceType, Taxon taxon) throws IOException {
if (sequenceType.equals(SequenceType.AFFY_PROBE)) {
return this.processAffymetrixDesign(arrayDesign, sequenceFile, taxon, true);
} else if (sequenceType.equals(SequenceType.OLIGO)) {
return this.processOligoDesign(arrayDesign, sequenceFile, taxon);
}
taxon = this.validateTaxon(taxon, arrayDesign);
this.checkForCompositeSequences(arrayDesign);
FastaParser fastaParser = new FastaParser();
fastaParser.parse(sequenceFile);
Collection<BioSequence> bioSequences = fastaParser.getResults();
// make two maps: one for genbank ids, one for the sequence name.
Map<String, BioSequence> gbIdMap = new HashMap<>();
Map<String, BioSequence> nameMap = new HashMap<>();
int total = bioSequences.size() + arrayDesign.getCompositeSequences().size();
int done = 0;
int percent = 0;
for (BioSequence sequence : bioSequences) {
sequence.setType(sequenceType);
sequence.setPolymerType(PolymerType.DNA);
sequence.setTaxon(taxon);
sequence = this.persistSequence(sequence);
this.addToMaps(gbIdMap, nameMap, sequence);
}
ArrayDesignSequenceProcessingServiceImpl.log.info("Sequences done, updating composite sequences");
int numWithNoSequence = 0;
int numMatchedByAccession = 0;
int numMatchedByProbeName = 0;
String mungeRegex = ArrayDesignSequenceProcessingServiceImpl.DUPLICATE_PROBE_NAME_MUNGE_SEPARATOR + ".+$";
for (CompositeSequence compositeSequence : arrayDesign.getCompositeSequences()) {
if (ArrayDesignSequenceProcessingServiceImpl.log.isTraceEnabled())
ArrayDesignSequenceProcessingServiceImpl.log.trace("Looking for sequence for: " + compositeSequence.getName());
BioSequence match = null;
if (nameMap.containsKey(compositeSequence.getName())) {
match = nameMap.get(compositeSequence.getName());
numMatchedByProbeName++;
} else if (compositeSequence.getName().matches(mungeRegex)) {
String unMungedName = compositeSequence.getName().replaceFirst(mungeRegex, "");
if (nameMap.containsKey(unMungedName)) {
numMatchedByProbeName++;
continue;
}
} else {
BioSequence biologicalCharacteristic = compositeSequence.getBiologicalCharacteristic();
if (biologicalCharacteristic != null) {
biologicalCharacteristic = bioSequenceService.thaw(biologicalCharacteristic);
if (biologicalCharacteristic.getSequenceDatabaseEntry() != null && gbIdMap.containsKey(biologicalCharacteristic.getSequenceDatabaseEntry().getAccession())) {
match = gbIdMap.get(biologicalCharacteristic.getSequenceDatabaseEntry().getAccession());
numMatchedByAccession++;
} else {
compositeSequence.setBiologicalCharacteristic(null);
numWithNoSequence++;
this.notifyAboutMissingSequences(numWithNoSequence, compositeSequence);
}
} else {
numWithNoSequence++;
this.notifyAboutMissingSequences(numWithNoSequence, compositeSequence);
}
}
if (match != null) {
// overwrite the existing characteristic if necessary.
compositeSequence.setBiologicalCharacteristic(match);
compositeSequence.setArrayDesign(arrayDesign);
}
if (++done % 1000 == 0) {
percent = this.updateProgress(total, done, percent);
}
}
ArrayDesignSequenceProcessingServiceImpl.log.info(numMatchedByAccession + "/" + arrayDesign.getCompositeSequences().size() + " composite sequences were matched to sequences by Genbank accession");
ArrayDesignSequenceProcessingServiceImpl.log.info(numMatchedByProbeName + "/" + arrayDesign.getCompositeSequences().size() + " composite sequences were matched to sequences by probe name");
if (numWithNoSequence > 0)
ArrayDesignSequenceProcessingServiceImpl.log.info("There were " + numWithNoSequence + "/" + arrayDesign.getCompositeSequences().size() + " composite sequences with no associated biological characteristic");
ArrayDesignSequenceProcessingServiceImpl.log.info("Updating sequences on arrayDesign");
arrayDesignService.update(arrayDesign);
arrayDesignReportService.generateArrayDesignReport(arrayDesign.getId());
return bioSequences;
}
use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.
the class ArrayDesignSequenceProcessingServiceImpl method processOligoDesign.
/**
* @param sequenceFile; the expected format is described in {@link ProbeSequenceParser}
* @see ProbeSequenceParser
*/
private Collection<BioSequence> processOligoDesign(ArrayDesign arrayDesign, InputStream sequenceFile, Taxon taxon) throws IOException {
this.checkForCompositeSequences(arrayDesign);
ProbeSequenceParser parser = new ProbeSequenceParser();
parser.parse(sequenceFile);
int total = arrayDesign.getCompositeSequences().size();
int done = 0;
int percent = 0;
taxon = this.validateTaxon(taxon, arrayDesign);
ArrayDesignSequenceProcessingServiceImpl.log.info("Sequences done, updating composite sequences");
int numWithNoSequence = 0;
Collection<BioSequence> res = new HashSet<>();
for (CompositeSequence compositeSequence : arrayDesign.getCompositeSequences()) {
if (ArrayDesignSequenceProcessingServiceImpl.log.isTraceEnabled())
ArrayDesignSequenceProcessingServiceImpl.log.trace("Looking for sequence for: " + compositeSequence.getName());
BioSequence sequence = parser.get(compositeSequence.getName());
if (sequence != null) {
// overwrite the existing characteristic if necessary.
assert sequence.getSequence() != null;
sequence.setType(SequenceType.OLIGO);
sequence.setPolymerType(PolymerType.DNA);
sequence.setTaxon(taxon);
sequence = this.persistSequence(sequence);
compositeSequence.setBiologicalCharacteristic(sequence);
compositeSequence.setArrayDesign(arrayDesign);
res.add(sequence);
} else {
numWithNoSequence++;
this.notifyAboutMissingSequences(numWithNoSequence, compositeSequence);
}
if (++done % 1000 == 0) {
percent = this.updateProgress(total, done, percent);
}
}
if (numWithNoSequence > 0)
ArrayDesignSequenceProcessingServiceImpl.log.info("There were " + numWithNoSequence + "/" + arrayDesign.getCompositeSequences().size() + " composite sequences with no associated biological characteristic");
ArrayDesignSequenceProcessingServiceImpl.log.info("Updating sequences on arrayDesign");
arrayDesignService.update(arrayDesign);
return res;
}
use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.
the class ArrayDesignSequenceProcessingServiceImpl method getAccession.
/**
* Used to check if an IMAGE clone exists to use for an accession. If the IMAGE clone is used instead, we update the
* composite sequence.
*/
private String getAccession(CompositeSequence cs) {
BioSequence bs = cs.getBiologicalCharacteristic();
if (bs.getSequenceDatabaseEntry() == null) {
return null;
}
bs = this.bioSequenceService.thaw(bs);
return bs.getSequenceDatabaseEntry().getAccession();
}
use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.
the class ArrayDesignSequenceProcessingServiceImpl method createOrUpdateGenbankSequence.
/**
* @param found a new (non-persistent) biosequence that can be used to create a new entry or update an existing one
* with the sequence. The sequence would have come from Genbank.
* @param force If true, if an existing BioSequence that matches if found in the system, any existing sequence
* information in the BioSequence will be overwritten. Otherwise, the sequence will only be updated if the
* actual sequence information was missing in our DB and 'found' has a sequence.
* @return persistent BioSequence.
*/
private BioSequence createOrUpdateGenbankSequence(BioSequence found, boolean force) {
assert found != null;
DatabaseEntry sequenceDatabaseEntry = found.getSequenceDatabaseEntry();
// this should always be the case because the sequences comes from
assert sequenceDatabaseEntry != null;
// genbank (blastDb)
assert sequenceDatabaseEntry.getExternalDatabase() != null;
BioSequence existing;
existing = bioSequenceService.findByAccession(sequenceDatabaseEntry);
BioSequence result;
if (existing == null) {
if (ArrayDesignSequenceProcessingServiceImpl.log.isDebugEnabled())
ArrayDesignSequenceProcessingServiceImpl.log.debug("Find (or creating) new sequence " + found);
// there still might be a match.
result = bioSequenceService.find(found);
if (result == null) {
result = bioSequenceService.create(found);
}
} else {
result = existing;
}
assert result != null;
// note that no matter what we make sure the database entry is filled in.
if (force || (StringUtils.isBlank(result.getSequence()) && !StringUtils.isBlank(found.getSequence()))) {
result = this.updateExistingWithSequenceData(found, result);
} else {
this.fillInDatabaseEntry(found, result);
}
return result;
}
Aggregations