use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.
the class SequenceManipulation method collapse.
/**
* Convert a CompositeSequence's immobilizedCharacteristics into a single sequence, using a simple merge-join
* strategy.
*
* @param sequences sequences
* @return BioSequence. Not all fields are filled in and must be set by the caller.
*/
public static BioSequence collapse(Collection<Reporter> sequences) {
Collection<Reporter> copyOfSequences = SequenceManipulation.copyReporters(sequences);
BioSequence collapsed = BioSequence.Factory.newInstance();
collapsed.setSequence("");
if (SequenceManipulation.log.isDebugEnabled())
SequenceManipulation.log.debug("Collapsing " + sequences.size() + " sequences");
while (!copyOfSequences.isEmpty()) {
Reporter next = SequenceManipulation.findLeftMostProbe(copyOfSequences);
int ol = SequenceManipulation.rightHandOverlap(collapsed, next.getImmobilizedCharacteristic());
String nextSeqStr = next.getImmobilizedCharacteristic().getSequence();
collapsed.setSequence(collapsed.getSequence() + nextSeqStr.substring(ol));
if (SequenceManipulation.log.isDebugEnabled()) {
SequenceManipulation.log.debug("New Seq to add: " + nextSeqStr + " Overlap=" + ol + " Result=" + collapsed.getSequence());
}
copyOfSequences.remove(next);
}
collapsed.setIsCircular(false);
collapsed.setIsApproximateLength(false);
collapsed.setLength((long) collapsed.getSequence().length());
collapsed.setDescription("Collapsed from " + sequences.size() + " reporter sequences");
return collapsed;
}
use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.
the class BlatAssociationServiceTest method testFindBioSequence.
@Test
public final void testFindBioSequence() {
BioSequence bs = BioSequence.Factory.newInstance();
Taxon t = Taxon.Factory.newInstance();
// has to match what the testpersistent object is.
t.setScientificName("Mus musculus");
t.setIsSpecies(true);
t.setIsGenesUsable(true);
bs.setSequence(testSequence);
bs.setTaxon(t);
bs.setName(testSequenceName);
BioSequence bsIn = this.bioSequenceService.find(bs);
assertNotNull("Did not find " + bs, bsIn);
Collection<BlatAssociation> res = this.blatAssociationService.find(bs);
assertEquals("Was seeking blatresults for sequence " + testSequenceName, 1, res.size());
}
use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.
the class ExpressionExperimentPlatformSwitchService method processVector.
/**
* @param designElementMap Mapping of sequences to probes for the platform that is being switch from. This is used
* to identify new candidates.
* @param usedDesignElements probes from the new design that have already been assigned to probes from the old
* design. If things are done correctly (the old design was merged into the new) then there should be enough.
* Map is of the new design probe to the old design probe it was used for (this is debugging information)
* @param vector vector
* @param bad BioAssayDimension to use, if necessary. If this is null or already the one used, it's igored.
* Otherwise the vector data will be rewritten to match it.
* @throws IllegalStateException if there is no (unused) design element matching the vector's biosequence
*/
private void processVector(Map<BioSequence, Collection<CompositeSequence>> designElementMap, Map<CompositeSequence, Collection<BioAssayDimension>> usedDesignElements, DesignElementDataVector vector, BioAssayDimension bad) {
CompositeSequence oldDe = vector.getDesignElement();
Collection<CompositeSequence> newElCandidates;
BioSequence seq = oldDe.getBiologicalCharacteristic();
if (seq == null) {
newElCandidates = designElementMap.get(ExpressionExperimentPlatformSwitchService.NULL_BIOSEQUENCE);
} else {
newElCandidates = designElementMap.get(seq);
}
if (newElCandidates == null || newElCandidates.isEmpty()) {
throw new IllegalStateException("There are no candidates probes for sequence: " + seq + "('null' should be okay)");
}
for (CompositeSequence newEl : newElCandidates) {
if (!usedDesignElements.containsKey(newEl)) {
vector.setDesignElement(newEl);
usedDesignElements.put(newEl, new HashSet<BioAssayDimension>());
usedDesignElements.get(newEl).add(vector.getBioAssayDimension());
break;
}
if (!usedDesignElements.get(newEl).contains(vector.getBioAssayDimension())) {
/*
* Then it's okay to use it.
*/
vector.setDesignElement(newEl);
usedDesignElements.get(newEl).add(vector.getBioAssayDimension());
break;
}
}
if (bad != null && !vector.getBioAssayDimension().equals(bad)) {
/*
* 1. Check if they are already the same; then just switch it to the desired BAD
* 2. If not, then the vector data has to be rewritten.
*/
this.vectorReWrite(vector, bad);
}
}
use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.
the class ExpressionExperimentPlatformSwitchService method switchExperimentToArrayDesign.
/**
* If you know the arraydesigns are already in a merged state, you should use switchExperimentToMergedPlatform
*
* @param ee ee
* @param arrayDesign The array design to switch to. If some samples already use that array design, nothing will be
* changed for them.
*/
public ExpressionExperiment switchExperimentToArrayDesign(ExpressionExperiment ee, ArrayDesign arrayDesign) {
assert arrayDesign != null;
// remove stuff that will be in the way.
processedExpressionDataVectorService.removeProcessedDataVectors(ee);
sampleCoexpressionMatrixService.delete(ee);
for (ExpressionExperimentSubSet subset : expressionExperimentService.getSubSets(ee)) {
subsetService.remove(subset);
}
// get relation between sequence and designelements.
Map<BioSequence, Collection<CompositeSequence>> designElementMap = new HashMap<>();
Collection<CompositeSequence> elsWithNoSeq = new HashSet<>();
this.populateCSeq(arrayDesign, designElementMap, elsWithNoSeq);
ee = expressionExperimentService.thaw(ee);
ExpressionExperimentPlatformSwitchService.log.info(elsWithNoSeq.size() + " elements on the new platform have no associated sequence.");
designElementMap.put(ExpressionExperimentPlatformSwitchService.NULL_BIOSEQUENCE, elsWithNoSeq);
boolean multiPlatformPerSample = this.checkMultiPerSample(ee, arrayDesign);
/*
* For a multiplatform-per-sample case: (note that some samples might just be on one platform...)
* 1. Pick a BAD that can be used for all DataVectors (it has all BioAssays in it).
* 2. Switch vectors to use it - may require adding NaNs and reordering the vectors
* 3. Delete the Bioassays that are using other BADs
*/
/*
* Now we have to get the BADs. Problem to watch out for: they might not be the same length, we need one that
* includes all BioMaterials.
*/
Collection<BioAssayDimension> unusedBADs = new HashSet<>();
BioAssayDimension maxBAD = null;
int maxSize = 0;
if (multiPlatformPerSample) {
maxBAD = this.doMultiSample(ee, unusedBADs, maxSize);
}
Collection<ArrayDesign> oldArrayDesigns = expressionExperimentService.getArrayDesignsUsed(ee);
Map<CompositeSequence, Collection<BioAssayDimension>> usedDesignElements = new HashMap<>();
for (ArrayDesign oldAd : oldArrayDesigns) {
this.runOldAd(ee, arrayDesign, designElementMap, maxBAD, usedDesignElements, oldAd);
}
ee.setDescription(ee.getDescription() + " [Switched to use " + arrayDesign.getShortName() + " by Gemma]");
helperService.persist(ee, arrayDesign);
/*
* This might need to be done inside the transaction we're using to make the switch.
*/
if (maxBAD != null && !unusedBADs.isEmpty()) {
this.checkUnused(unusedBADs, maxBAD);
}
return ee;
}
use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.
the class ArrayDesignProbeMapperServiceImpl method processArrayDesign.
@Override
public void processArrayDesign(ArrayDesign arrayDesign, Taxon taxon, File source, ExternalDatabase sourceDB, boolean ncbiIds) throws IOException {
if (taxon == null && !ncbiIds) {
throw new IllegalArgumentException("You must provide a taxon unless passing ncbiIds = true");
}
if (arrayDesign.getTechnologyType().equals(TechnologyType.NONE)) {
throw new IllegalArgumentException("Do not use this service to process platforms that do not use an probe-based technology.");
}
try (BufferedReader b = new BufferedReader(new FileReader(source))) {
String line;
int numSkipped = 0;
ArrayDesignProbeMapperServiceImpl.log.info("Removing any old associations");
arrayDesignService.deleteGeneProductAssociations(arrayDesign);
while ((line = b.readLine()) != null) {
if (StringUtils.isBlank(line)) {
continue;
}
if (line.startsWith("#")) {
continue;
}
String[] fields = StringUtils.splitPreserveAllTokens(line, '\t');
if (fields.length != 3) {
throw new IOException("Illegal format, expected three columns, got " + fields.length);
}
String probeId = fields[0];
String seqName = fields[1];
/*
* FIXME. We have to allow NCBI gene ids here.
*/
String geneSymbol = fields[2];
if (StringUtils.isBlank(geneSymbol)) {
numSkipped++;
continue;
}
CompositeSequence c = compositeSequenceService.findByName(arrayDesign, probeId);
if (c == null) {
if (ArrayDesignProbeMapperServiceImpl.log.isDebugEnabled())
ArrayDesignProbeMapperServiceImpl.log.debug("No probe found for '" + probeId + "' on " + arrayDesign + ", skipping");
numSkipped++;
continue;
}
// a probe can have more than one gene associated with it if so they are piped |
Collection<Gene> geneListProbe = new HashSet<>();
// indicate multiple genes
Gene geneDetails;
StringTokenizer st = new StringTokenizer(geneSymbol, "|");
while (st.hasMoreTokens()) {
String geneToken = st.nextToken().trim();
if (ncbiIds) {
geneDetails = geneService.findByNCBIId(Integer.parseInt(geneToken));
} else {
geneDetails = geneService.findByOfficialSymbol(geneToken, taxon);
}
if (geneDetails != null) {
geneListProbe.add(geneDetails);
}
}
if (geneListProbe.size() == 0) {
ArrayDesignProbeMapperServiceImpl.log.warn("No gene(s) found for '" + geneSymbol + "' in " + taxon + ", skipping");
numSkipped++;
continue;
} else if (geneListProbe.size() > 1) {
// this is a common situation, when the geneSymbol actually has |-separated genes, so no need to
// make a
// lot of fuss.
ArrayDesignProbeMapperServiceImpl.log.debug("More than one gene found for '" + geneSymbol + "' in " + taxon);
}
BioSequence bs = c.getBiologicalCharacteristic();
if (bs != null) {
if (StringUtils.isNotBlank(seqName)) {
bs = bioSequenceService.thaw(bs);
if (!bs.getName().equals(seqName)) {
ArrayDesignProbeMapperServiceImpl.log.warn("Sequence name '" + seqName + "' given for " + probeId + " does not match existing entry " + bs.getName() + ", skipping");
numSkipped++;
continue;
}
}
// otherwise we assume everything is okay.
} else {
// create one based on the text provided.
if (StringUtils.isBlank(seqName)) {
ArrayDesignProbeMapperServiceImpl.log.warn("You must provide sequence names for probes which are not already mapped. probeName=" + probeId + " had no sequence associated and no name provided; skipping");
numSkipped++;
continue;
}
bs = BioSequence.Factory.newInstance();
bs.setName(seqName);
bs.setTaxon(taxon);
bs.setDescription("Imported from annotation file");
// Placeholder.
bs.setType(SequenceType.OTHER);
bs = bioSequenceService.create(bs);
c.setBiologicalCharacteristic(bs);
compositeSequenceService.update(c);
}
assert bs != null;
assert bs.getId() != null;
for (Gene gene : geneListProbe) {
gene = geneService.thaw(gene);
if (gene.getProducts().size() == 0) {
ArrayDesignProbeMapperServiceImpl.log.warn("There are no gene products for " + gene + ", it cannot be mapped to probes. Skipping");
numSkipped++;
continue;
}
for (GeneProduct gp : gene.getProducts()) {
AnnotationAssociation association = AnnotationAssociation.Factory.newInstance();
association.setBioSequence(bs);
association.setGeneProduct(gp);
association.setSource(sourceDB);
annotationAssociationService.create(association);
}
}
}
arrayDesignReportService.generateArrayDesignReport(arrayDesign.getId());
this.deleteOldFiles(arrayDesign);
ArrayDesignProbeMapperServiceImpl.log.info("Completed association processing for " + arrayDesign + ", " + numSkipped + " were skipped");
}
}
Aggregations