Search in sources :

Example 21 with BioSequence

use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.

the class SequenceManipulation method collapse.

/**
 * Convert a CompositeSequence's immobilizedCharacteristics into a single sequence, using a simple merge-join
 * strategy.
 *
 * @param sequences sequences
 * @return BioSequence. Not all fields are filled in and must be set by the caller.
 */
public static BioSequence collapse(Collection<Reporter> sequences) {
    Collection<Reporter> copyOfSequences = SequenceManipulation.copyReporters(sequences);
    BioSequence collapsed = BioSequence.Factory.newInstance();
    collapsed.setSequence("");
    if (SequenceManipulation.log.isDebugEnabled())
        SequenceManipulation.log.debug("Collapsing " + sequences.size() + " sequences");
    while (!copyOfSequences.isEmpty()) {
        Reporter next = SequenceManipulation.findLeftMostProbe(copyOfSequences);
        int ol = SequenceManipulation.rightHandOverlap(collapsed, next.getImmobilizedCharacteristic());
        String nextSeqStr = next.getImmobilizedCharacteristic().getSequence();
        collapsed.setSequence(collapsed.getSequence() + nextSeqStr.substring(ol));
        if (SequenceManipulation.log.isDebugEnabled()) {
            SequenceManipulation.log.debug("New Seq to add: " + nextSeqStr + " Overlap=" + ol + " Result=" + collapsed.getSequence());
        }
        copyOfSequences.remove(next);
    }
    collapsed.setIsCircular(false);
    collapsed.setIsApproximateLength(false);
    collapsed.setLength((long) collapsed.getSequence().length());
    collapsed.setDescription("Collapsed from " + sequences.size() + " reporter sequences");
    return collapsed;
}
Also used : BioSequence(ubic.gemma.model.genome.biosequence.BioSequence) Reporter(ubic.gemma.core.loader.expression.arrayDesign.Reporter)

Example 22 with BioSequence

use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.

the class BlatAssociationServiceTest method testFindBioSequence.

@Test
public final void testFindBioSequence() {
    BioSequence bs = BioSequence.Factory.newInstance();
    Taxon t = Taxon.Factory.newInstance();
    // has to match what the testpersistent object is.
    t.setScientificName("Mus musculus");
    t.setIsSpecies(true);
    t.setIsGenesUsable(true);
    bs.setSequence(testSequence);
    bs.setTaxon(t);
    bs.setName(testSequenceName);
    BioSequence bsIn = this.bioSequenceService.find(bs);
    assertNotNull("Did not find " + bs, bsIn);
    Collection<BlatAssociation> res = this.blatAssociationService.find(bs);
    assertEquals("Was seeking blatresults for sequence " + testSequenceName, 1, res.size());
}
Also used : BioSequence(ubic.gemma.model.genome.biosequence.BioSequence) Taxon(ubic.gemma.model.genome.Taxon) Test(org.junit.Test) BaseSpringContextTest(ubic.gemma.core.testing.BaseSpringContextTest)

Example 23 with BioSequence

use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.

the class ExpressionExperimentPlatformSwitchService method processVector.

/**
 * @param designElementMap   Mapping of sequences to probes for the platform that is being switch from. This is used
 *                           to identify new candidates.
 * @param usedDesignElements probes from the new design that have already been assigned to probes from the old
 *                           design. If things are done correctly (the old design was merged into the new) then there should be enough.
 *                           Map is of the new design probe to the old design probe it was used for (this is debugging information)
 * @param vector             vector
 * @param bad                BioAssayDimension to use, if necessary. If this is null or already the one used, it's igored.
 *                           Otherwise the vector data will be rewritten to match it.
 * @throws IllegalStateException if there is no (unused) design element matching the vector's biosequence
 */
private void processVector(Map<BioSequence, Collection<CompositeSequence>> designElementMap, Map<CompositeSequence, Collection<BioAssayDimension>> usedDesignElements, DesignElementDataVector vector, BioAssayDimension bad) {
    CompositeSequence oldDe = vector.getDesignElement();
    Collection<CompositeSequence> newElCandidates;
    BioSequence seq = oldDe.getBiologicalCharacteristic();
    if (seq == null) {
        newElCandidates = designElementMap.get(ExpressionExperimentPlatformSwitchService.NULL_BIOSEQUENCE);
    } else {
        newElCandidates = designElementMap.get(seq);
    }
    if (newElCandidates == null || newElCandidates.isEmpty()) {
        throw new IllegalStateException("There are no candidates probes for sequence: " + seq + "('null' should be okay)");
    }
    for (CompositeSequence newEl : newElCandidates) {
        if (!usedDesignElements.containsKey(newEl)) {
            vector.setDesignElement(newEl);
            usedDesignElements.put(newEl, new HashSet<BioAssayDimension>());
            usedDesignElements.get(newEl).add(vector.getBioAssayDimension());
            break;
        }
        if (!usedDesignElements.get(newEl).contains(vector.getBioAssayDimension())) {
            /*
                 * Then it's okay to use it.
                 */
            vector.setDesignElement(newEl);
            usedDesignElements.get(newEl).add(vector.getBioAssayDimension());
            break;
        }
    }
    if (bad != null && !vector.getBioAssayDimension().equals(bad)) {
        /*
             * 1. Check if they are already the same; then just switch it to the desired BAD
             * 2. If not, then the vector data has to be rewritten.
             */
        this.vectorReWrite(vector, bad);
    }
}
Also used : BioAssayDimension(ubic.gemma.model.expression.bioAssayData.BioAssayDimension) BioSequence(ubic.gemma.model.genome.biosequence.BioSequence) CompositeSequence(ubic.gemma.model.expression.designElement.CompositeSequence)

Example 24 with BioSequence

use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.

the class ExpressionExperimentPlatformSwitchService method switchExperimentToArrayDesign.

/**
 * If you know the arraydesigns are already in a merged state, you should use switchExperimentToMergedPlatform
 *
 * @param ee          ee
 * @param arrayDesign The array design to switch to. If some samples already use that array design, nothing will be
 *                    changed for them.
 */
public ExpressionExperiment switchExperimentToArrayDesign(ExpressionExperiment ee, ArrayDesign arrayDesign) {
    assert arrayDesign != null;
    // remove stuff that will be in the way.
    processedExpressionDataVectorService.removeProcessedDataVectors(ee);
    sampleCoexpressionMatrixService.delete(ee);
    for (ExpressionExperimentSubSet subset : expressionExperimentService.getSubSets(ee)) {
        subsetService.remove(subset);
    }
    // get relation between sequence and designelements.
    Map<BioSequence, Collection<CompositeSequence>> designElementMap = new HashMap<>();
    Collection<CompositeSequence> elsWithNoSeq = new HashSet<>();
    this.populateCSeq(arrayDesign, designElementMap, elsWithNoSeq);
    ee = expressionExperimentService.thaw(ee);
    ExpressionExperimentPlatformSwitchService.log.info(elsWithNoSeq.size() + " elements on the new platform have no associated sequence.");
    designElementMap.put(ExpressionExperimentPlatformSwitchService.NULL_BIOSEQUENCE, elsWithNoSeq);
    boolean multiPlatformPerSample = this.checkMultiPerSample(ee, arrayDesign);
    /*
         * For a multiplatform-per-sample case: (note that some samples might just be on one platform...)
         * 1. Pick a BAD that can be used for all DataVectors (it has all BioAssays in it).
         * 2. Switch vectors to use it - may require adding NaNs and reordering the vectors
         * 3. Delete the Bioassays that are using other BADs
         */
    /*
         * Now we have to get the BADs. Problem to watch out for: they might not be the same length, we need one that
         * includes all BioMaterials.
         */
    Collection<BioAssayDimension> unusedBADs = new HashSet<>();
    BioAssayDimension maxBAD = null;
    int maxSize = 0;
    if (multiPlatformPerSample) {
        maxBAD = this.doMultiSample(ee, unusedBADs, maxSize);
    }
    Collection<ArrayDesign> oldArrayDesigns = expressionExperimentService.getArrayDesignsUsed(ee);
    Map<CompositeSequence, Collection<BioAssayDimension>> usedDesignElements = new HashMap<>();
    for (ArrayDesign oldAd : oldArrayDesigns) {
        this.runOldAd(ee, arrayDesign, designElementMap, maxBAD, usedDesignElements, oldAd);
    }
    ee.setDescription(ee.getDescription() + " [Switched to use " + arrayDesign.getShortName() + " by Gemma]");
    helperService.persist(ee, arrayDesign);
    /*
         * This might need to be done inside the transaction we're using to make the switch.
         */
    if (maxBAD != null && !unusedBADs.isEmpty()) {
        this.checkUnused(unusedBADs, maxBAD);
    }
    return ee;
}
Also used : BioSequence(ubic.gemma.model.genome.biosequence.BioSequence) ArrayDesign(ubic.gemma.model.expression.arrayDesign.ArrayDesign) ExpressionExperimentSubSet(ubic.gemma.model.expression.experiment.ExpressionExperimentSubSet) CompositeSequence(ubic.gemma.model.expression.designElement.CompositeSequence) BioAssayDimension(ubic.gemma.model.expression.bioAssayData.BioAssayDimension)

Example 25 with BioSequence

use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.

the class ArrayDesignProbeMapperServiceImpl method processArrayDesign.

@Override
public void processArrayDesign(ArrayDesign arrayDesign, Taxon taxon, File source, ExternalDatabase sourceDB, boolean ncbiIds) throws IOException {
    if (taxon == null && !ncbiIds) {
        throw new IllegalArgumentException("You must provide a taxon unless passing ncbiIds = true");
    }
    if (arrayDesign.getTechnologyType().equals(TechnologyType.NONE)) {
        throw new IllegalArgumentException("Do not use this service to process platforms that do not use an probe-based technology.");
    }
    try (BufferedReader b = new BufferedReader(new FileReader(source))) {
        String line;
        int numSkipped = 0;
        ArrayDesignProbeMapperServiceImpl.log.info("Removing any old associations");
        arrayDesignService.deleteGeneProductAssociations(arrayDesign);
        while ((line = b.readLine()) != null) {
            if (StringUtils.isBlank(line)) {
                continue;
            }
            if (line.startsWith("#")) {
                continue;
            }
            String[] fields = StringUtils.splitPreserveAllTokens(line, '\t');
            if (fields.length != 3) {
                throw new IOException("Illegal format, expected three columns, got " + fields.length);
            }
            String probeId = fields[0];
            String seqName = fields[1];
            /*
                 * FIXME. We have to allow NCBI gene ids here.
                 */
            String geneSymbol = fields[2];
            if (StringUtils.isBlank(geneSymbol)) {
                numSkipped++;
                continue;
            }
            CompositeSequence c = compositeSequenceService.findByName(arrayDesign, probeId);
            if (c == null) {
                if (ArrayDesignProbeMapperServiceImpl.log.isDebugEnabled())
                    ArrayDesignProbeMapperServiceImpl.log.debug("No probe found for '" + probeId + "' on " + arrayDesign + ", skipping");
                numSkipped++;
                continue;
            }
            // a probe can have more than one gene associated with it if so they are piped |
            Collection<Gene> geneListProbe = new HashSet<>();
            // indicate multiple genes
            Gene geneDetails;
            StringTokenizer st = new StringTokenizer(geneSymbol, "|");
            while (st.hasMoreTokens()) {
                String geneToken = st.nextToken().trim();
                if (ncbiIds) {
                    geneDetails = geneService.findByNCBIId(Integer.parseInt(geneToken));
                } else {
                    geneDetails = geneService.findByOfficialSymbol(geneToken, taxon);
                }
                if (geneDetails != null) {
                    geneListProbe.add(geneDetails);
                }
            }
            if (geneListProbe.size() == 0) {
                ArrayDesignProbeMapperServiceImpl.log.warn("No gene(s) found for '" + geneSymbol + "' in " + taxon + ", skipping");
                numSkipped++;
                continue;
            } else if (geneListProbe.size() > 1) {
                // this is a common situation, when the geneSymbol actually has |-separated genes, so no need to
                // make a
                // lot of fuss.
                ArrayDesignProbeMapperServiceImpl.log.debug("More than one gene found for '" + geneSymbol + "' in " + taxon);
            }
            BioSequence bs = c.getBiologicalCharacteristic();
            if (bs != null) {
                if (StringUtils.isNotBlank(seqName)) {
                    bs = bioSequenceService.thaw(bs);
                    if (!bs.getName().equals(seqName)) {
                        ArrayDesignProbeMapperServiceImpl.log.warn("Sequence name '" + seqName + "' given for " + probeId + " does not match existing entry " + bs.getName() + ", skipping");
                        numSkipped++;
                        continue;
                    }
                }
            // otherwise we assume everything is okay.
            } else {
                // create one based on the text provided.
                if (StringUtils.isBlank(seqName)) {
                    ArrayDesignProbeMapperServiceImpl.log.warn("You must provide sequence names for probes which are not already mapped. probeName=" + probeId + " had no sequence associated and no name provided; skipping");
                    numSkipped++;
                    continue;
                }
                bs = BioSequence.Factory.newInstance();
                bs.setName(seqName);
                bs.setTaxon(taxon);
                bs.setDescription("Imported from annotation file");
                // Placeholder.
                bs.setType(SequenceType.OTHER);
                bs = bioSequenceService.create(bs);
                c.setBiologicalCharacteristic(bs);
                compositeSequenceService.update(c);
            }
            assert bs != null;
            assert bs.getId() != null;
            for (Gene gene : geneListProbe) {
                gene = geneService.thaw(gene);
                if (gene.getProducts().size() == 0) {
                    ArrayDesignProbeMapperServiceImpl.log.warn("There are no gene products for " + gene + ", it cannot be mapped to probes. Skipping");
                    numSkipped++;
                    continue;
                }
                for (GeneProduct gp : gene.getProducts()) {
                    AnnotationAssociation association = AnnotationAssociation.Factory.newInstance();
                    association.setBioSequence(bs);
                    association.setGeneProduct(gp);
                    association.setSource(sourceDB);
                    annotationAssociationService.create(association);
                }
            }
        }
        arrayDesignReportService.generateArrayDesignReport(arrayDesign.getId());
        this.deleteOldFiles(arrayDesign);
        ArrayDesignProbeMapperServiceImpl.log.info("Completed association processing for " + arrayDesign + ", " + numSkipped + " were skipped");
    }
}
Also used : AnnotationAssociation(ubic.gemma.model.genome.sequenceAnalysis.AnnotationAssociation) BioSequence(ubic.gemma.model.genome.biosequence.BioSequence) IOException(java.io.IOException) CompositeSequence(ubic.gemma.model.expression.designElement.CompositeSequence) GeneProduct(ubic.gemma.model.genome.gene.GeneProduct) StringTokenizer(java.util.StringTokenizer) Gene(ubic.gemma.model.genome.Gene) BufferedReader(java.io.BufferedReader) FileReader(java.io.FileReader) HashSet(java.util.HashSet)

Aggregations

BioSequence (ubic.gemma.model.genome.biosequence.BioSequence)105 CompositeSequence (ubic.gemma.model.expression.designElement.CompositeSequence)40 ArrayDesign (ubic.gemma.model.expression.arrayDesign.ArrayDesign)24 Test (org.junit.Test)18 HashSet (java.util.HashSet)17 Taxon (ubic.gemma.model.genome.Taxon)15 BlatResult (ubic.gemma.model.genome.sequenceAnalysis.BlatResult)12 InputStream (java.io.InputStream)11 Collection (java.util.Collection)11 HashMap (java.util.HashMap)10 BaseSpringContextTest (ubic.gemma.core.testing.BaseSpringContextTest)10 GZIPInputStream (java.util.zip.GZIPInputStream)7 Gene (ubic.gemma.model.genome.Gene)7 GeoPlatform (ubic.gemma.core.loader.expression.geo.model.GeoPlatform)6 DatabaseEntry (ubic.gemma.model.common.description.DatabaseEntry)6 StopWatch (org.apache.commons.lang3.time.StopWatch)5 GeneProduct (ubic.gemma.model.genome.gene.GeneProduct)5 BioSequenceValueObject (ubic.gemma.model.genome.sequenceAnalysis.BioSequenceValueObject)5 BlatAssociation (ubic.gemma.model.genome.sequenceAnalysis.BlatAssociation)5 AbstractGeoServiceTest (ubic.gemma.core.loader.expression.geo.AbstractGeoServiceTest)4