Search in sources :

Example 56 with CompositeSequence

use of ubic.gemma.model.expression.designElement.CompositeSequence in project Gemma by PavlidisLab.

the class ExpressionExperimentPlatformSwitchService method switchExperimentToArrayDesign.

/**
 * If you know the arraydesigns are already in a merged state, you should use switchExperimentToMergedPlatform
 *
 * @param ee          ee
 * @param arrayDesign The array design to switch to. If some samples already use that array design, nothing will be
 *                    changed for them.
 */
public ExpressionExperiment switchExperimentToArrayDesign(ExpressionExperiment ee, ArrayDesign arrayDesign) {
    assert arrayDesign != null;
    // remove stuff that will be in the way.
    processedExpressionDataVectorService.removeProcessedDataVectors(ee);
    sampleCoexpressionMatrixService.delete(ee);
    for (ExpressionExperimentSubSet subset : expressionExperimentService.getSubSets(ee)) {
        subsetService.remove(subset);
    }
    // get relation between sequence and designelements.
    Map<BioSequence, Collection<CompositeSequence>> designElementMap = new HashMap<>();
    Collection<CompositeSequence> elsWithNoSeq = new HashSet<>();
    this.populateCSeq(arrayDesign, designElementMap, elsWithNoSeq);
    ee = expressionExperimentService.thaw(ee);
    ExpressionExperimentPlatformSwitchService.log.info(elsWithNoSeq.size() + " elements on the new platform have no associated sequence.");
    designElementMap.put(ExpressionExperimentPlatformSwitchService.NULL_BIOSEQUENCE, elsWithNoSeq);
    boolean multiPlatformPerSample = this.checkMultiPerSample(ee, arrayDesign);
    /*
         * For a multiplatform-per-sample case: (note that some samples might just be on one platform...)
         * 1. Pick a BAD that can be used for all DataVectors (it has all BioAssays in it).
         * 2. Switch vectors to use it - may require adding NaNs and reordering the vectors
         * 3. Delete the Bioassays that are using other BADs
         */
    /*
         * Now we have to get the BADs. Problem to watch out for: they might not be the same length, we need one that
         * includes all BioMaterials.
         */
    Collection<BioAssayDimension> unusedBADs = new HashSet<>();
    BioAssayDimension maxBAD = null;
    int maxSize = 0;
    if (multiPlatformPerSample) {
        maxBAD = this.doMultiSample(ee, unusedBADs, maxSize);
    }
    Collection<ArrayDesign> oldArrayDesigns = expressionExperimentService.getArrayDesignsUsed(ee);
    Map<CompositeSequence, Collection<BioAssayDimension>> usedDesignElements = new HashMap<>();
    for (ArrayDesign oldAd : oldArrayDesigns) {
        this.runOldAd(ee, arrayDesign, designElementMap, maxBAD, usedDesignElements, oldAd);
    }
    ee.setDescription(ee.getDescription() + " [Switched to use " + arrayDesign.getShortName() + " by Gemma]");
    helperService.persist(ee, arrayDesign);
    /*
         * This might need to be done inside the transaction we're using to make the switch.
         */
    if (maxBAD != null && !unusedBADs.isEmpty()) {
        this.checkUnused(unusedBADs, maxBAD);
    }
    return ee;
}
Also used : BioSequence(ubic.gemma.model.genome.biosequence.BioSequence) ArrayDesign(ubic.gemma.model.expression.arrayDesign.ArrayDesign) ExpressionExperimentSubSet(ubic.gemma.model.expression.experiment.ExpressionExperimentSubSet) CompositeSequence(ubic.gemma.model.expression.designElement.CompositeSequence) BioAssayDimension(ubic.gemma.model.expression.bioAssayData.BioAssayDimension)

Example 57 with CompositeSequence

use of ubic.gemma.model.expression.designElement.CompositeSequence in project Gemma by PavlidisLab.

the class ExpressionExperimentPlatformSwitchService method runOldAd.

private void runOldAd(ExpressionExperiment ee, ArrayDesign arrayDesign, Map<BioSequence, Collection<CompositeSequence>> designElementMap, BioAssayDimension maxBAD, Map<CompositeSequence, Collection<BioAssayDimension>> usedDesignElements, ArrayDesign oldAd) {
    if (oldAd.equals(arrayDesign))
        return;
    oldAd = arrayDesignService.thaw(oldAd);
    if (oldAd.getCompositeSequences().size() == 0 && !oldAd.getTechnologyType().equals(TechnologyType.NONE)) {
        /*
             * Bug 3451 - this is okay if it is a RNA-seq experiment etc. prior to data upload.
             */
        throw new IllegalStateException(oldAd + " has no elements");
    }
    Collection<QuantitationType> qts = expressionExperimentService.getQuantitationTypes(ee, oldAd);
    ExpressionExperimentPlatformSwitchService.log.info("Processing " + qts.size() + " quantitation types for vectors on " + oldAd);
    for (QuantitationType type : qts) {
        // use each design element only once per quantitation type + bioassaydimension per array design
        usedDesignElements.clear();
        Collection<RawExpressionDataVector> rawForQt = this.getRawVectorsForOneQuantitationType(oldAd, type);
        Collection<ProcessedExpressionDataVector> processedForQt = this.getProcessedVectorsForOneQuantitationType(oldAd, type);
        if (// 
        (rawForQt == null || rawForQt.size() == 0) && (processedForQt == null || processedForQt.size() == 0)) {
            /*
                 * This can happen when the quantitation types vary for the array designs.
                 */
            ExpressionExperimentPlatformSwitchService.log.debug("No vectors for " + type + " on " + oldAd);
            continue;
        }
        // This check assures we do not mix raw and processed vectors further down the line
        if ((rawForQt != null && rawForQt.size() > 0) && (processedForQt != null && processedForQt.size() > 0)) {
            throw new IllegalStateException("Two types of vector for quantitationType " + type);
        }
        Collection<DesignElementDataVector> vectors = new HashSet<>();
        if (rawForQt != null) {
            vectors.addAll(rawForQt);
        }
        if (processedForQt != null) {
            vectors.addAll(processedForQt);
        }
        ExpressionExperimentPlatformSwitchService.log.info("Switching " + vectors.size() + " vectors for " + type + " from " + oldAd.getShortName() + " to " + arrayDesign.getShortName());
        int count = 0;
        // noinspection MismatchedQueryAndUpdateOfCollection // Only used for logging
        Collection<DesignElementDataVector> unMatched = new HashSet<>();
        for (DesignElementDataVector vector : vectors) {
            assert RawExpressionDataVector.class.isAssignableFrom(vector.getClass()) : "Unexpected class: " + vector.getClass().getName();
            CompositeSequence oldDe = vector.getDesignElement();
            if (oldDe.getArrayDesign().equals(arrayDesign)) {
                continue;
            }
            this.processVector(designElementMap, usedDesignElements, vector, maxBAD);
            if (++count % 20000 == 0) {
                ExpressionExperimentPlatformSwitchService.log.info("Found matches for " + count + " vectors for " + type);
            }
        }
        /*
             * This is bad.
             */
        if (unMatched.size() > 0) {
            throw new IllegalStateException("There were " + unMatched.size() + " vectors that couldn't be matched to the new design for: " + type + ", example: " + unMatched.iterator().next());
        }
        // Force collection update
        if (rawForQt != null && rawForQt.size() > 0) {
            int s = ee.getRawExpressionDataVectors().size();
            ee.getRawExpressionDataVectors().removeAll(rawForQt);
            assert s > ee.getRawExpressionDataVectors().size();
            ee.getRawExpressionDataVectors().addAll(rawForQt);
            assert s == ee.getRawExpressionDataVectors().size();
        } else if (processedForQt != null && processedForQt.size() > 0) {
            int s = ee.getProcessedExpressionDataVectors().size();
            ee.getProcessedExpressionDataVectors().removeAll(processedForQt);
            assert s > ee.getProcessedExpressionDataVectors().size();
            ee.getProcessedExpressionDataVectors().addAll(processedForQt);
            assert s == ee.getProcessedExpressionDataVectors().size();
        }
    }
}
Also used : RawExpressionDataVector(ubic.gemma.model.expression.bioAssayData.RawExpressionDataVector) ProcessedExpressionDataVector(ubic.gemma.model.expression.bioAssayData.ProcessedExpressionDataVector) DesignElementDataVector(ubic.gemma.model.expression.bioAssayData.DesignElementDataVector) QuantitationType(ubic.gemma.model.common.quantitationtype.QuantitationType) CompositeSequence(ubic.gemma.model.expression.designElement.CompositeSequence)

Example 58 with CompositeSequence

use of ubic.gemma.model.expression.designElement.CompositeSequence in project Gemma by PavlidisLab.

the class ArrayDesignProbeMapperServiceImpl method processArrayDesign.

@Override
public void processArrayDesign(ArrayDesign arrayDesign, Taxon taxon, File source, ExternalDatabase sourceDB, boolean ncbiIds) throws IOException {
    if (taxon == null && !ncbiIds) {
        throw new IllegalArgumentException("You must provide a taxon unless passing ncbiIds = true");
    }
    if (arrayDesign.getTechnologyType().equals(TechnologyType.NONE)) {
        throw new IllegalArgumentException("Do not use this service to process platforms that do not use an probe-based technology.");
    }
    try (BufferedReader b = new BufferedReader(new FileReader(source))) {
        String line;
        int numSkipped = 0;
        ArrayDesignProbeMapperServiceImpl.log.info("Removing any old associations");
        arrayDesignService.deleteGeneProductAssociations(arrayDesign);
        while ((line = b.readLine()) != null) {
            if (StringUtils.isBlank(line)) {
                continue;
            }
            if (line.startsWith("#")) {
                continue;
            }
            String[] fields = StringUtils.splitPreserveAllTokens(line, '\t');
            if (fields.length != 3) {
                throw new IOException("Illegal format, expected three columns, got " + fields.length);
            }
            String probeId = fields[0];
            String seqName = fields[1];
            /*
                 * FIXME. We have to allow NCBI gene ids here.
                 */
            String geneSymbol = fields[2];
            if (StringUtils.isBlank(geneSymbol)) {
                numSkipped++;
                continue;
            }
            CompositeSequence c = compositeSequenceService.findByName(arrayDesign, probeId);
            if (c == null) {
                if (ArrayDesignProbeMapperServiceImpl.log.isDebugEnabled())
                    ArrayDesignProbeMapperServiceImpl.log.debug("No probe found for '" + probeId + "' on " + arrayDesign + ", skipping");
                numSkipped++;
                continue;
            }
            // a probe can have more than one gene associated with it if so they are piped |
            Collection<Gene> geneListProbe = new HashSet<>();
            // indicate multiple genes
            Gene geneDetails;
            StringTokenizer st = new StringTokenizer(geneSymbol, "|");
            while (st.hasMoreTokens()) {
                String geneToken = st.nextToken().trim();
                if (ncbiIds) {
                    geneDetails = geneService.findByNCBIId(Integer.parseInt(geneToken));
                } else {
                    geneDetails = geneService.findByOfficialSymbol(geneToken, taxon);
                }
                if (geneDetails != null) {
                    geneListProbe.add(geneDetails);
                }
            }
            if (geneListProbe.size() == 0) {
                ArrayDesignProbeMapperServiceImpl.log.warn("No gene(s) found for '" + geneSymbol + "' in " + taxon + ", skipping");
                numSkipped++;
                continue;
            } else if (geneListProbe.size() > 1) {
                // this is a common situation, when the geneSymbol actually has |-separated genes, so no need to
                // make a
                // lot of fuss.
                ArrayDesignProbeMapperServiceImpl.log.debug("More than one gene found for '" + geneSymbol + "' in " + taxon);
            }
            BioSequence bs = c.getBiologicalCharacteristic();
            if (bs != null) {
                if (StringUtils.isNotBlank(seqName)) {
                    bs = bioSequenceService.thaw(bs);
                    if (!bs.getName().equals(seqName)) {
                        ArrayDesignProbeMapperServiceImpl.log.warn("Sequence name '" + seqName + "' given for " + probeId + " does not match existing entry " + bs.getName() + ", skipping");
                        numSkipped++;
                        continue;
                    }
                }
            // otherwise we assume everything is okay.
            } else {
                // create one based on the text provided.
                if (StringUtils.isBlank(seqName)) {
                    ArrayDesignProbeMapperServiceImpl.log.warn("You must provide sequence names for probes which are not already mapped. probeName=" + probeId + " had no sequence associated and no name provided; skipping");
                    numSkipped++;
                    continue;
                }
                bs = BioSequence.Factory.newInstance();
                bs.setName(seqName);
                bs.setTaxon(taxon);
                bs.setDescription("Imported from annotation file");
                // Placeholder.
                bs.setType(SequenceType.OTHER);
                bs = bioSequenceService.create(bs);
                c.setBiologicalCharacteristic(bs);
                compositeSequenceService.update(c);
            }
            assert bs != null;
            assert bs.getId() != null;
            for (Gene gene : geneListProbe) {
                gene = geneService.thaw(gene);
                if (gene.getProducts().size() == 0) {
                    ArrayDesignProbeMapperServiceImpl.log.warn("There are no gene products for " + gene + ", it cannot be mapped to probes. Skipping");
                    numSkipped++;
                    continue;
                }
                for (GeneProduct gp : gene.getProducts()) {
                    AnnotationAssociation association = AnnotationAssociation.Factory.newInstance();
                    association.setBioSequence(bs);
                    association.setGeneProduct(gp);
                    association.setSource(sourceDB);
                    annotationAssociationService.create(association);
                }
            }
        }
        arrayDesignReportService.generateArrayDesignReport(arrayDesign.getId());
        this.deleteOldFiles(arrayDesign);
        ArrayDesignProbeMapperServiceImpl.log.info("Completed association processing for " + arrayDesign + ", " + numSkipped + " were skipped");
    }
}
Also used : AnnotationAssociation(ubic.gemma.model.genome.sequenceAnalysis.AnnotationAssociation) BioSequence(ubic.gemma.model.genome.biosequence.BioSequence) IOException(java.io.IOException) CompositeSequence(ubic.gemma.model.expression.designElement.CompositeSequence) GeneProduct(ubic.gemma.model.genome.gene.GeneProduct) StringTokenizer(java.util.StringTokenizer) Gene(ubic.gemma.model.genome.Gene) BufferedReader(java.io.BufferedReader) FileReader(java.io.FileReader) HashSet(java.util.HashSet)

Example 59 with CompositeSequence

use of ubic.gemma.model.expression.designElement.CompositeSequence in project Gemma by PavlidisLab.

the class ArrayDesignProbeMapperServiceImpl method processArrayDesign.

@Override
public void processArrayDesign(ArrayDesign arrayDesign, ProbeMapperConfig config, boolean useDB) {
    assert config != null;
    if (arrayDesign.getTechnologyType().equals(TechnologyType.NONE)) {
        throw new IllegalArgumentException("Do not use this service to process platforms that do not use an probe-based technology.");
    }
    Collection<Taxon> taxa = arrayDesignService.getTaxa(arrayDesign.getId());
    Taxon taxon = arrayDesign.getPrimaryTaxon();
    if (taxa.size() > 1 && taxon == null) {
        throw new IllegalArgumentException("Array design has sequence from multiple taxa and has no primary taxon set: " + arrayDesign);
    }
    GoldenPathSequenceAnalysis goldenPathDb = new GoldenPathSequenceAnalysis(taxon);
    BlockingQueue<BACS> persistingQueue = new ArrayBlockingQueue<>(ArrayDesignProbeMapperServiceImpl.QUEUE_SIZE);
    AtomicBoolean generatorDone = new AtomicBoolean(false);
    AtomicBoolean loaderDone = new AtomicBoolean(false);
    this.load(persistingQueue, generatorDone, loaderDone, useDB);
    if (useDB) {
        ArrayDesignProbeMapperServiceImpl.log.info("Removing any old associations");
        arrayDesignService.deleteGeneProductAssociations(arrayDesign);
    }
    int count = 0;
    int hits = 0;
    ArrayDesignProbeMapperServiceImpl.log.info("Start processing " + arrayDesign.getCompositeSequences().size() + " probes ...");
    for (CompositeSequence compositeSequence : arrayDesign.getCompositeSequences()) {
        if (compositeSequence.getName().equals("1431126_a_at")) {
            ArrayDesignProbeMapperServiceImpl.log.debug("HERE");
        }
        Map<String, Collection<BlatAssociation>> results = this.processCompositeSequence(config, taxon, goldenPathDb, compositeSequence);
        if (results == null)
            continue;
        for (Collection<BlatAssociation> col : results.values()) {
            for (BlatAssociation association : col) {
                if (ArrayDesignProbeMapperServiceImpl.log.isDebugEnabled())
                    ArrayDesignProbeMapperServiceImpl.log.debug(association);
                persistingQueue.add(new BACS(compositeSequence, association));
            }
            ++hits;
        }
        if (++count % 200 == 0) {
            ArrayDesignProbeMapperServiceImpl.log.info("Processed " + count + " composite sequences" + " with blat results; " + hits + " mappings found.");
        }
    }
    generatorDone.set(true);
    ArrayDesignProbeMapperServiceImpl.log.info("Waiting for loading to complete ...");
    while (!loaderDone.get()) {
        try {
            Thread.sleep(1000);
        } catch (InterruptedException e) {
            throw new RuntimeException(e);
        }
    }
    ArrayDesignProbeMapperServiceImpl.log.info("Processed " + count + " composite sequences with blat results; " + hits + " mappings found.");
    arrayDesignReportService.generateArrayDesignReport(arrayDesign.getId());
    this.deleteOldFiles(arrayDesign);
}
Also used : GoldenPathSequenceAnalysis(ubic.gemma.core.externalDb.GoldenPathSequenceAnalysis) Taxon(ubic.gemma.model.genome.Taxon) CompositeSequence(ubic.gemma.model.expression.designElement.CompositeSequence) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) ArrayBlockingQueue(java.util.concurrent.ArrayBlockingQueue) Collection(java.util.Collection) BlatAssociation(ubic.gemma.model.genome.sequenceAnalysis.BlatAssociation)

Example 60 with CompositeSequence

use of ubic.gemma.model.expression.designElement.CompositeSequence in project Gemma by PavlidisLab.

the class ArrayDesignSequenceAlignmentServiceImpl method getSequences.

/**
 * @param taxon (specified in case array has multiple taxa)
 */
// Possible external use
@SuppressWarnings({ "unused", "WeakerAccess" })
public static Collection<BioSequence> getSequences(ArrayDesign ad, Taxon taxon) {
    Collection<CompositeSequence> compositeSequences = ad.getCompositeSequences();
    Collection<BioSequence> sequencesToBlat = new HashSet<>();
    int numWithNoBioSequence = 0;
    int numWithNoSequenceData = 0;
    boolean warned = false;
    for (CompositeSequence cs : compositeSequences) {
        BioSequence bs = cs.getBiologicalCharacteristic();
        if (!warned && (numWithNoBioSequence > 20 || numWithNoSequenceData > 20)) {
            warned = true;
            ArrayDesignSequenceAlignmentServiceImpl.log.warn("More than 20 composite sequences don't have sequence information, no more warnings...");
        }
        if (bs == null) {
            ++numWithNoBioSequence;
            if (!warned) {
                ArrayDesignSequenceAlignmentServiceImpl.log.warn(cs + " had no associated biosequence object");
            }
            continue;
        }
        if (bs.getTaxon() == null) {
            warned = true;
            ArrayDesignSequenceAlignmentServiceImpl.log.warn("There is no taxon defined for this biosequence ");
            continue;
        }
        // if the taxon is null that means we want this run for all taxa for that array
        if (taxon != null && !bs.getTaxon().equals(taxon)) {
            continue;
        }
        // noinspection ConstantConditions // Better readability
        if (!warned && (numWithNoBioSequence > 20 || numWithNoSequenceData > 20)) {
            warned = true;
            ArrayDesignSequenceAlignmentServiceImpl.log.warn("More than 20 composite sequences don't have sequence information, no more warnings...");
        }
        if (StringUtils.isBlank(bs.getSequence())) {
            ++numWithNoSequenceData;
            if (!warned) {
                ArrayDesignSequenceAlignmentServiceImpl.log.warn(cs + " had " + bs + " but no sequence, skipping");
            }
            continue;
        }
        sequencesToBlat.add(bs);
    }
    if (numWithNoBioSequence > 0 || numWithNoSequenceData > 0) {
        ArrayDesignSequenceAlignmentServiceImpl.log.warn(numWithNoBioSequence + " composite sequences lacked biosequence associations; " + numWithNoSequenceData + " lacked sequence data ( out of " + compositeSequences.size() + " total).");
    }
    return sequencesToBlat;
}
Also used : BioSequence(ubic.gemma.model.genome.biosequence.BioSequence) CompositeSequence(ubic.gemma.model.expression.designElement.CompositeSequence) HashSet(java.util.HashSet)

Aggregations

CompositeSequence (ubic.gemma.model.expression.designElement.CompositeSequence)206 ArrayDesign (ubic.gemma.model.expression.arrayDesign.ArrayDesign)43 BioSequence (ubic.gemma.model.genome.biosequence.BioSequence)40 Gene (ubic.gemma.model.genome.Gene)32 Test (org.junit.Test)30 BioMaterial (ubic.gemma.model.expression.biomaterial.BioMaterial)19 ExpressionDataDoubleMatrix (ubic.gemma.core.datastructure.matrix.ExpressionDataDoubleMatrix)18 BioAssay (ubic.gemma.model.expression.bioAssay.BioAssay)18 DesignElementDataVector (ubic.gemma.model.expression.bioAssayData.DesignElementDataVector)18 RawExpressionDataVector (ubic.gemma.model.expression.bioAssayData.RawExpressionDataVector)18 StopWatch (org.apache.commons.lang3.time.StopWatch)17 HashSet (java.util.HashSet)15 BioAssayDimension (ubic.gemma.model.expression.bioAssayData.BioAssayDimension)15 CompositeSequenceValueObject (ubic.gemma.model.expression.designElement.CompositeSequenceValueObject)15 ArrayList (java.util.ArrayList)14 QuantitationType (ubic.gemma.model.common.quantitationtype.QuantitationType)14 BaseSpringContextTest (ubic.gemma.core.testing.BaseSpringContextTest)13 Taxon (ubic.gemma.model.genome.Taxon)12 Collection (java.util.Collection)11 ByteArrayConverter (ubic.basecode.io.ByteArrayConverter)11