Search in sources :

Example 16 with BioAssaySet

use of ubic.gemma.model.expression.experiment.BioAssaySet in project Gemma by PavlidisLab.

the class ProcessedExpressionDataVectorDaoImpl method handleGetProcessedExpressionDataArrays.

/**
 * This is an important method for fetching vectors.
 *
 * @param genes genes
 * @param ees   ees
 * @return vectors, possibly subsetted.
 */
private Collection<DoubleVectorValueObject> handleGetProcessedExpressionDataArrays(Collection<? extends BioAssaySet> ees, Collection<Long> genes) {
    // ees must be thawed first as currently implemented (?)
    Collection<DoubleVectorValueObject> results = new HashSet<>();
    /*
         * Check the cache.
         */
    Collection<ExpressionExperiment> needToSearch = new HashSet<>();
    Collection<Long> genesToSearch = new HashSet<>();
    this.checkCache(ees, genes, results, needToSearch, genesToSearch);
    AbstractDao.log.info("Using " + results.size() + " DoubleVectorValueObject(s) from cache");
    if (needToSearch.size() == 0) {
        return results;
    }
    /*
         * Get items not in the cache.
         */
    AbstractDao.log.info("Searching for vectors for " + genes.size() + " genes from " + needToSearch.size() + " experiments not in cache");
    Collection<ArrayDesign> arrays = CommonQueries.getArrayDesignsUsed(EntityUtils.getIds(this.getExperiments(ees)), this.getSessionFactory().getCurrentSession()).keySet();
    assert !arrays.isEmpty();
    Map<Long, Collection<Long>> cs2gene = CommonQueries.getCs2GeneIdMap(genesToSearch, EntityUtils.getIds(arrays), this.getSessionFactory().getCurrentSession());
    if (cs2gene.size() == 0) {
        if (results.isEmpty()) {
            AbstractDao.log.warn("No composite sequences found for genes");
            return new HashSet<>();
        }
        return results;
    }
    /*
         * Fill in the map, because we want to track information on the specificity of the probes used in the data
         * vectors.
         */
    cs2gene = CommonQueries.getCs2GeneMapForProbes(cs2gene.keySet(), this.getSessionFactory().getCurrentSession());
    Map<ProcessedExpressionDataVector, Collection<Long>> processedDataVectors = this.getProcessedVectors(EntityUtils.getIds(needToSearch), cs2gene);
    Map<BioAssaySet, Collection<BioAssayDimension>> bioAssayDimensions = this.getBioAssayDimensions(needToSearch);
    Collection<DoubleVectorValueObject> newResults = new HashSet<>();
    /*
         * This loop is to ensure that we don't get misaligned vectors for experiments that use more than one array
         * design. See bug 1704. This isn't that common, so we try to break out as soon as possible.
         */
    for (BioAssaySet bas : needToSearch) {
        Collection<BioAssayDimension> dims = bioAssayDimensions.get(bas);
        if (dims == null || dims.isEmpty()) {
            AbstractDao.log.warn("BioAssayDimensions were null/empty unexpectedly.");
            continue;
        }
        /*
             * Get the vectors for just this experiment. This is made more efficient by removing things from the map
             * each time through.
             */
        Map<ProcessedExpressionDataVector, Collection<Long>> vecsForBas = new HashMap<>();
        if (needToSearch.size() == 1) {
            vecsForBas = processedDataVectors;
        } else {
            // isolate the vectors for the current experiment.
            for (Iterator<ProcessedExpressionDataVector> it = processedDataVectors.keySet().iterator(); it.hasNext(); ) {
                ProcessedExpressionDataVector v = it.next();
                if (v.getExpressionExperiment().equals(bas)) {
                    vecsForBas.put(v, processedDataVectors.get(v));
                    // since we're done with it.
                    it.remove();
                }
            }
        }
        /*
             * Now see if anything is 'ragged' (fewer bioassays per biomaterial than in some other vector)
             */
        if (dims.size() == 1) {
            newResults.addAll(this.unpack(vecsForBas));
        } else {
            BioAssayDimension longestBad = this.checkRagged(dims);
            if (longestBad == null) {
                newResults.addAll(this.unpack(vecsForBas));
            } else {
                newResults.addAll(this.unpack(vecsForBas, longestBad));
            }
        }
    }
    if (!newResults.isEmpty()) {
        this.cacheResults(newResults);
        newResults = this.sliceSubsets(ees, newResults);
        results.addAll(newResults);
    }
    return results;
}
Also used : ArrayDesign(ubic.gemma.model.expression.arrayDesign.ArrayDesign) ExpressionExperiment(ubic.gemma.model.expression.experiment.ExpressionExperiment) BioAssaySet(ubic.gemma.model.expression.experiment.BioAssaySet)

Example 17 with BioAssaySet

use of ubic.gemma.model.expression.experiment.BioAssaySet in project Gemma by PavlidisLab.

the class ProcessedExpressionDataVectorDaoImpl method checkCache.

/**
 * We cache vectors at the experiment level. If we need subsets, we have to slice them out.
 *
 * @param bioAssaySets  that we exactly need the data for.
 * @param genes         that might have cached results
 * @param results       from the cache will be put here
 * @param needToSearch  experiments that need to be searched (not fully cached); this will be populated
 * @param genesToSearch that still need to be searched (not in cache)
 */
private void checkCache(Collection<? extends BioAssaySet> bioAssaySets, Collection<Long> genes, Collection<DoubleVectorValueObject> results, Collection<ExpressionExperiment> needToSearch, Collection<Long> genesToSearch) {
    for (BioAssaySet ee : bioAssaySets) {
        ExpressionExperiment experiment = null;
        boolean needSubSet = false;
        if (ee instanceof ExpressionExperiment) {
            experiment = (ExpressionExperiment) ee;
        } else if (ee instanceof ExpressionExperimentSubSet) {
            experiment = ((ExpressionExperimentSubSet) ee).getSourceExperiment();
            needSubSet = true;
        }
        assert experiment != null;
        for (Long g : genes) {
            Collection<DoubleVectorValueObject> obs = processedDataVectorCache.get(ee, g);
            if (obs != null) {
                if (needSubSet) {
                    obs = this.sliceSubSet((ExpressionExperimentSubSet) ee, obs);
                }
                results.addAll(obs);
            } else {
                genesToSearch.add(g);
            }
        }
        /*
             * This experiment is not fully cached for the genes in question.
             */
        if (genesToSearch.size() > 0) {
            needToSearch.add(experiment);
        }
    }
}
Also used : BioAssaySet(ubic.gemma.model.expression.experiment.BioAssaySet) ExpressionExperimentSubSet(ubic.gemma.model.expression.experiment.ExpressionExperimentSubSet) ExpressionExperiment(ubic.gemma.model.expression.experiment.ExpressionExperiment)

Example 18 with BioAssaySet

use of ubic.gemma.model.expression.experiment.BioAssaySet in project Gemma by PavlidisLab.

the class ProcessedExpressionDataVectorDaoImpl method getProcessedDataArraysByProbeIds.

private Collection<DoubleVectorValueObject> getProcessedDataArraysByProbeIds(Collection<? extends BioAssaySet> ees, Collection<Long> probeIds) {
    Collection<DoubleVectorValueObject> results = new HashSet<>();
    Map<Long, Collection<Long>> cs2gene = CommonQueries.getCs2GeneMapForProbes(probeIds, this.getSessionFactory().getCurrentSession());
    Map<Long, Collection<Long>> noGeneProbes = new HashMap<>();
    for (Long pid : probeIds) {
        if (!cs2gene.containsKey(pid) || cs2gene.get(pid).isEmpty()) {
            noGeneProbes.put(pid, new HashSet<Long>());
            cs2gene.remove(pid);
        }
    }
    AbstractDao.log.info(cs2gene.size() + " probes associated with a gene; " + noGeneProbes.size() + " not");
    /*
         * To Check the cache we need the list of genes 1st. Get from CS2Gene list then check the cache.
         */
    Collection<Long> genes = new HashSet<>();
    for (Long cs : cs2gene.keySet()) {
        genes.addAll(cs2gene.get(cs));
    }
    Collection<ExpressionExperiment> needToSearch = new HashSet<>();
    Collection<Long> genesToSearch = new HashSet<>();
    this.checkCache(ees, genes, results, needToSearch, genesToSearch);
    if (!results.isEmpty())
        AbstractDao.log.info(results.size() + " vectors fetched from cache");
    Map<ProcessedExpressionDataVector, Collection<Long>> rawResults = new HashMap<>();
    /*
         * Small problem: noGeneProbes are never really cached since we use the gene as part of that.
         */
    if (!noGeneProbes.isEmpty()) {
        Collection<ExpressionExperiment> eesForNoGeneProbes = new HashSet<>();
        for (BioAssaySet ee : ees) {
            if (ee instanceof ExpressionExperiment) {
                eesForNoGeneProbes.add((ExpressionExperiment) ee);
            } else {
                eesForNoGeneProbes.add(((ExpressionExperimentSubSet) ee).getSourceExperiment());
            }
        }
        needToSearch.addAll(eesForNoGeneProbes);
        rawResults.putAll(this.getProcessedVectors(EntityUtils.getIds(eesForNoGeneProbes), noGeneProbes));
    }
    if (!rawResults.isEmpty())
        AbstractDao.log.info(rawResults.size() + " vectors retrieved so far, for noGeneProbes");
    /*
         * Non-cached items.
         */
    if (!needToSearch.isEmpty()) {
        rawResults.putAll(this.getProcessedVectors(EntityUtils.getIds(needToSearch), cs2gene));
    }
    if (!rawResults.isEmpty())
        AbstractDao.log.info(rawResults.size() + " vectors retrieved so far, after fetching non-cached.");
    /*
         * Deal with possibility of 'gaps' and unpack the vectors.
         */
    Collection<DoubleVectorValueObject> newResults = new HashSet<>();
    for (ExpressionExperiment ee : needToSearch) {
        Collection<BioAssayDimension> bioAssayDimensions = this.getBioAssayDimensions(ee);
        if (bioAssayDimensions.size() == 1) {
            newResults.addAll(this.unpack(rawResults));
        } else {
            /*
                 * See handleGetProcessedExpressionDataArrays(Collection<? extends BioAssaySet>, Collection<Gene>,
                 * boolean) and bug 1704.
                 */
            BioAssayDimension longestBad = this.checkRagged(bioAssayDimensions);
            assert longestBad != null;
            newResults.addAll(this.unpack(rawResults, longestBad));
        }
        if (!newResults.isEmpty()) {
            this.cacheResults(newResults);
            newResults = this.sliceSubsets(ees, newResults);
            results.addAll(newResults);
        }
    }
    return results;
}
Also used : ExpressionExperiment(ubic.gemma.model.expression.experiment.ExpressionExperiment) BioAssaySet(ubic.gemma.model.expression.experiment.BioAssaySet)

Example 19 with BioAssaySet

use of ubic.gemma.model.expression.experiment.BioAssaySet in project Gemma by PavlidisLab.

the class ProcessedExpressionDataVectorDaoImpl method sliceSubsets.

/**
 * @param ees  Experiments and/or subsets required
 * @param vecs vectors to select from and if necessary slice, obviously from the given ees.
 * @return vectors that are for the requested subset. If an ee is not a subset, vectors will be unchanged. Otherwise
 * the data in a vector will be for the subset of samples in the ee subset.
 */
private Collection<DoubleVectorValueObject> sliceSubsets(Collection<? extends BioAssaySet> ees, Collection<DoubleVectorValueObject> vecs) {
    Collection<DoubleVectorValueObject> results = new HashSet<>();
    if (vecs == null || vecs.isEmpty())
        return results;
    for (BioAssaySet bas : ees) {
        if (bas instanceof ExpressionExperimentSubSet) {
            for (DoubleVectorValueObject d : vecs) {
                if (d.getExpressionExperiment().getId().equals(((ExpressionExperimentSubSet) bas).getSourceExperiment().getId())) {
                    Collection<DoubleVectorValueObject> ddvos = new HashSet<>();
                    ddvos.add(d);
                    // coll
                    results.addAll(this.sliceSubSet((ExpressionExperimentSubSet) bas, ddvos));
                }
            }
        } else {
            for (DoubleVectorValueObject d : vecs) {
                if (d.getExpressionExperiment().getId().equals(bas.getId())) {
                    results.add(d);
                }
            }
        }
    }
    return results;
}
Also used : BioAssaySet(ubic.gemma.model.expression.experiment.BioAssaySet) ExpressionExperimentSubSet(ubic.gemma.model.expression.experiment.ExpressionExperimentSubSet)

Example 20 with BioAssaySet

use of ubic.gemma.model.expression.experiment.BioAssaySet in project Gemma by PavlidisLab.

the class ProcessedExpressionDataVectorDaoImpl method getBioAssayDimensions.

private Map<BioAssaySet, Collection<BioAssayDimension>> getBioAssayDimensions(Collection<ExpressionExperiment> ees) {
    Map<BioAssaySet, Collection<BioAssayDimension>> result = new HashMap<>();
    if (ees.size() == 1) {
        ExpressionExperiment ee = ees.iterator().next();
        result.put(ee, this.getBioAssayDimensions(ee));
        return result;
    }
    StopWatch timer = new StopWatch();
    timer.start();
    // noinspection unchecked
    List<Object> r = this.getSessionFactory().getCurrentSession().createQuery("select distinct e, bad from ExpressionExperiment e, BioAssayDimension bad" + " inner join e.bioAssays b inner join bad.bioAssays badba where e in (:ees) and b in (badba) ").setParameterList("ees", ees).list();
    for (Object o : r) {
        Object[] tup = (Object[]) o;
        BioAssaySet bas = (BioAssaySet) tup[0];
        if (!result.containsKey(bas))
            result.put(bas, new HashSet<BioAssayDimension>());
        result.get(bas).add((BioAssayDimension) tup[1]);
    }
    if (timer.getTime() > 100) {
        AbstractDao.log.info("Fetch " + r.size() + " bioAssayDimensions for " + ees.size() + " experiment(s): " + timer.getTime() + "ms");
    }
    return result;
}
Also used : BioAssaySet(ubic.gemma.model.expression.experiment.BioAssaySet) BioAssayValueObject(ubic.gemma.model.expression.bioAssay.BioAssayValueObject) ExpressionExperimentValueObject(ubic.gemma.model.expression.experiment.ExpressionExperimentValueObject) CompositeSequenceValueObject(ubic.gemma.model.expression.designElement.CompositeSequenceValueObject) ExpressionExperiment(ubic.gemma.model.expression.experiment.ExpressionExperiment) StopWatch(org.apache.commons.lang3.time.StopWatch)

Aggregations

BioAssaySet (ubic.gemma.model.expression.experiment.BioAssaySet)39 ExpressionExperiment (ubic.gemma.model.expression.experiment.ExpressionExperiment)25 HashSet (java.util.HashSet)6 ExpressionExperimentSet (ubic.gemma.model.analysis.expression.ExpressionExperimentSet)6 Taxon (ubic.gemma.model.genome.Taxon)6 IOException (java.io.IOException)4 ArrayDesign (ubic.gemma.model.expression.arrayDesign.ArrayDesign)4 ArrayList (java.util.ArrayList)3 StopWatch (org.apache.commons.lang3.time.StopWatch)3 Transactional (org.springframework.transaction.annotation.Transactional)3 SecurityService (gemma.gsec.SecurityService)2 Element (org.w3c.dom.Element)2 PreprocessingException (ubic.gemma.core.analysis.preprocess.PreprocessingException)2 DataUpdater (ubic.gemma.core.loader.expression.geo.DataUpdater)2 DifferentialExpressionValueObject (ubic.gemma.model.analysis.expression.diff.DifferentialExpressionValueObject)2 CompositeSequence (ubic.gemma.model.expression.designElement.CompositeSequence)2 ExpressionExperimentSubSet (ubic.gemma.model.expression.experiment.ExpressionExperimentSubSet)2 ExpressionExperimentValueObject (ubic.gemma.model.expression.experiment.ExpressionExperimentValueObject)2 File (java.io.File)1 FileInputStream (java.io.FileInputStream)1