use of ubic.gemma.model.expression.experiment.BioAssaySet in project Gemma by PavlidisLab.
the class ProcessedExpressionDataVectorDaoImpl method handleGetProcessedExpressionDataArrays.
/**
* This is an important method for fetching vectors.
*
* @param genes genes
* @param ees ees
* @return vectors, possibly subsetted.
*/
private Collection<DoubleVectorValueObject> handleGetProcessedExpressionDataArrays(Collection<? extends BioAssaySet> ees, Collection<Long> genes) {
// ees must be thawed first as currently implemented (?)
Collection<DoubleVectorValueObject> results = new HashSet<>();
/*
* Check the cache.
*/
Collection<ExpressionExperiment> needToSearch = new HashSet<>();
Collection<Long> genesToSearch = new HashSet<>();
this.checkCache(ees, genes, results, needToSearch, genesToSearch);
AbstractDao.log.info("Using " + results.size() + " DoubleVectorValueObject(s) from cache");
if (needToSearch.size() == 0) {
return results;
}
/*
* Get items not in the cache.
*/
AbstractDao.log.info("Searching for vectors for " + genes.size() + " genes from " + needToSearch.size() + " experiments not in cache");
Collection<ArrayDesign> arrays = CommonQueries.getArrayDesignsUsed(EntityUtils.getIds(this.getExperiments(ees)), this.getSessionFactory().getCurrentSession()).keySet();
assert !arrays.isEmpty();
Map<Long, Collection<Long>> cs2gene = CommonQueries.getCs2GeneIdMap(genesToSearch, EntityUtils.getIds(arrays), this.getSessionFactory().getCurrentSession());
if (cs2gene.size() == 0) {
if (results.isEmpty()) {
AbstractDao.log.warn("No composite sequences found for genes");
return new HashSet<>();
}
return results;
}
/*
* Fill in the map, because we want to track information on the specificity of the probes used in the data
* vectors.
*/
cs2gene = CommonQueries.getCs2GeneMapForProbes(cs2gene.keySet(), this.getSessionFactory().getCurrentSession());
Map<ProcessedExpressionDataVector, Collection<Long>> processedDataVectors = this.getProcessedVectors(EntityUtils.getIds(needToSearch), cs2gene);
Map<BioAssaySet, Collection<BioAssayDimension>> bioAssayDimensions = this.getBioAssayDimensions(needToSearch);
Collection<DoubleVectorValueObject> newResults = new HashSet<>();
/*
* This loop is to ensure that we don't get misaligned vectors for experiments that use more than one array
* design. See bug 1704. This isn't that common, so we try to break out as soon as possible.
*/
for (BioAssaySet bas : needToSearch) {
Collection<BioAssayDimension> dims = bioAssayDimensions.get(bas);
if (dims == null || dims.isEmpty()) {
AbstractDao.log.warn("BioAssayDimensions were null/empty unexpectedly.");
continue;
}
/*
* Get the vectors for just this experiment. This is made more efficient by removing things from the map
* each time through.
*/
Map<ProcessedExpressionDataVector, Collection<Long>> vecsForBas = new HashMap<>();
if (needToSearch.size() == 1) {
vecsForBas = processedDataVectors;
} else {
// isolate the vectors for the current experiment.
for (Iterator<ProcessedExpressionDataVector> it = processedDataVectors.keySet().iterator(); it.hasNext(); ) {
ProcessedExpressionDataVector v = it.next();
if (v.getExpressionExperiment().equals(bas)) {
vecsForBas.put(v, processedDataVectors.get(v));
// since we're done with it.
it.remove();
}
}
}
/*
* Now see if anything is 'ragged' (fewer bioassays per biomaterial than in some other vector)
*/
if (dims.size() == 1) {
newResults.addAll(this.unpack(vecsForBas));
} else {
BioAssayDimension longestBad = this.checkRagged(dims);
if (longestBad == null) {
newResults.addAll(this.unpack(vecsForBas));
} else {
newResults.addAll(this.unpack(vecsForBas, longestBad));
}
}
}
if (!newResults.isEmpty()) {
this.cacheResults(newResults);
newResults = this.sliceSubsets(ees, newResults);
results.addAll(newResults);
}
return results;
}
use of ubic.gemma.model.expression.experiment.BioAssaySet in project Gemma by PavlidisLab.
the class ProcessedExpressionDataVectorDaoImpl method checkCache.
/**
* We cache vectors at the experiment level. If we need subsets, we have to slice them out.
*
* @param bioAssaySets that we exactly need the data for.
* @param genes that might have cached results
* @param results from the cache will be put here
* @param needToSearch experiments that need to be searched (not fully cached); this will be populated
* @param genesToSearch that still need to be searched (not in cache)
*/
private void checkCache(Collection<? extends BioAssaySet> bioAssaySets, Collection<Long> genes, Collection<DoubleVectorValueObject> results, Collection<ExpressionExperiment> needToSearch, Collection<Long> genesToSearch) {
for (BioAssaySet ee : bioAssaySets) {
ExpressionExperiment experiment = null;
boolean needSubSet = false;
if (ee instanceof ExpressionExperiment) {
experiment = (ExpressionExperiment) ee;
} else if (ee instanceof ExpressionExperimentSubSet) {
experiment = ((ExpressionExperimentSubSet) ee).getSourceExperiment();
needSubSet = true;
}
assert experiment != null;
for (Long g : genes) {
Collection<DoubleVectorValueObject> obs = processedDataVectorCache.get(ee, g);
if (obs != null) {
if (needSubSet) {
obs = this.sliceSubSet((ExpressionExperimentSubSet) ee, obs);
}
results.addAll(obs);
} else {
genesToSearch.add(g);
}
}
/*
* This experiment is not fully cached for the genes in question.
*/
if (genesToSearch.size() > 0) {
needToSearch.add(experiment);
}
}
}
use of ubic.gemma.model.expression.experiment.BioAssaySet in project Gemma by PavlidisLab.
the class ProcessedExpressionDataVectorDaoImpl method getProcessedDataArraysByProbeIds.
private Collection<DoubleVectorValueObject> getProcessedDataArraysByProbeIds(Collection<? extends BioAssaySet> ees, Collection<Long> probeIds) {
Collection<DoubleVectorValueObject> results = new HashSet<>();
Map<Long, Collection<Long>> cs2gene = CommonQueries.getCs2GeneMapForProbes(probeIds, this.getSessionFactory().getCurrentSession());
Map<Long, Collection<Long>> noGeneProbes = new HashMap<>();
for (Long pid : probeIds) {
if (!cs2gene.containsKey(pid) || cs2gene.get(pid).isEmpty()) {
noGeneProbes.put(pid, new HashSet<Long>());
cs2gene.remove(pid);
}
}
AbstractDao.log.info(cs2gene.size() + " probes associated with a gene; " + noGeneProbes.size() + " not");
/*
* To Check the cache we need the list of genes 1st. Get from CS2Gene list then check the cache.
*/
Collection<Long> genes = new HashSet<>();
for (Long cs : cs2gene.keySet()) {
genes.addAll(cs2gene.get(cs));
}
Collection<ExpressionExperiment> needToSearch = new HashSet<>();
Collection<Long> genesToSearch = new HashSet<>();
this.checkCache(ees, genes, results, needToSearch, genesToSearch);
if (!results.isEmpty())
AbstractDao.log.info(results.size() + " vectors fetched from cache");
Map<ProcessedExpressionDataVector, Collection<Long>> rawResults = new HashMap<>();
/*
* Small problem: noGeneProbes are never really cached since we use the gene as part of that.
*/
if (!noGeneProbes.isEmpty()) {
Collection<ExpressionExperiment> eesForNoGeneProbes = new HashSet<>();
for (BioAssaySet ee : ees) {
if (ee instanceof ExpressionExperiment) {
eesForNoGeneProbes.add((ExpressionExperiment) ee);
} else {
eesForNoGeneProbes.add(((ExpressionExperimentSubSet) ee).getSourceExperiment());
}
}
needToSearch.addAll(eesForNoGeneProbes);
rawResults.putAll(this.getProcessedVectors(EntityUtils.getIds(eesForNoGeneProbes), noGeneProbes));
}
if (!rawResults.isEmpty())
AbstractDao.log.info(rawResults.size() + " vectors retrieved so far, for noGeneProbes");
/*
* Non-cached items.
*/
if (!needToSearch.isEmpty()) {
rawResults.putAll(this.getProcessedVectors(EntityUtils.getIds(needToSearch), cs2gene));
}
if (!rawResults.isEmpty())
AbstractDao.log.info(rawResults.size() + " vectors retrieved so far, after fetching non-cached.");
/*
* Deal with possibility of 'gaps' and unpack the vectors.
*/
Collection<DoubleVectorValueObject> newResults = new HashSet<>();
for (ExpressionExperiment ee : needToSearch) {
Collection<BioAssayDimension> bioAssayDimensions = this.getBioAssayDimensions(ee);
if (bioAssayDimensions.size() == 1) {
newResults.addAll(this.unpack(rawResults));
} else {
/*
* See handleGetProcessedExpressionDataArrays(Collection<? extends BioAssaySet>, Collection<Gene>,
* boolean) and bug 1704.
*/
BioAssayDimension longestBad = this.checkRagged(bioAssayDimensions);
assert longestBad != null;
newResults.addAll(this.unpack(rawResults, longestBad));
}
if (!newResults.isEmpty()) {
this.cacheResults(newResults);
newResults = this.sliceSubsets(ees, newResults);
results.addAll(newResults);
}
}
return results;
}
use of ubic.gemma.model.expression.experiment.BioAssaySet in project Gemma by PavlidisLab.
the class ProcessedExpressionDataVectorDaoImpl method sliceSubsets.
/**
* @param ees Experiments and/or subsets required
* @param vecs vectors to select from and if necessary slice, obviously from the given ees.
* @return vectors that are for the requested subset. If an ee is not a subset, vectors will be unchanged. Otherwise
* the data in a vector will be for the subset of samples in the ee subset.
*/
private Collection<DoubleVectorValueObject> sliceSubsets(Collection<? extends BioAssaySet> ees, Collection<DoubleVectorValueObject> vecs) {
Collection<DoubleVectorValueObject> results = new HashSet<>();
if (vecs == null || vecs.isEmpty())
return results;
for (BioAssaySet bas : ees) {
if (bas instanceof ExpressionExperimentSubSet) {
for (DoubleVectorValueObject d : vecs) {
if (d.getExpressionExperiment().getId().equals(((ExpressionExperimentSubSet) bas).getSourceExperiment().getId())) {
Collection<DoubleVectorValueObject> ddvos = new HashSet<>();
ddvos.add(d);
// coll
results.addAll(this.sliceSubSet((ExpressionExperimentSubSet) bas, ddvos));
}
}
} else {
for (DoubleVectorValueObject d : vecs) {
if (d.getExpressionExperiment().getId().equals(bas.getId())) {
results.add(d);
}
}
}
}
return results;
}
use of ubic.gemma.model.expression.experiment.BioAssaySet in project Gemma by PavlidisLab.
the class ProcessedExpressionDataVectorDaoImpl method getBioAssayDimensions.
private Map<BioAssaySet, Collection<BioAssayDimension>> getBioAssayDimensions(Collection<ExpressionExperiment> ees) {
Map<BioAssaySet, Collection<BioAssayDimension>> result = new HashMap<>();
if (ees.size() == 1) {
ExpressionExperiment ee = ees.iterator().next();
result.put(ee, this.getBioAssayDimensions(ee));
return result;
}
StopWatch timer = new StopWatch();
timer.start();
// noinspection unchecked
List<Object> r = this.getSessionFactory().getCurrentSession().createQuery("select distinct e, bad from ExpressionExperiment e, BioAssayDimension bad" + " inner join e.bioAssays b inner join bad.bioAssays badba where e in (:ees) and b in (badba) ").setParameterList("ees", ees).list();
for (Object o : r) {
Object[] tup = (Object[]) o;
BioAssaySet bas = (BioAssaySet) tup[0];
if (!result.containsKey(bas))
result.put(bas, new HashSet<BioAssayDimension>());
result.get(bas).add((BioAssayDimension) tup[1]);
}
if (timer.getTime() > 100) {
AbstractDao.log.info("Fetch " + r.size() + " bioAssayDimensions for " + ees.size() + " experiment(s): " + timer.getTime() + "ms");
}
return result;
}
Aggregations