use of org.apache.commons.lang3.StringUtils.join in project Gemma by PavlidisLab.
the class DifferentialExpressionResultDaoImpl method findDiffExAnalysisResultIdsInResultSets.
@Override
public Map<Long, Map<Long, DiffExprGeneSearchResult>> findDiffExAnalysisResultIdsInResultSets(Collection<DiffExResultSetSummaryValueObject> resultSets, Collection<Long> geneIds) {
Map<Long, Map<Long, DiffExprGeneSearchResult>> results = new HashMap<>();
Session session = this.getSessionFactory().getCurrentSession();
Map<Long, DiffExResultSetSummaryValueObject> resultSetIdsMap = EntityUtils.getIdMap(resultSets, "getResultSetId");
Map<Long, Collection<Long>> foundInCache = this.fillFromCache(results, resultSetIdsMap.keySet(), geneIds);
if (!foundInCache.isEmpty()) {
AbstractDao.log.info("Results for " + foundInCache.size() + " resultsets found in cache");
} else {
AbstractDao.log.info("No results were in the cache");
}
Collection<Long> resultSetsNeeded = this.stripUnneededResultSets(foundInCache, resultSetIdsMap.keySet(), geneIds);
// Are we finished?
if (resultSetsNeeded.isEmpty()) {
AbstractDao.log.info("All results were in the cache.");
return results;
}
AbstractDao.log.info(foundInCache.size() + "/" + resultSetIdsMap.size() + " resultsSets had at least some cached results; still need to query " + resultSetsNeeded.size());
assert !resultSetsNeeded.isEmpty();
org.hibernate.SQLQuery queryObject = session.createSQLQuery(DifferentialExpressionResultDaoImpl.fetchBatchDifferentialExpressionAnalysisResultsByResultSetsAndGeneQuery);
/*
* These values have been tweaked to probe for performance issues.
*/
int resultSetBatchSize = 50;
int geneBatchSize = 100;
if (resultSetsNeeded.size() > geneIds.size()) {
resultSetBatchSize = Math.min(500, resultSetsNeeded.size());
AbstractDao.log.info("Batching by result sets (" + resultSetsNeeded.size() + " resultSets); " + geneIds.size() + " genes; batch size=" + resultSetBatchSize);
} else {
geneBatchSize = Math.min(200, geneIds.size());
AbstractDao.log.info("Batching by genes (" + geneIds.size() + " genes); " + resultSetsNeeded.size() + " resultSets; batch size=" + geneBatchSize);
}
final int numResultSetBatches = (int) Math.ceil(resultSetsNeeded.size() / resultSetBatchSize);
queryObject.setFlushMode(FlushMode.MANUAL);
StopWatch timer = new StopWatch();
timer.start();
int numResults = 0;
long timeForFillingNonSig = 0;
Map<Long, Map<Long, DiffExprGeneSearchResult>> resultsFromDb = new HashMap<>();
int numResultSetBatchesDone = 0;
// Iterate over batches of resultSets
for (Collection<Long> resultSetIdBatch : new BatchIterator<>(resultSetsNeeded, resultSetBatchSize)) {
if (AbstractDao.log.isDebugEnabled())
AbstractDao.log.debug("Starting batch of resultsets: " + StringUtils.abbreviate(StringUtils.join(resultSetIdBatch, ","), 100));
/*
* Get the probes using the CommonQueries gene2cs. Otherwise we (in effect) end up doing this over and over
* again.
*/
Map<Long, Collection<Long>> cs2GeneIdMap = this.getProbesForGenesInResultSetBatch(session, geneIds, resultSetIdsMap, resultSetIdBatch);
queryObject.setParameterList("rs_ids", resultSetIdBatch);
int numGeneBatchesDone = 0;
final int numGeneBatches = (int) Math.ceil(cs2GeneIdMap.size() / geneBatchSize);
StopWatch innerQt = new StopWatch();
// iterate over batches of probes (genes)
for (Collection<Long> probeBatch : new BatchIterator<>(cs2GeneIdMap.keySet(), geneBatchSize)) {
if (AbstractDao.log.isDebugEnabled())
AbstractDao.log.debug("Starting batch of probes: " + StringUtils.abbreviate(StringUtils.join(probeBatch, ","), 100));
// would it help to sort the probeBatch/
List<Long> pbL = new Vector<>(probeBatch);
Collections.sort(pbL);
queryObject.setParameterList("probe_ids", pbL);
innerQt.start();
List<?> queryResult = queryObject.list();
innerQt.stop();
if (innerQt.getTime() > 2000) {
// show the actual query with params.
AbstractDao.log.info("Query time: " + innerQt.getTime() + "ms:\n " + queryObject.getQueryString().replace(":probe_ids", StringUtils.join(probeBatch, ",")).replace(":rs_ids", StringUtils.join(resultSetIdBatch, ",")));
}
innerQt.reset();
/*
* Each query tuple are the probe, result, resultsSet, qvalue, pvalue.
*/
for (Object o : queryResult) {
// Long resultSetId = ( ( BigInteger )((Object[])o)[2] ).longValue();
// if (!resultSetId.equals)
numResults += this.processResultTuple(o, resultsFromDb, cs2GeneIdMap);
}
if (timer.getTime() > 5000 && AbstractDao.log.isInfoEnabled()) {
AbstractDao.log.info("Batch time: " + timer.getTime() + "ms; Fetched DiffEx " + numResults + " results so far. " + numResultSetBatchesDone + "/" + numResultSetBatches + " resultset batches completed. " + numGeneBatchesDone + "/" + numGeneBatches + " gene batches done.");
timer.reset();
timer.start();
}
// Check if task was cancelled.
if (Thread.currentThread().isInterrupted()) {
throw new TaskCancelledException("Search was cancelled");
}
numGeneBatchesDone++;
if (DifferentialExpressionResultDaoImpl.CORRECTED_PVALUE_THRESHOLD_TO_BE_CONSIDERED_DIFF_EX < 1.0) {
timeForFillingNonSig += this.fillNonSignificant(pbL, resultSetIdsMap, resultsFromDb, resultSetIdBatch, cs2GeneIdMap, session);
}
}
// Check if task was cancelled.
if (Thread.currentThread().isInterrupted()) {
throw new TaskCancelledException("Search was cancelled");
}
numResultSetBatchesDone++;
}
if (timer.getTime() > 1000 && AbstractDao.log.isInfoEnabled()) {
AbstractDao.log.info("Fetching DiffEx from DB took total of " + timer.getTime() + " ms : geneIds=" + StringUtils.abbreviate(StringUtils.join(geneIds, ","), 50) + " result set=" + StringUtils.abbreviate(StringUtils.join(resultSetsNeeded, ","), 50));
if (timeForFillingNonSig > 100) {
AbstractDao.log.info("Filling in non-significant values: " + timeForFillingNonSig + "ms in total");
}
}
// Add the DB results to the cached results.
this.addToCache(resultsFromDb, resultSetsNeeded, geneIds);
for (Long resultSetId : resultsFromDb.keySet()) {
Map<Long, DiffExprGeneSearchResult> geneResults = resultsFromDb.get(resultSetId);
if (results.containsKey(resultSetId)) {
results.get(resultSetId).putAll(geneResults);
} else {
results.put(resultSetId, geneResults);
}
}
return results;
}
use of org.apache.commons.lang3.StringUtils.join in project Gemma by PavlidisLab.
the class DifferentialExpressionResultDaoImpl method loadContrastDetailsForResults.
/**
* Key method for getting contrasts associated with results.
*/
@Override
public Map<Long, ContrastsValueObject> loadContrastDetailsForResults(Collection<Long> ids) {
// language=SQL
final String queryString = "SELECT DISTINCT c.ID, c.LOG_FOLD_CHANGE, c.FACTOR_VALUE_FK," + " c.DIFFERENTIAL_EXPRESSION_ANALYSIS_RESULT_FK, c.PVALUE FROM CONTRAST_RESULT c" + " WHERE c.DIFFERENTIAL_EXPRESSION_ANALYSIS_RESULT_FK IN (:ids) ";
Map<Long, ContrastsValueObject> probeResults = new HashMap<>();
if (ids.isEmpty()) {
return probeResults;
}
SQLQuery query = this.getSessionFactory().getCurrentSession().createSQLQuery(queryString);
// previously: 500, then 1000. New optimized query is plenty fast.
int BATCH_SIZE = 2000;
StopWatch timer = new StopWatch();
for (Collection<Long> batch : new BatchIterator<>(ids, BATCH_SIZE)) {
timer.reset();
timer.start();
query.setParameterList("ids", batch);
List<?> batchR = query.list();
for (Object o : batchR) {
Object[] ol = (Object[]) o;
Long resultId = ((BigInteger) ol[3]).longValue();
if (!probeResults.containsKey(resultId)) {
probeResults.put(resultId, new ContrastsValueObject(resultId));
}
ContrastsValueObject cvo = probeResults.get(resultId);
Long contrastId = ((BigInteger) ol[0]).longValue();
Double logFoldChange = ol[1] == null ? null : (Double) ol[1];
Long factorValueId = ol[2] == null ? null : ((BigInteger) ol[2]).longValue();
Double pvalue = ol[4] == null ? null : (Double) ol[4];
cvo.addContrast(contrastId, factorValueId, logFoldChange, pvalue, null);
}
if (timer.getTime() > 2000) {
AbstractDao.log.info("Fetch " + batch.size() + " results with contrasts: " + timer.getTime() + "ms; query was\n " + queryString.replace(":ids", StringUtils.join(batch, ",")));
}
}
return probeResults;
}
use of org.apache.commons.lang3.StringUtils.join in project Gemma by PavlidisLab.
the class ExperimentalDesignVisualizationServiceImpl method sortVectorDataByDesign.
@Override
public Map<Long, LinkedHashMap<BioAssayValueObject, LinkedHashMap<ExperimentalFactor, Double>>> sortVectorDataByDesign(Collection<DoubleVectorValueObject> dedVs) {
if (dedVs == null) {
return new HashMap<>(0);
}
Map<Long, LinkedHashMap<BioAssayValueObject, LinkedHashMap<ExperimentalFactor, Double>>> returnedLayouts = new HashMap<>(dedVs.size());
StopWatch timer = new StopWatch();
timer.start();
/*
* This is shared across experiments that might show up in the dedVs; this should be okay...saves computation.
* This is the only slow part.
*/
this.prepare(dedVs);
/*
* This loop is not a performance issue.
*/
Map<DoubleVectorValueObject, List<BioAssayValueObject>> newOrderingsForBioAssayDimensions = new HashMap<>();
for (DoubleVectorValueObject vec : dedVs) {
if (vec.isReorganized()) {
continue;
}
assert !vec.getBioAssays().isEmpty();
LinkedHashMap<BioAssayValueObject, LinkedHashMap<ExperimentalFactor, Double>> layout = null;
if (cachedLayouts.containsKey(vec.getExpressionExperiment().getId())) {
layout = cachedLayouts.get(vec.getExpressionExperiment().getId());
} else if (vec.getExpressionExperiment().getClass().isInstance(ExpressionExperimentSubsetValueObject.class)) {
// subset.
layout = cachedLayouts.get(((ExpressionExperimentSubsetValueObject) vec.getExpressionExperiment()).getSourceExperiment());
}
if (layout == null || layout.isEmpty()) {
log.error("Did not find cached layout for " + vec.getId());
continue;
}
List<BioAssayValueObject> newOrdering = new ArrayList<>(layout.keySet());
newOrdering.retainAll(vec.getBioAssays());
/*
* This can happen if the vectors are out of whack with the bioassays - e.g. two platforms were used but
* merging is not done. See bug 3775. Skipping the ordering is not the right thing to do.
*/
if (newOrdering.isEmpty()) {
boolean allNaN = this.allNaN(vec);
if (allNaN) {
// reordering will have no effect.
continue;
}
/*
* Add to the layout.
*/
layout = this.extendLayout(vec, vec.getExpressionExperiment().getId());
newOrdering = new ArrayList<>(layout.keySet());
newOrdering.retainAll(vec.getBioAssays());
assert !newOrdering.isEmpty();
}
newOrderingsForBioAssayDimensions.put(vec, newOrdering);
Map<BioAssayValueObject, Integer> ordering = this.getOrdering(newOrdering);
Long eeId;
// might be subset id.
eeId = vec.getExpressionExperiment().getId();
if (!returnedLayouts.containsKey(eeId)) {
if (vec.isSliced()) {
LinkedHashMap<BioAssayValueObject, LinkedHashMap<ExperimentalFactor, Double>> trimmedLayout = new LinkedHashMap<>();
for (BioAssayValueObject baVo : newOrdering) {
trimmedLayout.put(baVo, layout.get(baVo));
}
returnedLayouts.put(eeId, trimmedLayout);
} else {
returnedLayouts.put(eeId, layout);
}
}
/*
* Might be a faster way.
*/
double[] data = vec.getData();
double[] dol = ArrayUtils.clone(data);
// assert ordering.size() == data.length : "got " + ordering.size() + " expected " + data.length;
List<BioAssayValueObject> oldOrdering = vec.getBioAssayDimension().getBioAssays();
int j = 0;
if (log.isTraceEnabled())
log.trace("Old order: " + StringUtils.join(ArrayUtils.toObject(data), ","));
for (BioAssayValueObject ba : oldOrdering) {
if (ordering.get(ba) == null) {
assert Double.isNaN(dol[j]);
j++;
continue;
}
assert ordering.containsKey(ba);
assert ordering.get(ba) != null;
Integer targetIndex = ordering.get(ba);
data[targetIndex] = dol[j++];
}
if (log.isTraceEnabled())
log.trace("New order: " + StringUtils.join(ArrayUtils.toObject(data), ","));
vec.setReorganized(true);
}
for (DoubleVectorValueObject vec : dedVs) {
if (vec.getBioAssayDimension().isReordered())
continue;
List<BioAssayValueObject> newOrdering = newOrderingsForBioAssayDimensions.get(vec);
if (newOrdering == null)
// data was empty, etc.
continue;
vec.getBioAssayDimension().reorder(newOrdering);
}
if (timer.getTime() > 1500) {
log.info("Sort vectors by design: " + timer.getTime() + "ms");
}
return returnedLayouts;
}
use of org.apache.commons.lang3.StringUtils.join in project Gemma by PavlidisLab.
the class CoexpressionDaoImpl method populateTestedInDetails.
/**
* When fetching data. Requires database hits, but values for testedin are cached.
*
* @param g2gLinks links
*/
private void populateTestedInDetails(Collection<CoexpressionValueObject> g2gLinks) {
assert !g2gLinks.isEmpty();
StopWatch timer = new StopWatch();
timer.start();
// GeneCoexpressionTestedIn are one-per-gene so we first gather up all the unique genes we have to look at.
Map<Long, GeneCoexpressionTestedIn> gcTestedIn = new HashMap<>();
Set<Long> genes = new HashSet<>();
for (CoexpressionValueObject gene2GeneCoexpression : g2gLinks) {
Long queryGeneId = gene2GeneCoexpression.getQueryGeneId();
GeneCoexpressionTestedIn queryGeneTestedIn = geneTestedInCache.get(queryGeneId);
if (queryGeneTestedIn == null) {
genes.add(queryGeneId);
} else {
gcTestedIn.put(queryGeneId, queryGeneTestedIn);
}
Long coexGeneId = gene2GeneCoexpression.getCoexGeneId();
GeneCoexpressionTestedIn coexGeneTestedIn = geneTestedInCache.get(coexGeneId);
if (coexGeneTestedIn == null) {
genes.add(coexGeneId);
} else {
gcTestedIn.put(coexGeneId, coexGeneTestedIn);
}
}
if (!genes.isEmpty()) {
// fetch the GeneCoexpressionTestedIn information for those genes which were not cached.
Query q = this.getSessionFactory().getCurrentSession().createQuery("from GeneCoexpressionTestedIn g where geneId in (:genes)");
int BATCH_SIZE = 512;
int n = 0;
for (BatchIterator<Long> it = BatchIterator.batches(genes, BATCH_SIZE); it.hasNext(); ) {
Collection<Long> g = it.next();
q.setParameterList("genes", g);
List<GeneCoexpressionTestedIn> list = q.list();
Map<Long, GeneCoexpressionTestedIn> idMap = EntityUtils.getIdMap(list, "getGeneId");
geneTestedInCache.cache(idMap);
gcTestedIn.putAll(idMap);
++n;
}
if (timer.getTime() > 1000)
CoexpressionDaoImpl.log.debug("Query for tested-in details for " + genes.size() + " genes: " + timer.getTime() + " ms (" + n + " batches), values fetched or from cache size=" + gcTestedIn.size());
}
timer.reset();
timer.start();
// copy it into the g2g value objects.
for (CoexpressionValueObject g2g : g2gLinks) {
assert g2g.getNumDatasetsSupporting() > 0 : g2g + " has support less than 1";
Long id1 = g2g.getQueryGeneId();
Long id2 = g2g.getCoexGeneId();
GeneCoexpressionTestedIn geneCoexpressionTestedIn1 = gcTestedIn.get(id1);
GeneCoexpressionTestedIn geneCoexpressionTestedIn2 = gcTestedIn.get(id2);
if (geneCoexpressionTestedIn1 == null || geneCoexpressionTestedIn2 == null) {
throw new IllegalStateException("Was missing GeneCoexpressionTestedIn data for genes in " + g2g);
}
if (geneCoexpressionTestedIn1.getNumDatasetsTestedIn() == 0 || geneCoexpressionTestedIn2.getNumDatasetsTestedIn() == 0) {
throw new IllegalStateException(g2g + ": had no data sets tested in: " + StringUtils.join(geneCoexpressionTestedIn1.getIds(), ",") + " :: " + StringUtils.join(geneCoexpressionTestedIn2.getIds(), ","));
}
Set<Long> testedIn = geneCoexpressionTestedIn1.andSet(geneCoexpressionTestedIn2);
if (testedIn.isEmpty()) {
throw new IllegalStateException(g2g + ": had no data sets tested in: " + StringUtils.join(geneCoexpressionTestedIn1.getIds(), ",") + " :: " + StringUtils.join(geneCoexpressionTestedIn2.getIds(), ","));
}
g2g.setTestedInDatasets(testedIn);
}
if (timer.getTime() > 100)
CoexpressionDaoImpl.log.debug("Populate into value obects: " + timer.getTime() + "ms (" + g2gLinks.size() + " links)");
}
use of org.apache.commons.lang3.StringUtils.join in project Gemma by PavlidisLab.
the class CharacteristicDaoImpl method getParents.
@Override
public Map<Characteristic, Object> getParents(Class<?> parentClass, Collection<Characteristic> characteristics) {
Map<Characteristic, Object> charToParent = new HashMap<>();
if (characteristics == null || characteristics.size() == 0) {
return charToParent;
}
if (AbstractDao.log.isDebugEnabled()) {
Collection<String> uris = new HashSet<>();
for (Characteristic c : characteristics) {
if (c instanceof VocabCharacteristic) {
VocabCharacteristic vc = (VocabCharacteristic) c;
if (vc.getValueUri() == null)
continue;
uris.add(vc.getValueUri());
}
}
AbstractDao.log.debug("For class=" + parentClass.getSimpleName() + ": " + characteristics.size() + " Characteristics have URIS:\n" + StringUtils.join(uris, "\n"));
}
StopWatch timer = new StopWatch();
timer.start();
for (Collection<Characteristic> batch : new BatchIterator<>(characteristics, CharacteristicDaoImpl.BATCH_SIZE)) {
this.batchGetParents(parentClass, batch, charToParent);
}
if (timer.getTime() > 1000) {
AbstractDao.log.info("Fetch parents of characteristics: " + timer.getTime() + "ms for " + characteristics.size() + " elements for class=" + parentClass.getSimpleName());
}
return charToParent;
}
Aggregations