Search in sources :

Example 36 with StringUtils.join

use of org.apache.commons.lang3.StringUtils.join in project Gemma by PavlidisLab.

the class DifferentialExpressionResultDaoImpl method findDiffExAnalysisResultIdsInResultSets.

@Override
public Map<Long, Map<Long, DiffExprGeneSearchResult>> findDiffExAnalysisResultIdsInResultSets(Collection<DiffExResultSetSummaryValueObject> resultSets, Collection<Long> geneIds) {
    Map<Long, Map<Long, DiffExprGeneSearchResult>> results = new HashMap<>();
    Session session = this.getSessionFactory().getCurrentSession();
    Map<Long, DiffExResultSetSummaryValueObject> resultSetIdsMap = EntityUtils.getIdMap(resultSets, "getResultSetId");
    Map<Long, Collection<Long>> foundInCache = this.fillFromCache(results, resultSetIdsMap.keySet(), geneIds);
    if (!foundInCache.isEmpty()) {
        AbstractDao.log.info("Results for " + foundInCache.size() + " resultsets found in cache");
    } else {
        AbstractDao.log.info("No results were in the cache");
    }
    Collection<Long> resultSetsNeeded = this.stripUnneededResultSets(foundInCache, resultSetIdsMap.keySet(), geneIds);
    // Are we finished?
    if (resultSetsNeeded.isEmpty()) {
        AbstractDao.log.info("All results were in the cache.");
        return results;
    }
    AbstractDao.log.info(foundInCache.size() + "/" + resultSetIdsMap.size() + " resultsSets had at least some cached results; still need to query " + resultSetsNeeded.size());
    assert !resultSetsNeeded.isEmpty();
    org.hibernate.SQLQuery queryObject = session.createSQLQuery(DifferentialExpressionResultDaoImpl.fetchBatchDifferentialExpressionAnalysisResultsByResultSetsAndGeneQuery);
    /*
         * These values have been tweaked to probe for performance issues.
         */
    int resultSetBatchSize = 50;
    int geneBatchSize = 100;
    if (resultSetsNeeded.size() > geneIds.size()) {
        resultSetBatchSize = Math.min(500, resultSetsNeeded.size());
        AbstractDao.log.info("Batching by result sets (" + resultSetsNeeded.size() + " resultSets); " + geneIds.size() + " genes; batch size=" + resultSetBatchSize);
    } else {
        geneBatchSize = Math.min(200, geneIds.size());
        AbstractDao.log.info("Batching by genes (" + geneIds.size() + " genes); " + resultSetsNeeded.size() + " resultSets; batch size=" + geneBatchSize);
    }
    final int numResultSetBatches = (int) Math.ceil(resultSetsNeeded.size() / resultSetBatchSize);
    queryObject.setFlushMode(FlushMode.MANUAL);
    StopWatch timer = new StopWatch();
    timer.start();
    int numResults = 0;
    long timeForFillingNonSig = 0;
    Map<Long, Map<Long, DiffExprGeneSearchResult>> resultsFromDb = new HashMap<>();
    int numResultSetBatchesDone = 0;
    // Iterate over batches of resultSets
    for (Collection<Long> resultSetIdBatch : new BatchIterator<>(resultSetsNeeded, resultSetBatchSize)) {
        if (AbstractDao.log.isDebugEnabled())
            AbstractDao.log.debug("Starting batch of resultsets: " + StringUtils.abbreviate(StringUtils.join(resultSetIdBatch, ","), 100));
        /*
             * Get the probes using the CommonQueries gene2cs. Otherwise we (in effect) end up doing this over and over
             * again.
             */
        Map<Long, Collection<Long>> cs2GeneIdMap = this.getProbesForGenesInResultSetBatch(session, geneIds, resultSetIdsMap, resultSetIdBatch);
        queryObject.setParameterList("rs_ids", resultSetIdBatch);
        int numGeneBatchesDone = 0;
        final int numGeneBatches = (int) Math.ceil(cs2GeneIdMap.size() / geneBatchSize);
        StopWatch innerQt = new StopWatch();
        // iterate over batches of probes (genes)
        for (Collection<Long> probeBatch : new BatchIterator<>(cs2GeneIdMap.keySet(), geneBatchSize)) {
            if (AbstractDao.log.isDebugEnabled())
                AbstractDao.log.debug("Starting batch of probes: " + StringUtils.abbreviate(StringUtils.join(probeBatch, ","), 100));
            // would it help to sort the probeBatch/
            List<Long> pbL = new Vector<>(probeBatch);
            Collections.sort(pbL);
            queryObject.setParameterList("probe_ids", pbL);
            innerQt.start();
            List<?> queryResult = queryObject.list();
            innerQt.stop();
            if (innerQt.getTime() > 2000) {
                // show the actual query with params.
                AbstractDao.log.info("Query time: " + innerQt.getTime() + "ms:\n " + queryObject.getQueryString().replace(":probe_ids", StringUtils.join(probeBatch, ",")).replace(":rs_ids", StringUtils.join(resultSetIdBatch, ",")));
            }
            innerQt.reset();
            /*
                 * Each query tuple are the probe, result, resultsSet, qvalue, pvalue.
                 */
            for (Object o : queryResult) {
                // Long resultSetId = ( ( BigInteger )((Object[])o)[2] ).longValue();
                // if (!resultSetId.equals)
                numResults += this.processResultTuple(o, resultsFromDb, cs2GeneIdMap);
            }
            if (timer.getTime() > 5000 && AbstractDao.log.isInfoEnabled()) {
                AbstractDao.log.info("Batch time: " + timer.getTime() + "ms; Fetched DiffEx " + numResults + " results so far. " + numResultSetBatchesDone + "/" + numResultSetBatches + " resultset batches completed. " + numGeneBatchesDone + "/" + numGeneBatches + " gene batches done.");
                timer.reset();
                timer.start();
            }
            // Check if task was cancelled.
            if (Thread.currentThread().isInterrupted()) {
                throw new TaskCancelledException("Search was cancelled");
            }
            numGeneBatchesDone++;
            if (DifferentialExpressionResultDaoImpl.CORRECTED_PVALUE_THRESHOLD_TO_BE_CONSIDERED_DIFF_EX < 1.0) {
                timeForFillingNonSig += this.fillNonSignificant(pbL, resultSetIdsMap, resultsFromDb, resultSetIdBatch, cs2GeneIdMap, session);
            }
        }
        // Check if task was cancelled.
        if (Thread.currentThread().isInterrupted()) {
            throw new TaskCancelledException("Search was cancelled");
        }
        numResultSetBatchesDone++;
    }
    if (timer.getTime() > 1000 && AbstractDao.log.isInfoEnabled()) {
        AbstractDao.log.info("Fetching DiffEx from DB took total of " + timer.getTime() + " ms : geneIds=" + StringUtils.abbreviate(StringUtils.join(geneIds, ","), 50) + " result set=" + StringUtils.abbreviate(StringUtils.join(resultSetsNeeded, ","), 50));
        if (timeForFillingNonSig > 100) {
            AbstractDao.log.info("Filling in non-significant values: " + timeForFillingNonSig + "ms in total");
        }
    }
    // Add the DB results to the cached results.
    this.addToCache(resultsFromDb, resultSetsNeeded, geneIds);
    for (Long resultSetId : resultsFromDb.keySet()) {
        Map<Long, DiffExprGeneSearchResult> geneResults = resultsFromDb.get(resultSetId);
        if (results.containsKey(resultSetId)) {
            results.get(resultSetId).putAll(geneResults);
        } else {
            results.put(resultSetId, geneResults);
        }
    }
    return results;
}
Also used : BatchIterator(ubic.basecode.util.BatchIterator) org.hibernate(org.hibernate) StopWatch(org.apache.commons.lang3.time.StopWatch) GeneValueObject(ubic.gemma.model.genome.gene.GeneValueObject) TaskCancelledException(ubic.gemma.persistence.util.TaskCancelledException)

Example 37 with StringUtils.join

use of org.apache.commons.lang3.StringUtils.join in project Gemma by PavlidisLab.

the class DifferentialExpressionResultDaoImpl method loadContrastDetailsForResults.

/**
 * Key method for getting contrasts associated with results.
 */
@Override
public Map<Long, ContrastsValueObject> loadContrastDetailsForResults(Collection<Long> ids) {
    // language=SQL
    final String queryString = "SELECT DISTINCT c.ID, c.LOG_FOLD_CHANGE, c.FACTOR_VALUE_FK," + " c.DIFFERENTIAL_EXPRESSION_ANALYSIS_RESULT_FK, c.PVALUE FROM CONTRAST_RESULT c" + " WHERE c.DIFFERENTIAL_EXPRESSION_ANALYSIS_RESULT_FK IN (:ids)  ";
    Map<Long, ContrastsValueObject> probeResults = new HashMap<>();
    if (ids.isEmpty()) {
        return probeResults;
    }
    SQLQuery query = this.getSessionFactory().getCurrentSession().createSQLQuery(queryString);
    // previously: 500, then 1000. New optimized query is plenty fast.
    int BATCH_SIZE = 2000;
    StopWatch timer = new StopWatch();
    for (Collection<Long> batch : new BatchIterator<>(ids, BATCH_SIZE)) {
        timer.reset();
        timer.start();
        query.setParameterList("ids", batch);
        List<?> batchR = query.list();
        for (Object o : batchR) {
            Object[] ol = (Object[]) o;
            Long resultId = ((BigInteger) ol[3]).longValue();
            if (!probeResults.containsKey(resultId)) {
                probeResults.put(resultId, new ContrastsValueObject(resultId));
            }
            ContrastsValueObject cvo = probeResults.get(resultId);
            Long contrastId = ((BigInteger) ol[0]).longValue();
            Double logFoldChange = ol[1] == null ? null : (Double) ol[1];
            Long factorValueId = ol[2] == null ? null : ((BigInteger) ol[2]).longValue();
            Double pvalue = ol[4] == null ? null : (Double) ol[4];
            cvo.addContrast(contrastId, factorValueId, logFoldChange, pvalue, null);
        }
        if (timer.getTime() > 2000) {
            AbstractDao.log.info("Fetch " + batch.size() + " results with contrasts: " + timer.getTime() + "ms; query was\n " + queryString.replace(":ids", StringUtils.join(batch, ",")));
        }
    }
    return probeResults;
}
Also used : BatchIterator(ubic.basecode.util.BatchIterator) StopWatch(org.apache.commons.lang3.time.StopWatch) BigInteger(java.math.BigInteger) GeneValueObject(ubic.gemma.model.genome.gene.GeneValueObject)

Example 38 with StringUtils.join

use of org.apache.commons.lang3.StringUtils.join in project Gemma by PavlidisLab.

the class ExperimentalDesignVisualizationServiceImpl method sortVectorDataByDesign.

@Override
public Map<Long, LinkedHashMap<BioAssayValueObject, LinkedHashMap<ExperimentalFactor, Double>>> sortVectorDataByDesign(Collection<DoubleVectorValueObject> dedVs) {
    if (dedVs == null) {
        return new HashMap<>(0);
    }
    Map<Long, LinkedHashMap<BioAssayValueObject, LinkedHashMap<ExperimentalFactor, Double>>> returnedLayouts = new HashMap<>(dedVs.size());
    StopWatch timer = new StopWatch();
    timer.start();
    /*
         * This is shared across experiments that might show up in the dedVs; this should be okay...saves computation.
         * This is the only slow part.
         */
    this.prepare(dedVs);
    /*
         * This loop is not a performance issue.
         */
    Map<DoubleVectorValueObject, List<BioAssayValueObject>> newOrderingsForBioAssayDimensions = new HashMap<>();
    for (DoubleVectorValueObject vec : dedVs) {
        if (vec.isReorganized()) {
            continue;
        }
        assert !vec.getBioAssays().isEmpty();
        LinkedHashMap<BioAssayValueObject, LinkedHashMap<ExperimentalFactor, Double>> layout = null;
        if (cachedLayouts.containsKey(vec.getExpressionExperiment().getId())) {
            layout = cachedLayouts.get(vec.getExpressionExperiment().getId());
        } else if (vec.getExpressionExperiment().getClass().isInstance(ExpressionExperimentSubsetValueObject.class)) {
            // subset.
            layout = cachedLayouts.get(((ExpressionExperimentSubsetValueObject) vec.getExpressionExperiment()).getSourceExperiment());
        }
        if (layout == null || layout.isEmpty()) {
            log.error("Did not find cached layout for " + vec.getId());
            continue;
        }
        List<BioAssayValueObject> newOrdering = new ArrayList<>(layout.keySet());
        newOrdering.retainAll(vec.getBioAssays());
        /*
             * This can happen if the vectors are out of whack with the bioassays - e.g. two platforms were used but
             * merging is not done. See bug 3775. Skipping the ordering is not the right thing to do.
             */
        if (newOrdering.isEmpty()) {
            boolean allNaN = this.allNaN(vec);
            if (allNaN) {
                // reordering will have no effect.
                continue;
            }
            /*
                 * Add to the layout.
                 */
            layout = this.extendLayout(vec, vec.getExpressionExperiment().getId());
            newOrdering = new ArrayList<>(layout.keySet());
            newOrdering.retainAll(vec.getBioAssays());
            assert !newOrdering.isEmpty();
        }
        newOrderingsForBioAssayDimensions.put(vec, newOrdering);
        Map<BioAssayValueObject, Integer> ordering = this.getOrdering(newOrdering);
        Long eeId;
        // might be subset id.
        eeId = vec.getExpressionExperiment().getId();
        if (!returnedLayouts.containsKey(eeId)) {
            if (vec.isSliced()) {
                LinkedHashMap<BioAssayValueObject, LinkedHashMap<ExperimentalFactor, Double>> trimmedLayout = new LinkedHashMap<>();
                for (BioAssayValueObject baVo : newOrdering) {
                    trimmedLayout.put(baVo, layout.get(baVo));
                }
                returnedLayouts.put(eeId, trimmedLayout);
            } else {
                returnedLayouts.put(eeId, layout);
            }
        }
        /*
             * Might be a faster way.
             */
        double[] data = vec.getData();
        double[] dol = ArrayUtils.clone(data);
        // assert ordering.size() == data.length : "got " + ordering.size() + " expected " + data.length;
        List<BioAssayValueObject> oldOrdering = vec.getBioAssayDimension().getBioAssays();
        int j = 0;
        if (log.isTraceEnabled())
            log.trace("Old order: " + StringUtils.join(ArrayUtils.toObject(data), ","));
        for (BioAssayValueObject ba : oldOrdering) {
            if (ordering.get(ba) == null) {
                assert Double.isNaN(dol[j]);
                j++;
                continue;
            }
            assert ordering.containsKey(ba);
            assert ordering.get(ba) != null;
            Integer targetIndex = ordering.get(ba);
            data[targetIndex] = dol[j++];
        }
        if (log.isTraceEnabled())
            log.trace("New order: " + StringUtils.join(ArrayUtils.toObject(data), ","));
        vec.setReorganized(true);
    }
    for (DoubleVectorValueObject vec : dedVs) {
        if (vec.getBioAssayDimension().isReordered())
            continue;
        List<BioAssayValueObject> newOrdering = newOrderingsForBioAssayDimensions.get(vec);
        if (newOrdering == null)
            // data was empty, etc.
            continue;
        vec.getBioAssayDimension().reorder(newOrdering);
    }
    if (timer.getTime() > 1500) {
        log.info("Sort vectors by design: " + timer.getTime() + "ms");
    }
    return returnedLayouts;
}
Also used : ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) StopWatch(org.apache.commons.lang3.time.StopWatch) BioAssayValueObject(ubic.gemma.model.expression.bioAssay.BioAssayValueObject) List(java.util.List) DoubleVectorValueObject(ubic.gemma.model.expression.bioAssayData.DoubleVectorValueObject)

Example 39 with StringUtils.join

use of org.apache.commons.lang3.StringUtils.join in project Gemma by PavlidisLab.

the class CoexpressionDaoImpl method populateTestedInDetails.

/**
 * When fetching data. Requires database hits, but values for testedin are cached.
 *
 * @param g2gLinks links
 */
private void populateTestedInDetails(Collection<CoexpressionValueObject> g2gLinks) {
    assert !g2gLinks.isEmpty();
    StopWatch timer = new StopWatch();
    timer.start();
    // GeneCoexpressionTestedIn are one-per-gene so we first gather up all the unique genes we have to look at.
    Map<Long, GeneCoexpressionTestedIn> gcTestedIn = new HashMap<>();
    Set<Long> genes = new HashSet<>();
    for (CoexpressionValueObject gene2GeneCoexpression : g2gLinks) {
        Long queryGeneId = gene2GeneCoexpression.getQueryGeneId();
        GeneCoexpressionTestedIn queryGeneTestedIn = geneTestedInCache.get(queryGeneId);
        if (queryGeneTestedIn == null) {
            genes.add(queryGeneId);
        } else {
            gcTestedIn.put(queryGeneId, queryGeneTestedIn);
        }
        Long coexGeneId = gene2GeneCoexpression.getCoexGeneId();
        GeneCoexpressionTestedIn coexGeneTestedIn = geneTestedInCache.get(coexGeneId);
        if (coexGeneTestedIn == null) {
            genes.add(coexGeneId);
        } else {
            gcTestedIn.put(coexGeneId, coexGeneTestedIn);
        }
    }
    if (!genes.isEmpty()) {
        // fetch the GeneCoexpressionTestedIn information for those genes which were not cached.
        Query q = this.getSessionFactory().getCurrentSession().createQuery("from GeneCoexpressionTestedIn g where geneId in (:genes)");
        int BATCH_SIZE = 512;
        int n = 0;
        for (BatchIterator<Long> it = BatchIterator.batches(genes, BATCH_SIZE); it.hasNext(); ) {
            Collection<Long> g = it.next();
            q.setParameterList("genes", g);
            List<GeneCoexpressionTestedIn> list = q.list();
            Map<Long, GeneCoexpressionTestedIn> idMap = EntityUtils.getIdMap(list, "getGeneId");
            geneTestedInCache.cache(idMap);
            gcTestedIn.putAll(idMap);
            ++n;
        }
        if (timer.getTime() > 1000)
            CoexpressionDaoImpl.log.debug("Query for tested-in details for " + genes.size() + " genes: " + timer.getTime() + " ms (" + n + " batches), values fetched or from cache size=" + gcTestedIn.size());
    }
    timer.reset();
    timer.start();
    // copy it into the g2g value objects.
    for (CoexpressionValueObject g2g : g2gLinks) {
        assert g2g.getNumDatasetsSupporting() > 0 : g2g + " has support less than 1";
        Long id1 = g2g.getQueryGeneId();
        Long id2 = g2g.getCoexGeneId();
        GeneCoexpressionTestedIn geneCoexpressionTestedIn1 = gcTestedIn.get(id1);
        GeneCoexpressionTestedIn geneCoexpressionTestedIn2 = gcTestedIn.get(id2);
        if (geneCoexpressionTestedIn1 == null || geneCoexpressionTestedIn2 == null) {
            throw new IllegalStateException("Was missing GeneCoexpressionTestedIn data for genes in " + g2g);
        }
        if (geneCoexpressionTestedIn1.getNumDatasetsTestedIn() == 0 || geneCoexpressionTestedIn2.getNumDatasetsTestedIn() == 0) {
            throw new IllegalStateException(g2g + ": had no data sets tested in: " + StringUtils.join(geneCoexpressionTestedIn1.getIds(), ",") + " :: " + StringUtils.join(geneCoexpressionTestedIn2.getIds(), ","));
        }
        Set<Long> testedIn = geneCoexpressionTestedIn1.andSet(geneCoexpressionTestedIn2);
        if (testedIn.isEmpty()) {
            throw new IllegalStateException(g2g + ": had no data sets tested in: " + StringUtils.join(geneCoexpressionTestedIn1.getIds(), ",") + " :: " + StringUtils.join(geneCoexpressionTestedIn2.getIds(), ","));
        }
        g2g.setTestedInDatasets(testedIn);
    }
    if (timer.getTime() > 100)
        CoexpressionDaoImpl.log.debug("Populate into value obects: " + timer.getTime() + "ms (" + g2gLinks.size() + " links)");
}
Also used : StopWatch(org.apache.commons.lang3.time.StopWatch) GeneCoexpressionTestedIn(ubic.gemma.model.analysis.expression.coexpression.GeneCoexpressionTestedIn)

Example 40 with StringUtils.join

use of org.apache.commons.lang3.StringUtils.join in project Gemma by PavlidisLab.

the class CharacteristicDaoImpl method getParents.

@Override
public Map<Characteristic, Object> getParents(Class<?> parentClass, Collection<Characteristic> characteristics) {
    Map<Characteristic, Object> charToParent = new HashMap<>();
    if (characteristics == null || characteristics.size() == 0) {
        return charToParent;
    }
    if (AbstractDao.log.isDebugEnabled()) {
        Collection<String> uris = new HashSet<>();
        for (Characteristic c : characteristics) {
            if (c instanceof VocabCharacteristic) {
                VocabCharacteristic vc = (VocabCharacteristic) c;
                if (vc.getValueUri() == null)
                    continue;
                uris.add(vc.getValueUri());
            }
        }
        AbstractDao.log.debug("For class=" + parentClass.getSimpleName() + ": " + characteristics.size() + " Characteristics have URIS:\n" + StringUtils.join(uris, "\n"));
    }
    StopWatch timer = new StopWatch();
    timer.start();
    for (Collection<Characteristic> batch : new BatchIterator<>(characteristics, CharacteristicDaoImpl.BATCH_SIZE)) {
        this.batchGetParents(parentClass, batch, charToParent);
    }
    if (timer.getTime() > 1000) {
        AbstractDao.log.info("Fetch parents of characteristics: " + timer.getTime() + "ms for " + characteristics.size() + " elements for class=" + parentClass.getSimpleName());
    }
    return charToParent;
}
Also used : Characteristic(ubic.gemma.model.common.description.Characteristic) VocabCharacteristic(ubic.gemma.model.common.description.VocabCharacteristic) VocabCharacteristic(ubic.gemma.model.common.description.VocabCharacteristic) CharacteristicValueObject(ubic.gemma.model.genome.gene.phenotype.valueObject.CharacteristicValueObject) BatchIterator(ubic.basecode.util.BatchIterator) StopWatch(org.apache.commons.lang3.time.StopWatch)

Aggregations

StringUtils (org.apache.commons.lang3.StringUtils)34 List (java.util.List)30 Collectors (java.util.stream.Collectors)23 ArrayList (java.util.ArrayList)21 Map (java.util.Map)17 HashMap (java.util.HashMap)15 Set (java.util.Set)14 Logger (org.slf4j.Logger)14 LoggerFactory (org.slf4j.LoggerFactory)14 IOException (java.io.IOException)13 HashSet (java.util.HashSet)11 Arrays (java.util.Arrays)10 Collections (java.util.Collections)10 Date (java.util.Date)9 File (java.io.File)6 StopWatch (org.apache.commons.lang3.time.StopWatch)6 InputStream (java.io.InputStream)5 java.util (java.util)5 Pair (org.apache.commons.lang3.tuple.Pair)5 Path (java.nio.file.Path)4