use of cern.colt.list.DoubleArrayList in project Gemma by PavlidisLab.
the class GeneDifferentialExpressionServiceImpl method getDifferentialExpressionMetaAnalysis.
@Override
public DifferentialExpressionMetaAnalysisValueObject getDifferentialExpressionMetaAnalysis(double threshold, Gene g, Map<Long, Long> eeFactorsMap, Collection<BioAssaySet> activeExperiments) {
StopWatch timer = new StopWatch();
timer.start();
/*
* Get results for each active experiment on given gene. Handling the threshold check below since we ignore this
* for the meta analysis. The results returned are for all factors, not just the factors we are seeking.
*/
Map<ExpressionExperimentValueObject, List<DifferentialExpressionValueObject>> resultsMap = differentialExpressionResultService.find(g, EntityUtils.getIds(activeExperiments));
GeneDifferentialExpressionServiceImpl.log.debug(resultsMap.size() + " results for " + g + " in " + activeExperiments);
DifferentialExpressionMetaAnalysisValueObject mavo = new DifferentialExpressionMetaAnalysisValueObject();
DoubleArrayList pvaluesToCombine = new DoubleArrayList();
/* a gene can have multiple probes that map to it, so store one diff value object for each probe */
Collection<DifferentialExpressionValueObject> devos = new ArrayList<>();
Collection<Long> eesThatMetThreshold = new HashSet<>();
for (ExpressionExperimentValueObject ee : resultsMap.keySet()) {
ExpressionExperimentValueObject eevo = this.configExpressionExperimentValueObject(ee);
Collection<DifferentialExpressionValueObject> proberesults = resultsMap.get(ee);
Collection<DifferentialExpressionValueObject> filteredResults = new HashSet<>();
for (DifferentialExpressionValueObject r : proberesults) {
Collection<ExperimentalFactorValueObject> efs = r.getExperimentalFactors();
assert efs.size() > 0;
if (efs.size() > 1) {
// We always ignore interaction effects.
continue;
}
ExperimentalFactorValueObject ef = efs.iterator().next();
/*
* note that we don't care about the reverse: the eefactorsmap can have stuff we don't need. We focus on
* the experiments because they are easy to select & secure. The eefactorsmap provides additional
* details.
*/
assert eeFactorsMap.containsKey(ee.getId()) : "eeFactorsMap does not contain ee=" + ee.getId();
Long sfId = eeFactorsMap.get(ee.getId());
if (!ef.getId().equals(sfId)) {
/*
* Screen out factors we're not using.
*/
continue;
}
/* filtered result with chosen factor */
filteredResults.add(r);
}
if (filteredResults.size() == 0) {
GeneDifferentialExpressionServiceImpl.log.warn("No result for ee=" + ee);
continue;
}
/*
* For the diff expression meta analysis, ignore threshold. Select the 'best' penalized probe if multiple
* probes map to the same gene.
*/
DifferentialExpressionValueObject res = this.findMinPenalizedProbeResult(filteredResults);
if (res == null)
continue;
Double p = res.getP();
if (p == null)
continue;
/*
* Moderate the pvalues by setting all values to be no smaller than PVALUE_CLIP_THRESHOLD
*/
pvaluesToCombine.add(Math.max(p, GeneDifferentialExpressionService.PVALUE_CLIP_THRESHOLD));
for (DifferentialExpressionValueObject r : filteredResults) {
Collection<ExperimentalFactorValueObject> efs = r.getExperimentalFactors();
if (efs == null) {
// This should not happen any more, but just in case.
GeneDifferentialExpressionServiceImpl.log.warn("No experimentalFactor(s) for DifferentialExpressionAnalysisResult: " + r.getId());
continue;
}
Boolean metThreshold = r.getCorrP() != null && (r.getCorrP() <= threshold);
r.setMetThreshold(metThreshold);
if (metThreshold) {
eesThatMetThreshold.add(eevo.getId());
}
Boolean fisherContribution = r.equals(res);
r.setFisherContribution(fisherContribution);
}
}
/*
* Meta-analysis part.
*/
double fisherPval = MetaAnalysis.fisherCombinePvalues(pvaluesToCombine);
mavo.setFisherPValue(fisherPval);
mavo.setGene(new GeneValueObject(g));
mavo.setActiveExperiments(activeExperiments);
mavo.setProbeResults(devos);
mavo.setNumMetThreshold(eesThatMetThreshold.size());
mavo.setSortKey();
timer.stop();
if (timer.getTime() > 1000) {
GeneDifferentialExpressionServiceImpl.log.info("Meta-analysis results: " + timer.getTime() + " ms");
}
return mavo;
}
use of cern.colt.list.DoubleArrayList in project Gemma by PavlidisLab.
the class AbstractDifferentialExpressionAnalyzer method computeRanks.
/**
* @param pvalues pvalues
* @return normalized ranks of the pvalues, or null if they were invalid/unusable.
*/
double[] computeRanks(double[] pvalues) {
if (pvalues == null) {
log.error("Null pvalues");
return null;
}
if (pvalues.length == 0) {
log.error("Empty pvalues array");
return null;
}
DoubleArrayList ranks = Rank.rankTransform(new DoubleArrayList(pvalues));
if (ranks == null) {
log.error("Pvalue ranks could not be computed");
return null;
}
double[] normalizedRanks = new double[ranks.size()];
for (int i = 0; i < ranks.size(); i++) {
normalizedRanks[i] = ranks.get(i) / ranks.size();
}
return normalizedRanks;
}
use of cern.colt.list.DoubleArrayList in project Gemma by PavlidisLab.
the class DiffExMetaAnalyzerServiceImpl method doMetaAnalysis.
private GeneDifferentialExpressionMetaAnalysis doMetaAnalysis(Collection<ExpressionAnalysisResultSet> updatedResultSets, Map<Gene, Collection<DifferentialExpressionAnalysisResult>> gene2result) {
DiffExMetaAnalyzerServiceImpl.log.info("Computing pvalues ...");
DoubleArrayList pvaluesUp = new DoubleArrayList();
DoubleArrayList pvaluesDown = new DoubleArrayList();
// third pass: collate to get p-values. First we have to aggregate within result set for genes which have more
// than one probe
List<GeneDifferentialExpressionMetaAnalysisResult> metaAnalysisResultsUp = new ArrayList<>();
List<GeneDifferentialExpressionMetaAnalysisResult> metaAnalysisResultsDown = new ArrayList<>();
for (Gene g : gene2result.keySet()) {
Map<ExpressionAnalysisResultSet, Collection<DifferentialExpressionAnalysisResult>> resultSet2Results4Gene = this.getResults4GenePerResultSet(g, gene2result);
if (g.getOfficialSymbol().equals("GUK1")) {
DiffExMetaAnalyzerServiceImpl.log.info(g);
}
/*
* Compute the pvalues for each resultset.
*/
DoubleArrayList pvalues4geneUp = new DoubleArrayList();
DoubleArrayList pvalues4geneDown = new DoubleArrayList();
DoubleArrayList foldChanges4gene = new DoubleArrayList();
Collection<DifferentialExpressionAnalysisResult> resultsUsed = new HashSet<>();
for (ExpressionAnalysisResultSet rs : resultSet2Results4Gene.keySet()) {
Collection<DifferentialExpressionAnalysisResult> res = resultSet2Results4Gene.get(rs);
if (res.isEmpty()) {
// shouldn't happen?
DiffExMetaAnalyzerServiceImpl.log.warn("Unexpectedly no results in resultSet for gene " + g);
continue;
}
Double foldChange4GeneInOneResultSet = this.aggregateFoldChangeForGeneWithinResultSet(res);
if (foldChange4GeneInOneResultSet == null) {
// we can't go on.
continue;
}
// we use the pvalue for the 'best' direction, and set the other to be 1- that. An alternative would be
// to use _only_ the best one.
Double pvalue4GeneInOneResultSetUp;
Double pvalue4GeneInOneResultSetDown;
if (foldChange4GeneInOneResultSet < 0) {
pvalue4GeneInOneResultSetDown = this.aggregatePvaluesForGeneWithinResultSet(res, false);
assert pvalue4GeneInOneResultSetDown != null;
pvalue4GeneInOneResultSetUp = 1.0 - pvalue4GeneInOneResultSetDown;
} else {
pvalue4GeneInOneResultSetUp = this.aggregatePvaluesForGeneWithinResultSet(res, true);
assert pvalue4GeneInOneResultSetUp != null;
pvalue4GeneInOneResultSetDown = 1.0 - pvalue4GeneInOneResultSetUp;
}
// If we have missing values, skip them. (this should be impossible!)
if (Double.isNaN(pvalue4GeneInOneResultSetUp) || Double.isNaN(pvalue4GeneInOneResultSetDown)) {
continue;
}
/*
* Now when we correct, we have to 1) bonferroni correct for multiple probes and 2) clip really small
* pvalues. We do this now, so that we don't skew the converse pvalues (up vs. down).
*/
pvalue4GeneInOneResultSetUp = this.correctAndClip(res, pvalue4GeneInOneResultSetUp);
pvalue4GeneInOneResultSetDown = this.correctAndClip(res, pvalue4GeneInOneResultSetDown);
// results used for this one gene's meta-analysis.
boolean added = resultsUsed.addAll(res);
assert added;
pvalues4geneUp.add(pvalue4GeneInOneResultSetUp);
pvalues4geneDown.add(pvalue4GeneInOneResultSetDown);
foldChanges4gene.add(foldChange4GeneInOneResultSet);
if (DiffExMetaAnalyzerServiceImpl.log.isDebugEnabled())
DiffExMetaAnalyzerServiceImpl.log.debug(String.format("%s %.4f %.4f %.1f", g.getOfficialSymbol(), pvalue4GeneInOneResultSetUp, pvalue4GeneInOneResultSetDown, foldChange4GeneInOneResultSet));
}
// loop over results for one gene
assert resultsUsed.size() >= pvalues4geneUp.size();
if (pvalues4geneUp.size() < 2) {
continue;
}
/*
* Note that this value can be misleading. It should not be used to determine the change that was
* "looked for". Use the 'upperTail' field instead.
*/
assert !Double.isNaN(Descriptive.mean(foldChanges4gene));
double fisherPvalueUp = MetaAnalysis.fisherCombinePvalues(pvalues4geneUp);
double fisherPvalueDown = MetaAnalysis.fisherCombinePvalues(pvalues4geneDown);
if (Double.isNaN(fisherPvalueUp) || Double.isNaN(fisherPvalueDown)) {
continue;
}
pvaluesUp.add(fisherPvalueUp);
GeneDifferentialExpressionMetaAnalysisResult metaAnalysisResultUp = this.makeMetaAnalysisResult(g, resultsUsed, fisherPvalueUp, Boolean.TRUE);
metaAnalysisResultsUp.add(metaAnalysisResultUp);
pvaluesDown.add(fisherPvalueDown);
GeneDifferentialExpressionMetaAnalysisResult metaAnalysisResultDown = this.makeMetaAnalysisResult(g, resultsUsed, fisherPvalueDown, Boolean.FALSE);
metaAnalysisResultsDown.add(metaAnalysisResultDown);
this.debug(g, fisherPvalueUp, fisherPvalueDown);
}
assert metaAnalysisResultsUp.size() == metaAnalysisResultsDown.size();
if (metaAnalysisResultsDown.isEmpty()) {
// can happen if platforms don't have any genes that match etc.
DiffExMetaAnalyzerServiceImpl.log.warn("No meta-analysis results were obtained");
return null;
}
DiffExMetaAnalyzerServiceImpl.log.info(metaAnalysisResultsUp.size() + " initial meta-analysis results");
DoubleArrayList qvaluesUp = MultipleTestCorrection.benjaminiHochberg(pvaluesUp);
assert qvaluesUp.size() == metaAnalysisResultsUp.size();
DoubleArrayList qvaluesDown = MultipleTestCorrection.benjaminiHochberg(pvaluesDown);
assert qvaluesDown.size() == metaAnalysisResultsDown.size();
return this.makeMetaAnalysisObject(updatedResultSets, metaAnalysisResultsUp, metaAnalysisResultsDown, qvaluesUp, qvaluesDown);
}
use of cern.colt.list.DoubleArrayList in project Gemma by PavlidisLab.
the class OutlierDetectionServiceImpl method identifyOutliersByMedianCorrelation.
@Override
public Collection<OutlierDetails> identifyOutliersByMedianCorrelation(DoubleMatrix<BioAssay, BioAssay> cormat) {
List<OutlierDetails> allSamples = new ArrayList<>();
OutlierDetails sample;
/* Find the 1st, 2nd, and 3rd quartiles of each sample */
for (int i = 0; i < cormat.rows(); i++) {
DoubleArrayList cors = new DoubleArrayList();
sample = new OutlierDetails(cormat.getRowName(i));
for (int j = 0; j < cormat.columns(); j++) {
if (j != i) {
// get all sample correlations except correlation with self
double d = cormat.get(i, j);
cors.add(d);
}
}
assert (cors.size() == cormat.rows() - 1);
sample.setFirstQuartile(this.findValueAtDesiredQuantile(cors, 25));
sample.setMedianCorrelation(this.findValueAtDesiredQuantile(cors, 50));
sample.setThirdQuartile(this.findValueAtDesiredQuantile(cors, 75));
if (sample.getFirstQuartile() == Double.MIN_VALUE || sample.getMedianCorrelation() == Double.MIN_VALUE || sample.getThirdQuartile() == Double.MIN_VALUE) {
throw new IllegalStateException("Could not determine one or more quartiles for a sample; ");
}
allSamples.add(sample);
}
/* Sort all samples by median correlation */
Collections.sort(allSamples, OutlierDetails.MedianComparator);
int numOutliers = 0;
/* Check for overlap of first quartile and median of consecutive samples */
for (int k = 0; k < allSamples.size() - 1; k++) {
// if ( allSamples.get( k ).getMedianCorrelation() < allSamples.get( k + 1 ).getFirstQuartile() ) {
if (allSamples.get(k).getThirdQuartile() < allSamples.get(k + 1).getFirstQuartile()) {
numOutliers = k + 1;
}
}
List<OutlierDetails> outliers = new ArrayList<>();
for (int m = 0; m < numOutliers; m++) {
outliers.add(allSamples.get(m));
}
/*
* Check that all outliers are legitimate (controls for situations where sorting by median does not give 'true'
* order)
*/
if (numOutliers > 0) {
OutlierDetectionServiceImpl.log.info("Removing false positives; number of outliers before test: " + numOutliers);
outliers = this.removeFalsePositives(allSamples, outliers, numOutliers);
numOutliers = outliers.size();
OutlierDetectionServiceImpl.log.info("Number of outliers after removing false positives: " + numOutliers);
}
OutlierDetectionServiceImpl.log.info("Found " + numOutliers + " outlier(s)");
return outliers;
}
use of cern.colt.list.DoubleArrayList in project Gemma by PavlidisLab.
the class DataUpdater method addTotalCountInformation.
private void addTotalCountInformation(ExpressionExperiment ee, ExpressionDataDoubleMatrix countEEMatrix, Integer readLength, Boolean isPairedReads) {
for (BioAssay ba : ee.getBioAssays()) {
Double[] col = countEEMatrix.getColumn(ba);
double librarySize = DescriptiveWithMissing.sum(new DoubleArrayList(ArrayUtils.toPrimitive(col)));
DataUpdater.log.info(ba + " total library size=" + librarySize);
ba.setSequenceReadLength(readLength);
ba.setSequencePairedReads(isPairedReads);
ba.setSequenceReadCount((int) Math.floor(librarySize));
bioAssayService.update(ba);
}
}
Aggregations