use of cern.colt.list.DoubleArrayList in project Gemma by PavlidisLab.
the class ComBat method aPrior.
private DoubleArrayList aPrior(DoubleMatrix2D d) {
DoubleArrayList result = new DoubleArrayList();
for (int i = 0; i < d.rows(); i++) {
DoubleArrayList dd = new DoubleArrayList(d.viewRow(i).toArray());
double mean = DescriptiveWithMissing.mean(dd);
double var = DescriptiveWithMissing.sampleVariance(dd, mean);
result.add((2.0 * var + Math.pow(mean, 2)) / var);
}
return result;
}
use of cern.colt.list.DoubleArrayList in project Gemma by PavlidisLab.
the class ComBat method stepSum.
private DoubleMatrix1D stepSum(DoubleMatrix2D matrix, DoubleMatrix1D gnew) {
Algebra s = new Algebra();
DoubleMatrix2D g = new DenseDoubleMatrix2D(1, gnew.size());
for (int i = 0; i < gnew.size(); i++) {
g.set(0, i, gnew.get(i));
}
DoubleMatrix2D a = new DenseDoubleMatrix2D(1, matrix.columns());
a.assign(1.0);
/*
* subtract column gnew from each column of data; square; then sum over each row.
*/
DoubleMatrix2D deltas = matrix.copy().assign((s.mult(s.transpose(g), a)), Functions.minus).assign(Functions.square);
DoubleMatrix1D sumsq = new DenseDoubleMatrix1D(deltas.rows());
sumsq.assign(0.0);
for (int i = 0; i < deltas.rows(); i++) {
sumsq.set(i, DescriptiveWithMissing.sum(new DoubleArrayList(deltas.viewRow(i).toArray())));
}
return sumsq;
}
use of cern.colt.list.DoubleArrayList in project Gemma by PavlidisLab.
the class RowLevelFilter method filter.
@Override
public ExpressionDataDoubleMatrix filter(ExpressionDataDoubleMatrix data) {
if (lowCut == -Double.MAX_VALUE && highCut == Double.MAX_VALUE) {
RowLevelFilter.log.info("No filtering requested");
return data;
}
int numRows = data.rows();
DoubleArrayList criteria = new DoubleArrayList(new double[numRows]);
int numAllNeg = this.computeCriteria(data, criteria);
DoubleArrayList sortedCriteria = criteria.copy();
sortedCriteria.sort();
int consideredRows = numRows;
int startIndex = 0;
if (removeAllNegative) {
consideredRows = numRows - numAllNeg;
startIndex = numAllNeg;
}
double realHighCut = this.getHighThreshold(sortedCriteria, consideredRows);
double realLowCut = this.getLowThreshold(numRows, sortedCriteria, consideredRows, startIndex);
if (Double.isNaN(realHighCut)) {
throw new IllegalStateException("High threshold cut is NaN");
}
RowLevelFilter.log.debug("Low cut = " + realLowCut);
RowLevelFilter.log.debug("High cut = " + realHighCut);
if (realHighCut <= realLowCut) {
throw new RuntimeException("High cut " + realHighCut + " is lower or same as low cut " + realLowCut);
}
List<CompositeSequence> kept = new ArrayList<>();
for (int i = 0; i < numRows; i++) {
// values, zeros should always be removed
if (criteria.get(i) > realLowCut && criteria.get(i) <= realHighCut) {
kept.add(data.getDesignElementForRow(i));
}
}
this.logInfo(numRows, kept);
return new ExpressionDataDoubleMatrix(data, kept);
}
use of cern.colt.list.DoubleArrayList in project Gemma by PavlidisLab.
the class SVDServiceHelperImpl method analyzeComponent.
/**
* Do the factor comparisons for one component.
*
* @param bioMaterialFactorMap Map of factors to biomaterials to the value we're going to use. Even for
* non-continuous factors the value is a double.
*/
private void analyzeComponent(SVDValueObject svo, int componentNumber, DoubleMatrix<Long, Integer> vMatrix, Map<Long, Date> bioMaterialDates, Map<ExperimentalFactor, Map<Long, Double>> bioMaterialFactorMap, Long[] svdBioMaterials) {
DoubleArrayList eigenGene = new DoubleArrayList(vMatrix.getColumn(componentNumber));
// since we use rank correlation/anova, we just use the casted ids (two-groups) or dates as the covariate
int numWithDates = 0;
for (Long id : bioMaterialDates.keySet()) {
if (bioMaterialDates.get(id) != null) {
numWithDates++;
}
}
if (numWithDates > 2) {
/*
* Get the dates in order, - no rounding.
*/
boolean initializingDates = svo.getDates().isEmpty();
double[] dates = new double[svdBioMaterials.length];
/*
* If dates are all the same, skip.
*/
Set<Date> uniqueDate = new HashSet<>();
for (int j = 0; j < svdBioMaterials.length; j++) {
Date date = bioMaterialDates.get(svdBioMaterials[j]);
if (date == null) {
SVDServiceHelperImpl.log.warn("Incomplete date information, missing for biomaterial " + svdBioMaterials[j]);
dates[j] = Double.NaN;
} else {
Date roundDate = DateUtils.round(date, Calendar.MINUTE);
uniqueDate.add(roundDate);
// round to minute; make int, cast to
dates[j] = roundDate.getTime();
// double
}
if (initializingDates) {
svo.getDates().add(date);
}
}
if (uniqueDate.size() == 1) {
SVDServiceHelperImpl.log.warn("All scan dates the same, skipping data analysis");
svo.getDates().clear();
}
if (eigenGene.size() != dates.length) {
SVDServiceHelperImpl.log.warn("Could not compute correlation, dates and eigenGene had different lengths.");
return;
}
double dateCorrelation = Distance.spearmanRankCorrelation(eigenGene, new DoubleArrayList(dates));
svo.setPCDateCorrelation(componentNumber, dateCorrelation);
svo.setPCDateCorrelationPval(componentNumber, CorrelationStats.spearmanPvalue(dateCorrelation, eigenGene.size()));
}
/*
* Compare each factor (including batch information that is somewhat redundant with the dates) to the
* eigen-genes. Using rank statistics.
*/
for (ExperimentalFactor ef : bioMaterialFactorMap.keySet()) {
Map<Long, Double> bmToFv = bioMaterialFactorMap.get(ef);
double[] fvs = new double[svdBioMaterials.length];
assert fvs.length > 0;
int numNotMissing = 0;
boolean initializing = false;
if (!svo.getFactors().containsKey(ef.getId())) {
svo.getFactors().put(ef.getId(), new ArrayList<Double>());
initializing = true;
}
for (int j = 0; j < svdBioMaterials.length; j++) {
fvs[j] = bmToFv.get(svdBioMaterials[j]);
if (!Double.isNaN(fvs[j])) {
numNotMissing++;
}
// value.
if (initializing) {
if (SVDServiceHelperImpl.log.isDebugEnabled())
SVDServiceHelperImpl.log.debug("EF:" + ef.getId() + " fv=" + bmToFv.get(svdBioMaterials[j]));
svo.getFactors().get(ef.getId()).add(bmToFv.get(svdBioMaterials[j]));
}
}
if (fvs.length != eigenGene.size()) {
SVDServiceHelperImpl.log.debug(fvs.length + " factor values (biomaterials) but " + eigenGene.size() + " values in the eigenGene");
continue;
}
if (numNotMissing < SVDServiceHelperImpl.MINIMUM_POINTS_TO_COMPARE_TO_EIGEN_GENE) {
SVDServiceHelperImpl.log.debug("Insufficient values to compare " + ef + " to eigenGenes");
continue;
}
if (ExperimentalDesignUtils.isContinuous(ef)) {
double factorCorrelation = Distance.spearmanRankCorrelation(eigenGene, new DoubleArrayList(fvs));
svo.setPCFactorCorrelation(componentNumber, ef, factorCorrelation);
svo.setPCFactorCorrelationPval(componentNumber, ef, CorrelationStats.spearmanPvalue(factorCorrelation, eigenGene.size()));
} else {
Collection<Integer> groups = new HashSet<>();
IntArrayList groupings = new IntArrayList(fvs.length);
int k = 0;
DoubleArrayList eigenGeneWithoutMissing = new DoubleArrayList();
for (double d : fvs) {
if (Double.isNaN(d)) {
k++;
continue;
}
groupings.add((int) d);
groups.add((int) d);
eigenGeneWithoutMissing.add(eigenGene.get(k));
k++;
}
if (groups.size() < 2) {
SVDServiceHelperImpl.log.debug("Factor had less than two groups: " + ef + ", SVD comparison can't be done.");
continue;
}
if (eigenGeneWithoutMissing.size() < SVDServiceHelperImpl.MINIMUM_POINTS_TO_COMPARE_TO_EIGEN_GENE) {
SVDServiceHelperImpl.log.debug("Too few non-missing values for factor to compare to eigenGenes: " + ef);
continue;
}
if (groups.size() == 2) {
// use the one that still has missing values.
double factorCorrelation = Distance.spearmanRankCorrelation(eigenGene, new DoubleArrayList(fvs));
svo.setPCFactorCorrelation(componentNumber, ef, factorCorrelation);
svo.setPCFactorCorrelationPval(componentNumber, ef, CorrelationStats.spearmanPvalue(factorCorrelation, eigenGeneWithoutMissing.size()));
} else {
// one-way ANOVA on ranks.
double kwPVal = KruskalWallis.test(eigenGeneWithoutMissing, groupings);
svo.setPCFactorCorrelationPval(componentNumber, ef, kwPVal);
double factorCorrelation = Distance.spearmanRankCorrelation(eigenGene, new DoubleArrayList(fvs));
double corrPvalue = CorrelationStats.spearmanPvalue(factorCorrelation, eigenGeneWithoutMissing.size());
// sanity.
assert Math.abs(factorCorrelation) < 1.0 + 1e-2;
/*
* Avoid storing a pvalue, as it's hard to compare. If the regular linear correlation is strong,
* then we should just use that -- basically, it means the order we have the groups happens to be a
* good one. Of course we could just store pvalues, but that's not easy to use either.
*/
if (corrPvalue <= kwPVal) {
svo.setPCFactorCorrelation(componentNumber, ef, factorCorrelation);
} else {
// hack. A bit like turning pvalues into prob it
double approxCorr = CorrelationStats.correlationForPvalue(kwPVal, eigenGeneWithoutMissing.size());
svo.setPCFactorCorrelation(componentNumber, ef, approxCorr);
}
}
}
}
}
use of cern.colt.list.DoubleArrayList in project Gemma by PavlidisLab.
the class SpearmanMetricsTest method testCorrelB.
/**
* This tests the same values as testCorrelWithMissing, different method than testCorrelC
*/
@Test
public void testCorrelB() {
boolean[] usedA = new boolean[] { true, true, true, true, true, true, true, true, true };
boolean[] usedB = new boolean[] { true, true, true, true, true, true, true, true, true };
double[] a = new double[] { 400, 310, 20, 20, 688, 498, 533, 1409, 1500 };
double[] b = new double[] { 1545, 2072, 1113, 676, 2648, 2478, 2574, 5155, 1624 };
assertEquals(a.length, b.length);
assertEquals(a.length, usedA.length);
assertEquals(b.length, usedB.length);
DoubleArrayList ranksIA = Rank.rankTransform(new DoubleArrayList(a));
DoubleArrayList ranksIB = Rank.rankTransform(new DoubleArrayList(b));
SpearmanMetrics test = new SpearmanMetrics(10);
double actualValue = test.spearman(ranksIA.elements(), ranksIB.elements(), usedA, usedB, 0, 1);
double expectedValue = 0.7113033;
assertEquals(expectedValue, actualValue, 0.0001);
}
Aggregations