use of cern.colt.matrix.DoubleMatrix1D in project Gemma by PavlidisLab.
the class LinearModelAnalyzer method regressionResiduals.
/**
* @param matrix on which to perform regression.
* @param config containing configuration of factors to include. Any interactions or subset configuration is
* ignored. Data are <em>NOT</em> log transformed unless they come in that way. (the qValueThreshold will be
* ignored)
* @param retainScale if true, the data retain the global mean (intercept)
* @return residuals from the regression.
*/
@Override
public ExpressionDataDoubleMatrix regressionResiduals(ExpressionDataDoubleMatrix matrix, DifferentialExpressionAnalysisConfig config, boolean retainScale) {
if (config.getFactorsToInclude().isEmpty()) {
LinearModelAnalyzer.log.warn("No factors");
return matrix;
}
/*
* Note that this method relies on similar code to doAnalysis, for the setup stages.
*/
List<ExperimentalFactor> factors = config.getFactorsToInclude();
List<BioMaterial> samplesUsed = ExperimentalDesignUtils.getOrderedSamples(matrix, factors);
Map<ExperimentalFactor, FactorValue> baselineConditions = ExperimentalDesignUtils.getBaselineConditions(samplesUsed, factors);
ObjectMatrix<String, String, Object> designMatrix = ExperimentalDesignUtils.buildDesignMatrix(factors, samplesUsed, baselineConditions);
DesignMatrix properDesignMatrix = new DesignMatrix(designMatrix, true);
ExpressionDataDoubleMatrix dmatrix = new ExpressionDataDoubleMatrix(samplesUsed, matrix);
DoubleMatrix<CompositeSequence, BioMaterial> namedMatrix = dmatrix.getMatrix();
DoubleMatrix<String, String> sNamedMatrix = this.makeDataMatrix(designMatrix, namedMatrix);
// perform weighted least squares regression on COUNT data
QuantitationType quantitationType = dmatrix.getQuantitationTypes().iterator().next();
LeastSquaresFit fit;
if (quantitationType.getScale().equals(ScaleType.COUNT)) {
LinearModelAnalyzer.log.info("Calculating residuals of weighted least squares regression on COUNT data");
// note: data is not log transformed
DoubleMatrix1D librarySize = MatrixStats.colSums(sNamedMatrix);
MeanVarianceEstimator mv = new MeanVarianceEstimator(properDesignMatrix, sNamedMatrix, librarySize);
fit = new LeastSquaresFit(properDesignMatrix, sNamedMatrix, mv.getWeights());
} else {
fit = new LeastSquaresFit(properDesignMatrix, sNamedMatrix);
}
DoubleMatrix2D residuals = fit.getResiduals();
if (retainScale) {
DoubleMatrix1D intercept = fit.getCoefficients().viewRow(0);
for (int i = 0; i < residuals.rows(); i++) {
residuals.viewRow(i).assign(Functions.plus(intercept.get(i)));
}
}
DoubleMatrix<CompositeSequence, BioMaterial> f = new DenseDoubleMatrix<>(residuals.toArray());
f.setRowNames(dmatrix.getMatrix().getRowNames());
f.setColumnNames(dmatrix.getMatrix().getColNames());
return new ExpressionDataDoubleMatrix(dmatrix, f);
}
use of cern.colt.matrix.DoubleMatrix1D in project Gemma by PavlidisLab.
the class LinkAnalysisServiceImpl method diagnoseCorrelationDistribution.
/**
* Check properties of the distribution
*/
// Better readability
@SuppressWarnings("StatementWithEmptyBody")
private void diagnoseCorrelationDistribution(ExpressionExperiment ee, CoexpCorrelationDistribution corrDist) throws UnsuitableForAnalysisException {
/*
* Find the median, etc.
*/
ByteArrayConverter bac = new ByteArrayConverter();
double[] binCounts = bac.byteArrayToDoubles(corrDist.getBinCounts());
int numBins = binCounts.length;
DoubleMatrix1D histogram = new DenseDoubleMatrix1D(binCounts);
// QC parameters; quantile, not correlation
double lowerLimitofMiddle = 0.45;
double upperLimitofMiddle = 0.55;
double tailFraction = 0.1;
// normalize
histogram.assign(Functions.div(histogram.zSum()));
double lowerTailDensity = 0.0;
double upperTailDensity = 0.0;
double median = 0.0;
// cumulative
double s = 0.0;
double middleDensity = 0.0;
for (int bin = 0; bin < histogram.size(); bin++) {
// cumulate
s += histogram.get(bin);
/*
* Perhaps these should be adjusted based on the sample size; for smaller data sets, more of the data is
* going to be above 0.9 etc. But in practice this can't have a very big effect.
*/
if (bin == (int) Math.floor(numBins * tailFraction)) {
lowerTailDensity = s;
} else if (bin == (int) Math.floor(numBins * (1.0 - tailFraction))) {
upperTailDensity = 1.0 - s;
} else if (bin > (int) Math.floor(lowerLimitofMiddle * numBins) && bin < (int) Math.floor(upperLimitofMiddle * numBins)) {
middleDensity += histogram.get(bin);
}
if (s >= 0.2) {
// firstQuintile = binToCorrelation( i, numBins );
} else if (s >= 0.5) {
median = this.binToCorrelation(bin, numBins);
} else if (s >= 0.8) {
// lastQuintile = binToCorrelation( i, numBins );
}
}
String message = "";
boolean bad = false;
if (median > 0.2 || median < -0.2) {
bad = true;
message = "Correlation distribution fails QC: median far from center (" + median + ")";
} else if (lowerTailDensity + upperTailDensity > middleDensity) {
bad = true;
message = "Correlation distribution fails QC: tails too heavy";
}
if (bad) {
throw new UnsuitableForAnalysisException(ee, message);
}
}
use of cern.colt.matrix.DoubleMatrix1D in project Gemma by PavlidisLab.
the class DataUpdater method addCountData.
/**
* Replaces data. Starting with the count data, we compute the log2cpm, which is the preferred quantitation type we
* use internally. Counts and FPKM (if provided) are stored in addition.
*
* @param ee ee
* @param targetArrayDesign - this should be one of the "Generic" gene-based platforms. The data set will be
* switched to use it.
* @param countMatrix Representing 'raw' counts (added after rpkm, if provided).
* @param rpkmMatrix Representing per-gene normalized data, optional (RPKM or FPKM)
* @param allowMissingSamples if true, samples that are missing data will be deleted from the experiment.
*/
public void addCountData(ExpressionExperiment ee, ArrayDesign targetArrayDesign, DoubleMatrix<String, String> countMatrix, DoubleMatrix<String, String> rpkmMatrix, Integer readLength, Boolean isPairedReads, boolean allowMissingSamples) {
if (countMatrix == null)
throw new IllegalArgumentException("You must provide count matrix (rpkm is optional)");
targetArrayDesign = arrayDesignService.thaw(targetArrayDesign);
ee = experimentService.thawLite(ee);
ee = this.dealWithMissingSamples(ee, countMatrix, allowMissingSamples);
DoubleMatrix<CompositeSequence, BioMaterial> properCountMatrix = this.matchElementsToRowNames(targetArrayDesign, countMatrix);
this.matchBioMaterialsToColNames(ee, countMatrix, properCountMatrix);
assert !properCountMatrix.getColNames().isEmpty();
assert !properCountMatrix.getRowNames().isEmpty();
QuantitationType countqt = this.makeCountQt();
ExpressionDataDoubleMatrix countEEMatrix = new ExpressionDataDoubleMatrix(ee, countqt, properCountMatrix);
QuantitationType log2cpmQt = this.makelog2cpmQt();
DoubleMatrix1D librarySize = MatrixStats.colSums(countMatrix);
DoubleMatrix<CompositeSequence, BioMaterial> log2cpmMatrix = MatrixStats.convertToLog2Cpm(properCountMatrix, librarySize);
ExpressionDataDoubleMatrix log2cpmEEMatrix = new ExpressionDataDoubleMatrix(ee, log2cpmQt, log2cpmMatrix);
ee = this.replaceData(ee, targetArrayDesign, log2cpmEEMatrix);
ee = this.addData(ee, targetArrayDesign, countEEMatrix);
this.addTotalCountInformation(ee, countEEMatrix, readLength, isPairedReads);
if (rpkmMatrix != null) {
DoubleMatrix<CompositeSequence, BioMaterial> properRPKMMatrix = this.matchElementsToRowNames(targetArrayDesign, rpkmMatrix);
this.matchBioMaterialsToColNames(ee, rpkmMatrix, properRPKMMatrix);
assert !properRPKMMatrix.getColNames().isEmpty();
assert !properRPKMMatrix.getRowNames().isEmpty();
QuantitationType rpkmqt = this.makeRPKMQt();
ExpressionDataDoubleMatrix rpkmEEMatrix = new ExpressionDataDoubleMatrix(ee, rpkmqt, properRPKMMatrix);
ee = this.addData(ee, targetArrayDesign, rpkmEEMatrix);
}
assert !processedExpressionDataVectorService.getProcessedDataVectors(ee).isEmpty();
}
use of cern.colt.matrix.DoubleMatrix1D in project tdq-studio-se by Talend.
the class Formatter method demo2.
/**
* Demonstrates how to use this class.
*/
public static void demo2() {
// parameters
double[] values = { // 5, 0.0, -0.0, -Double.NaN, Double.NaN, 0.0/0.0, Double.NEGATIVE_INFINITY, Double.POSITIVE_INFINITY, Double.MIN_VALUE, Double.MAX_VALUE
5, 0.0, -0.0, -Double.NaN, Double.NaN, 0.0 / 0.0, Double.MIN_VALUE, Double.MAX_VALUE, Double.NEGATIVE_INFINITY, Double.POSITIVE_INFINITY // Double.MIN_VALUE, Double.MAX_VALUE //, Double.NEGATIVE_INFINITY, Double.POSITIVE_INFINITY
};
// String[] formats = {"%G", "%1.10G", "%f", "%1.2f", "%0.2e"};
String[] formats = { "%G", "%1.19G" };
// now the processing
int size = formats.length;
DoubleMatrix1D matrix = new DenseDoubleMatrix1D(values);
String[] strings = new String[size];
for (int i = 0; i < size; i++) {
String format = formats[i];
strings[i] = new Formatter(format).toString(matrix);
for (int j = 0; j < matrix.size(); j++) {
System.out.println(String.valueOf(matrix.get(j)));
}
}
System.out.println("original:\n" + new Formatter().toString(matrix));
for (int i = 0; i < size; i++) {
System.out.println("\nstring(" + formats[i] + "):\n" + strings[i]);
}
}
use of cern.colt.matrix.DoubleMatrix1D in project tdq-studio-se by Talend.
the class Partitioning method partition.
/**
*Same as {@link cern.colt.Partitioning#partition(int[],int,int,int[],int,int,int[])}
*except that it <i>synchronously</i> partitions the rows of the given matrix by the values of the given matrix column;
*This is essentially the same as partitioning a list of composite objects by some instance variable;
*In other words, two entire rows of the matrix are swapped, whenever two column values indicate so.
*<p>
*Let's say, a "row" is an "object" (tuple, d-dimensional point).
*A "column" is the list of "object" values of a given variable (field, dimension).
*A "matrix" is a list of "objects" (tuples, points).
*<p>
*Now, rows (objects, tuples) are partially sorted according to their values in one given variable (dimension).
*Two entire rows of the matrix are swapped, whenever two column values indicate so.
*<p>
*Note that arguments are not checked for validity.
*<p>
*<b>Example:</b>
*<table border="1" cellspacing="0">
* <tr nowrap>
* <td valign="top"><tt>8 x 3 matrix:<br>
* 23, 22, 21<br>
* 20, 19, 18<br>
* 17, 16, 15<br>
* 14, 13, 12<br>
* 11, 10, 9<br>
* 8, 7, 6<br>
* 5, 4, 3<br>
* 2, 1, 0 </tt></td>
* <td align="left" valign="top">
* <p><tt>column = 0;<br>
* rowIndexes = {0,1,2,..,matrix.rows()-1};
* rowFrom = 0;<br>
* rowTo = matrix.rows()-1;<br>
* splitters = {5,10,12}<br>
* c = 0; <br>
* d = splitters.length-1;<br>
* partition(matrix,rowIndexes,rowFrom,rowTo,column,splitters,c,d,splitIndexes);<br>
* ==><br>
* splitIndexes == {0, 2, 3}<br>
* rowIndexes == {7, 6, 5, 4, 0, 1, 2, 3}</tt></p>
* </td>
* <td valign="top">
* The matrix IS NOT REORDERED.<br>
* Here is how it would look<br>
* like, if it would be reordered<br>
* accoring to <tt>rowIndexes</tt>.<br>
* <tt>8 x 3 matrix:<br>
* 2, 1, 0<br>
* 5, 4, 3<br>
* 8, 7, 6<br>
* 11, 10, 9<br>
* 23, 22, 21<br>
* 20, 19, 18<br>
* 17, 16, 15<br>
* 14, 13, 12 </tt></td>
* </tr>
*</table>
*@param matrix the matrix to be partitioned.
*@param rowIndexes the index of the i-th row; is modified by this method to reflect partitioned indexes.
*@param rowFrom the index of the first row (inclusive).
*@param rowTo the index of the last row (inclusive).
*@param column the index of the column to partition on.
*@param splitters the values at which the rows shall be split into intervals.
* Must be sorted ascending and must not contain multiple identical values.
* These preconditions are not checked; be sure that they are met.
*
*@param splitFrom the index of the first splitter element to be considered.
*@param splitTo the index of the last splitter element to be considered.
* The method considers the splitter elements <tt>splitters[splitFrom] .. splitters[splitTo]</tt>.
*
*@param splitIndexes a list into which this method fills the indexes of rows delimiting intervals.
*Upon return <tt>splitIndexes[splitFrom..splitTo]</tt> will be set accordingly.
*Therefore, must satisfy <tt>splitIndexes.length >= splitters.length</tt>.
*/
public static void partition(DoubleMatrix2D matrix, int[] rowIndexes, int rowFrom, int rowTo, int column, final double[] splitters, int splitFrom, int splitTo, int[] splitIndexes) {
if (rowFrom < 0 || rowTo >= matrix.rows() || rowTo >= rowIndexes.length)
throw new IllegalArgumentException();
if (column < 0 || column >= matrix.columns())
throw new IllegalArgumentException();
if (splitFrom < 0 || splitTo >= splitters.length)
throw new IllegalArgumentException();
if (splitIndexes.length < splitters.length)
throw new IllegalArgumentException();
// this one knows how to swap two row indexes (a,b)
final int[] g = rowIndexes;
Swapper swapper = new Swapper() {
public void swap(int b, int c) {
int tmp = g[b];
g[b] = g[c];
g[c] = tmp;
}
};
// compare splitter[a] with columnView[rowIndexes[b]]
final DoubleMatrix1D columnView = matrix.viewColumn(column);
IntComparator comp = new IntComparator() {
public int compare(int a, int b) {
double av = splitters[a];
double bv = columnView.getQuick(g[b]);
return av < bv ? -1 : (av == bv ? 0 : 1);
}
};
// compare columnView[rowIndexes[a]] with columnView[rowIndexes[b]]
IntComparator comp2 = new IntComparator() {
public int compare(int a, int b) {
double av = columnView.getQuick(g[a]);
double bv = columnView.getQuick(g[b]);
return av < bv ? -1 : (av == bv ? 0 : 1);
}
};
// compare splitter[a] with splitter[b]
IntComparator comp3 = new IntComparator() {
public int compare(int a, int b) {
double av = splitters[a];
double bv = splitters[b];
return av < bv ? -1 : (av == bv ? 0 : 1);
}
};
// generic partitioning does the main work of reordering row indexes
cern.colt.Partitioning.genericPartition(rowFrom, rowTo, splitFrom, splitTo, splitIndexes, comp, comp2, comp3, swapper);
}
Aggregations