use of ubic.gemma.core.analysis.preprocess.UnknownLogScaleException in project Gemma by PavlidisLab.
the class ExpressionDataDoubleMatrixUtil method filterAndLog2Transform.
/**
* Log2 transform if necessary, do any required filtering prior to analysis. Count data is converted to log2CPM (but
* we store log2cpm as the processed data, so that is what would generally be used).
*
* @param quantitationType QT
* @param dmatrix matrix
* @return ee data double matrix
*/
public static ExpressionDataDoubleMatrix filterAndLog2Transform(QuantitationType quantitationType, ExpressionDataDoubleMatrix dmatrix) {
ScaleType scaleType = ExpressionDataDoubleMatrixUtil.findScale(quantitationType, dmatrix.getMatrix());
if (scaleType.equals(ScaleType.LOG2)) {
ExpressionDataDoubleMatrixUtil.log.info("Data is already on a log2 scale");
} else if (scaleType.equals(ScaleType.LN)) {
ExpressionDataDoubleMatrixUtil.log.info(" **** Converting from ln to log2 **** ");
MatrixStats.convertToLog2(dmatrix.getMatrix(), Math.E);
} else if (scaleType.equals(ScaleType.LOG10)) {
ExpressionDataDoubleMatrixUtil.log.info(" **** Converting from log10 to log2 **** ");
MatrixStats.convertToLog2(dmatrix.getMatrix(), 10);
} else if (scaleType.equals(ScaleType.LINEAR)) {
ExpressionDataDoubleMatrixUtil.log.info(" **** LOG TRANSFORMING **** ");
MatrixStats.logTransform(dmatrix.getMatrix());
} else if (scaleType.equals(ScaleType.COUNT)) {
/*
* Since we store log2cpm this shouldn't be reached any more. We don't do it in place.
*/
ExpressionDataDoubleMatrixUtil.log.info(" **** Converting from count to log2 counts per million **** ");
DoubleMatrix1D librarySize = MatrixStats.colSums(dmatrix.getMatrix());
DoubleMatrix<CompositeSequence, BioMaterial> log2cpm = MatrixStats.convertToLog2Cpm(dmatrix.getMatrix(), librarySize);
dmatrix = new ExpressionDataDoubleMatrix(dmatrix, log2cpm);
} else {
throw new UnknownLogScaleException("Can't figure out what scale the data are on");
}
/*
* We do this second because doing it first causes some kind of subtle problem ... (round off? I could not
* really track this down).
*
* Remove zero-variance rows, but also rows that have lots of equal values even if variance is non-zero. This
* happens when data is "clipped" (e.g., all values under 10 set to 10).
*/
int r = dmatrix.rows();
dmatrix = ExpressionExperimentFilter.zeroVarianceFilter(dmatrix);
if (dmatrix.rows() < r) {
ExpressionDataDoubleMatrixUtil.log.info((r - dmatrix.rows()) + " rows removed due to low variance");
}
r = dmatrix.rows();
if (dmatrix.columns() > ExpressionDataDoubleMatrixUtil.COLUMNS_LIMIT) {
dmatrix = ExpressionExperimentFilter.tooFewDistinctValues(dmatrix, ExpressionDataDoubleMatrixUtil.VALUES_LIMIT);
if (dmatrix.rows() < r) {
ExpressionDataDoubleMatrixUtil.log.info((r - dmatrix.rows()) + " rows removed due to too many identical values");
}
}
return dmatrix;
}
Aggregations