Search in sources :

Example 1 with UnknownLogScaleException

use of ubic.gemma.core.analysis.preprocess.UnknownLogScaleException in project Gemma by PavlidisLab.

the class ExpressionDataDoubleMatrixUtil method filterAndLog2Transform.

/**
 * Log2 transform if necessary, do any required filtering prior to analysis. Count data is converted to log2CPM (but
 * we store log2cpm as the processed data, so that is what would generally be used).
 *
 * @param quantitationType QT
 * @param dmatrix          matrix
 * @return ee data double matrix
 */
public static ExpressionDataDoubleMatrix filterAndLog2Transform(QuantitationType quantitationType, ExpressionDataDoubleMatrix dmatrix) {
    ScaleType scaleType = ExpressionDataDoubleMatrixUtil.findScale(quantitationType, dmatrix.getMatrix());
    if (scaleType.equals(ScaleType.LOG2)) {
        ExpressionDataDoubleMatrixUtil.log.info("Data is already on a log2 scale");
    } else if (scaleType.equals(ScaleType.LN)) {
        ExpressionDataDoubleMatrixUtil.log.info(" **** Converting from ln to log2 **** ");
        MatrixStats.convertToLog2(dmatrix.getMatrix(), Math.E);
    } else if (scaleType.equals(ScaleType.LOG10)) {
        ExpressionDataDoubleMatrixUtil.log.info(" **** Converting from log10 to log2 **** ");
        MatrixStats.convertToLog2(dmatrix.getMatrix(), 10);
    } else if (scaleType.equals(ScaleType.LINEAR)) {
        ExpressionDataDoubleMatrixUtil.log.info(" **** LOG TRANSFORMING **** ");
        MatrixStats.logTransform(dmatrix.getMatrix());
    } else if (scaleType.equals(ScaleType.COUNT)) {
        /*
             * Since we store log2cpm this shouldn't be reached any more. We don't do it in place.
             */
        ExpressionDataDoubleMatrixUtil.log.info(" **** Converting from count to log2 counts per million **** ");
        DoubleMatrix1D librarySize = MatrixStats.colSums(dmatrix.getMatrix());
        DoubleMatrix<CompositeSequence, BioMaterial> log2cpm = MatrixStats.convertToLog2Cpm(dmatrix.getMatrix(), librarySize);
        dmatrix = new ExpressionDataDoubleMatrix(dmatrix, log2cpm);
    } else {
        throw new UnknownLogScaleException("Can't figure out what scale the data are on");
    }
    /*
         * We do this second because doing it first causes some kind of subtle problem ... (round off? I could not
         * really track this down).
         *
         * Remove zero-variance rows, but also rows that have lots of equal values even if variance is non-zero. This
         * happens when data is "clipped" (e.g., all values under 10 set to 10).
         */
    int r = dmatrix.rows();
    dmatrix = ExpressionExperimentFilter.zeroVarianceFilter(dmatrix);
    if (dmatrix.rows() < r) {
        ExpressionDataDoubleMatrixUtil.log.info((r - dmatrix.rows()) + " rows removed due to low variance");
    }
    r = dmatrix.rows();
    if (dmatrix.columns() > ExpressionDataDoubleMatrixUtil.COLUMNS_LIMIT) {
        dmatrix = ExpressionExperimentFilter.tooFewDistinctValues(dmatrix, ExpressionDataDoubleMatrixUtil.VALUES_LIMIT);
        if (dmatrix.rows() < r) {
            ExpressionDataDoubleMatrixUtil.log.info((r - dmatrix.rows()) + " rows removed due to too many identical values");
        }
    }
    return dmatrix;
}
Also used : BioMaterial(ubic.gemma.model.expression.biomaterial.BioMaterial) ScaleType(ubic.gemma.model.common.quantitationtype.ScaleType) DoubleMatrix1D(cern.colt.matrix.DoubleMatrix1D) CompositeSequence(ubic.gemma.model.expression.designElement.CompositeSequence) UnknownLogScaleException(ubic.gemma.core.analysis.preprocess.UnknownLogScaleException)

Aggregations

DoubleMatrix1D (cern.colt.matrix.DoubleMatrix1D)1 UnknownLogScaleException (ubic.gemma.core.analysis.preprocess.UnknownLogScaleException)1 ScaleType (ubic.gemma.model.common.quantitationtype.ScaleType)1 BioMaterial (ubic.gemma.model.expression.biomaterial.BioMaterial)1 CompositeSequence (ubic.gemma.model.expression.designElement.CompositeSequence)1