Search in sources :

Example 26 with ExpressionDataDoubleMatrix

use of ubic.gemma.core.datastructure.matrix.ExpressionDataDoubleMatrix in project Gemma by PavlidisLab.

the class ExpressionExperimentFilter method doFilter.

/**
 * Apply filters as configured by the command line parameters and technology type. See getFilteredMatrix for the
 * details of what filters are applied and the ordering.
 *
 * @param eeDoubleMatrix , already masked for missing values.
 * @return A data matrix in which filters have been applied and missing values (in the PRESENTABSENT quantitation
 * type, if present) are masked
 */
private ExpressionDataDoubleMatrix doFilter(ExpressionDataDoubleMatrix eeDoubleMatrix) {
    ExpressionDataDoubleMatrix filteredMatrix = eeDoubleMatrix;
    int startingRows = eeDoubleMatrix.rows();
    config.setStartingRows(startingRows);
    if (config.isRequireSequences()) {
        filteredMatrix = this.noSequencesFilter(eeDoubleMatrix);
        if (filteredMatrix.rows() == 0) {
            // This can happen if the array design is not populated. To avoid problems with useless failures, just
            // skip this step.
            ExpressionExperimentFilter.log.warn("There were no sequences for the platform(s), but allowing filtering to go forward anyway despite config settings.");
            filteredMatrix = eeDoubleMatrix;
        // throw new IllegalStateException( "No rows left after removing elements without sequences" );
        }
    }
    int afterSequenceRemovalRows = filteredMatrix.rows();
    int afterAffyControlsFilter = afterSequenceRemovalRows;
    int afterMinPresentFilter = afterSequenceRemovalRows;
    int afterLowVarianceCut = afterSequenceRemovalRows;
    int afterLowExpressionCut = afterSequenceRemovalRows;
    int afterZeroVarianceCut;
    if (this.usesAffymetrix()) {
        ExpressionExperimentFilter.log.debug("Filtering Affymetrix controls");
        filteredMatrix = this.affyControlProbeFilter(filteredMatrix);
        afterAffyControlsFilter = filteredMatrix.rows();
    }
    config.setAfterInitialFilter(afterAffyControlsFilter);
    if (config.isMinPresentFractionIsSet() && !config.isIgnoreMinimumSampleThreshold()) {
        ExpressionExperimentFilter.log.debug("Filtering for missing data");
        filteredMatrix = this.minPresentFilter(filteredMatrix);
        afterMinPresentFilter = filteredMatrix.rows();
        config.setAfterMinPresentFilter(afterMinPresentFilter);
        if (filteredMatrix.rows() == 0) {
            throw new IllegalStateException("No rows left after minimum non-missing data filtering");
        }
    }
    /*
         * Always remove rows that have a variance of zero.
         */
    ExpressionExperimentFilter.log.debug("Filtering rows with zero variance");
    filteredMatrix = ExpressionExperimentFilter.zeroVarianceFilter(filteredMatrix);
    afterZeroVarianceCut = filteredMatrix.rows();
    config.setAfterZeroVarianceCut(afterZeroVarianceCut);
    if (filteredMatrix.rows() == 0) {
        throw new IllegalStateException("No rows left after filtering rows with zero variance");
    }
    /*
         * Filtering lowly expressed genes.
         */
    if (config.isLowExpressionCutIsSet()) {
        ExpressionExperimentFilter.log.debug("Filtering for low or too high expression");
        Map<CompositeSequence, Double> ranks = eeDoubleMatrix.getRanks();
        filteredMatrix = this.lowExpressionFilter(filteredMatrix, ranks);
        afterLowExpressionCut = filteredMatrix.rows();
        config.setAfterLowExpressionCut(afterLowExpressionCut);
        if (filteredMatrix.rows() == 0) {
            throw new IllegalStateException("No rows left after expression level filtering");
        }
    }
    /*
         *
         * Variance filtering is a little tricky. For ratiometric arrays, you clearly should use the variance. For
         * 'signal' arrays, we tried using the CV, but this has problems when the mean is near zero (filtering by low
         * expression first helps). If the data are on a log scale, and furthermore variance-stabilized (RMA for
         * example), this is less of an issue.
         *
         * The variance is probably the safest bet and seems to be what others use. For example see Hackstadt and Hess,
         * BMC Bioinformatics 2009 10:11 (http://www.ncbi.nlm.nih.gov/pubmed/19133141)
         */
    if (config.isLowVarianceCutIsSet()) {
        ExpressionExperimentFilter.log.debug("Filtering for low variance ");
        filteredMatrix = this.lowVarianceFilter(filteredMatrix);
        afterLowVarianceCut = filteredMatrix.rows();
        config.setAfterLowVarianceCut(afterLowVarianceCut);
        if (filteredMatrix.rows() == 0) {
            throw new IllegalStateException("No rows left after variance filtering");
        }
    }
    if (ExpressionExperimentFilter.log.isInfoEnabled()) {
        StringBuilder buf = new StringBuilder();
        buf.append("Filter summary:\n");
        buf.append("Filter summary for ").append(eeDoubleMatrix.getExpressionExperiment()).append(":\n");
        buf.append("Started with\t").append(startingRows).append(" (").append(eeDoubleMatrix.columns()).append(" columns) ").append("\n");
        if (config.isRequireSequences())
            buf.append("After Seq\t").append(afterSequenceRemovalRows).append("\n");
        if (this.usesAffymetrix())
            buf.append("After removing Affy controls\t").append(afterAffyControlsFilter).append("\n");
        if (config.isMinPresentFractionIsSet() && !config.isIgnoreMinimumSampleThreshold())
            buf.append("After MinPresent\t").append(afterMinPresentFilter).append("\n");
        buf.append("After ZeroVar\t").append(afterZeroVarianceCut).append("\n");
        if (config.isLowExpressionCutIsSet())
            buf.append("After LowExpr\t").append(afterLowExpressionCut).append("\n");
        if (config.isLowVarianceCutIsSet())
            buf.append("After LowVar\t").append(afterLowVarianceCut).append("\n");
        buf.append("================================================================\n");
        ExpressionExperimentFilter.log.info(buf.toString());
    }
    return filteredMatrix;
}
Also used : ExpressionDataDoubleMatrix(ubic.gemma.core.datastructure.matrix.ExpressionDataDoubleMatrix) CompositeSequence(ubic.gemma.model.expression.designElement.CompositeSequence)

Example 27 with ExpressionDataDoubleMatrix

use of ubic.gemma.core.datastructure.matrix.ExpressionDataDoubleMatrix in project Gemma by PavlidisLab.

the class ExpressionExperimentFilter method getFilteredMatrix.

/**
 * Provides a ready-to-use expression data matrix that is transformed and filtered. The processes that are applied,
 * in this order:
 * <ol>
 * <li>Log transform, if requested and not already done
 * <li>Use the missing value data to mask the preferred data (ratiometric data only)
 * <li>Remove rows that don't have biosequences (always applied)
 * <li>Remove Affymetrix control probes (Affymetrix only)
 * <li>Remove rows that have too many missing values (as configured)
 * <li>Remove rows with low variance (ratiometric) or CV (one-color) (as configured)
 * <li>Remove rows with very high or low expression (as configured)
 * </ol>
 *
 * @param dataVectors data vectors
 * @return filtered matrix
 */
public ExpressionDataDoubleMatrix getFilteredMatrix(Collection<ProcessedExpressionDataVector> dataVectors) {
    ExpressionDataDoubleMatrix eeDoubleMatrix = new ExpressionDataDoubleMatrix(dataVectors);
    this.transform(eeDoubleMatrix);
    return this.filter(eeDoubleMatrix);
}
Also used : ExpressionDataDoubleMatrix(ubic.gemma.core.datastructure.matrix.ExpressionDataDoubleMatrix)

Example 28 with ExpressionDataDoubleMatrix

use of ubic.gemma.core.datastructure.matrix.ExpressionDataDoubleMatrix in project Gemma by PavlidisLab.

the class RowMissingValueFilter method filter.

@Override
public ExpressionDataDoubleMatrix filter(ExpressionDataDoubleMatrix data) {
    int numRows = data.rows();
    int numCols = data.columns();
    IntArrayList present = new IntArrayList(numRows);
    List<CompositeSequence> kept = new ArrayList<>();
    /*
         * Do not allow minPresentFraction to override minPresent if minPresent is higher.
         */
    if (minPresentFractionIsSet) {
        int proposedMinimumNumberOfSamples = (int) Math.ceil(minPresentFraction * numCols);
        if (!minPresentIsSet) {
            this.setMinPresentCount(proposedMinimumNumberOfSamples);
        } else if (proposedMinimumNumberOfSamples > minPresentCount) {
            RowMissingValueFilter.log.info("The minimum number of samples is already set to " + this.minPresentCount + " but computed missing threshold from fraction of " + minPresentFraction + " is higher (" + proposedMinimumNumberOfSamples + ")");
            this.setMinPresentCount(proposedMinimumNumberOfSamples);
        } else {
            RowMissingValueFilter.log.info("The minimum number of samples is already set to " + this.minPresentCount + " and computed missing threshold from fraction of " + minPresentFraction + " is lower (" + proposedMinimumNumberOfSamples + "), keeping higher value.");
        }
    }
    if (minPresentCount > numCols) {
        throw new IllegalStateException("Minimum present count is set to " + minPresentCount + " but there are only " + numCols + " columns in the matrix.");
    }
    if (!minPresentIsSet) {
        RowMissingValueFilter.log.info("No filtering was requested");
        return data;
    }
    /* first pass - determine how many missing values there are per row */
    for (int i = 0; i < numRows; i++) {
        CompositeSequence designElementForRow = data.getDesignElementForRow(i);
        /* allow for the possibility that the absent/present matrix is not in the same order, etc. */
        int absentPresentRow = absentPresentCalls == null ? -1 : absentPresentCalls.getRowIndex(designElementForRow);
        int presentCount = 0;
        for (int j = 0; j < numCols; j++) {
            boolean callIsPresent = true;
            if (absentPresentRow >= 0) {
                callIsPresent = absentPresentCalls.get(absentPresentRow, j);
            }
            if (!Double.isNaN(data.get(i, j)) && callIsPresent) {
                presentCount++;
            }
        }
        present.add(presentCount);
        if (presentCount >= RowMissingValueFilter.ABSOLUTE_MIN_PRESENT && presentCount >= minPresentCount) {
            kept.add(designElementForRow);
        }
    }
    /* decide whether we need to invoke the 'too many removed' clause, to avoid removing too many rows. */
    if (maxFractionRemoved != 0.0 && kept.size() < numRows * (1.0 - maxFractionRemoved)) {
        IntArrayList sortedPresent = present.copy();
        sortedPresent.sort();
        sortedPresent.reverse();
        RowMissingValueFilter.log.info("There are " + kept.size() + " rows that meet criterion of at least " + minPresentCount + " non-missing values, but that's too many given the max fraction of " + maxFractionRemoved + "; minPresent adjusted to " + sortedPresent.get((int) (numRows * (maxFractionRemoved))));
        minPresentCount = sortedPresent.get((int) (numRows * (maxFractionRemoved)));
        // Do another pass to add rows we missed before.
        for (int i = 0; i < numRows; i++) {
            if (present.get(i) >= minPresentCount && present.get(i) >= RowMissingValueFilter.ABSOLUTE_MIN_PRESENT) {
                CompositeSequence designElementForRow = data.getDesignElementForRow(i);
                if (kept.contains(designElementForRow))
                    continue;
                kept.add(designElementForRow);
            }
        }
    }
    RowMissingValueFilter.log.info("Retaining " + kept.size() + " rows that meet criterion of at least " + minPresentCount + " non-missing values");
    return new ExpressionDataDoubleMatrix(data, kept);
}
Also used : ExpressionDataDoubleMatrix(ubic.gemma.core.datastructure.matrix.ExpressionDataDoubleMatrix) IntArrayList(cern.colt.list.IntArrayList) ArrayList(java.util.ArrayList) IntArrayList(cern.colt.list.IntArrayList) CompositeSequence(ubic.gemma.model.expression.designElement.CompositeSequence)

Example 29 with ExpressionDataDoubleMatrix

use of ubic.gemma.core.datastructure.matrix.ExpressionDataDoubleMatrix in project Gemma by PavlidisLab.

the class ExpressionExperimentBatchCorrectionServiceImpl method doComBat.

private ExpressionDataDoubleMatrix doComBat(ExpressionExperiment ee, ExpressionDataDoubleMatrix originalDataMatrix, ObjectMatrix<BioMaterial, ExperimentalFactor, Object> design) {
    ObjectMatrix<BioMaterial, String, Object> designU = this.convertFactorValuesToStrings(design);
    DoubleMatrix<CompositeSequence, BioMaterial> matrix = originalDataMatrix.getMatrix();
    designU = this.orderMatrix(matrix, designU);
    ScaleType scale = originalDataMatrix.getQuantitationTypes().iterator().next().getScale();
    boolean transformed = false;
    if (!(scale.equals(ScaleType.LOG2) || scale.equals(ScaleType.LOG10) || scale.equals(ScaleType.LOGBASEUNKNOWN) || scale.equals(ScaleType.LN))) {
        ExpressionExperimentBatchCorrectionServiceImpl.log.info(" *** COMBAT: LOG TRANSFORMING ***");
        transformed = true;
        MatrixStats.logTransform(matrix);
    }
    /*
         * Process
         */
    ComBat<CompositeSequence, BioMaterial> comBat = new ComBat<>(matrix, designU);
    // false: NONPARAMETRIC
    DoubleMatrix2D results = comBat.run(true);
    // note these plots always reflect the parametric setup.
    // TEMPORARY?
    comBat.plot(ee.getId() + "." + FileTools.cleanForFileName(ee.getShortName()));
    /*
         * Postprocess. Results is a raw matrix/
         */
    DoubleMatrix<CompositeSequence, BioMaterial> correctedDataMatrix = new DenseDoubleMatrix<>(results.toArray());
    correctedDataMatrix.setRowNames(matrix.getRowNames());
    correctedDataMatrix.setColumnNames(matrix.getColNames());
    if (transformed) {
        MatrixStats.unLogTransform(correctedDataMatrix);
    }
    ExpressionDataDoubleMatrix correctedExpressionDataMatrix = new ExpressionDataDoubleMatrix(originalDataMatrix, correctedDataMatrix);
    assert correctedExpressionDataMatrix.getQuantitationTypes().size() == 1;
    /*
         * It is easier if we make a new quantitationtype.
         */
    QuantitationType oldQt = correctedExpressionDataMatrix.getQuantitationTypes().iterator().next();
    QuantitationType newQt = this.makeNewQuantitationType(oldQt);
    correctedExpressionDataMatrix.getQuantitationTypes().clear();
    correctedExpressionDataMatrix.getQuantitationTypes().add(newQt);
    // Sanity check...
    for (int i = 0; i < correctedExpressionDataMatrix.columns(); i++) {
        assert correctedExpressionDataMatrix.getBioMaterialForColumn(i).equals(originalDataMatrix.getBioMaterialForColumn(i));
    }
    return correctedExpressionDataMatrix;
}
Also used : BioMaterial(ubic.gemma.model.expression.biomaterial.BioMaterial) ScaleType(ubic.gemma.model.common.quantitationtype.ScaleType) ExpressionDataDoubleMatrix(ubic.gemma.core.datastructure.matrix.ExpressionDataDoubleMatrix) CompositeSequence(ubic.gemma.model.expression.designElement.CompositeSequence) DoubleMatrix2D(cern.colt.matrix.DoubleMatrix2D) DenseDoubleMatrix(ubic.basecode.dataStructure.matrix.DenseDoubleMatrix) QuantitationType(ubic.gemma.model.common.quantitationtype.QuantitationType)

Example 30 with ExpressionDataDoubleMatrix

use of ubic.gemma.core.datastructure.matrix.ExpressionDataDoubleMatrix in project Gemma by PavlidisLab.

the class RowsWithSequencesFilter method filter.

@Override
public ExpressionDataDoubleMatrix filter(ExpressionDataDoubleMatrix dataMatrix) {
    List<CompositeSequence> kept = new ArrayList<>();
    int numRows = dataMatrix.rows();
    for (int i = 0; i < numRows; i++) {
        CompositeSequence cs = dataMatrix.getDesignElementForRow(i);
        if (cs.getBiologicalCharacteristic() != null) {
            kept.add(cs);
        }
    }
    RowsWithSequencesFilter.log.info("Retaining " + kept.size() + "/" + numRows + " rows that have associated BioSequences");
    return new ExpressionDataDoubleMatrix(dataMatrix, kept);
}
Also used : ExpressionDataDoubleMatrix(ubic.gemma.core.datastructure.matrix.ExpressionDataDoubleMatrix) ArrayList(java.util.ArrayList) CompositeSequence(ubic.gemma.model.expression.designElement.CompositeSequence)

Aggregations

ExpressionDataDoubleMatrix (ubic.gemma.core.datastructure.matrix.ExpressionDataDoubleMatrix)41 CompositeSequence (ubic.gemma.model.expression.designElement.CompositeSequence)18 BioMaterial (ubic.gemma.model.expression.biomaterial.BioMaterial)12 Test (org.junit.Test)9 BioAssay (ubic.gemma.model.expression.bioAssay.BioAssay)7 ArrayList (java.util.ArrayList)6 ExpressionExperiment (ubic.gemma.model.expression.experiment.ExpressionExperiment)6 DenseDoubleMatrix (ubic.basecode.dataStructure.matrix.DenseDoubleMatrix)5 AbstractGeoServiceTest (ubic.gemma.core.loader.expression.geo.AbstractGeoServiceTest)5 AlreadyExistsInSystemException (ubic.gemma.core.loader.util.AlreadyExistsInSystemException)5 ProcessedExpressionDataVector (ubic.gemma.model.expression.bioAssayData.ProcessedExpressionDataVector)5 DoubleMatrix1D (cern.colt.matrix.DoubleMatrix1D)4 DoubleMatrix2D (cern.colt.matrix.DoubleMatrix2D)4 InputStream (java.io.InputStream)4 DoubleVectorValueObject (ubic.gemma.model.expression.bioAssayData.DoubleVectorValueObject)4 RawExpressionDataVector (ubic.gemma.model.expression.bioAssayData.RawExpressionDataVector)4 Algebra (cern.colt.matrix.linalg.Algebra)3 GeoDomainObjectGeneratorLocal (ubic.gemma.core.loader.expression.geo.GeoDomainObjectGeneratorLocal)3 QuantitationType (ubic.gemma.model.common.quantitationtype.QuantitationType)3 BioAssayDimension (ubic.gemma.model.expression.bioAssayData.BioAssayDimension)3