Search in sources :

Example 96 with BioMaterial

use of ubic.gemma.model.expression.biomaterial.BioMaterial in project Gemma by PavlidisLab.

the class DataUpdater method matchElementsToRowNames.

/**
 * @param rawMatrix         matrix
 * @param targetArrayDesign ad
 * @return matrix with row names fixed up. ColumnNames still need to be done.
 */
private DoubleMatrix<CompositeSequence, BioMaterial> matchElementsToRowNames(ArrayDesign targetArrayDesign, DoubleMatrix<String, String> rawMatrix) {
    Map<String, CompositeSequence> pnmap = new HashMap<>();
    for (CompositeSequence cs : targetArrayDesign.getCompositeSequences()) {
        pnmap.put(cs.getName(), cs);
    }
    int failedMatch = 0;
    int timesWarned = 0;
    List<CompositeSequence> newRowNames = new ArrayList<>();
    List<String> usableRowNames = new ArrayList<>();
    assert !rawMatrix.getRowNames().isEmpty();
    for (String rowName : rawMatrix.getRowNames()) {
        CompositeSequence cs = pnmap.get(rowName);
        if (cs == null) {
            /*
                 * This might be okay, but not too much
                 */
            failedMatch++;
            if (timesWarned < 20) {
                DataUpdater.log.warn("No platform match to element named: " + rowName);
            }
            if (timesWarned == 20) {
                DataUpdater.log.warn("Further warnings suppressed");
            }
            timesWarned++;
            continue;
        }
        usableRowNames.add(rowName);
        newRowNames.add(cs);
    }
    if (usableRowNames.isEmpty() || newRowNames.isEmpty()) {
        throw new IllegalArgumentException("None of the rows matched the given platform elements");
    }
    DoubleMatrix<CompositeSequence, BioMaterial> finalMatrix;
    if (failedMatch > 0) {
        DataUpdater.log.warn(failedMatch + "/" + rawMatrix.rows() + " elements could not be matched to the platform. Lines that did not match will be ignored.");
        DoubleMatrix<String, String> useableData = rawMatrix.subsetRows(usableRowNames);
        finalMatrix = new DenseDoubleMatrix<>(useableData.getRawMatrix());
    } else {
        finalMatrix = new DenseDoubleMatrix<>(rawMatrix.getRawMatrix());
    }
    finalMatrix.setRowNames(newRowNames);
    if (finalMatrix.getRowNames().isEmpty()) {
        throw new IllegalStateException("Failed to get row names");
    }
    // not actually final.
    return finalMatrix;
}
Also used : BioMaterial(ubic.gemma.model.expression.biomaterial.BioMaterial) DoubleArrayList(cern.colt.list.DoubleArrayList) CompositeSequence(ubic.gemma.model.expression.designElement.CompositeSequence)

Example 97 with BioMaterial

use of ubic.gemma.model.expression.biomaterial.BioMaterial in project Gemma by PavlidisLab.

the class DataUpdater method makeBioMaterialNameMap.

/**
 * @param ee ee
 * @return map of strings to biomaterials, where the keys are likely column names used in the input files.
 */
private Map<String, BioMaterial> makeBioMaterialNameMap(ExpressionExperiment ee) {
    Map<String, BioMaterial> bmMap = new HashMap<>();
    Collection<BioAssay> bioAssays = ee.getBioAssays();
    for (BioAssay bioAssay : bioAssays) {
        BioMaterial bm = bioAssay.getSampleUsed();
        if (bmMap.containsKey(bm.getName())) {
            // this might not actually be an error - but just in case...
            throw new IllegalStateException("Two biomaterials from the same experiment with the same name ");
        }
        bmMap.put(bm.getName(), bm);
        if (bioAssay.getAccession() != null) {
            // e.g. GSM123455
            String accession = bioAssay.getAccession().getAccession();
            if (bmMap.containsKey(accession)) {
                throw new IllegalStateException("Two bioassays with the same accession");
            }
            bmMap.put(accession, bm);
        }
        // I think it will always be null, if it is from GEO anyway.
        if (bm.getExternalAccession() != null) {
            if (bmMap.containsKey(bm.getExternalAccession().getAccession())) {
                throw new IllegalStateException("Two biomaterials with the same accession");
            }
            bmMap.put(bm.getExternalAccession().getAccession(), bm);
        }
    }
    return bmMap;
}
Also used : BioMaterial(ubic.gemma.model.expression.biomaterial.BioMaterial) BioAssay(ubic.gemma.model.expression.bioAssay.BioAssay)

Example 98 with BioMaterial

use of ubic.gemma.model.expression.biomaterial.BioMaterial in project Gemma by PavlidisLab.

the class DataUpdater method dealWithMissingSamples.

private ExpressionExperiment dealWithMissingSamples(ExpressionExperiment ee, DoubleMatrix<String, String> countMatrix, boolean allowMissingSamples) {
    if (ee.getBioAssays().size() > countMatrix.columns()) {
        if (allowMissingSamples) {
            Map<String, BioMaterial> bmMap = this.makeBioMaterialNameMap(ee);
            List<BioAssay> usedBioAssays = new ArrayList<>();
            for (String colName : countMatrix.getColNames()) {
                BioMaterial bm = bmMap.get(colName);
                if (bm == null) {
                    throw new IllegalStateException("Could not match a column name to a biomaterial: " + colName);
                }
                usedBioAssays.addAll(bm.getBioAssaysUsedIn());
            }
            assert usedBioAssays.size() == countMatrix.columns();
            Collection<BioAssay> toRemove = new HashSet<>();
            for (BioAssay ba : ee.getBioAssays()) {
                if (!usedBioAssays.contains(ba)) {
                    toRemove.add(ba);
                    DataUpdater.log.info("Will remove unused bioassay from experiment: " + ba);
                }
            }
            if (!toRemove.isEmpty()) {
                ee.getBioAssays().removeAll(toRemove);
                experimentService.update(ee);
                ee = experimentService.load(ee.getId());
                ee = experimentService.thawLite(ee);
                if (ee.getBioAssays().size() != countMatrix.columns()) {
                    throw new IllegalStateException("Something went wrong, could not remove unused samples");
                }
                // this should already be done...
                for (BioAssay b : toRemove) {
                    bioAssayService.remove(b);
                }
            }
        } else {
            throw new IllegalArgumentException("Too little data provided: The experiment has " + ee.getBioAssays().size() + " samples but the data has " + countMatrix.columns() + " columns.");
        }
    } else if (ee.getBioAssays().size() < countMatrix.columns()) {
        throw new IllegalArgumentException("Extra data provided: The experiment has " + ee.getBioAssays().size() + " samples but the data has " + countMatrix.columns() + " columns.");
    }
    return ee;
}
Also used : BioMaterial(ubic.gemma.model.expression.biomaterial.BioMaterial) DoubleArrayList(cern.colt.list.DoubleArrayList) BioAssay(ubic.gemma.model.expression.bioAssay.BioAssay)

Example 99 with BioMaterial

use of ubic.gemma.model.expression.biomaterial.BioMaterial in project Gemma by PavlidisLab.

the class DataUpdater method replaceData.

/**
 * Replace the data associated with the experiment (or add it if there is none). These data become the 'preferred'
 * quantitation type. Note that this replaces the "raw" data. Similar to
 * AffyPowerToolsProbesetSummarize.convertDesignElementDataVectors and code in
 * SimpleExpressionDataLoaderService.
 * This method exists in addition to the other replaceData to allow more direct reading of data from files, allowing
 * sample- and element-matching to happen here.
 *
 * @param ee             ee
 * @param targetPlatform (this only works for a single-platform data set)
 * @param qt             qt
 * @param data           data
 * @return ee
 */
// Possible external use
@SuppressWarnings("UnusedReturnValue")
public ExpressionExperiment replaceData(ExpressionExperiment ee, ArrayDesign targetPlatform, QuantitationType qt, DoubleMatrix<String, String> data) {
    targetPlatform = this.arrayDesignService.thaw(targetPlatform);
    ee = this.experimentService.thawLite(ee);
    DoubleMatrix<CompositeSequence, BioMaterial> rdata = this.matchElementsToRowNames(targetPlatform, data);
    this.matchBioMaterialsToColNames(ee, data, rdata);
    ExpressionDataDoubleMatrix eematrix = new ExpressionDataDoubleMatrix(ee, qt, rdata);
    return this.replaceData(ee, targetPlatform, eematrix);
}
Also used : BioMaterial(ubic.gemma.model.expression.biomaterial.BioMaterial) ExpressionDataDoubleMatrix(ubic.gemma.core.datastructure.matrix.ExpressionDataDoubleMatrix) CompositeSequence(ubic.gemma.model.expression.designElement.CompositeSequence)

Example 100 with BioMaterial

use of ubic.gemma.model.expression.biomaterial.BioMaterial in project Gemma by PavlidisLab.

the class DataUpdater method addCountData.

/**
 * Replaces data. Starting with the count data, we compute the log2cpm, which is the preferred quantitation type we
 * use internally. Counts and FPKM (if provided) are stored in addition.
 *
 * @param ee                  ee
 * @param targetArrayDesign   - this should be one of the "Generic" gene-based platforms. The data set will be
 *                            switched to use it.
 * @param countMatrix         Representing 'raw' counts (added after rpkm, if provided).
 * @param rpkmMatrix          Representing per-gene normalized data, optional (RPKM or FPKM)
 * @param allowMissingSamples if true, samples that are missing data will be deleted from the experiment.
 */
public void addCountData(ExpressionExperiment ee, ArrayDesign targetArrayDesign, DoubleMatrix<String, String> countMatrix, DoubleMatrix<String, String> rpkmMatrix, Integer readLength, Boolean isPairedReads, boolean allowMissingSamples) {
    if (countMatrix == null)
        throw new IllegalArgumentException("You must provide count matrix (rpkm is optional)");
    targetArrayDesign = arrayDesignService.thaw(targetArrayDesign);
    ee = experimentService.thawLite(ee);
    ee = this.dealWithMissingSamples(ee, countMatrix, allowMissingSamples);
    DoubleMatrix<CompositeSequence, BioMaterial> properCountMatrix = this.matchElementsToRowNames(targetArrayDesign, countMatrix);
    this.matchBioMaterialsToColNames(ee, countMatrix, properCountMatrix);
    assert !properCountMatrix.getColNames().isEmpty();
    assert !properCountMatrix.getRowNames().isEmpty();
    QuantitationType countqt = this.makeCountQt();
    ExpressionDataDoubleMatrix countEEMatrix = new ExpressionDataDoubleMatrix(ee, countqt, properCountMatrix);
    QuantitationType log2cpmQt = this.makelog2cpmQt();
    DoubleMatrix1D librarySize = MatrixStats.colSums(countMatrix);
    DoubleMatrix<CompositeSequence, BioMaterial> log2cpmMatrix = MatrixStats.convertToLog2Cpm(properCountMatrix, librarySize);
    ExpressionDataDoubleMatrix log2cpmEEMatrix = new ExpressionDataDoubleMatrix(ee, log2cpmQt, log2cpmMatrix);
    ee = this.replaceData(ee, targetArrayDesign, log2cpmEEMatrix);
    ee = this.addData(ee, targetArrayDesign, countEEMatrix);
    this.addTotalCountInformation(ee, countEEMatrix, readLength, isPairedReads);
    if (rpkmMatrix != null) {
        DoubleMatrix<CompositeSequence, BioMaterial> properRPKMMatrix = this.matchElementsToRowNames(targetArrayDesign, rpkmMatrix);
        this.matchBioMaterialsToColNames(ee, rpkmMatrix, properRPKMMatrix);
        assert !properRPKMMatrix.getColNames().isEmpty();
        assert !properRPKMMatrix.getRowNames().isEmpty();
        QuantitationType rpkmqt = this.makeRPKMQt();
        ExpressionDataDoubleMatrix rpkmEEMatrix = new ExpressionDataDoubleMatrix(ee, rpkmqt, properRPKMMatrix);
        ee = this.addData(ee, targetArrayDesign, rpkmEEMatrix);
    }
    assert !processedExpressionDataVectorService.getProcessedDataVectors(ee).isEmpty();
}
Also used : BioMaterial(ubic.gemma.model.expression.biomaterial.BioMaterial) ExpressionDataDoubleMatrix(ubic.gemma.core.datastructure.matrix.ExpressionDataDoubleMatrix) DoubleMatrix1D(cern.colt.matrix.DoubleMatrix1D) CompositeSequence(ubic.gemma.model.expression.designElement.CompositeSequence)

Aggregations

BioMaterial (ubic.gemma.model.expression.biomaterial.BioMaterial)132 BioAssay (ubic.gemma.model.expression.bioAssay.BioAssay)67 FactorValue (ubic.gemma.model.expression.experiment.FactorValue)27 ExperimentalFactor (ubic.gemma.model.expression.experiment.ExperimentalFactor)22 CompositeSequence (ubic.gemma.model.expression.designElement.CompositeSequence)19 BioAssayDimension (ubic.gemma.model.expression.bioAssayData.BioAssayDimension)15 HashSet (java.util.HashSet)13 Test (org.junit.Test)13 ExpressionDataDoubleMatrix (ubic.gemma.core.datastructure.matrix.ExpressionDataDoubleMatrix)12 ArrayDesign (ubic.gemma.model.expression.arrayDesign.ArrayDesign)12 ExpressionExperiment (ubic.gemma.model.expression.experiment.ExpressionExperiment)10 InputStream (java.io.InputStream)7 DenseDoubleMatrix (ubic.basecode.dataStructure.matrix.DenseDoubleMatrix)7 BaseSpringContextTest (ubic.gemma.core.testing.BaseSpringContextTest)7 QuantitationType (ubic.gemma.model.common.quantitationtype.QuantitationType)7 Characteristic (ubic.gemma.model.common.description.Characteristic)6 RawExpressionDataVector (ubic.gemma.model.expression.bioAssayData.RawExpressionDataVector)6 DoubleArrayList (cern.colt.list.DoubleArrayList)5 DoubleMatrix1D (cern.colt.matrix.DoubleMatrix1D)5 ArrayList (java.util.ArrayList)5