use of ubic.gemma.model.expression.biomaterial.BioMaterial in project Gemma by PavlidisLab.
the class DataUpdater method matchElementsToRowNames.
/**
* @param rawMatrix matrix
* @param targetArrayDesign ad
* @return matrix with row names fixed up. ColumnNames still need to be done.
*/
private DoubleMatrix<CompositeSequence, BioMaterial> matchElementsToRowNames(ArrayDesign targetArrayDesign, DoubleMatrix<String, String> rawMatrix) {
Map<String, CompositeSequence> pnmap = new HashMap<>();
for (CompositeSequence cs : targetArrayDesign.getCompositeSequences()) {
pnmap.put(cs.getName(), cs);
}
int failedMatch = 0;
int timesWarned = 0;
List<CompositeSequence> newRowNames = new ArrayList<>();
List<String> usableRowNames = new ArrayList<>();
assert !rawMatrix.getRowNames().isEmpty();
for (String rowName : rawMatrix.getRowNames()) {
CompositeSequence cs = pnmap.get(rowName);
if (cs == null) {
/*
* This might be okay, but not too much
*/
failedMatch++;
if (timesWarned < 20) {
DataUpdater.log.warn("No platform match to element named: " + rowName);
}
if (timesWarned == 20) {
DataUpdater.log.warn("Further warnings suppressed");
}
timesWarned++;
continue;
}
usableRowNames.add(rowName);
newRowNames.add(cs);
}
if (usableRowNames.isEmpty() || newRowNames.isEmpty()) {
throw new IllegalArgumentException("None of the rows matched the given platform elements");
}
DoubleMatrix<CompositeSequence, BioMaterial> finalMatrix;
if (failedMatch > 0) {
DataUpdater.log.warn(failedMatch + "/" + rawMatrix.rows() + " elements could not be matched to the platform. Lines that did not match will be ignored.");
DoubleMatrix<String, String> useableData = rawMatrix.subsetRows(usableRowNames);
finalMatrix = new DenseDoubleMatrix<>(useableData.getRawMatrix());
} else {
finalMatrix = new DenseDoubleMatrix<>(rawMatrix.getRawMatrix());
}
finalMatrix.setRowNames(newRowNames);
if (finalMatrix.getRowNames().isEmpty()) {
throw new IllegalStateException("Failed to get row names");
}
// not actually final.
return finalMatrix;
}
use of ubic.gemma.model.expression.biomaterial.BioMaterial in project Gemma by PavlidisLab.
the class DataUpdater method makeBioMaterialNameMap.
/**
* @param ee ee
* @return map of strings to biomaterials, where the keys are likely column names used in the input files.
*/
private Map<String, BioMaterial> makeBioMaterialNameMap(ExpressionExperiment ee) {
Map<String, BioMaterial> bmMap = new HashMap<>();
Collection<BioAssay> bioAssays = ee.getBioAssays();
for (BioAssay bioAssay : bioAssays) {
BioMaterial bm = bioAssay.getSampleUsed();
if (bmMap.containsKey(bm.getName())) {
// this might not actually be an error - but just in case...
throw new IllegalStateException("Two biomaterials from the same experiment with the same name ");
}
bmMap.put(bm.getName(), bm);
if (bioAssay.getAccession() != null) {
// e.g. GSM123455
String accession = bioAssay.getAccession().getAccession();
if (bmMap.containsKey(accession)) {
throw new IllegalStateException("Two bioassays with the same accession");
}
bmMap.put(accession, bm);
}
// I think it will always be null, if it is from GEO anyway.
if (bm.getExternalAccession() != null) {
if (bmMap.containsKey(bm.getExternalAccession().getAccession())) {
throw new IllegalStateException("Two biomaterials with the same accession");
}
bmMap.put(bm.getExternalAccession().getAccession(), bm);
}
}
return bmMap;
}
use of ubic.gemma.model.expression.biomaterial.BioMaterial in project Gemma by PavlidisLab.
the class DataUpdater method dealWithMissingSamples.
private ExpressionExperiment dealWithMissingSamples(ExpressionExperiment ee, DoubleMatrix<String, String> countMatrix, boolean allowMissingSamples) {
if (ee.getBioAssays().size() > countMatrix.columns()) {
if (allowMissingSamples) {
Map<String, BioMaterial> bmMap = this.makeBioMaterialNameMap(ee);
List<BioAssay> usedBioAssays = new ArrayList<>();
for (String colName : countMatrix.getColNames()) {
BioMaterial bm = bmMap.get(colName);
if (bm == null) {
throw new IllegalStateException("Could not match a column name to a biomaterial: " + colName);
}
usedBioAssays.addAll(bm.getBioAssaysUsedIn());
}
assert usedBioAssays.size() == countMatrix.columns();
Collection<BioAssay> toRemove = new HashSet<>();
for (BioAssay ba : ee.getBioAssays()) {
if (!usedBioAssays.contains(ba)) {
toRemove.add(ba);
DataUpdater.log.info("Will remove unused bioassay from experiment: " + ba);
}
}
if (!toRemove.isEmpty()) {
ee.getBioAssays().removeAll(toRemove);
experimentService.update(ee);
ee = experimentService.load(ee.getId());
ee = experimentService.thawLite(ee);
if (ee.getBioAssays().size() != countMatrix.columns()) {
throw new IllegalStateException("Something went wrong, could not remove unused samples");
}
// this should already be done...
for (BioAssay b : toRemove) {
bioAssayService.remove(b);
}
}
} else {
throw new IllegalArgumentException("Too little data provided: The experiment has " + ee.getBioAssays().size() + " samples but the data has " + countMatrix.columns() + " columns.");
}
} else if (ee.getBioAssays().size() < countMatrix.columns()) {
throw new IllegalArgumentException("Extra data provided: The experiment has " + ee.getBioAssays().size() + " samples but the data has " + countMatrix.columns() + " columns.");
}
return ee;
}
use of ubic.gemma.model.expression.biomaterial.BioMaterial in project Gemma by PavlidisLab.
the class DataUpdater method replaceData.
/**
* Replace the data associated with the experiment (or add it if there is none). These data become the 'preferred'
* quantitation type. Note that this replaces the "raw" data. Similar to
* AffyPowerToolsProbesetSummarize.convertDesignElementDataVectors and code in
* SimpleExpressionDataLoaderService.
* This method exists in addition to the other replaceData to allow more direct reading of data from files, allowing
* sample- and element-matching to happen here.
*
* @param ee ee
* @param targetPlatform (this only works for a single-platform data set)
* @param qt qt
* @param data data
* @return ee
*/
// Possible external use
@SuppressWarnings("UnusedReturnValue")
public ExpressionExperiment replaceData(ExpressionExperiment ee, ArrayDesign targetPlatform, QuantitationType qt, DoubleMatrix<String, String> data) {
targetPlatform = this.arrayDesignService.thaw(targetPlatform);
ee = this.experimentService.thawLite(ee);
DoubleMatrix<CompositeSequence, BioMaterial> rdata = this.matchElementsToRowNames(targetPlatform, data);
this.matchBioMaterialsToColNames(ee, data, rdata);
ExpressionDataDoubleMatrix eematrix = new ExpressionDataDoubleMatrix(ee, qt, rdata);
return this.replaceData(ee, targetPlatform, eematrix);
}
use of ubic.gemma.model.expression.biomaterial.BioMaterial in project Gemma by PavlidisLab.
the class DataUpdater method addCountData.
/**
* Replaces data. Starting with the count data, we compute the log2cpm, which is the preferred quantitation type we
* use internally. Counts and FPKM (if provided) are stored in addition.
*
* @param ee ee
* @param targetArrayDesign - this should be one of the "Generic" gene-based platforms. The data set will be
* switched to use it.
* @param countMatrix Representing 'raw' counts (added after rpkm, if provided).
* @param rpkmMatrix Representing per-gene normalized data, optional (RPKM or FPKM)
* @param allowMissingSamples if true, samples that are missing data will be deleted from the experiment.
*/
public void addCountData(ExpressionExperiment ee, ArrayDesign targetArrayDesign, DoubleMatrix<String, String> countMatrix, DoubleMatrix<String, String> rpkmMatrix, Integer readLength, Boolean isPairedReads, boolean allowMissingSamples) {
if (countMatrix == null)
throw new IllegalArgumentException("You must provide count matrix (rpkm is optional)");
targetArrayDesign = arrayDesignService.thaw(targetArrayDesign);
ee = experimentService.thawLite(ee);
ee = this.dealWithMissingSamples(ee, countMatrix, allowMissingSamples);
DoubleMatrix<CompositeSequence, BioMaterial> properCountMatrix = this.matchElementsToRowNames(targetArrayDesign, countMatrix);
this.matchBioMaterialsToColNames(ee, countMatrix, properCountMatrix);
assert !properCountMatrix.getColNames().isEmpty();
assert !properCountMatrix.getRowNames().isEmpty();
QuantitationType countqt = this.makeCountQt();
ExpressionDataDoubleMatrix countEEMatrix = new ExpressionDataDoubleMatrix(ee, countqt, properCountMatrix);
QuantitationType log2cpmQt = this.makelog2cpmQt();
DoubleMatrix1D librarySize = MatrixStats.colSums(countMatrix);
DoubleMatrix<CompositeSequence, BioMaterial> log2cpmMatrix = MatrixStats.convertToLog2Cpm(properCountMatrix, librarySize);
ExpressionDataDoubleMatrix log2cpmEEMatrix = new ExpressionDataDoubleMatrix(ee, log2cpmQt, log2cpmMatrix);
ee = this.replaceData(ee, targetArrayDesign, log2cpmEEMatrix);
ee = this.addData(ee, targetArrayDesign, countEEMatrix);
this.addTotalCountInformation(ee, countEEMatrix, readLength, isPairedReads);
if (rpkmMatrix != null) {
DoubleMatrix<CompositeSequence, BioMaterial> properRPKMMatrix = this.matchElementsToRowNames(targetArrayDesign, rpkmMatrix);
this.matchBioMaterialsToColNames(ee, rpkmMatrix, properRPKMMatrix);
assert !properRPKMMatrix.getColNames().isEmpty();
assert !properRPKMMatrix.getRowNames().isEmpty();
QuantitationType rpkmqt = this.makeRPKMQt();
ExpressionDataDoubleMatrix rpkmEEMatrix = new ExpressionDataDoubleMatrix(ee, rpkmqt, properRPKMMatrix);
ee = this.addData(ee, targetArrayDesign, rpkmEEMatrix);
}
assert !processedExpressionDataVectorService.getProcessedDataVectors(ee).isEmpty();
}
Aggregations