use of ubic.gemma.core.datastructure.matrix.ExpressionDataDoubleMatrix in project Gemma by PavlidisLab.
the class DataUpdaterTest method testLoadRNASeqData.
/*
* More realistic test of RNA seq. GSE19166
*
*/
@Test
public void testLoadRNASeqData() throws Exception {
geoService.setGeoDomainObjectGenerator(new GeoDomainObjectGenerator());
ExpressionExperiment ee;
try {
Collection<?> results = geoService.fetchAndLoad("GSE19166", false, false, false);
ee = (ExpressionExperiment) results.iterator().next();
} catch (AlreadyExistsInSystemException e) {
ee = (ExpressionExperiment) ((List<?>) e.getData()).get(0);
}
ee = experimentService.thaw(ee);
// Load the data from a text file.
DoubleMatrixReader reader = new DoubleMatrixReader();
try (InputStream countData = this.getClass().getResourceAsStream("/data/loader/expression/flatfileload/GSE19166_expression_count.test.txt");
InputStream rpkmData = this.getClass().getResourceAsStream("/data/loader/expression/flatfileload/GSE19166_expression_RPKM.test.txt")) {
DoubleMatrix<String, String> countMatrix = reader.read(countData);
DoubleMatrix<String, String> rpkmMatrix = reader.read(rpkmData);
List<String> probeNames = countMatrix.getRowNames();
assertEquals(199, probeNames.size());
// we have to find the right generic platform to use.
targetArrayDesign = this.getTestPersistentArrayDesign(probeNames, taxonService.findByCommonName("human"));
targetArrayDesign = arrayDesignService.thaw(targetArrayDesign);
assertEquals(199, targetArrayDesign.getCompositeSequences().size());
// Main step.
dataUpdater.addCountData(ee, targetArrayDesign, countMatrix, rpkmMatrix, 36, true, false);
}
ee = experimentService.thaw(ee);
// should have: log2cpm, counts, rpkm, and counts-masked ('preferred')
assertEquals(4, ee.getQuantitationTypes().size());
for (BioAssay ba : ee.getBioAssays()) {
assertEquals(targetArrayDesign, ba.getArrayDesignUsed());
}
assertNotNull(ee.getNumberOfDataVectors());
assertEquals(199, ee.getNumberOfDataVectors().intValue());
// GSM475204 GSM475205 GSM475206 GSM475207 GSM475208 GSM475209
// 3949585 3929008 3712314 3693219 3574068 3579631
ExpressionDataDoubleMatrix mat = dataMatrixService.getProcessedExpressionDataMatrix(ee);
assertEquals(199, mat.rows());
TestUtils.assertBAs(ee, targetArrayDesign, "GSM475204", 3949585);
assertEquals(3 * 199, ee.getRawExpressionDataVectors().size());
assertEquals(199, ee.getProcessedExpressionDataVectors().size());
Collection<DoubleVectorValueObject> processedDataArrays = dataVectorService.getProcessedDataArrays(ee);
assertEquals(199, processedDataArrays.size());
for (DoubleVectorValueObject v : processedDataArrays) {
assertEquals(6, v.getBioAssays().size());
}
assertTrue(!dataVectorService.getProcessedDataVectors(experimentService.load(ee.getId())).isEmpty());
}
use of ubic.gemma.core.datastructure.matrix.ExpressionDataDoubleMatrix in project Gemma by PavlidisLab.
the class LinkAnalysisServiceImpl method analyze.
private void analyze(ExpressionExperiment ee, FilterConfig filterConfig, LinkAnalysisConfig linkAnalysisConfig, LinkAnalysis la, Collection<ProcessedExpressionDataVector> dataVectors) {
this.qcCheck(linkAnalysisConfig, ee);
ExpressionDataDoubleMatrix datamatrix = expressionDataMatrixService.getFilteredMatrix(ee, filterConfig, dataVectors);
this.setUpForAnalysis(ee, la, dataVectors, datamatrix);
Map<CompositeSequence, Set<Gene>> probeToGeneMap = la.getProbeToGeneMap();
assert !probeToGeneMap.isEmpty();
/*
* remove probes that have no gene mapped to them, not just those that have no sequence
*/
datamatrix = this.filterUnmappedProbes(datamatrix, probeToGeneMap);
this.checkDatamatrix(datamatrix);
LinkAnalysisServiceImpl.log.info("Starting link analysis... " + ee);
this.normalize(datamatrix, linkAnalysisConfig);
/*
* Link analysis section.
*/
this.addAnalysisObj(ee, filterConfig, linkAnalysisConfig, la);
la.analyze();
CoexpCorrelationDistribution corrDist = la.getCorrelationDistribution();
// another qc check.
if (linkAnalysisConfig.isCheckCorrelationDistribution()) {
this.diagnoseCorrelationDistribution(ee, corrDist);
}
}
use of ubic.gemma.core.datastructure.matrix.ExpressionDataDoubleMatrix in project Gemma by PavlidisLab.
the class GeeqServiceImpl method scoreMissingValues.
private void scoreMissingValues(ExpressionExperiment ee, Geeq gq, boolean hasRawData) {
double score;
boolean hasProcessedVectors = true;
boolean hasMissingValues = false;
String problems = "";
if (!hasRawData) {
try {
ExpressionDataDoubleMatrix dmatrix = expressionDataMatrixService.getProcessedExpressionDataMatrix(ee);
hasMissingValues = dmatrix.hasMissingValues();
} catch (IllegalArgumentException e) {
hasProcessedVectors = false;
} catch (Exception e) {
hasProcessedVectors = false;
problems = GeeqServiceImpl.ERR_MSG_MISSING_VALS + e.getMessage();
}
}
score = hasRawData || (!hasMissingValues && hasProcessedVectors) ? GeeqServiceImpl.P_10 : GeeqServiceImpl.N_10;
gq.setNoVectors(!hasProcessedVectors);
gq.addOtherIssues(problems);
gq.setsScoreMissingValues(score);
}
use of ubic.gemma.core.datastructure.matrix.ExpressionDataDoubleMatrix in project Gemma by PavlidisLab.
the class DataUpdater method replaceData.
/**
* Replace the data associated with the experiment (or add it if there is none). These data become the 'preferred'
* quantitation type. Note that this replaces the "raw" data. Similar to
* AffyPowerToolsProbesetSummarize.convertDesignElementDataVectors and code in
* SimpleExpressionDataLoaderService.
* This method exists in addition to the other replaceData to allow more direct reading of data from files, allowing
* sample- and element-matching to happen here.
*
* @param ee ee
* @param targetPlatform (this only works for a single-platform data set)
* @param qt qt
* @param data data
* @return ee
*/
// Possible external use
@SuppressWarnings("UnusedReturnValue")
public ExpressionExperiment replaceData(ExpressionExperiment ee, ArrayDesign targetPlatform, QuantitationType qt, DoubleMatrix<String, String> data) {
targetPlatform = this.arrayDesignService.thaw(targetPlatform);
ee = this.experimentService.thawLite(ee);
DoubleMatrix<CompositeSequence, BioMaterial> rdata = this.matchElementsToRowNames(targetPlatform, data);
this.matchBioMaterialsToColNames(ee, data, rdata);
ExpressionDataDoubleMatrix eematrix = new ExpressionDataDoubleMatrix(ee, qt, rdata);
return this.replaceData(ee, targetPlatform, eematrix);
}
use of ubic.gemma.core.datastructure.matrix.ExpressionDataDoubleMatrix in project Gemma by PavlidisLab.
the class DataUpdater method addCountData.
/**
* Replaces data. Starting with the count data, we compute the log2cpm, which is the preferred quantitation type we
* use internally. Counts and FPKM (if provided) are stored in addition.
*
* @param ee ee
* @param targetArrayDesign - this should be one of the "Generic" gene-based platforms. The data set will be
* switched to use it.
* @param countMatrix Representing 'raw' counts (added after rpkm, if provided).
* @param rpkmMatrix Representing per-gene normalized data, optional (RPKM or FPKM)
* @param allowMissingSamples if true, samples that are missing data will be deleted from the experiment.
*/
public void addCountData(ExpressionExperiment ee, ArrayDesign targetArrayDesign, DoubleMatrix<String, String> countMatrix, DoubleMatrix<String, String> rpkmMatrix, Integer readLength, Boolean isPairedReads, boolean allowMissingSamples) {
if (countMatrix == null)
throw new IllegalArgumentException("You must provide count matrix (rpkm is optional)");
targetArrayDesign = arrayDesignService.thaw(targetArrayDesign);
ee = experimentService.thawLite(ee);
ee = this.dealWithMissingSamples(ee, countMatrix, allowMissingSamples);
DoubleMatrix<CompositeSequence, BioMaterial> properCountMatrix = this.matchElementsToRowNames(targetArrayDesign, countMatrix);
this.matchBioMaterialsToColNames(ee, countMatrix, properCountMatrix);
assert !properCountMatrix.getColNames().isEmpty();
assert !properCountMatrix.getRowNames().isEmpty();
QuantitationType countqt = this.makeCountQt();
ExpressionDataDoubleMatrix countEEMatrix = new ExpressionDataDoubleMatrix(ee, countqt, properCountMatrix);
QuantitationType log2cpmQt = this.makelog2cpmQt();
DoubleMatrix1D librarySize = MatrixStats.colSums(countMatrix);
DoubleMatrix<CompositeSequence, BioMaterial> log2cpmMatrix = MatrixStats.convertToLog2Cpm(properCountMatrix, librarySize);
ExpressionDataDoubleMatrix log2cpmEEMatrix = new ExpressionDataDoubleMatrix(ee, log2cpmQt, log2cpmMatrix);
ee = this.replaceData(ee, targetArrayDesign, log2cpmEEMatrix);
ee = this.addData(ee, targetArrayDesign, countEEMatrix);
this.addTotalCountInformation(ee, countEEMatrix, readLength, isPairedReads);
if (rpkmMatrix != null) {
DoubleMatrix<CompositeSequence, BioMaterial> properRPKMMatrix = this.matchElementsToRowNames(targetArrayDesign, rpkmMatrix);
this.matchBioMaterialsToColNames(ee, rpkmMatrix, properRPKMMatrix);
assert !properRPKMMatrix.getColNames().isEmpty();
assert !properRPKMMatrix.getRowNames().isEmpty();
QuantitationType rpkmqt = this.makeRPKMQt();
ExpressionDataDoubleMatrix rpkmEEMatrix = new ExpressionDataDoubleMatrix(ee, rpkmqt, properRPKMMatrix);
ee = this.addData(ee, targetArrayDesign, rpkmEEMatrix);
}
assert !processedExpressionDataVectorService.getProcessedDataVectors(ee).isEmpty();
}
Aggregations