use of ubic.gemma.model.expression.biomaterial.BioMaterial in project Gemma by PavlidisLab.
the class ExpressionDataDoubleMatrixTest method testConstructExpressionDataDoubleMatrixWithGeoValues.
/**
* This is a self-contained test. That is, it does not depend on the setup in onSetUpInTransaction}. It tests
* creating an {@link ExpressionDataDoubleMatrix} using real values from the Gene Expression Omnibus (GEO). That is,
* we have obtained information from GSE994. The probe sets used are 218120_s_at and 121_at, and the samples used
* are GSM15697 and GSM15744. Specifically, we the Gemma objects that correspond to the GEO objects are:
* DesignElement 1 = 218120_s_at, DesignElement 2 = 121_at
* BioAssay 1 = "Current Smoker 73", BioAssay 2 = "Former Smoker 34"
* BioMaterial 1 = "GSM15697", BioMaterial 2 = "GSM15744"
* BioAssayDimension = "GSM15697, GSM15744" (the names of all the biomaterials).
*/
@Test
public void testConstructExpressionDataDoubleMatrixWithGeoValues() {
ByteArrayConverter bac = new ByteArrayConverter();
ee = ExpressionExperiment.Factory.newInstance();
QuantitationType qt = QuantitationType.Factory.newInstance();
qt.setName("VALUE");
qt.setIsBackgroundSubtracted(false);
qt.setIsNormalized(false);
qt.setIsBackground(false);
qt.setIsRatio(false);
qt.setIsPreferred(true);
qt.setIsMaskedPreferred(false);
qt.setRepresentation(PrimitiveType.DOUBLE);
BioAssayDimension bioAssayDimension = BioAssayDimension.Factory.newInstance();
bioAssayDimension.setName("GSM15697, GSM15744");
List<BioAssay> assays = new ArrayList<>();
BioAssay assay1 = BioAssay.Factory.newInstance();
assay1.setName("Current Smoker 73");
BioMaterial sample1 = BioMaterial.Factory.newInstance();
sample1.setName("GSM15697");
assay1.setSampleUsed(sample1);
assays.add(assay1);
BioAssay assay2 = BioAssay.Factory.newInstance();
assay2.setName("Former Smoker 34");
BioMaterial sample2 = BioMaterial.Factory.newInstance();
sample2.setName("GSM15744");
assay2.setSampleUsed(sample2);
assays.add(assay2);
bioAssayDimension.setBioAssays(assays);
RawExpressionDataVector vector1 = RawExpressionDataVector.Factory.newInstance();
double[] ddata1 = { 74.9, 101.7 };
byte[] bdata1 = bac.doubleArrayToBytes(ddata1);
vector1.setData(bdata1);
vector1.setQuantitationType(qt);
vector1.setBioAssayDimension(bioAssayDimension);
RawExpressionDataVector vector2 = RawExpressionDataVector.Factory.newInstance();
double[] ddata2 = { 404.6, 318.7 };
byte[] bdata2 = bac.doubleArrayToBytes(ddata2);
vector2.setData(bdata2);
vector2.setQuantitationType(qt);
vector2.setBioAssayDimension(bioAssayDimension);
ArrayDesign ad = ArrayDesign.Factory.newInstance();
ad.setName("test ar");
CompositeSequence de1 = CompositeSequence.Factory.newInstance();
de1.setName("218120_s_at");
vector1.setDesignElement(de1);
BioSequence bs1 = BioSequence.Factory.newInstance();
bs1.setName("test1");
de1.setBiologicalCharacteristic(bs1);
de1.setArrayDesign(ad);
CompositeSequence de2 = CompositeSequence.Factory.newInstance();
de2.setName("121_at");
BioSequence bs2 = BioSequence.Factory.newInstance();
bs2.setName("test2");
de2.setBiologicalCharacteristic(bs2);
de2.setArrayDesign(ad);
vector2.setDesignElement(de2);
Collection<RawExpressionDataVector> eeVectors = new LinkedHashSet<>();
eeVectors.add(vector1);
eeVectors.add(vector2);
ee.setRawExpressionDataVectors(eeVectors);
ExpressionDataDoubleMatrix expressionDataMatrix = new ExpressionDataDoubleMatrix(eeVectors);
assertNotNull(expressionDataMatrix);
assertEquals(expressionDataMatrix.rows(), 2);
assertEquals(expressionDataMatrix.columns(), 2);
}
use of ubic.gemma.model.expression.biomaterial.BioMaterial in project Gemma by PavlidisLab.
the class DataUpdaterTest method testAddData.
@Test
public void testAddData() throws Exception {
/*
* Load a regular data set that has no data. Platform is (basically) irrelevant.
*/
geoService.setGeoDomainObjectGenerator(new GeoDomainObjectGeneratorLocal(this.getTestFileBasePath()));
ExpressionExperiment ee;
try {
// RNA-seq data.
Collection<?> results = geoService.fetchAndLoad("GSE37646", false, true, false);
ee = (ExpressionExperiment) results.iterator().next();
} catch (AlreadyExistsInSystemException e) {
// log.warn( "Test skipped because GSE37646 was not removed from the system prior to test" );
ee = (ExpressionExperiment) ((List<?>) e.getData()).get(0);
}
ee = experimentService.thawLite(ee);
List<BioAssay> bioAssays = new ArrayList<>(ee.getBioAssays());
assertEquals(31, bioAssays.size());
List<BioMaterial> bms = new ArrayList<>();
for (BioAssay ba : bioAssays) {
bms.add(ba.getSampleUsed());
}
targetArrayDesign = this.getTestPersistentArrayDesign(100, true);
DoubleMatrix<CompositeSequence, BioMaterial> rawMatrix = new DenseDoubleMatrix<>(targetArrayDesign.getCompositeSequences().size(), bms.size());
/*
* make up some fake data on another platform, and match it to those samples
*/
for (int i = 0; i < rawMatrix.rows(); i++) {
for (int j = 0; j < rawMatrix.columns(); j++) {
rawMatrix.set(i, j, (i + 1) * (j + 1) * Math.random() / 100.0);
}
}
List<CompositeSequence> probes = new ArrayList<>(targetArrayDesign.getCompositeSequences());
rawMatrix.setRowNames(probes);
rawMatrix.setColumnNames(bms);
QuantitationType qt = this.makeQt(true);
ExpressionDataDoubleMatrix data = new ExpressionDataDoubleMatrix(ee, qt, rawMatrix);
assertNotNull(data.getBestBioAssayDimension());
assertEquals(rawMatrix.columns(), data.getBestBioAssayDimension().getBioAssays().size());
assertEquals(probes.size(), data.getMatrix().rows());
/*
* Replace it.
*/
ee = dataUpdater.replaceData(ee, targetArrayDesign, data);
for (BioAssay ba : ee.getBioAssays()) {
assertEquals(targetArrayDesign, ba.getArrayDesignUsed());
}
ee = experimentService.thaw(ee);
for (BioAssay ba : ee.getBioAssays()) {
assertEquals(targetArrayDesign, ba.getArrayDesignUsed());
}
assertEquals(100, ee.getRawExpressionDataVectors().size());
for (RawExpressionDataVector v : ee.getRawExpressionDataVectors()) {
assertTrue(v.getQuantitationType().getIsPreferred());
}
assertEquals(100, ee.getProcessedExpressionDataVectors().size());
Collection<DoubleVectorValueObject> processedDataArrays = dataVectorService.getProcessedDataArrays(ee);
for (DoubleVectorValueObject v : processedDataArrays) {
assertEquals(31, v.getBioAssays().size());
}
/*
* Test adding data (non-preferred)
*/
qt = this.makeQt(false);
ExpressionDataDoubleMatrix moreData = new ExpressionDataDoubleMatrix(ee, qt, rawMatrix);
ee = dataUpdater.addData(ee, targetArrayDesign, moreData);
ee = experimentService.thaw(ee);
try {
// add preferred data twice.
dataUpdater.addData(ee, targetArrayDesign, data);
fail("Should have gotten an exception");
} catch (IllegalArgumentException e) {
// okay.
}
dataUpdater.deleteData(ee, qt);
}
use of ubic.gemma.model.expression.biomaterial.BioMaterial in project Gemma by PavlidisLab.
the class ExperimentalDesignImportDuplicateValueTest method testParse.
/*
* Note that this test will fail if you run it again on a dirty DB. Sorry!
*/
@Test
public final void testParse() throws Exception {
try (InputStream is = this.getClass().getResourceAsStream("/data/loader/expression/expdesign.import.testfull.txt")) {
experimentalDesignImporter.importDesign(ee, is);
}
Collection<BioMaterial> bms = new HashSet<>();
for (BioAssay ba : ee.getBioAssays()) {
BioMaterial bm = ba.getSampleUsed();
bms.add(bm);
}
checkResults(bms);
}
use of ubic.gemma.model.expression.biomaterial.BioMaterial in project Gemma by PavlidisLab.
the class LinearModelAnalyzer method run.
@Override
public Collection<DifferentialExpressionAnalysis> run(ExpressionExperiment expressionExperiment, ExpressionDataDoubleMatrix dmatrix, DifferentialExpressionAnalysisConfig config) {
/*
* I apologize for this being so complicated. Basically there are four phases:
*
* 1. Get the data matrix and factors
*
* 2. Determine baseline groups; build model and contrasts
*
* 3. Run the analysis
*
* 4. Postprocess the analysis
*
* By far the most complex is #2 -- it depends on which factors and what kind they are.
*/
/*
* Initialize our matrix and factor lists...
*/
List<ExperimentalFactor> factors = config.getFactorsToInclude();
/*
* FIXME this is the place to strip put the outliers.
*/
List<BioMaterial> samplesUsed = ExperimentalDesignUtils.getOrderedSamples(dmatrix, factors);
// enforce ordering
dmatrix = new ExpressionDataDoubleMatrix(samplesUsed, dmatrix);
/*
* Do the analysis, by subsets if requested
*/
Collection<DifferentialExpressionAnalysis> results = new HashSet<>();
ExperimentalFactor subsetFactor = config.getSubsetFactor();
if (subsetFactor != null) {
if (factors.contains(subsetFactor)) {
throw new IllegalStateException("Subset factor cannot also be included in the analysis [ Factor was: " + subsetFactor + "]");
}
Map<FactorValue, ExpressionDataDoubleMatrix> subsets = this.makeSubSets(config, dmatrix, samplesUsed, subsetFactor);
LinearModelAnalyzer.log.info("Total number of subsets: " + subsets.size());
/*
* Now analyze each subset
*/
for (FactorValue subsetFactorValue : subsets.keySet()) {
LinearModelAnalyzer.log.info("Analyzing subset: " + subsetFactorValue);
/*
* Checking for DE_Exclude characteristics, which should not be included in the analysis.
* As requested in issue #4458 (bugzilla)
*/
boolean include = true;
for (Characteristic c : subsetFactorValue.getCharacteristics()) {
if (LinearModelAnalyzer.EXCLUDE_CHARACTERISTICS_VALUES.contains(c.getValue())) {
include = false;
break;
}
}
if (!include) {
LinearModelAnalyzer.log.warn(LinearModelAnalyzer.EXCLUDE_WARNING);
continue;
}
List<BioMaterial> bioMaterials = ExperimentalDesignUtils.getOrderedSamples(subsets.get(subsetFactorValue), factors);
/*
* make a EESubSet
*/
ExpressionExperimentSubSet eeSubSet = ExpressionExperimentSubSet.Factory.newInstance();
eeSubSet.setSourceExperiment(expressionExperiment);
eeSubSet.setName("Subset for " + subsetFactorValue);
Collection<BioAssay> bioAssays = new HashSet<>();
for (BioMaterial bm : bioMaterials) {
bioAssays.addAll(bm.getBioAssaysUsedIn());
}
eeSubSet.getBioAssays().addAll(bioAssays);
Collection<ExperimentalFactor> subsetFactors = this.fixFactorsForSubset(subsets.get(subsetFactorValue), eeSubSet, factors);
DifferentialExpressionAnalysisConfig subsetConfig = this.fixConfigForSubset(factors, config, subsetFactorValue);
if (subsetFactors.isEmpty()) {
LinearModelAnalyzer.log.warn("Experimental design is not valid for subset: " + subsetFactorValue + "; skipping");
continue;
}
/*
* Run analysis on the subset.
*/
DifferentialExpressionAnalysis analysis = this.doAnalysis(eeSubSet, subsetConfig, subsets.get(subsetFactorValue), bioMaterials, new ArrayList<>(subsetFactors), subsetFactorValue);
if (analysis == null) {
LinearModelAnalyzer.log.warn("No analysis results were obtained for subset: " + subsetFactorValue);
continue;
}
results.add(analysis);
}
} else {
/*
* Analyze the whole thing as one
*/
DifferentialExpressionAnalysis analysis = this.doAnalysis(expressionExperiment, config, dmatrix, samplesUsed, factors, null);
if (analysis == null) {
LinearModelAnalyzer.log.warn("No analysis results were obtained");
} else {
results.add(analysis);
}
}
return results;
}
use of ubic.gemma.model.expression.biomaterial.BioMaterial in project Gemma by PavlidisLab.
the class LinearModelAnalyzer method doAnalysis.
/**
* @param bioAssaySet source data, could be a SubSet
* @param dmatrix data
* @param samplesUsed analyzed
* @param factors included in the model
* @param subsetFactorValue null unless analyzing a subset (only used for book-keeping)
* @return analysis, or null if there was a problem.
*/
private DifferentialExpressionAnalysis doAnalysis(BioAssaySet bioAssaySet, DifferentialExpressionAnalysisConfig config, ExpressionDataDoubleMatrix dmatrix, List<BioMaterial> samplesUsed, List<ExperimentalFactor> factors, FactorValue subsetFactorValue) {
// We may want to change this to fall back to running normally, though the real fix is to just finish the ebayes implementation.
if (config.getModerateStatistics() && dmatrix.hasMissingValues()) {
throw new UnsupportedOperationException("Ebayes cannot be used when there are values missing in the data");
}
if (factors.isEmpty()) {
LinearModelAnalyzer.log.error("Must provide at least one factor");
return null;
}
if (samplesUsed.size() <= factors.size()) {
LinearModelAnalyzer.log.error("Must have more samples than factors");
return null;
}
final Map<String, Collection<ExperimentalFactor>> label2Factors = this.getRNames(factors);
Map<ExperimentalFactor, FactorValue> baselineConditions = ExperimentalDesignUtils.getBaselineConditions(samplesUsed, factors);
this.dropIncompleteFactors(samplesUsed, factors, baselineConditions);
if (factors.isEmpty()) {
LinearModelAnalyzer.log.error("Must provide at least one factor; they were all removed due to incomplete values");
return null;
}
QuantitationType quantitationType = dmatrix.getQuantitationTypes().iterator().next();
ExperimentalFactor interceptFactor = this.determineInterceptFactor(factors, quantitationType);
/*
* Build our factor terms, with interactions handled specially
*/
List<String[]> interactionFactorLists = new ArrayList<>();
ObjectMatrix<String, String, Object> designMatrix = ExperimentalDesignUtils.buildDesignMatrix(factors, samplesUsed, baselineConditions);
config.setBaseLineFactorValues(baselineConditions);
boolean oneSampleTTest = interceptFactor != null && factors.size() == 1;
if (!oneSampleTTest) {
this.buildModelFormula(config, label2Factors, interactionFactorLists);
}
// calculate library size before log transformation (FIXME we compute this twice)
DoubleMatrix1D librarySize = MatrixStats.colSums(dmatrix.getMatrix());
/*
* FIXME: remove columns that are marked as outliers.
*/
dmatrix = ExpressionDataDoubleMatrixUtil.filterAndLog2Transform(quantitationType, dmatrix);
DoubleMatrix<CompositeSequence, BioMaterial> namedMatrix = dmatrix.getMatrix();
if (LinearModelAnalyzer.log.isDebugEnabled())
this.outputForDebugging(dmatrix, designMatrix);
/*
* PREPARATION FOR 'NATIVE' FITTING
*/
DoubleMatrix<String, String> sNamedMatrix = this.makeDataMatrix(designMatrix, namedMatrix);
DesignMatrix properDesignMatrix = this.makeDesignMatrix(designMatrix, interactionFactorLists, baselineConditions);
/*
* Run the analysis NOTE this can be simplified if we strip out R code.
*/
final Map<String, LinearModelSummary> rawResults = this.runAnalysis(namedMatrix, sNamedMatrix, properDesignMatrix, librarySize, config);
if (rawResults.size() == 0) {
LinearModelAnalyzer.log.error("Got no results from the analysis");
return null;
}
/*
* Initialize data structures we need to hold results.
*/
// this used to be a Set, but a List is much faster.
Map<String, List<DifferentialExpressionAnalysisResult>> resultLists = new HashMap<>();
Map<String, List<Double>> pvaluesForQvalue = new HashMap<>();
for (String factorName : label2Factors.keySet()) {
resultLists.put(factorName, new ArrayList<DifferentialExpressionAnalysisResult>());
pvaluesForQvalue.put(factorName, new ArrayList<Double>());
}
for (String[] fs : interactionFactorLists) {
String intF = StringUtils.join(fs, ":");
resultLists.put(intF, new ArrayList<DifferentialExpressionAnalysisResult>());
pvaluesForQvalue.put(intF, new ArrayList<Double>());
}
if (pvaluesForQvalue.isEmpty()) {
LinearModelAnalyzer.log.warn("No results were obtained for the current stage of analysis.");
return null;
}
/*
* Create result objects for each model fit. Keeping things in order is important.
*/
final Transformer rowNameExtractor = TransformerUtils.invokerTransformer("getId");
boolean warned = false;
int notUsable = 0;
int processed = 0;
for (CompositeSequence el : namedMatrix.getRowNames()) {
if (++processed % 15000 == 0) {
LinearModelAnalyzer.log.info("Processed results for " + processed + " elements ...");
}
LinearModelSummary lm = rawResults.get(rowNameExtractor.transform(el).toString());
if (LinearModelAnalyzer.log.isDebugEnabled())
LinearModelAnalyzer.log.debug(el.getName() + "\n" + lm);
if (lm == null) {
if (!warned) {
LinearModelAnalyzer.log.warn("No result for " + el + ", further warnings suppressed");
warned = true;
}
notUsable++;
continue;
}
for (String factorName : label2Factors.keySet()) {
if (!pvaluesForQvalue.containsKey(factorName)) {
// was dropped.
continue;
}
Double overallPValue;
DifferentialExpressionAnalysisResult probeAnalysisResult = DifferentialExpressionAnalysisResult.Factory.newInstance();
probeAnalysisResult.setProbe(el);
if (lm.getCoefficients() == null) {
// probeAnalysisResult.setPvalue( null );
// pvaluesForQvalue.get( factorName ).add( overallPValue );
// resultLists.get( factorName ).add( probeAnalysisResult );
notUsable++;
continue;
}
Collection<ExperimentalFactor> factorsForName = label2Factors.get(factorName);
if (factorsForName.size() > 1) {
/*
* Interactions
*/
if (factorsForName.size() > 2) {
LinearModelAnalyzer.log.error("Handling more than two-way interactions is not implemented");
return null;
}
assert factorName.contains(":");
String[] factorNames = StringUtils.split(factorName, ":");
assert factorNames.length == factorsForName.size();
overallPValue = lm.getInteractionEffectP(factorNames);
if (overallPValue != null && !Double.isNaN(overallPValue)) {
Map<String, Double> interactionContrastTStats = lm.getContrastTStats(factorName);
Map<String, Double> interactionContrastCoeffs = lm.getContrastCoefficients(factorName);
Map<String, Double> interactionContrastPValues = lm.getContrastPValues(factorName);
for (String term : interactionContrastPValues.keySet()) {
Double contrastPvalue = interactionContrastPValues.get(term);
this.makeContrast(probeAnalysisResult, factorsForName, term, factorName, contrastPvalue, interactionContrastTStats, interactionContrastCoeffs);
}
} else {
if (!warned) {
LinearModelAnalyzer.log.warn("Interaction could not be computed for " + el + ", further warnings suppressed");
warned = true;
}
if (LinearModelAnalyzer.log.isDebugEnabled())
LinearModelAnalyzer.log.debug("Interaction could not be computed for " + el + ", further warnings suppressed");
// will over count?
notUsable++;
continue;
}
} else {
/*
* Main effect
*/
assert factorsForName.size() == 1;
ExperimentalFactor experimentalFactor = factorsForName.iterator().next();
if (interceptFactor != null && factorsForName.size() == 1 && experimentalFactor.equals(interceptFactor)) {
overallPValue = lm.getInterceptP();
} else {
overallPValue = lm.getMainEffectP(factorName);
}
/*
* Add contrasts unless overall pvalue is NaN
*/
if (overallPValue != null && !Double.isNaN(overallPValue)) {
Map<String, Double> mainEffectContrastTStats = lm.getContrastTStats(factorName);
Map<String, Double> mainEffectContrastPvalues = lm.getContrastPValues(factorName);
Map<String, Double> mainEffectContrastCoeffs = lm.getContrastCoefficients(factorName);
for (String term : mainEffectContrastPvalues.keySet()) {
Double contrastPvalue = mainEffectContrastPvalues.get(term);
this.makeContrast(probeAnalysisResult, factorsForName, term, factorName, contrastPvalue, mainEffectContrastTStats, mainEffectContrastCoeffs);
}
} else {
if (!warned) {
LinearModelAnalyzer.log.warn("ANOVA could not be done for " + experimentalFactor + " on " + el + ", further warnings suppressed");
warned = true;
}
if (LinearModelAnalyzer.log.isDebugEnabled())
LinearModelAnalyzer.log.debug("ANOVA could not be done for " + experimentalFactor + " on " + el);
// will over count?
notUsable++;
continue;
}
}
assert !Double.isNaN(overallPValue) : "We should not be keeping non-number pvalues (null or NaNs)";
probeAnalysisResult.setPvalue(this.nan2Null(overallPValue));
pvaluesForQvalue.get(factorName).add(overallPValue);
resultLists.get(factorName).add(probeAnalysisResult);
}
// over terms
}
if (notUsable > 0) {
LinearModelAnalyzer.log.info(notUsable + " elements or results were not usable - model could not be fit, etc.");
}
this.getRanksAndQvalues(resultLists, pvaluesForQvalue);
DifferentialExpressionAnalysis expressionAnalysis = this.makeAnalysisEntity(bioAssaySet, config, label2Factors, baselineConditions, interceptFactor, interactionFactorLists, oneSampleTTest, resultLists, subsetFactorValue);
LinearModelAnalyzer.log.info("Analysis processing phase done ...");
return expressionAnalysis;
}
Aggregations