use of ubic.gemma.model.expression.experiment.ExperimentalFactor in project Gemma by PavlidisLab.
the class BatchInfoPopulationServiceImpl method getBatchDataFromRawFiles.
/**
* @param files Local copies of raw data files obtained from the data provider (e.g. GEO), adds audit event.
* @param ee ee
* @return boolean
*/
private boolean getBatchDataFromRawFiles(ExpressionExperiment ee, Collection<LocalFile> files) {
BatchInfoParser batchInfoParser = new BatchInfoParser();
ee = expressionExperimentService.thaw(ee);
if (ee.getAccession() == null) {
// in fact, currently it has to be from GEO.
throw new IllegalArgumentException("The experiment does not seem to be from an external source that would have batch information available.");
}
Map<BioMaterial, Date> dates = batchInfoParser.getBatchInfo(ee, files);
this.removeExistingBatchFactor(ee);
ExperimentalFactor factor = batchInfoPopulationHelperService.createBatchFactor(ee, dates);
if (!dates.isEmpty()) {
int numberOfBatches = factor == null || factor.getFactorValues().size() == 0 ? 0 : factor.getFactorValues().size();
List<Date> allDates = new ArrayList<>(dates.values());
Collections.sort(allDates);
String datesString = StringUtils.join(allDates, "\n");
BatchInfoPopulationServiceImpl.log.info("Got batch information for: " + ee.getShortName() + ", with " + numberOfBatches + " batches.");
this.auditTrailService.addUpdateEvent(ee, BatchInformationFetchingEvent.class, batchInfoParser.getScanDateExtractor().getClass().getSimpleName() + "; " + numberOfBatches + " batches.", "Dates of sample runs: " + datesString);
return true;
}
return false;
}
use of ubic.gemma.model.expression.experiment.ExperimentalFactor in project Gemma by PavlidisLab.
the class ExpressionExperimentBatchCorrectionServiceImpl method comBat.
@Override
public ExpressionDataDoubleMatrix comBat(ExpressionExperiment ee) {
/*
* is there a batch to use?
*/
ExperimentalFactor batch = this.getBatchFactor(ee);
if (batch == null) {
ExpressionExperimentBatchCorrectionServiceImpl.log.warn("No batch factor found");
return null;
}
/*
* Extract data
*/
Collection<ProcessedExpressionDataVector> vectos = processedExpressionDataVectorService.getProcessedDataVectors(ee);
processedExpressionDataVectorService.thaw(vectos);
ExpressionDataDoubleMatrix mat = new ExpressionDataDoubleMatrix(vectos);
return this.comBat(mat);
}
use of ubic.gemma.model.expression.experiment.ExperimentalFactor in project Gemma by PavlidisLab.
the class DifferentialExpressionAnalysisCli method getSubsetFactor.
private ExperimentalFactor getSubsetFactor(ExpressionExperiment ee) {
ExperimentalFactorService efs = this.getBean(ExperimentalFactorService.class);
ExperimentalFactor subsetFactor = null;
if (StringUtils.isNotBlank(this.subsetFactorName)) {
Collection<ExperimentalFactor> experimentalFactors = ee.getExperimentalDesign().getExperimentalFactors();
for (ExperimentalFactor experimentalFactor : experimentalFactors) {
// has already implemented way of figuring out human-friendly name of factor value.
ExperimentalFactorValueObject fvo = new ExperimentalFactorValueObject(experimentalFactor);
if (ignoreBatch && BatchInfoPopulationServiceImpl.isBatchFactor(experimentalFactor)) {
AbstractCLI.log.info("Ignoring batch factor:" + experimentalFactor);
continue;
}
if (subsetFactorName.equals(experimentalFactor.getName().replaceAll(" ", "_"))) {
subsetFactor = experimentalFactor;
} else if (fvo.getCategory() != null && subsetFactorName.equals(fvo.getCategory().replaceAll(" ", "_"))) {
subsetFactor = experimentalFactor;
}
}
if (subsetFactor == null)
throw new IllegalArgumentException("Didn't find factor for provided subset factor name");
return subsetFactor;
} else if (this.subsetFactorId != null) {
subsetFactor = efs.load(subsetFactorId);
if (subsetFactor == null) {
throw new IllegalArgumentException("No factor for id=" + subsetFactorId);
}
return subsetFactor;
}
return null;
}
use of ubic.gemma.model.expression.experiment.ExperimentalFactor in project Gemma by PavlidisLab.
the class SVDServiceHelperImpl method analyzeComponent.
/**
* Do the factor comparisons for one component.
*
* @param bioMaterialFactorMap Map of factors to biomaterials to the value we're going to use. Even for
* non-continuous factors the value is a double.
*/
private void analyzeComponent(SVDValueObject svo, int componentNumber, DoubleMatrix<Long, Integer> vMatrix, Map<Long, Date> bioMaterialDates, Map<ExperimentalFactor, Map<Long, Double>> bioMaterialFactorMap, Long[] svdBioMaterials) {
DoubleArrayList eigenGene = new DoubleArrayList(vMatrix.getColumn(componentNumber));
// since we use rank correlation/anova, we just use the casted ids (two-groups) or dates as the covariate
int numWithDates = 0;
for (Long id : bioMaterialDates.keySet()) {
if (bioMaterialDates.get(id) != null) {
numWithDates++;
}
}
if (numWithDates > 2) {
/*
* Get the dates in order, - no rounding.
*/
boolean initializingDates = svo.getDates().isEmpty();
double[] dates = new double[svdBioMaterials.length];
/*
* If dates are all the same, skip.
*/
Set<Date> uniqueDate = new HashSet<>();
for (int j = 0; j < svdBioMaterials.length; j++) {
Date date = bioMaterialDates.get(svdBioMaterials[j]);
if (date == null) {
SVDServiceHelperImpl.log.warn("Incomplete date information, missing for biomaterial " + svdBioMaterials[j]);
dates[j] = Double.NaN;
} else {
Date roundDate = DateUtils.round(date, Calendar.MINUTE);
uniqueDate.add(roundDate);
// round to minute; make int, cast to
dates[j] = roundDate.getTime();
// double
}
if (initializingDates) {
svo.getDates().add(date);
}
}
if (uniqueDate.size() == 1) {
SVDServiceHelperImpl.log.warn("All scan dates the same, skipping data analysis");
svo.getDates().clear();
}
if (eigenGene.size() != dates.length) {
SVDServiceHelperImpl.log.warn("Could not compute correlation, dates and eigenGene had different lengths.");
return;
}
double dateCorrelation = Distance.spearmanRankCorrelation(eigenGene, new DoubleArrayList(dates));
svo.setPCDateCorrelation(componentNumber, dateCorrelation);
svo.setPCDateCorrelationPval(componentNumber, CorrelationStats.spearmanPvalue(dateCorrelation, eigenGene.size()));
}
/*
* Compare each factor (including batch information that is somewhat redundant with the dates) to the
* eigen-genes. Using rank statistics.
*/
for (ExperimentalFactor ef : bioMaterialFactorMap.keySet()) {
Map<Long, Double> bmToFv = bioMaterialFactorMap.get(ef);
double[] fvs = new double[svdBioMaterials.length];
assert fvs.length > 0;
int numNotMissing = 0;
boolean initializing = false;
if (!svo.getFactors().containsKey(ef.getId())) {
svo.getFactors().put(ef.getId(), new ArrayList<Double>());
initializing = true;
}
for (int j = 0; j < svdBioMaterials.length; j++) {
fvs[j] = bmToFv.get(svdBioMaterials[j]);
if (!Double.isNaN(fvs[j])) {
numNotMissing++;
}
// value.
if (initializing) {
if (SVDServiceHelperImpl.log.isDebugEnabled())
SVDServiceHelperImpl.log.debug("EF:" + ef.getId() + " fv=" + bmToFv.get(svdBioMaterials[j]));
svo.getFactors().get(ef.getId()).add(bmToFv.get(svdBioMaterials[j]));
}
}
if (fvs.length != eigenGene.size()) {
SVDServiceHelperImpl.log.debug(fvs.length + " factor values (biomaterials) but " + eigenGene.size() + " values in the eigenGene");
continue;
}
if (numNotMissing < SVDServiceHelperImpl.MINIMUM_POINTS_TO_COMPARE_TO_EIGEN_GENE) {
SVDServiceHelperImpl.log.debug("Insufficient values to compare " + ef + " to eigenGenes");
continue;
}
if (ExperimentalDesignUtils.isContinuous(ef)) {
double factorCorrelation = Distance.spearmanRankCorrelation(eigenGene, new DoubleArrayList(fvs));
svo.setPCFactorCorrelation(componentNumber, ef, factorCorrelation);
svo.setPCFactorCorrelationPval(componentNumber, ef, CorrelationStats.spearmanPvalue(factorCorrelation, eigenGene.size()));
} else {
Collection<Integer> groups = new HashSet<>();
IntArrayList groupings = new IntArrayList(fvs.length);
int k = 0;
DoubleArrayList eigenGeneWithoutMissing = new DoubleArrayList();
for (double d : fvs) {
if (Double.isNaN(d)) {
k++;
continue;
}
groupings.add((int) d);
groups.add((int) d);
eigenGeneWithoutMissing.add(eigenGene.get(k));
k++;
}
if (groups.size() < 2) {
SVDServiceHelperImpl.log.debug("Factor had less than two groups: " + ef + ", SVD comparison can't be done.");
continue;
}
if (eigenGeneWithoutMissing.size() < SVDServiceHelperImpl.MINIMUM_POINTS_TO_COMPARE_TO_EIGEN_GENE) {
SVDServiceHelperImpl.log.debug("Too few non-missing values for factor to compare to eigenGenes: " + ef);
continue;
}
if (groups.size() == 2) {
// use the one that still has missing values.
double factorCorrelation = Distance.spearmanRankCorrelation(eigenGene, new DoubleArrayList(fvs));
svo.setPCFactorCorrelation(componentNumber, ef, factorCorrelation);
svo.setPCFactorCorrelationPval(componentNumber, ef, CorrelationStats.spearmanPvalue(factorCorrelation, eigenGeneWithoutMissing.size()));
} else {
// one-way ANOVA on ranks.
double kwPVal = KruskalWallis.test(eigenGeneWithoutMissing, groupings);
svo.setPCFactorCorrelationPval(componentNumber, ef, kwPVal);
double factorCorrelation = Distance.spearmanRankCorrelation(eigenGene, new DoubleArrayList(fvs));
double corrPvalue = CorrelationStats.spearmanPvalue(factorCorrelation, eigenGeneWithoutMissing.size());
// sanity.
assert Math.abs(factorCorrelation) < 1.0 + 1e-2;
/*
* Avoid storing a pvalue, as it's hard to compare. If the regular linear correlation is strong,
* then we should just use that -- basically, it means the order we have the groups happens to be a
* good one. Of course we could just store pvalues, but that's not easy to use either.
*/
if (corrPvalue <= kwPVal) {
svo.setPCFactorCorrelation(componentNumber, ef, factorCorrelation);
} else {
// hack. A bit like turning pvalues into prob it
double approxCorr = CorrelationStats.correlationForPvalue(kwPVal, eigenGeneWithoutMissing.size());
svo.setPCFactorCorrelation(componentNumber, ef, approxCorr);
}
}
}
}
}
use of ubic.gemma.model.expression.experiment.ExperimentalFactor in project Gemma by PavlidisLab.
the class GeoDatasetServiceTest method testFetchAndLoadMultiChipPerSeriesShort.
@Test
public void testFetchAndLoadMultiChipPerSeriesShort() throws Exception {
geoService.setGeoDomainObjectGenerator(new GeoDomainObjectGeneratorLocal(this.getTestFileBasePath("shortTest")));
/*
* HG-U133A. GDS473 is for the other chip (B). Series is GSE674. see
* http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=gds&term=GSE674[Accession]&cmd=search
*/
ExpressionExperiment newee;
try {
Collection<?> results = geoService.fetchAndLoad("GSE674", false, true, false);
newee = (ExpressionExperiment) results.iterator().next();
} catch (AlreadyExistsInSystemException e) {
log.info("Skipping test, data already exists in db");
return;
}
assertNotNull(newee);
newee = eeService.thaw(newee);
/*
* Test for bug 468 (merging of subsets across GDS's)
*/
ExperimentalFactor factor = newee.getExperimentalDesign().getExperimentalFactors().iterator().next();
// otherwise get 4.
assertEquals(2, factor.getFactorValues().size());
Collection<RawExpressionDataVector> vectors = newee.getRawExpressionDataVectors();
rawExpressionDataVectorService.thaw(vectors);
ExpressionDataMatrixBuilder builder = new ExpressionDataMatrixBuilder(vectors);
ExpressionDataMatrix<Double> matrix = builder.getPreferredData();
assertNotNull(matrix);
assertEquals(31, matrix.rows());
assertEquals(15, matrix.columns());
// GSM10363 = D1-U133B
this.testMatrixValue(newee, matrix, "200000_s_at", "GSM10363", 5722.0);
// GSM10380 = C7-U133A
this.testMatrixValue(newee, matrix, "1007_s_at", "GSM10380", 1272.0);
}
Aggregations