Search in sources :

Example 16 with ExperimentalFactor

use of ubic.gemma.model.expression.experiment.ExperimentalFactor in project Gemma by PavlidisLab.

the class BatchInfoPopulationServiceImpl method getBatchDataFromRawFiles.

/**
 * @param files Local copies of raw data files obtained from the data provider (e.g. GEO), adds audit event.
 * @param ee    ee
 * @return boolean
 */
private boolean getBatchDataFromRawFiles(ExpressionExperiment ee, Collection<LocalFile> files) {
    BatchInfoParser batchInfoParser = new BatchInfoParser();
    ee = expressionExperimentService.thaw(ee);
    if (ee.getAccession() == null) {
        // in fact, currently it has to be from GEO.
        throw new IllegalArgumentException("The experiment does not seem to be from an external source that would have batch information available.");
    }
    Map<BioMaterial, Date> dates = batchInfoParser.getBatchInfo(ee, files);
    this.removeExistingBatchFactor(ee);
    ExperimentalFactor factor = batchInfoPopulationHelperService.createBatchFactor(ee, dates);
    if (!dates.isEmpty()) {
        int numberOfBatches = factor == null || factor.getFactorValues().size() == 0 ? 0 : factor.getFactorValues().size();
        List<Date> allDates = new ArrayList<>(dates.values());
        Collections.sort(allDates);
        String datesString = StringUtils.join(allDates, "\n");
        BatchInfoPopulationServiceImpl.log.info("Got batch information for: " + ee.getShortName() + ", with " + numberOfBatches + " batches.");
        this.auditTrailService.addUpdateEvent(ee, BatchInformationFetchingEvent.class, batchInfoParser.getScanDateExtractor().getClass().getSimpleName() + "; " + numberOfBatches + " batches.", "Dates of sample runs: " + datesString);
        return true;
    }
    return false;
}
Also used : BioMaterial(ubic.gemma.model.expression.biomaterial.BioMaterial) ExperimentalFactor(ubic.gemma.model.expression.experiment.ExperimentalFactor)

Example 17 with ExperimentalFactor

use of ubic.gemma.model.expression.experiment.ExperimentalFactor in project Gemma by PavlidisLab.

the class ExpressionExperimentBatchCorrectionServiceImpl method comBat.

@Override
public ExpressionDataDoubleMatrix comBat(ExpressionExperiment ee) {
    /*
         * is there a batch to use?
         */
    ExperimentalFactor batch = this.getBatchFactor(ee);
    if (batch == null) {
        ExpressionExperimentBatchCorrectionServiceImpl.log.warn("No batch factor found");
        return null;
    }
    /*
         * Extract data
         */
    Collection<ProcessedExpressionDataVector> vectos = processedExpressionDataVectorService.getProcessedDataVectors(ee);
    processedExpressionDataVectorService.thaw(vectos);
    ExpressionDataDoubleMatrix mat = new ExpressionDataDoubleMatrix(vectos);
    return this.comBat(mat);
}
Also used : ExperimentalFactor(ubic.gemma.model.expression.experiment.ExperimentalFactor) ExpressionDataDoubleMatrix(ubic.gemma.core.datastructure.matrix.ExpressionDataDoubleMatrix) ProcessedExpressionDataVector(ubic.gemma.model.expression.bioAssayData.ProcessedExpressionDataVector)

Example 18 with ExperimentalFactor

use of ubic.gemma.model.expression.experiment.ExperimentalFactor in project Gemma by PavlidisLab.

the class DifferentialExpressionAnalysisCli method getSubsetFactor.

private ExperimentalFactor getSubsetFactor(ExpressionExperiment ee) {
    ExperimentalFactorService efs = this.getBean(ExperimentalFactorService.class);
    ExperimentalFactor subsetFactor = null;
    if (StringUtils.isNotBlank(this.subsetFactorName)) {
        Collection<ExperimentalFactor> experimentalFactors = ee.getExperimentalDesign().getExperimentalFactors();
        for (ExperimentalFactor experimentalFactor : experimentalFactors) {
            // has already implemented way of figuring out human-friendly name of factor value.
            ExperimentalFactorValueObject fvo = new ExperimentalFactorValueObject(experimentalFactor);
            if (ignoreBatch && BatchInfoPopulationServiceImpl.isBatchFactor(experimentalFactor)) {
                AbstractCLI.log.info("Ignoring batch factor:" + experimentalFactor);
                continue;
            }
            if (subsetFactorName.equals(experimentalFactor.getName().replaceAll(" ", "_"))) {
                subsetFactor = experimentalFactor;
            } else if (fvo.getCategory() != null && subsetFactorName.equals(fvo.getCategory().replaceAll(" ", "_"))) {
                subsetFactor = experimentalFactor;
            }
        }
        if (subsetFactor == null)
            throw new IllegalArgumentException("Didn't find factor for provided subset factor name");
        return subsetFactor;
    } else if (this.subsetFactorId != null) {
        subsetFactor = efs.load(subsetFactorId);
        if (subsetFactor == null) {
            throw new IllegalArgumentException("No factor for id=" + subsetFactorId);
        }
        return subsetFactor;
    }
    return null;
}
Also used : ExperimentalFactor(ubic.gemma.model.expression.experiment.ExperimentalFactor) ExperimentalFactorValueObject(ubic.gemma.model.expression.experiment.ExperimentalFactorValueObject) ExperimentalFactorService(ubic.gemma.persistence.service.expression.experiment.ExperimentalFactorService)

Example 19 with ExperimentalFactor

use of ubic.gemma.model.expression.experiment.ExperimentalFactor in project Gemma by PavlidisLab.

the class SVDServiceHelperImpl method analyzeComponent.

/**
 * Do the factor comparisons for one component.
 *
 * @param bioMaterialFactorMap Map of factors to biomaterials to the value we're going to use. Even for
 *                             non-continuous factors the value is a double.
 */
private void analyzeComponent(SVDValueObject svo, int componentNumber, DoubleMatrix<Long, Integer> vMatrix, Map<Long, Date> bioMaterialDates, Map<ExperimentalFactor, Map<Long, Double>> bioMaterialFactorMap, Long[] svdBioMaterials) {
    DoubleArrayList eigenGene = new DoubleArrayList(vMatrix.getColumn(componentNumber));
    // since we use rank correlation/anova, we just use the casted ids (two-groups) or dates as the covariate
    int numWithDates = 0;
    for (Long id : bioMaterialDates.keySet()) {
        if (bioMaterialDates.get(id) != null) {
            numWithDates++;
        }
    }
    if (numWithDates > 2) {
        /*
             * Get the dates in order, - no rounding.
             */
        boolean initializingDates = svo.getDates().isEmpty();
        double[] dates = new double[svdBioMaterials.length];
        /*
             * If dates are all the same, skip.
             */
        Set<Date> uniqueDate = new HashSet<>();
        for (int j = 0; j < svdBioMaterials.length; j++) {
            Date date = bioMaterialDates.get(svdBioMaterials[j]);
            if (date == null) {
                SVDServiceHelperImpl.log.warn("Incomplete date information, missing for biomaterial " + svdBioMaterials[j]);
                dates[j] = Double.NaN;
            } else {
                Date roundDate = DateUtils.round(date, Calendar.MINUTE);
                uniqueDate.add(roundDate);
                // round to minute; make int, cast to
                dates[j] = roundDate.getTime();
            // double
            }
            if (initializingDates) {
                svo.getDates().add(date);
            }
        }
        if (uniqueDate.size() == 1) {
            SVDServiceHelperImpl.log.warn("All scan dates the same, skipping data analysis");
            svo.getDates().clear();
        }
        if (eigenGene.size() != dates.length) {
            SVDServiceHelperImpl.log.warn("Could not compute correlation, dates and eigenGene had different lengths.");
            return;
        }
        double dateCorrelation = Distance.spearmanRankCorrelation(eigenGene, new DoubleArrayList(dates));
        svo.setPCDateCorrelation(componentNumber, dateCorrelation);
        svo.setPCDateCorrelationPval(componentNumber, CorrelationStats.spearmanPvalue(dateCorrelation, eigenGene.size()));
    }
    /*
         * Compare each factor (including batch information that is somewhat redundant with the dates) to the
         * eigen-genes. Using rank statistics.
         */
    for (ExperimentalFactor ef : bioMaterialFactorMap.keySet()) {
        Map<Long, Double> bmToFv = bioMaterialFactorMap.get(ef);
        double[] fvs = new double[svdBioMaterials.length];
        assert fvs.length > 0;
        int numNotMissing = 0;
        boolean initializing = false;
        if (!svo.getFactors().containsKey(ef.getId())) {
            svo.getFactors().put(ef.getId(), new ArrayList<Double>());
            initializing = true;
        }
        for (int j = 0; j < svdBioMaterials.length; j++) {
            fvs[j] = bmToFv.get(svdBioMaterials[j]);
            if (!Double.isNaN(fvs[j])) {
                numNotMissing++;
            }
            // value.
            if (initializing) {
                if (SVDServiceHelperImpl.log.isDebugEnabled())
                    SVDServiceHelperImpl.log.debug("EF:" + ef.getId() + " fv=" + bmToFv.get(svdBioMaterials[j]));
                svo.getFactors().get(ef.getId()).add(bmToFv.get(svdBioMaterials[j]));
            }
        }
        if (fvs.length != eigenGene.size()) {
            SVDServiceHelperImpl.log.debug(fvs.length + " factor values (biomaterials) but " + eigenGene.size() + " values in the eigenGene");
            continue;
        }
        if (numNotMissing < SVDServiceHelperImpl.MINIMUM_POINTS_TO_COMPARE_TO_EIGEN_GENE) {
            SVDServiceHelperImpl.log.debug("Insufficient values to compare " + ef + " to eigenGenes");
            continue;
        }
        if (ExperimentalDesignUtils.isContinuous(ef)) {
            double factorCorrelation = Distance.spearmanRankCorrelation(eigenGene, new DoubleArrayList(fvs));
            svo.setPCFactorCorrelation(componentNumber, ef, factorCorrelation);
            svo.setPCFactorCorrelationPval(componentNumber, ef, CorrelationStats.spearmanPvalue(factorCorrelation, eigenGene.size()));
        } else {
            Collection<Integer> groups = new HashSet<>();
            IntArrayList groupings = new IntArrayList(fvs.length);
            int k = 0;
            DoubleArrayList eigenGeneWithoutMissing = new DoubleArrayList();
            for (double d : fvs) {
                if (Double.isNaN(d)) {
                    k++;
                    continue;
                }
                groupings.add((int) d);
                groups.add((int) d);
                eigenGeneWithoutMissing.add(eigenGene.get(k));
                k++;
            }
            if (groups.size() < 2) {
                SVDServiceHelperImpl.log.debug("Factor had less than two groups: " + ef + ", SVD comparison can't be done.");
                continue;
            }
            if (eigenGeneWithoutMissing.size() < SVDServiceHelperImpl.MINIMUM_POINTS_TO_COMPARE_TO_EIGEN_GENE) {
                SVDServiceHelperImpl.log.debug("Too few non-missing values for factor to compare to eigenGenes: " + ef);
                continue;
            }
            if (groups.size() == 2) {
                // use the one that still has missing values.
                double factorCorrelation = Distance.spearmanRankCorrelation(eigenGene, new DoubleArrayList(fvs));
                svo.setPCFactorCorrelation(componentNumber, ef, factorCorrelation);
                svo.setPCFactorCorrelationPval(componentNumber, ef, CorrelationStats.spearmanPvalue(factorCorrelation, eigenGeneWithoutMissing.size()));
            } else {
                // one-way ANOVA on ranks.
                double kwPVal = KruskalWallis.test(eigenGeneWithoutMissing, groupings);
                svo.setPCFactorCorrelationPval(componentNumber, ef, kwPVal);
                double factorCorrelation = Distance.spearmanRankCorrelation(eigenGene, new DoubleArrayList(fvs));
                double corrPvalue = CorrelationStats.spearmanPvalue(factorCorrelation, eigenGeneWithoutMissing.size());
                // sanity.
                assert Math.abs(factorCorrelation) < 1.0 + 1e-2;
                /*
                     * Avoid storing a pvalue, as it's hard to compare. If the regular linear correlation is strong,
                     * then we should just use that -- basically, it means the order we have the groups happens to be a
                     * good one. Of course we could just store pvalues, but that's not easy to use either.
                     */
                if (corrPvalue <= kwPVal) {
                    svo.setPCFactorCorrelation(componentNumber, ef, factorCorrelation);
                } else {
                    // hack. A bit like turning pvalues into prob it
                    double approxCorr = CorrelationStats.correlationForPvalue(kwPVal, eigenGeneWithoutMissing.size());
                    svo.setPCFactorCorrelation(componentNumber, ef, approxCorr);
                }
            }
        }
    }
}
Also used : ExperimentalFactor(ubic.gemma.model.expression.experiment.ExperimentalFactor) DoubleArrayList(cern.colt.list.DoubleArrayList) IntArrayList(cern.colt.list.IntArrayList)

Example 20 with ExperimentalFactor

use of ubic.gemma.model.expression.experiment.ExperimentalFactor in project Gemma by PavlidisLab.

the class GeoDatasetServiceTest method testFetchAndLoadMultiChipPerSeriesShort.

@Test
public void testFetchAndLoadMultiChipPerSeriesShort() throws Exception {
    geoService.setGeoDomainObjectGenerator(new GeoDomainObjectGeneratorLocal(this.getTestFileBasePath("shortTest")));
    /*
         * HG-U133A. GDS473 is for the other chip (B). Series is GSE674. see
         * http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=gds&term=GSE674[Accession]&cmd=search
         */
    ExpressionExperiment newee;
    try {
        Collection<?> results = geoService.fetchAndLoad("GSE674", false, true, false);
        newee = (ExpressionExperiment) results.iterator().next();
    } catch (AlreadyExistsInSystemException e) {
        log.info("Skipping test, data already exists in db");
        return;
    }
    assertNotNull(newee);
    newee = eeService.thaw(newee);
    /*
         * Test for bug 468 (merging of subsets across GDS's)
         */
    ExperimentalFactor factor = newee.getExperimentalDesign().getExperimentalFactors().iterator().next();
    // otherwise get 4.
    assertEquals(2, factor.getFactorValues().size());
    Collection<RawExpressionDataVector> vectors = newee.getRawExpressionDataVectors();
    rawExpressionDataVectorService.thaw(vectors);
    ExpressionDataMatrixBuilder builder = new ExpressionDataMatrixBuilder(vectors);
    ExpressionDataMatrix<Double> matrix = builder.getPreferredData();
    assertNotNull(matrix);
    assertEquals(31, matrix.rows());
    assertEquals(15, matrix.columns());
    // GSM10363 = D1-U133B
    this.testMatrixValue(newee, matrix, "200000_s_at", "GSM10363", 5722.0);
    // GSM10380 = C7-U133A
    this.testMatrixValue(newee, matrix, "1007_s_at", "GSM10380", 1272.0);
}
Also used : ExpressionDataMatrixBuilder(ubic.gemma.core.analysis.preprocess.ExpressionDataMatrixBuilder) RawExpressionDataVector(ubic.gemma.model.expression.bioAssayData.RawExpressionDataVector) ExperimentalFactor(ubic.gemma.model.expression.experiment.ExperimentalFactor) AlreadyExistsInSystemException(ubic.gemma.core.loader.util.AlreadyExistsInSystemException) ExpressionExperiment(ubic.gemma.model.expression.experiment.ExpressionExperiment) GeoDomainObjectGeneratorLocal(ubic.gemma.core.loader.expression.geo.GeoDomainObjectGeneratorLocal) AbstractGeoServiceTest(ubic.gemma.core.loader.expression.geo.AbstractGeoServiceTest) Test(org.junit.Test)

Aggregations

ExperimentalFactor (ubic.gemma.model.expression.experiment.ExperimentalFactor)88 Test (org.junit.Test)31 FactorValue (ubic.gemma.model.expression.experiment.FactorValue)30 DifferentialExpressionAnalysis (ubic.gemma.model.analysis.expression.diff.DifferentialExpressionAnalysis)26 BioMaterial (ubic.gemma.model.expression.biomaterial.BioMaterial)22 ExpressionExperiment (ubic.gemma.model.expression.experiment.ExpressionExperiment)18 HashSet (java.util.HashSet)17 AbstractGeoServiceTest (ubic.gemma.core.loader.expression.geo.AbstractGeoServiceTest)17 ExpressionAnalysisResultSet (ubic.gemma.model.analysis.expression.diff.ExpressionAnalysisResultSet)16 BioAssay (ubic.gemma.model.expression.bioAssay.BioAssay)14 DifferentialExpressionAnalysisResult (ubic.gemma.model.analysis.expression.diff.DifferentialExpressionAnalysisResult)12 CompositeSequence (ubic.gemma.model.expression.designElement.CompositeSequence)11 GeoDomainObjectGeneratorLocal (ubic.gemma.core.loader.expression.geo.GeoDomainObjectGeneratorLocal)10 AlreadyExistsInSystemException (ubic.gemma.core.loader.util.AlreadyExistsInSystemException)10 Before (org.junit.Before)8 Collection (java.util.Collection)7 ContrastResult (ubic.gemma.model.analysis.expression.diff.ContrastResult)6 AnalysisType (ubic.gemma.core.analysis.expression.diff.DifferentialExpressionAnalyzerServiceImpl.AnalysisType)5 ExperimentalFactorValueObject (ubic.gemma.model.expression.experiment.ExperimentalFactorValueObject)5 InputStream (java.io.InputStream)4