Search in sources :

Example 36 with BioMaterial

use of ubic.gemma.model.expression.biomaterial.BioMaterial in project Gemma by PavlidisLab.

the class ExperimentalDesignImporterImpl method mapBioMaterialsToNamePossibilities.

/**
 * @param bioMaterials bio materials
 * @return a map of various strings that we might find in a design importing file to the biomaterials.
 */
private Map<String, BioMaterial> mapBioMaterialsToNamePossibilities(Collection<BioMaterial> bioMaterials) {
    Map<String, BioMaterial> biomaterialsInExpressionExperiment = new HashMap<>();
    // this rather big loop is recomputed each time we call this method. No big deal, but could be more efficient.
    for (BioMaterial bm : bioMaterials) {
        biomaterialsInExpressionExperiment.put(bm.getName(), bm);
        // we allow multiple bioassays per biomaterial - e.g. two platforms run on the sa
        for (BioAssay ba : bm.getBioAssaysUsedIn()) {
            /*
                 * Allow matches to the accession (external id) of the bioassay; trying to be flexible! This _could_
                 * cause problems if there are multiple bioassays per biomaterial, thus the check here.
                 */
            if (ba.getAccession() != null && StringUtils.isNotBlank(ba.getAccession().getAccession())) {
                String accession = ba.getAccession().getAccession();
                /*
                     * We get at most one bioassay per biomaterial.
                     */
                biomaterialsInExpressionExperiment.put(accession, bm);
            }
            /*
                 * Similarly allow match on the bioassay name
                 */
            biomaterialsInExpressionExperiment.put(ba.getName(), bm);
        }
        /*
             * All put in the very-mangled name we use in the 'native' Gemma export format. This includes the ID, so not
             * useful for tests.
             */
        biomaterialsInExpressionExperiment.put(ExpressionDataWriterUtils.constructBioAssayName(bm, bm.getBioAssaysUsedIn()), bm);
    }
    return biomaterialsInExpressionExperiment;
}
Also used : BioMaterial(ubic.gemma.model.expression.biomaterial.BioMaterial) BioAssay(ubic.gemma.model.expression.bioAssay.BioAssay)

Example 37 with BioMaterial

use of ubic.gemma.model.expression.biomaterial.BioMaterial in project Gemma by PavlidisLab.

the class ExperimentalDesignImporterImpl method importDesign.

@Override
@Transactional
public void importDesign(ExpressionExperiment experiment, InputStream is) throws IOException {
    this.efoService = this.ontologyService.getExperimentalFactorOntologyService();
    ExperimentalDesignImporterImpl.log.debug("Parsing input file");
    boolean readHeader = false;
    BufferedReader r = new BufferedReader(new InputStreamReader(is));
    String line;
    // experiment = expressionExperimentService.thawBioAssays( experiment );
    ExperimentalDesign experimentalDesign = experiment.getExperimentalDesign();
    if (!experimentalDesign.getExperimentalFactors().isEmpty()) {
        ExperimentalDesignImporterImpl.log.warn("Experimental design already has factors, import will add new ones");
    }
    experimentalDesign.setDescription("Parsed from file.");
    List<String> experimentalFactorLines = new ArrayList<>();
    String sampleHeaderLine = "";
    List<String> factorValueLines = new ArrayList<>();
    while ((line = r.readLine()) != null) {
        if (line.startsWith(ExperimentalDesignImporterImpl.EXPERIMENTAL_FACTOR_DESCRIPTION_LINE_INDICATOR)) {
            experimentalFactorLines.add(line);
        } else if (line.startsWith("#") || StringUtils.isBlank(line)) {
            // noinspection UnnecessaryContinue // Better for readability
            continue;
        } else if (!readHeader) {
            sampleHeaderLine = line;
            readHeader = true;
        } else {
            factorValueLines.add(line);
        }
    }
    String[] headerFields = StringUtils.splitPreserveAllTokens(sampleHeaderLine, "\t");
    Collection<BioMaterial> experimentBioMaterials = this.bioMaterialService.findByExperiment(experiment);
    this.validateFileComponents(experimentalFactorLines, sampleHeaderLine, factorValueLines);
    this.validateExperimentalFactorFileContent(experimentalFactorLines, sampleHeaderLine);
    this.validateFactorFileContent(experimentalFactorLines.size(), factorValueLines);
    this.validateBioMaterialFileContent(experimentBioMaterials, factorValueLines);
    // build up the composite: create experimental factor then add the experimental value
    this.addExperimentalFactorsToExperimentalDesign(experimentalDesign, experimentalFactorLines, headerFields, factorValueLines);
    assert !experimentalDesign.getExperimentalFactors().isEmpty();
    assert !experiment.getExperimentalDesign().getExperimentalFactors().isEmpty();
    experimentalDesignService.update(experimentalDesign);
    Collection<BioMaterial> bioMaterialsWithFactorValues = this.addFactorValuesToBioMaterialsInExpressionExperiment(experimentBioMaterials, experimentalDesign, factorValueLines, headerFields);
    for (BioMaterial bioMaterial : bioMaterialsWithFactorValues) {
        this.bioMaterialService.update(bioMaterial);
    }
}
Also used : BioMaterial(ubic.gemma.model.expression.biomaterial.BioMaterial) InputStreamReader(java.io.InputStreamReader) BufferedReader(java.io.BufferedReader) Transactional(org.springframework.transaction.annotation.Transactional)

Example 38 with BioMaterial

use of ubic.gemma.model.expression.biomaterial.BioMaterial in project Gemma by PavlidisLab.

the class ExperimentalDesignImporterImpl method validateBioMaterialFileContent.

/**
 * Check that the biomaterial is in the file and in the experiment. It is arguable whether this should be an
 * exception. I think it has to be to make sure that simple errors in the format are caught. But it's inconvenient
 * for cases where a single 'design' file is to be used for multiple microarray studies. Biomaterial ids should
 * match what is stored
 *
 * @param factorValueLines Lines containing biomaterial names and their factor values
 */
private void validateBioMaterialFileContent(Collection<BioMaterial> bioMaterials, List<String> factorValueLines) throws IllegalArgumentException {
    for (String factorValueLine : factorValueLines) {
        String[] vals = StringUtils.splitPreserveAllTokens(factorValueLine, '\t');
        if (vals.length < 2) {
            throw new IllegalArgumentException("Expected a file with at least two columns separated by tabs, got " + factorValueLine);
        }
        BioMaterial bioMaterialInFile = this.getBioMaterialFromExpressionExperiment(bioMaterials, vals[0], vals[1]);
        if (bioMaterialInFile == null) {
            throw new IllegalArgumentException("The uploaded file has a biomaterial name that does not match the study: " + StringUtils.splitPreserveAllTokens(factorValueLine, "\t")[0] + " (formatted based on on input: ");
        }
    }
}
Also used : BioMaterial(ubic.gemma.model.expression.biomaterial.BioMaterial)

Example 39 with BioMaterial

use of ubic.gemma.model.expression.biomaterial.BioMaterial in project Gemma by PavlidisLab.

the class SimpleExpressionDataLoaderServiceImpl method convertBioAssayDimension.

/**
 * @return BioAssayDimension
 */
private BioAssayDimension convertBioAssayDimension(ExpressionExperiment ee, ArrayDesign arrayDesign, Taxon taxon, DoubleMatrix<String, String> matrix) {
    BioAssayDimension bad = BioAssayDimension.Factory.newInstance();
    bad.setName("For " + ee.getShortName());
    bad.setDescription("Generated from flat file");
    for (int i = 0; i < matrix.columns(); i++) {
        String columnName = matrix.getColName(i);
        BioMaterial bioMaterial = BioMaterial.Factory.newInstance();
        bioMaterial.setName(columnName);
        bioMaterial.setDescription("Generated by Gemma for: " + ee.getShortName());
        bioMaterial.setSourceTaxon(taxon);
        BioAssay assay = BioAssay.Factory.newInstance();
        assay.setName(columnName);
        assay.setArrayDesignUsed(arrayDesign);
        assay.setSampleUsed(bioMaterial);
        assay.setIsOutlier(false);
        assay.setSequencePairedReads(false);
        bad.getBioAssays().add(assay);
    }
    SimpleExpressionDataLoaderServiceImpl.log.info("Generated " + bad.getBioAssays().size() + " bioAssays");
    return bad;
}
Also used : BioMaterial(ubic.gemma.model.expression.biomaterial.BioMaterial) BioAssayDimension(ubic.gemma.model.expression.bioAssayData.BioAssayDimension) BioAssay(ubic.gemma.model.expression.bioAssay.BioAssay)

Example 40 with BioMaterial

use of ubic.gemma.model.expression.biomaterial.BioMaterial in project Gemma by PavlidisLab.

the class GeoConverterImpl method convertSeriesSingle.

private ExpressionExperiment convertSeriesSingle(GeoSeries series) {
    if (series == null)
        return null;
    GeoConverterImpl.log.info("Converting series: " + series.getGeoAccession());
    Collection<GeoDataset> dataSets = series.getDatasets();
    Collection<String> dataSetsToSkip = new HashSet<>();
    Collection<GeoSample> samplesToSkip = new HashSet<>();
    this.checkForDataToSkip(series, dataSetsToSkip, samplesToSkip);
    if (dataSets.size() > 0 && dataSetsToSkip.size() == dataSets.size()) {
        return null;
    }
    if (!this.isUsable(series)) {
        GeoConverterImpl.log.warn("Series was not usable: types=" + StringUtils.join(series.getSeriesTypes(), " "));
        return null;
    }
    ExpressionExperiment expExp = ExpressionExperiment.Factory.newInstance();
    expExp.setDescription("");
    expExp.setDescription(series.getSummaries() + (series.getSummaries().endsWith("\n") ? "" : "\n"));
    if (series.getLastUpdateDate() != null) {
        expExp.setDescription(expExp.getDescription() + "Last Updated (by provider): " + series.getLastUpdateDate() + "\n");
    }
    expExp.setName(series.getTitle());
    expExp.setShortName(series.getGeoAccession());
    this.convertContacts(series, expExp);
    this.convertPubMedIds(series, expExp);
    expExp.setAccession(this.convertDatabaseEntry(series));
    LocalFile expExpRawDataFile = this.convertSupplementaryFileToLocalFile(series);
    expExp.setRawDataFile(expExpRawDataFile);
    ExperimentalDesign design = ExperimentalDesign.Factory.newInstance();
    design.setDescription("");
    design.setName("");
    Collection<GeoVariable> variables = series.getVariables().values();
    for (GeoVariable variable : variables) {
        GeoConverterImpl.log.debug("Adding variable " + variable);
        ExperimentalFactor ef = this.convertVariableToFactor(variable);
        this.convertVariableToFactorValue(variable, ef);
        design.getExperimentalFactors().add(ef);
        design.setName(variable.getDescription() + " " + design.getName());
    }
    if (series.getKeyWords().size() > 0) {
        for (String keyWord : series.getKeyWords()) {
            // design.setDescription( design.getDescription() + " Keyword: " + keyWord );
            Characteristic o = Characteristic.Factory.newInstance();
            o.setDescription("GEO Keyword");
            o.setValue(keyWord);
            o.setEvidenceCode(GOEvidenceCode.IIA);
            o.setDescription("Keyword from GEO series definition file.");
        }
    }
    if (series.getOverallDesign() != null) {
        design.setDescription(design.getDescription() + " Overall design: " + series.getOverallDesign());
    }
    Collection<GeoReplication> replication = series.getReplicates().values();
    for (GeoReplication replicate : replication) {
        GeoConverterImpl.log.debug("Adding replication " + replicate);
        ExperimentalFactor ef = this.convertReplicationToFactor(replicate);
        this.convertReplicationToFactorValue(replicate, ef);
        design.getExperimentalFactors().add(ef);
    }
    expExp.setExperimentalDesign(design);
    // GEO does not have the concept of a biomaterial.
    Collection<GeoSample> allSeriesSamples = series.getSamples();
    GeoConverterImpl.log.info("Series has " + series.getSamples().size() + " samples");
    if (samplesToSkip.size() > 0) {
        GeoConverterImpl.log.info(samplesToSkip.size() + " samples will be skipped");
    }
    expExp.setBioAssays(new HashSet<BioAssay>());
    if (series.getSampleCorrespondence().size() == 0) {
        throw new IllegalArgumentException("No sample correspondence!");
    }
    // spits out a big summary of the correspondence.
    if (GeoConverterImpl.log.isDebugEnabled())
        GeoConverterImpl.log.debug(series.getSampleCorrespondence());
    int numBioMaterials = 0;
    /*
         * For each _set_ of "corresponding" samples (from the same RNA, or so we think) we make up a new BioMaterial.
         */
    Collection<String> seen = new HashSet<>();
    for (Iterator<Set<String>> iter = series.getSampleCorrespondence().iterator(); iter.hasNext(); ) {
        Set<String> correspondingSamples = iter.next();
        if (correspondingSamples.isEmpty())
            // can happen after removing samples (multitaxon)
            continue;
        BioMaterial bioMaterial = BioMaterial.Factory.newInstance();
        String bioMaterialName = this.getBiomaterialPrefix(series, ++numBioMaterials);
        StringBuilder bioMaterialDescription = new StringBuilder(GeoConverterImpl.BIOMATERIAL_DESCRIPTION_PREFIX + series.getGeoAccession());
        // From the series samples, find the sample that corresponds and convert it.
        for (String cSample : correspondingSamples) {
            boolean found = false;
            for (GeoSample sample : allSeriesSamples) {
                if (sample == null || sample.getGeoAccession() == null) {
                    GeoConverterImpl.log.warn("Null sample or no accession for " + sample);
                    continue;
                }
                if (samplesToSkip.contains(sample)) {
                    continue;
                }
                String accession = sample.getGeoAccession();
                if (accession.equals(cSample)) {
                    if (seen.contains(accession)) {
                        GeoConverterImpl.log.error("Got " + accession + " twice, this time in set " + correspondingSamples);
                    }
                    seen.add(accession);
                    BioAssay ba = this.convertSample(sample, bioMaterial, expExp.getExperimentalDesign());
                    assert (ba != null);
                    LocalFile rawDataFile = this.convertSupplementaryFileToLocalFile(sample);
                    // deal with null at UI
                    ba.setRawDataFile(rawDataFile);
                    ba.setDescription(ba.getDescription() + "\nSource GEO sample is " + sample.getGeoAccession() + "\nLast updated (according to GEO): " + sample.getLastUpdateDate());
                    assert ba.getSampleUsed() != null;
                    bioMaterial.getBioAssaysUsedIn().add(ba);
                    bioMaterialDescription.append(",").append(sample);
                    expExp.getBioAssays().add(ba);
                    found = true;
                    break;
                }
            }
            if (!found) {
                if (GeoConverterImpl.log.isDebugEnabled())
                    GeoConverterImpl.log.debug("No sample found in " + series + " to match " + cSample + "; this can happen if some samples were not run on all platforms.");
            }
        }
        bioMaterial.setName(bioMaterialName);
        bioMaterial.setDescription(bioMaterialDescription.toString());
    }
    GeoConverterImpl.log.info("Expression Experiment from " + series + " has " + expExp.getBioAssays().size() + " bioassays and " + numBioMaterials + " biomaterials.");
    int expectedNumSamples = series.getSamples().size() - samplesToSkip.size();
    int actualNumSamples = expExp.getBioAssays().size();
    if (expectedNumSamples > actualNumSamples) {
        GeoConverterImpl.log.warn((expectedNumSamples - actualNumSamples) + " samples were not in the 'sample correspondence'" + " and have been omitted. Possibly they were in the Series (GSE) but not in the corresponding Dataset (GDS)?");
    }
    if (dataSets.size() == 0) {
        // we miss extra description and the subset information.
        if (series.getValues().hasData())
            this.convertSeriesDataVectors(series, expExp);
    } else {
        for (GeoDataset dataset : dataSets) {
            if (dataSetsToSkip.contains(dataset.getGeoAccession()))
                continue;
            this.convertDataset(dataset, expExp);
        }
    }
    return expExp;
}
Also used : BioMaterial(ubic.gemma.model.expression.biomaterial.BioMaterial) BioAssay(ubic.gemma.model.expression.bioAssay.BioAssay)

Aggregations

BioMaterial (ubic.gemma.model.expression.biomaterial.BioMaterial)132 BioAssay (ubic.gemma.model.expression.bioAssay.BioAssay)67 FactorValue (ubic.gemma.model.expression.experiment.FactorValue)27 ExperimentalFactor (ubic.gemma.model.expression.experiment.ExperimentalFactor)22 CompositeSequence (ubic.gemma.model.expression.designElement.CompositeSequence)19 BioAssayDimension (ubic.gemma.model.expression.bioAssayData.BioAssayDimension)15 HashSet (java.util.HashSet)13 Test (org.junit.Test)13 ExpressionDataDoubleMatrix (ubic.gemma.core.datastructure.matrix.ExpressionDataDoubleMatrix)12 ArrayDesign (ubic.gemma.model.expression.arrayDesign.ArrayDesign)12 ExpressionExperiment (ubic.gemma.model.expression.experiment.ExpressionExperiment)10 InputStream (java.io.InputStream)7 DenseDoubleMatrix (ubic.basecode.dataStructure.matrix.DenseDoubleMatrix)7 BaseSpringContextTest (ubic.gemma.core.testing.BaseSpringContextTest)7 QuantitationType (ubic.gemma.model.common.quantitationtype.QuantitationType)7 Characteristic (ubic.gemma.model.common.description.Characteristic)6 RawExpressionDataVector (ubic.gemma.model.expression.bioAssayData.RawExpressionDataVector)6 DoubleArrayList (cern.colt.list.DoubleArrayList)5 DoubleMatrix1D (cern.colt.matrix.DoubleMatrix1D)5 ArrayList (java.util.ArrayList)5