use of ubic.gemma.model.expression.biomaterial.BioMaterial in project Gemma by PavlidisLab.
the class ExperimentalDesignImporterImpl method mapBioMaterialsToNamePossibilities.
/**
* @param bioMaterials bio materials
* @return a map of various strings that we might find in a design importing file to the biomaterials.
*/
private Map<String, BioMaterial> mapBioMaterialsToNamePossibilities(Collection<BioMaterial> bioMaterials) {
Map<String, BioMaterial> biomaterialsInExpressionExperiment = new HashMap<>();
// this rather big loop is recomputed each time we call this method. No big deal, but could be more efficient.
for (BioMaterial bm : bioMaterials) {
biomaterialsInExpressionExperiment.put(bm.getName(), bm);
// we allow multiple bioassays per biomaterial - e.g. two platforms run on the sa
for (BioAssay ba : bm.getBioAssaysUsedIn()) {
/*
* Allow matches to the accession (external id) of the bioassay; trying to be flexible! This _could_
* cause problems if there are multiple bioassays per biomaterial, thus the check here.
*/
if (ba.getAccession() != null && StringUtils.isNotBlank(ba.getAccession().getAccession())) {
String accession = ba.getAccession().getAccession();
/*
* We get at most one bioassay per biomaterial.
*/
biomaterialsInExpressionExperiment.put(accession, bm);
}
/*
* Similarly allow match on the bioassay name
*/
biomaterialsInExpressionExperiment.put(ba.getName(), bm);
}
/*
* All put in the very-mangled name we use in the 'native' Gemma export format. This includes the ID, so not
* useful for tests.
*/
biomaterialsInExpressionExperiment.put(ExpressionDataWriterUtils.constructBioAssayName(bm, bm.getBioAssaysUsedIn()), bm);
}
return biomaterialsInExpressionExperiment;
}
use of ubic.gemma.model.expression.biomaterial.BioMaterial in project Gemma by PavlidisLab.
the class ExperimentalDesignImporterImpl method importDesign.
@Override
@Transactional
public void importDesign(ExpressionExperiment experiment, InputStream is) throws IOException {
this.efoService = this.ontologyService.getExperimentalFactorOntologyService();
ExperimentalDesignImporterImpl.log.debug("Parsing input file");
boolean readHeader = false;
BufferedReader r = new BufferedReader(new InputStreamReader(is));
String line;
// experiment = expressionExperimentService.thawBioAssays( experiment );
ExperimentalDesign experimentalDesign = experiment.getExperimentalDesign();
if (!experimentalDesign.getExperimentalFactors().isEmpty()) {
ExperimentalDesignImporterImpl.log.warn("Experimental design already has factors, import will add new ones");
}
experimentalDesign.setDescription("Parsed from file.");
List<String> experimentalFactorLines = new ArrayList<>();
String sampleHeaderLine = "";
List<String> factorValueLines = new ArrayList<>();
while ((line = r.readLine()) != null) {
if (line.startsWith(ExperimentalDesignImporterImpl.EXPERIMENTAL_FACTOR_DESCRIPTION_LINE_INDICATOR)) {
experimentalFactorLines.add(line);
} else if (line.startsWith("#") || StringUtils.isBlank(line)) {
// noinspection UnnecessaryContinue // Better for readability
continue;
} else if (!readHeader) {
sampleHeaderLine = line;
readHeader = true;
} else {
factorValueLines.add(line);
}
}
String[] headerFields = StringUtils.splitPreserveAllTokens(sampleHeaderLine, "\t");
Collection<BioMaterial> experimentBioMaterials = this.bioMaterialService.findByExperiment(experiment);
this.validateFileComponents(experimentalFactorLines, sampleHeaderLine, factorValueLines);
this.validateExperimentalFactorFileContent(experimentalFactorLines, sampleHeaderLine);
this.validateFactorFileContent(experimentalFactorLines.size(), factorValueLines);
this.validateBioMaterialFileContent(experimentBioMaterials, factorValueLines);
// build up the composite: create experimental factor then add the experimental value
this.addExperimentalFactorsToExperimentalDesign(experimentalDesign, experimentalFactorLines, headerFields, factorValueLines);
assert !experimentalDesign.getExperimentalFactors().isEmpty();
assert !experiment.getExperimentalDesign().getExperimentalFactors().isEmpty();
experimentalDesignService.update(experimentalDesign);
Collection<BioMaterial> bioMaterialsWithFactorValues = this.addFactorValuesToBioMaterialsInExpressionExperiment(experimentBioMaterials, experimentalDesign, factorValueLines, headerFields);
for (BioMaterial bioMaterial : bioMaterialsWithFactorValues) {
this.bioMaterialService.update(bioMaterial);
}
}
use of ubic.gemma.model.expression.biomaterial.BioMaterial in project Gemma by PavlidisLab.
the class ExperimentalDesignImporterImpl method validateBioMaterialFileContent.
/**
* Check that the biomaterial is in the file and in the experiment. It is arguable whether this should be an
* exception. I think it has to be to make sure that simple errors in the format are caught. But it's inconvenient
* for cases where a single 'design' file is to be used for multiple microarray studies. Biomaterial ids should
* match what is stored
*
* @param factorValueLines Lines containing biomaterial names and their factor values
*/
private void validateBioMaterialFileContent(Collection<BioMaterial> bioMaterials, List<String> factorValueLines) throws IllegalArgumentException {
for (String factorValueLine : factorValueLines) {
String[] vals = StringUtils.splitPreserveAllTokens(factorValueLine, '\t');
if (vals.length < 2) {
throw new IllegalArgumentException("Expected a file with at least two columns separated by tabs, got " + factorValueLine);
}
BioMaterial bioMaterialInFile = this.getBioMaterialFromExpressionExperiment(bioMaterials, vals[0], vals[1]);
if (bioMaterialInFile == null) {
throw new IllegalArgumentException("The uploaded file has a biomaterial name that does not match the study: " + StringUtils.splitPreserveAllTokens(factorValueLine, "\t")[0] + " (formatted based on on input: ");
}
}
}
use of ubic.gemma.model.expression.biomaterial.BioMaterial in project Gemma by PavlidisLab.
the class SimpleExpressionDataLoaderServiceImpl method convertBioAssayDimension.
/**
* @return BioAssayDimension
*/
private BioAssayDimension convertBioAssayDimension(ExpressionExperiment ee, ArrayDesign arrayDesign, Taxon taxon, DoubleMatrix<String, String> matrix) {
BioAssayDimension bad = BioAssayDimension.Factory.newInstance();
bad.setName("For " + ee.getShortName());
bad.setDescription("Generated from flat file");
for (int i = 0; i < matrix.columns(); i++) {
String columnName = matrix.getColName(i);
BioMaterial bioMaterial = BioMaterial.Factory.newInstance();
bioMaterial.setName(columnName);
bioMaterial.setDescription("Generated by Gemma for: " + ee.getShortName());
bioMaterial.setSourceTaxon(taxon);
BioAssay assay = BioAssay.Factory.newInstance();
assay.setName(columnName);
assay.setArrayDesignUsed(arrayDesign);
assay.setSampleUsed(bioMaterial);
assay.setIsOutlier(false);
assay.setSequencePairedReads(false);
bad.getBioAssays().add(assay);
}
SimpleExpressionDataLoaderServiceImpl.log.info("Generated " + bad.getBioAssays().size() + " bioAssays");
return bad;
}
use of ubic.gemma.model.expression.biomaterial.BioMaterial in project Gemma by PavlidisLab.
the class GeoConverterImpl method convertSeriesSingle.
private ExpressionExperiment convertSeriesSingle(GeoSeries series) {
if (series == null)
return null;
GeoConverterImpl.log.info("Converting series: " + series.getGeoAccession());
Collection<GeoDataset> dataSets = series.getDatasets();
Collection<String> dataSetsToSkip = new HashSet<>();
Collection<GeoSample> samplesToSkip = new HashSet<>();
this.checkForDataToSkip(series, dataSetsToSkip, samplesToSkip);
if (dataSets.size() > 0 && dataSetsToSkip.size() == dataSets.size()) {
return null;
}
if (!this.isUsable(series)) {
GeoConverterImpl.log.warn("Series was not usable: types=" + StringUtils.join(series.getSeriesTypes(), " "));
return null;
}
ExpressionExperiment expExp = ExpressionExperiment.Factory.newInstance();
expExp.setDescription("");
expExp.setDescription(series.getSummaries() + (series.getSummaries().endsWith("\n") ? "" : "\n"));
if (series.getLastUpdateDate() != null) {
expExp.setDescription(expExp.getDescription() + "Last Updated (by provider): " + series.getLastUpdateDate() + "\n");
}
expExp.setName(series.getTitle());
expExp.setShortName(series.getGeoAccession());
this.convertContacts(series, expExp);
this.convertPubMedIds(series, expExp);
expExp.setAccession(this.convertDatabaseEntry(series));
LocalFile expExpRawDataFile = this.convertSupplementaryFileToLocalFile(series);
expExp.setRawDataFile(expExpRawDataFile);
ExperimentalDesign design = ExperimentalDesign.Factory.newInstance();
design.setDescription("");
design.setName("");
Collection<GeoVariable> variables = series.getVariables().values();
for (GeoVariable variable : variables) {
GeoConverterImpl.log.debug("Adding variable " + variable);
ExperimentalFactor ef = this.convertVariableToFactor(variable);
this.convertVariableToFactorValue(variable, ef);
design.getExperimentalFactors().add(ef);
design.setName(variable.getDescription() + " " + design.getName());
}
if (series.getKeyWords().size() > 0) {
for (String keyWord : series.getKeyWords()) {
// design.setDescription( design.getDescription() + " Keyword: " + keyWord );
Characteristic o = Characteristic.Factory.newInstance();
o.setDescription("GEO Keyword");
o.setValue(keyWord);
o.setEvidenceCode(GOEvidenceCode.IIA);
o.setDescription("Keyword from GEO series definition file.");
}
}
if (series.getOverallDesign() != null) {
design.setDescription(design.getDescription() + " Overall design: " + series.getOverallDesign());
}
Collection<GeoReplication> replication = series.getReplicates().values();
for (GeoReplication replicate : replication) {
GeoConverterImpl.log.debug("Adding replication " + replicate);
ExperimentalFactor ef = this.convertReplicationToFactor(replicate);
this.convertReplicationToFactorValue(replicate, ef);
design.getExperimentalFactors().add(ef);
}
expExp.setExperimentalDesign(design);
// GEO does not have the concept of a biomaterial.
Collection<GeoSample> allSeriesSamples = series.getSamples();
GeoConverterImpl.log.info("Series has " + series.getSamples().size() + " samples");
if (samplesToSkip.size() > 0) {
GeoConverterImpl.log.info(samplesToSkip.size() + " samples will be skipped");
}
expExp.setBioAssays(new HashSet<BioAssay>());
if (series.getSampleCorrespondence().size() == 0) {
throw new IllegalArgumentException("No sample correspondence!");
}
// spits out a big summary of the correspondence.
if (GeoConverterImpl.log.isDebugEnabled())
GeoConverterImpl.log.debug(series.getSampleCorrespondence());
int numBioMaterials = 0;
/*
* For each _set_ of "corresponding" samples (from the same RNA, or so we think) we make up a new BioMaterial.
*/
Collection<String> seen = new HashSet<>();
for (Iterator<Set<String>> iter = series.getSampleCorrespondence().iterator(); iter.hasNext(); ) {
Set<String> correspondingSamples = iter.next();
if (correspondingSamples.isEmpty())
// can happen after removing samples (multitaxon)
continue;
BioMaterial bioMaterial = BioMaterial.Factory.newInstance();
String bioMaterialName = this.getBiomaterialPrefix(series, ++numBioMaterials);
StringBuilder bioMaterialDescription = new StringBuilder(GeoConverterImpl.BIOMATERIAL_DESCRIPTION_PREFIX + series.getGeoAccession());
// From the series samples, find the sample that corresponds and convert it.
for (String cSample : correspondingSamples) {
boolean found = false;
for (GeoSample sample : allSeriesSamples) {
if (sample == null || sample.getGeoAccession() == null) {
GeoConverterImpl.log.warn("Null sample or no accession for " + sample);
continue;
}
if (samplesToSkip.contains(sample)) {
continue;
}
String accession = sample.getGeoAccession();
if (accession.equals(cSample)) {
if (seen.contains(accession)) {
GeoConverterImpl.log.error("Got " + accession + " twice, this time in set " + correspondingSamples);
}
seen.add(accession);
BioAssay ba = this.convertSample(sample, bioMaterial, expExp.getExperimentalDesign());
assert (ba != null);
LocalFile rawDataFile = this.convertSupplementaryFileToLocalFile(sample);
// deal with null at UI
ba.setRawDataFile(rawDataFile);
ba.setDescription(ba.getDescription() + "\nSource GEO sample is " + sample.getGeoAccession() + "\nLast updated (according to GEO): " + sample.getLastUpdateDate());
assert ba.getSampleUsed() != null;
bioMaterial.getBioAssaysUsedIn().add(ba);
bioMaterialDescription.append(",").append(sample);
expExp.getBioAssays().add(ba);
found = true;
break;
}
}
if (!found) {
if (GeoConverterImpl.log.isDebugEnabled())
GeoConverterImpl.log.debug("No sample found in " + series + " to match " + cSample + "; this can happen if some samples were not run on all platforms.");
}
}
bioMaterial.setName(bioMaterialName);
bioMaterial.setDescription(bioMaterialDescription.toString());
}
GeoConverterImpl.log.info("Expression Experiment from " + series + " has " + expExp.getBioAssays().size() + " bioassays and " + numBioMaterials + " biomaterials.");
int expectedNumSamples = series.getSamples().size() - samplesToSkip.size();
int actualNumSamples = expExp.getBioAssays().size();
if (expectedNumSamples > actualNumSamples) {
GeoConverterImpl.log.warn((expectedNumSamples - actualNumSamples) + " samples were not in the 'sample correspondence'" + " and have been omitted. Possibly they were in the Series (GSE) but not in the corresponding Dataset (GDS)?");
}
if (dataSets.size() == 0) {
// we miss extra description and the subset information.
if (series.getValues().hasData())
this.convertSeriesDataVectors(series, expExp);
} else {
for (GeoDataset dataset : dataSets) {
if (dataSetsToSkip.contains(dataset.getGeoAccession()))
continue;
this.convertDataset(dataset, expExp);
}
}
return expExp;
}
Aggregations