use of ubic.gemma.model.expression.bioAssay.BioAssay in project Gemma by PavlidisLab.
the class SampleCoexpressionMatrixServiceImpl method getMatrix.
private static DoubleMatrix<BioAssay, BioAssay> getMatrix(ExpressionDataDoubleMatrix matrix) {
DoubleMatrix<BioMaterial, CompositeSequence> transposeR = matrix.getMatrix().transpose();
DoubleMatrix<BioAssay, CompositeSequence> transpose = new DenseDoubleMatrix<>(transposeR.getRawMatrix());
transpose.setColumnNames(transposeR.getColNames());
for (int i = 0; i < transpose.rows(); i++) {
BioAssay s = transposeR.getRowName(i).getBioAssaysUsedIn().iterator().next();
transpose.setRowName(s, i);
}
return MatrixStats.correlationMatrix(transpose);
}
use of ubic.gemma.model.expression.bioAssay.BioAssay in project Gemma by PavlidisLab.
the class BatchInfoParser method getBatchInformationFromFiles.
/**
* Now we can parse the file to get the batch information.
* We allow ourselves to add dates to _some_ of the bioassays. It turns out to be common for there to be a single
* corrupted date in CEL files, for example. However, downstream code has to be careful, and the batch factor could
* be a problem too.
*
* @param bioAssays2Files BA 2 files
* @return map of biomaterials to dates. Biomaterials which did not have associated dates are not included in the
* map.
*/
private Map<BioMaterial, Date> getBatchInformationFromFiles(Map<BioAssay, File> bioAssays2Files) {
Map<BioMaterial, Date> result = new HashMap<>();
Collection<File> missingDate = new HashSet<>();
for (BioAssay ba : bioAssays2Files.keySet()) {
File f = bioAssays2Files.get(ba);
ArrayDesign arrayDesignUsed = ba.getArrayDesignUsed();
try (InputStream is = FileTools.getInputStreamFromPlainOrCompressedFile(f.getAbsolutePath())) {
this.locateExtractor(arrayDesignUsed, ba, f);
Date d = scanDateExtractor.extract(is);
// to be okay, but let's assume we're not getting data the same day it was generated!
if (d != null && d.after(new Date())) {
throw new RuntimeException("Date was in the future for: " + ba + " from " + f.getName());
}
BioMaterial bm = ba.getSampleUsed();
result.put(bm, d);
} catch (RuntimeException | IOException e) {
BatchInfoParser.log.warn("Failure while parsing: " + f + ": " + e.getMessage());
missingDate.add(f);
}
}
if (missingDate.size() == bioAssays2Files.size()) {
throw new IllegalStateException("Dates were not found for any of the files.");
}
if (missingDate.size() > 0) {
BatchInfoParser.log.warn("Dates were not obtained for " + missingDate + " files: ");
for (File f : missingDate) {
BatchInfoParser.log.info("Missing date for: " + f.getName());
}
}
return result;
}
use of ubic.gemma.model.expression.bioAssay.BioAssay in project Gemma by PavlidisLab.
the class BatchInfoParser method matchBioAssaysToRawDataFiles.
/**
* From the file names, match to the bioassays. GEO names things consistently (??) so this should work but not
* ideal.
*
* @param files files
* @param assayAccessions accessions
* @return map
*/
// Better readability
@SuppressWarnings("StatementWithEmptyBody")
private Map<BioAssay, File> matchBioAssaysToRawDataFiles(Collection<LocalFile> files, Map<String, BioAssay> assayAccessions) {
Pattern regex = Pattern.compile("(GSM[0-9]+).+");
Map<BioAssay, File> bioAssays2Files = new HashMap<>();
for (LocalFile file : files) {
File f = file.asFile();
String n = f.getName();
/*
* We only support the newer style of storing these.
*/
if (!n.startsWith("GSM")) {
continue;
}
if (n.toUpperCase().contains(".CHP") || n.toUpperCase().contains(".DAT") || n.toUpperCase().contains(".EXP") || n.toUpperCase().contains(".RPT") || n.toUpperCase().contains(".TIF")) {
continue;
}
/*
* keep just the GSMNNNNNN part. FIXME: only works with GEO
*/
Matcher matcher = regex.matcher(n);
if (!matcher.matches()) {
continue;
}
String acc = matcher.group(1);
assert acc.matches("GSM[0-9]+");
BioAssay ba = assayAccessions.get(acc);
if (ba == null) {
/*
* Warn? Throw exception?
*/
continue;
}
if (bioAssays2Files.containsKey(ba)) {
/*
* Don't clobber a valid file. For affymetrix, CEL is what we want. Other cases harder to predict, but
* .txt files can be either good or bad. (We could do this check earlier)
*/
if (bioAssays2Files.get(ba).getName().toUpperCase().contains(".CEL")) {
BatchInfoParser.log.debug("Retaining CEL file, ignoring " + f.getName());
continue;
} else if (f.getName().toUpperCase().contains(".CEL")) {
// we displace the old file with this CEL file, but there is no need to warn.
} else {
BatchInfoParser.log.warn("Multiple files matching " + ba + ": " + bioAssays2Files.get(ba) + "; using new file: " + f);
}
}
bioAssays2Files.put(ba, f);
}
return bioAssays2Files;
}
use of ubic.gemma.model.expression.bioAssay.BioAssay in project Gemma by PavlidisLab.
the class SVDServiceHelperImpl method getFactorsForAnalysis.
private void getFactorsForAnalysis(Collection<BioAssay> bioAssays, Map<Long, Date> bioMaterialDates, Map<ExperimentalFactor, Map<Long, Double>> bioMaterialFactorMap) {
for (BioAssay bioAssay : bioAssays) {
Date processingDate = bioAssay.getProcessingDate();
BioMaterial bm = bioAssay.getSampleUsed();
// can be null
bioMaterialDates.put(bm.getId(), processingDate);
SVDServiceHelperImpl.populateBMFMap(bioMaterialFactorMap, bm);
}
}
use of ubic.gemma.model.expression.bioAssay.BioAssay in project Gemma by PavlidisLab.
the class SVDServiceHelperImpl method svdFactorAnalysis.
@Override
public SVDValueObject svdFactorAnalysis(PrincipalComponentAnalysis pca) {
BioAssayDimension bad = pca.getBioAssayDimension();
List<BioAssay> bioAssays = bad.getBioAssays();
SVDValueObject svo;
try {
svo = new SVDValueObject(pca);
} catch (Exception e) {
SVDServiceHelperImpl.log.error(e.getLocalizedMessage());
return null;
}
Map<Long, Date> bioMaterialDates = new HashMap<>();
Map<ExperimentalFactor, Map<Long, Double>> bioMaterialFactorMap = new HashMap<>();
this.prepareForFactorComparisons(svo, bioAssays, bioMaterialDates, bioMaterialFactorMap);
if (bioMaterialDates.isEmpty() && bioMaterialFactorMap.isEmpty()) {
SVDServiceHelperImpl.log.warn("No factor or date information to compare to the eigenGenes");
return svo;
}
Long[] svdBioMaterials = svo.getBioMaterialIds();
svo.getDateCorrelations().clear();
svo.getFactorCorrelations().clear();
svo.getDates().clear();
svo.getFactors().clear();
for (int componentNumber = 0; componentNumber < Math.min(svo.getvMatrix().columns(), SVDServiceHelperImpl.MAX_EIGEN_GENES_TO_TEST); componentNumber++) {
this.analyzeComponent(svo, componentNumber, svo.getvMatrix(), bioMaterialDates, bioMaterialFactorMap, svdBioMaterials);
}
return svo;
}
Aggregations