use of ubic.gemma.model.expression.arrayDesign.ArrayDesign in project Gemma by PavlidisLab.
the class VectorMergingServiceImpl method mergeVectors.
@Override
public ExpressionExperiment mergeVectors(ExpressionExperiment ee) {
Collection<ArrayDesign> arrayDesigns = expressionExperimentService.getArrayDesignsUsed(ee);
if (arrayDesigns.size() > 1) {
throw new IllegalArgumentException("Cannot cope with more than one platform; switch experiment to use a (merged) platform first");
}
ee = expressionExperimentService.thaw(ee);
Collection<QuantitationType> qts = expressionExperimentService.getQuantitationTypes(ee);
VectorMergingServiceImpl.log.info(qts.size() + " quantitation types for potential merge");
/*
* Load all the bioassay dimensions, which will be merged.
*/
Collection<BioAssayDimension> allOldBioAssayDims = new HashSet<>();
for (BioAssay ba : ee.getBioAssays()) {
Collection<BioAssayDimension> oldBioAssayDims = bioAssayService.findBioAssayDimensions(ba);
for (BioAssayDimension bioAssayDim : oldBioAssayDims) {
if (bioAssayDim.getDescription().startsWith(VectorMergingServiceImpl.MERGED_DIM_DESC_PREFIX)) {
// not foolproof, but avoids some artifacts - e.g. if there were previous failed attempts at this.
continue;
}
allOldBioAssayDims.add(bioAssayDim);
}
}
if (allOldBioAssayDims.size() == 0) {
throw new IllegalStateException("No bioAssayDimensions found to merge (previously merged ones are filtered, data may be corrupt?");
}
if (allOldBioAssayDims.size() == 1) {
VectorMergingServiceImpl.log.warn("Experiment already has only a single bioAssayDimension, nothing seems to need merging. Bailing");
return ee;
}
VectorMergingServiceImpl.log.info(allOldBioAssayDims.size() + " bioAssayDimensions to merge");
List<BioAssayDimension> sortedOldDims = this.sortedBioAssayDimensions(allOldBioAssayDims);
BioAssayDimension newBioAd = this.getNewBioAssayDimension(sortedOldDims);
int totalBioAssays = newBioAd.getBioAssays().size();
assert totalBioAssays == ee.getBioAssays().size() : "experiment has " + ee.getBioAssays().size() + " but new bioAssayDimension has " + totalBioAssays;
Map<QuantitationType, Collection<RawExpressionDataVector>> qt2Vec = this.getVectors(ee, qts, allOldBioAssayDims);
/*
* This will run into problems if there are excess quantitation types
*/
int numSuccessfulMergers = 0;
for (QuantitationType type : qt2Vec.keySet()) {
Collection<RawExpressionDataVector> oldVecs = qt2Vec.get(type);
if (oldVecs.isEmpty()) {
VectorMergingServiceImpl.log.warn("No vectors for " + type);
continue;
}
Map<CompositeSequence, Collection<RawExpressionDataVector>> deVMap = this.getDevMap(oldVecs);
if (deVMap == null) {
VectorMergingServiceImpl.log.info("Vector merging will not be done for " + type + " as there is only one vector per element already");
continue;
}
VectorMergingServiceImpl.log.info("Processing " + oldVecs.size() + " vectors for " + type);
Collection<RawExpressionDataVector> newVectors = new HashSet<>();
int numAllMissing = 0;
int missingValuesForQt = 0;
for (CompositeSequence de : deVMap.keySet()) {
RawExpressionDataVector vector = this.initializeNewVector(ee, newBioAd, type, de);
Collection<RawExpressionDataVector> dedvs = deVMap.get(de);
/*
* these ugly nested loops are to ENSURE that we get the vector reconstructed properly. For each of the
* old bioassayDimensions, find the designElementDataVector that uses it. If there isn't one, fill in
* the values for that dimension with missing data. We go through the dimensions in the same order that
* we joined them up.
*/
List<Object> data = new ArrayList<>();
int totalMissingInVector = this.makeMergedData(sortedOldDims, newBioAd, type, de, dedvs, data);
missingValuesForQt += totalMissingInVector;
if (totalMissingInVector == totalBioAssays) {
numAllMissing++;
// we don't save data that is all missing.
continue;
}
if (data.size() != totalBioAssays) {
throw new IllegalStateException("Wrong number of values for " + de + " / " + type + ", expected " + totalBioAssays + ", got " + data.size());
}
byte[] newDataAr = converter.toBytes(data.toArray());
vector.setData(newDataAr);
newVectors.add(vector);
}
// TRANSACTION
vectorMergingHelperService.persist(ee, type, newVectors);
if (numAllMissing > 0) {
VectorMergingServiceImpl.log.info(numAllMissing + " vectors had all missing values and were junked for " + type);
}
if (missingValuesForQt > 0) {
VectorMergingServiceImpl.log.info(missingValuesForQt + " total missing values: " + type);
}
VectorMergingServiceImpl.log.info("Removing " + oldVecs.size() + " old vectors for " + type);
rawExpressionDataVectorService.remove(oldVecs);
ee.getRawExpressionDataVectors().removeAll(oldVecs);
numSuccessfulMergers++;
}
if (numSuccessfulMergers == 0) {
/*
* Try to clean up
*/
this.bioAssayDimensionService.remove(newBioAd);
throw new IllegalStateException("Nothing was merged. Maybe all the vectors are effectively merged already");
}
expressionExperimentService.update(ee);
// Several transactions
this.cleanUp(ee, allOldBioAssayDims, newBioAd);
// transaction
this.audit(ee, "Vector merging performed, merged " + allOldBioAssayDims + " old bioassay dimensions for " + qts.size() + " quantitation types.");
// several transactions
try {
preprocessorService.process(ee);
} catch (PreprocessingException e) {
VectorMergingServiceImpl.log.error("Error during postprocessing", e);
}
return ee;
}
use of ubic.gemma.model.expression.arrayDesign.ArrayDesign in project Gemma by PavlidisLab.
the class ArrayDesignProbeCleanupCLI method doWork.
@Override
protected Exception doWork(String[] args) {
Exception err = this.processCommandLine(args);
if (err != null)
return err;
File f = new File(file);
if (!f.canRead()) {
AbstractCLI.log.fatal("Cannot read from " + file);
this.bail(ErrorCode.INVALID_OPTION);
}
if (this.arrayDesignsToProcess.size() > 1) {
throw new IllegalArgumentException("Cannot be applied to more than one platform given to the '-a' option");
}
ArrayDesign arrayDesign = this.arrayDesignsToProcess.iterator().next();
try (InputStream is = new FileInputStream(f);
BufferedReader br = new BufferedReader(new InputStreamReader(is))) {
String line;
int count = 0;
while ((line = br.readLine()) != null) {
if (StringUtils.isBlank(line)) {
continue;
}
String[] fields = line.split("\t");
String probe = fields[0];
CompositeSequence cs = compositeSequenceService.findByName(arrayDesign, probe);
if (cs != null) {
AbstractCLI.log.info("Removing: " + cs);
rawExpressionDataVectorService.removeDataForCompositeSequence(cs);
processedExpressionDataVectorService.removeDataForCompositeSequence(cs);
compositeSequenceService.remove(cs);
count++;
}
}
AbstractCLI.log.info("Deleted " + count + " probes");
} catch (IOException e) {
return e;
}
return null;
}
use of ubic.gemma.model.expression.arrayDesign.ArrayDesign in project Gemma by PavlidisLab.
the class ArrayDesignSequenceAssociationCli method doWork.
@Override
protected Exception doWork(String[] args) {
try {
Exception err = this.processCommandLine(args);
if (err != null)
return err;
// this is kind of an oddball function of this tool.
if (this.hasOption('s')) {
BioSequence updated = arrayDesignSequenceProcessingService.processSingleAccession(this.sequenceId, new String[] { "nt", "est_others", "est_human", "est_mouse" }, null, force);
if (updated != null) {
AbstractCLI.log.info("Updated or created " + updated);
}
return null;
}
for (ArrayDesign arrayDesign : this.arrayDesignsToProcess) {
arrayDesign = this.thaw(arrayDesign);
SequenceType sequenceTypeEn = SequenceType.fromString(sequenceType);
if (sequenceTypeEn == null) {
AbstractCLI.log.error("No sequenceType " + sequenceType + " found");
this.bail(ErrorCode.INVALID_OPTION);
}
if (this.hasOption('f')) {
try (InputStream sequenceFileIs = FileTools.getInputStreamFromPlainOrCompressedFile(sequenceFile)) {
if (sequenceFileIs == null) {
AbstractCLI.log.error("No file " + sequenceFile + " was readable");
this.bail(ErrorCode.INVALID_OPTION);
return null;
}
Taxon taxon = null;
if (this.hasOption('t')) {
taxon = taxonService.findByCommonName(this.taxonName);
if (taxon == null) {
throw new IllegalArgumentException("No taxon named " + taxonName);
}
}
AbstractCLI.log.info("Processing ArrayDesign...");
arrayDesignSequenceProcessingService.processArrayDesign(arrayDesign, sequenceFileIs, sequenceTypeEn, taxon);
this.audit(arrayDesign, "Sequences read from file: " + sequenceFile);
}
} else if (this.hasOption('i')) {
try (InputStream idFileIs = FileTools.getInputStreamFromPlainOrCompressedFile(idFile)) {
if (idFileIs == null) {
AbstractCLI.log.error("No file " + idFile + " was readable");
this.bail(ErrorCode.INVALID_OPTION);
}
Taxon taxon = null;
if (this.hasOption('t')) {
taxon = taxonService.findByCommonName(this.taxonName);
if (taxon == null) {
throw new IllegalArgumentException("No taxon named " + taxonName);
}
}
AbstractCLI.log.info("Processing ArrayDesign...");
arrayDesignSequenceProcessingService.processArrayDesign(arrayDesign, idFileIs, new String[] { "nt", "est_others", "est_human", "est_mouse" }, null, taxon, force);
this.audit(arrayDesign, "Sequences identifiers from file: " + idFile);
}
} else {
AbstractCLI.log.info("Retrieving sequences from BLAST databases");
arrayDesignSequenceProcessingService.processArrayDesign(arrayDesign, new String[] { "nt", "est_others", "est_human", "est_mouse" }, null, force);
this.audit(arrayDesign, "Sequence looked up from BLAST databases");
}
}
} catch (Exception e) {
AbstractCLI.log.error(e, e);
return e;
}
return null;
}
use of ubic.gemma.model.expression.arrayDesign.ArrayDesign in project Gemma by PavlidisLab.
the class PreprocessorServiceImpl method processForMissingValues.
private void processForMissingValues(ExpressionExperiment ee) {
Collection<ArrayDesign> arrayDesignsUsed = expressionExperimentService.getArrayDesignsUsed(ee);
if (arrayDesignsUsed.size() > 1) {
throw new UnsupportedOperationException("Skipping postprocessing because experiment uses " + "multiple platform types. Please check valid entry and run postprocessing separately.");
}
ArrayDesign arrayDesignUsed = arrayDesignsUsed.iterator().next();
TechnologyType tt = arrayDesignUsed.getTechnologyType();
if (tt == TechnologyType.TWOCOLOR || tt == TechnologyType.DUALMODE) {
PreprocessorServiceImpl.log.info(ee + " uses a two-color array design, processing for missing values ...");
ee = expressionExperimentService.thawLite(ee);
twoChannelMissingValueService.computeMissingValues(ee);
}
}
use of ubic.gemma.model.expression.arrayDesign.ArrayDesign in project Gemma by PavlidisLab.
the class ProcessedExpressionDataVectorCreateHelperServiceImpl method loadIntensities.
/**
* Computes expression intensities depending on which ArrayDesign TechnologyType is used.
*
* @return ExpressionDataDoubleMatrix
*/
private ExpressionDataDoubleMatrix loadIntensities(ExpressionExperiment ee, Collection<ProcessedExpressionDataVector> processedVectors) {
Collection<ArrayDesign> arrayDesignsUsed = this.eeService.getArrayDesignsUsed(ee);
assert !arrayDesignsUsed.isEmpty();
ArrayDesign arrayDesign = arrayDesignsUsed.iterator().next();
assert arrayDesign != null && arrayDesign.getTechnologyType() != null;
ExpressionDataDoubleMatrix intensities;
if (!arrayDesign.getTechnologyType().equals(TechnologyType.ONECOLOR) && !arrayDesign.getTechnologyType().equals(TechnologyType.NONE)) {
ProcessedExpressionDataVectorCreateHelperServiceImpl.log.info("Computing intensities for two-color data from underlying data");
/*
* Get vectors needed to compute intensities.
*/
Collection<QuantitationType> quantitationTypes = eeService.getQuantitationTypes(ee);
Collection<QuantitationType> usefulQuantitationTypes = ExpressionDataMatrixBuilder.getUsefulQuantitationTypes(quantitationTypes);
if (usefulQuantitationTypes.isEmpty()) {
throw new IllegalStateException("No useful quantitation types for " + ee.getShortName());
}
Collection<? extends DesignElementDataVector> vectors = rawExpressionDataVectorService.find(usefulQuantitationTypes);
if (vectors.isEmpty()) {
vectors = processedExpressionDataVectorService.find(usefulQuantitationTypes);
}
if (vectors.isEmpty()) {
throw new IllegalStateException("No vectors for useful quantitation types for " + ee.getShortName());
}
ProcessedExpressionDataVectorCreateHelperServiceImpl.log.info("Vectors loaded ...");
Collection<DesignElementDataVector> vs = new HashSet<>(vectors);
rawExpressionDataVectorService.thawRawAndProcessed(vs);
ExpressionDataMatrixBuilder builder = new ExpressionDataMatrixBuilder(processedVectors, vectors);
intensities = builder.getIntensity();
ExpressionDataBooleanMatrix missingValues = builder.getMissingValueData();
if (missingValues == null) {
ProcessedExpressionDataVectorCreateHelperServiceImpl.log.warn("Could not locate missing value matrix for " + ee + ", rank computation skipped (needed for two-color data)");
return intensities;
}
if (intensities == null) {
ProcessedExpressionDataVectorCreateHelperServiceImpl.log.warn("Could not locate intensity matrix for " + ee + ", rank computation skipped (needed for two-color data)");
return null;
}
ProcessedExpressionDataVectorCreateHelperServiceImpl.log.info("Masking ...");
this.maskMissingValues(intensities, missingValues);
} else {
ProcessedExpressionDataVectorCreateHelperServiceImpl.log.info("Computing intensities directly from processed data");
intensities = new ExpressionDataDoubleMatrix(processedVectors);
}
return intensities;
}
Aggregations