use of ubic.gemma.model.expression.bioAssayData.RawExpressionDataVector in project Gemma by PavlidisLab.
the class VectorMergingServiceImpl method mergeVectors.
@Override
public ExpressionExperiment mergeVectors(ExpressionExperiment ee) {
Collection<ArrayDesign> arrayDesigns = expressionExperimentService.getArrayDesignsUsed(ee);
if (arrayDesigns.size() > 1) {
throw new IllegalArgumentException("Cannot cope with more than one platform; switch experiment to use a (merged) platform first");
}
ee = expressionExperimentService.thaw(ee);
Collection<QuantitationType> qts = expressionExperimentService.getQuantitationTypes(ee);
VectorMergingServiceImpl.log.info(qts.size() + " quantitation types for potential merge");
/*
* Load all the bioassay dimensions, which will be merged.
*/
Collection<BioAssayDimension> allOldBioAssayDims = new HashSet<>();
for (BioAssay ba : ee.getBioAssays()) {
Collection<BioAssayDimension> oldBioAssayDims = bioAssayService.findBioAssayDimensions(ba);
for (BioAssayDimension bioAssayDim : oldBioAssayDims) {
if (bioAssayDim.getDescription().startsWith(VectorMergingServiceImpl.MERGED_DIM_DESC_PREFIX)) {
// not foolproof, but avoids some artifacts - e.g. if there were previous failed attempts at this.
continue;
}
allOldBioAssayDims.add(bioAssayDim);
}
}
if (allOldBioAssayDims.size() == 0) {
throw new IllegalStateException("No bioAssayDimensions found to merge (previously merged ones are filtered, data may be corrupt?");
}
if (allOldBioAssayDims.size() == 1) {
VectorMergingServiceImpl.log.warn("Experiment already has only a single bioAssayDimension, nothing seems to need merging. Bailing");
return ee;
}
VectorMergingServiceImpl.log.info(allOldBioAssayDims.size() + " bioAssayDimensions to merge");
List<BioAssayDimension> sortedOldDims = this.sortedBioAssayDimensions(allOldBioAssayDims);
BioAssayDimension newBioAd = this.getNewBioAssayDimension(sortedOldDims);
int totalBioAssays = newBioAd.getBioAssays().size();
assert totalBioAssays == ee.getBioAssays().size() : "experiment has " + ee.getBioAssays().size() + " but new bioAssayDimension has " + totalBioAssays;
Map<QuantitationType, Collection<RawExpressionDataVector>> qt2Vec = this.getVectors(ee, qts, allOldBioAssayDims);
/*
* This will run into problems if there are excess quantitation types
*/
int numSuccessfulMergers = 0;
for (QuantitationType type : qt2Vec.keySet()) {
Collection<RawExpressionDataVector> oldVecs = qt2Vec.get(type);
if (oldVecs.isEmpty()) {
VectorMergingServiceImpl.log.warn("No vectors for " + type);
continue;
}
Map<CompositeSequence, Collection<RawExpressionDataVector>> deVMap = this.getDevMap(oldVecs);
if (deVMap == null) {
VectorMergingServiceImpl.log.info("Vector merging will not be done for " + type + " as there is only one vector per element already");
continue;
}
VectorMergingServiceImpl.log.info("Processing " + oldVecs.size() + " vectors for " + type);
Collection<RawExpressionDataVector> newVectors = new HashSet<>();
int numAllMissing = 0;
int missingValuesForQt = 0;
for (CompositeSequence de : deVMap.keySet()) {
RawExpressionDataVector vector = this.initializeNewVector(ee, newBioAd, type, de);
Collection<RawExpressionDataVector> dedvs = deVMap.get(de);
/*
* these ugly nested loops are to ENSURE that we get the vector reconstructed properly. For each of the
* old bioassayDimensions, find the designElementDataVector that uses it. If there isn't one, fill in
* the values for that dimension with missing data. We go through the dimensions in the same order that
* we joined them up.
*/
List<Object> data = new ArrayList<>();
int totalMissingInVector = this.makeMergedData(sortedOldDims, newBioAd, type, de, dedvs, data);
missingValuesForQt += totalMissingInVector;
if (totalMissingInVector == totalBioAssays) {
numAllMissing++;
// we don't save data that is all missing.
continue;
}
if (data.size() != totalBioAssays) {
throw new IllegalStateException("Wrong number of values for " + de + " / " + type + ", expected " + totalBioAssays + ", got " + data.size());
}
byte[] newDataAr = converter.toBytes(data.toArray());
vector.setData(newDataAr);
newVectors.add(vector);
}
// TRANSACTION
vectorMergingHelperService.persist(ee, type, newVectors);
if (numAllMissing > 0) {
VectorMergingServiceImpl.log.info(numAllMissing + " vectors had all missing values and were junked for " + type);
}
if (missingValuesForQt > 0) {
VectorMergingServiceImpl.log.info(missingValuesForQt + " total missing values: " + type);
}
VectorMergingServiceImpl.log.info("Removing " + oldVecs.size() + " old vectors for " + type);
rawExpressionDataVectorService.remove(oldVecs);
ee.getRawExpressionDataVectors().removeAll(oldVecs);
numSuccessfulMergers++;
}
if (numSuccessfulMergers == 0) {
/*
* Try to clean up
*/
this.bioAssayDimensionService.remove(newBioAd);
throw new IllegalStateException("Nothing was merged. Maybe all the vectors are effectively merged already");
}
expressionExperimentService.update(ee);
// Several transactions
this.cleanUp(ee, allOldBioAssayDims, newBioAd);
// transaction
this.audit(ee, "Vector merging performed, merged " + allOldBioAssayDims + " old bioassay dimensions for " + qts.size() + " quantitation types.");
// several transactions
try {
preprocessorService.process(ee);
} catch (PreprocessingException e) {
VectorMergingServiceImpl.log.error("Error during postprocessing", e);
}
return ee;
}
use of ubic.gemma.model.expression.bioAssayData.RawExpressionDataVector in project Gemma by PavlidisLab.
the class VectorMergingServiceImpl method getDevMap.
/**
* @param oldVectors old vectors
* @return map of design element to vectors.
*/
private Map<CompositeSequence, Collection<RawExpressionDataVector>> getDevMap(Collection<RawExpressionDataVector> oldVectors) {
Map<CompositeSequence, Collection<RawExpressionDataVector>> deVMap = new HashMap<>();
boolean atLeastOneMatch = false;
assert !oldVectors.isEmpty();
for (RawExpressionDataVector vector : oldVectors) {
CompositeSequence designElement = vector.getDesignElement();
if (!deVMap.containsKey(designElement)) {
if (VectorMergingServiceImpl.log.isDebugEnabled())
VectorMergingServiceImpl.log.debug("adding " + designElement + " " + designElement.getBiologicalCharacteristic());
deVMap.put(designElement, new HashSet<RawExpressionDataVector>());
}
deVMap.get(designElement).add(vector);
if (!atLeastOneMatch && deVMap.get(designElement).size() > 1) {
atLeastOneMatch = true;
}
}
if (!atLeastOneMatch) {
return null;
}
return deVMap;
}
use of ubic.gemma.model.expression.bioAssayData.RawExpressionDataVector in project Gemma by PavlidisLab.
the class VectorMergingServiceTest method test.
@Test
public final void test() throws Exception {
/*
* Need a persistent experiment that uses multiple array designs. Then merge the designs, switch the vectors,
* and merge the vectors. GSE3443
*/
/*
* The experiment uses the following GPLs
*
* GPL2868, GPL2933, GPL2934, GPL2935, GPL2936, GPL2937, GPL2938
*
* Example of a sequence appearing on more than one platform: N57553
*/
geoService.setGeoDomainObjectGenerator(new GeoDomainObjectGeneratorLocal(this.getTestFileBasePath("gse3443merge")));
Collection<?> results = geoService.fetchAndLoad("GSE3443", false, false, false);
ee = (ExpressionExperiment) results.iterator().next();
ee = this.eeService.thawLite(ee);
Collection<ArrayDesign> aas = eeService.getArrayDesignsUsed(ee);
assertEquals(7, aas.size());
/*
* Check number of sequences across all platforms. This is how many elements we need on the new platform, plus
* extras for duplicated sequences (e.g. elements that don't have a sequence...)
*/
Collection<ArrayDesign> taas = new HashSet<>();
Set<BioSequence> oldbs = new HashSet<>();
for (ArrayDesign arrayDesign : aas) {
arrayDesign = arrayDesignService.thaw(arrayDesign);
taas.add(arrayDesign);
for (CompositeSequence cs : arrayDesign.getCompositeSequences()) {
log.info(cs + " " + cs.getBiologicalCharacteristic());
oldbs.add(cs.getBiologicalCharacteristic());
}
}
assertEquals(63, oldbs.size());
/*
* Check total size of elements across all 7 platforms.
*/
int totalElements = 0;
for (ArrayDesign arrayDesign : taas) {
totalElements += arrayDesign.getCompositeSequences().size();
}
assertEquals(140, totalElements);
ArrayDesign firstaa = taas.iterator().next();
aas.remove(firstaa);
assertEquals(null, firstaa.getMergedInto());
mergedAA = arrayDesignMergeService.merge(firstaa, taas, "testMerge" + RandomStringUtils.randomAlphabetic(5), "merged" + RandomStringUtils.randomAlphabetic(5), false);
assertEquals(72, mergedAA.getCompositeSequences().size());
Set<BioSequence> seenBs = new HashSet<>();
for (CompositeSequence cs : mergedAA.getCompositeSequences()) {
seenBs.add(cs.getBiologicalCharacteristic());
}
assertEquals(63, seenBs.size());
// just to make this explicit. The new array design has to contain all the old sequences.
assertEquals(oldbs.size(), seenBs.size());
ee = eeService.thaw(ee);
assertEquals(1828, ee.getRawExpressionDataVectors().size());
ee = eePlatformSwitchService.switchExperimentToArrayDesign(ee, mergedAA);
ee = eeService.thaw(ee);
// check we actually got switched over.
for (BioAssay ba : ee.getBioAssays()) {
assertEquals(mergedAA, ba.getArrayDesignUsed());
}
for (RawExpressionDataVector v : ee.getRawExpressionDataVectors()) {
assertEquals(mergedAA, v.getDesignElement().getArrayDesign());
}
assertEquals(15, ee.getQuantitationTypes().size());
assertEquals(1828, ee.getRawExpressionDataVectors().size());
ee = vectorMergingService.mergeVectors(ee);
// check we got the right processed data
Collection<ProcessedExpressionDataVector> pvs = processedExpressionDataVectorService.getProcessedDataVectors(ee);
assertEquals(72, pvs.size());
ee = eeService.thaw(ee);
Collection<DoubleVectorValueObject> processedDataArrays = processedExpressionDataVectorService.getProcessedDataArrays(ee, 50);
assertEquals(50, processedDataArrays.size());
}
use of ubic.gemma.model.expression.bioAssayData.RawExpressionDataVector in project Gemma by PavlidisLab.
the class ExpressionExperimentServiceTest method testGetDesignElementDataVectorsByQt.
@Test
public final void testGetDesignElementDataVectorsByQt() {
QuantitationType quantitationType = ee.getRawExpressionDataVectors().iterator().next().getQuantitationType();
Collection<QuantitationType> quantitationTypes = new HashSet<>();
quantitationTypes.add(quantitationType);
Collection<RawExpressionDataVector> vectors = rawExpressionDataVectorService.find(quantitationTypes);
assertEquals(12, vectors.size());
}
use of ubic.gemma.model.expression.bioAssayData.RawExpressionDataVector in project Gemma by PavlidisLab.
the class ExpressionDataDoubleMatrixTest method testConstructExpressionDataDoubleMatrix.
/**
* Tests the construction of an ExpressionDataDoubleMatrix
*/
@Test
public void testConstructExpressionDataDoubleMatrix() {
/* test creating the ExpressionDataDoubleMatrix */
QuantitationType quantitationType = QuantitationType.Factory.newInstance();
quantitationType.setName(metaData.getQuantitationTypeName());
quantitationType.setIsPreferred(true);
quantitationType.setRepresentation(PrimitiveType.DOUBLE);
quantitationType.setIsMaskedPreferred(false);
quantitationType.setIsRatio(true);
quantitationType.setIsBackground(false);
quantitationType.setIsBackgroundSubtracted(true);
quantitationType.setIsNormalized(true);
Collection<RawExpressionDataVector> designElementDataVectors = ee.getRawExpressionDataVectors();
Collection<CompositeSequence> designElements = new HashSet<>();
for (DesignElementDataVector designElementDataVector : designElementDataVectors) {
CompositeSequence de = designElementDataVector.getDesignElement();
designElements.add(de);
}
/* Constructor 1 */
ExpressionDataDoubleMatrix expressionDataDoubleMatrix = new ExpressionDataDoubleMatrix(designElementDataVectors);
/* Assertions */
CompositeSequence deToQuery = designElements.iterator().next();
Double[] row = expressionDataDoubleMatrix.getRow(deToQuery);
assertNotNull(row);
for (Double aRow : row) {
log.debug(aRow);
}
Double[][] dMatrix = expressionDataDoubleMatrix.getRawMatrix();
assertEquals(dMatrix.length, 200);
assertEquals(dMatrix[0].length, 59);
}
Aggregations