Search in sources :

Example 36 with RawExpressionDataVector

use of ubic.gemma.model.expression.bioAssayData.RawExpressionDataVector in project Gemma by PavlidisLab.

the class VectorMergingServiceImpl method mergeVectors.

@Override
public ExpressionExperiment mergeVectors(ExpressionExperiment ee) {
    Collection<ArrayDesign> arrayDesigns = expressionExperimentService.getArrayDesignsUsed(ee);
    if (arrayDesigns.size() > 1) {
        throw new IllegalArgumentException("Cannot cope with more than one platform; switch experiment to use a (merged) platform first");
    }
    ee = expressionExperimentService.thaw(ee);
    Collection<QuantitationType> qts = expressionExperimentService.getQuantitationTypes(ee);
    VectorMergingServiceImpl.log.info(qts.size() + " quantitation types for potential merge");
    /*
         * Load all the bioassay dimensions, which will be merged.
         */
    Collection<BioAssayDimension> allOldBioAssayDims = new HashSet<>();
    for (BioAssay ba : ee.getBioAssays()) {
        Collection<BioAssayDimension> oldBioAssayDims = bioAssayService.findBioAssayDimensions(ba);
        for (BioAssayDimension bioAssayDim : oldBioAssayDims) {
            if (bioAssayDim.getDescription().startsWith(VectorMergingServiceImpl.MERGED_DIM_DESC_PREFIX)) {
                // not foolproof, but avoids some artifacts - e.g. if there were previous failed attempts at this.
                continue;
            }
            allOldBioAssayDims.add(bioAssayDim);
        }
    }
    if (allOldBioAssayDims.size() == 0) {
        throw new IllegalStateException("No bioAssayDimensions found to merge (previously merged ones are filtered, data may be corrupt?");
    }
    if (allOldBioAssayDims.size() == 1) {
        VectorMergingServiceImpl.log.warn("Experiment already has only a single bioAssayDimension, nothing seems to need merging. Bailing");
        return ee;
    }
    VectorMergingServiceImpl.log.info(allOldBioAssayDims.size() + " bioAssayDimensions to merge");
    List<BioAssayDimension> sortedOldDims = this.sortedBioAssayDimensions(allOldBioAssayDims);
    BioAssayDimension newBioAd = this.getNewBioAssayDimension(sortedOldDims);
    int totalBioAssays = newBioAd.getBioAssays().size();
    assert totalBioAssays == ee.getBioAssays().size() : "experiment has " + ee.getBioAssays().size() + " but new bioAssayDimension has " + totalBioAssays;
    Map<QuantitationType, Collection<RawExpressionDataVector>> qt2Vec = this.getVectors(ee, qts, allOldBioAssayDims);
    /*
         * This will run into problems if there are excess quantitation types
         */
    int numSuccessfulMergers = 0;
    for (QuantitationType type : qt2Vec.keySet()) {
        Collection<RawExpressionDataVector> oldVecs = qt2Vec.get(type);
        if (oldVecs.isEmpty()) {
            VectorMergingServiceImpl.log.warn("No vectors for " + type);
            continue;
        }
        Map<CompositeSequence, Collection<RawExpressionDataVector>> deVMap = this.getDevMap(oldVecs);
        if (deVMap == null) {
            VectorMergingServiceImpl.log.info("Vector merging will not be done for " + type + " as there is only one vector per element already");
            continue;
        }
        VectorMergingServiceImpl.log.info("Processing " + oldVecs.size() + " vectors  for " + type);
        Collection<RawExpressionDataVector> newVectors = new HashSet<>();
        int numAllMissing = 0;
        int missingValuesForQt = 0;
        for (CompositeSequence de : deVMap.keySet()) {
            RawExpressionDataVector vector = this.initializeNewVector(ee, newBioAd, type, de);
            Collection<RawExpressionDataVector> dedvs = deVMap.get(de);
            /*
                 * these ugly nested loops are to ENSURE that we get the vector reconstructed properly. For each of the
                 * old bioassayDimensions, find the designElementDataVector that uses it. If there isn't one, fill in
                 * the values for that dimension with missing data. We go through the dimensions in the same order that
                 * we joined them up.
                 */
            List<Object> data = new ArrayList<>();
            int totalMissingInVector = this.makeMergedData(sortedOldDims, newBioAd, type, de, dedvs, data);
            missingValuesForQt += totalMissingInVector;
            if (totalMissingInVector == totalBioAssays) {
                numAllMissing++;
                // we don't save data that is all missing.
                continue;
            }
            if (data.size() != totalBioAssays) {
                throw new IllegalStateException("Wrong number of values for " + de + " / " + type + ", expected " + totalBioAssays + ", got " + data.size());
            }
            byte[] newDataAr = converter.toBytes(data.toArray());
            vector.setData(newDataAr);
            newVectors.add(vector);
        }
        // TRANSACTION
        vectorMergingHelperService.persist(ee, type, newVectors);
        if (numAllMissing > 0) {
            VectorMergingServiceImpl.log.info(numAllMissing + " vectors had all missing values and were junked for " + type);
        }
        if (missingValuesForQt > 0) {
            VectorMergingServiceImpl.log.info(missingValuesForQt + " total missing values: " + type);
        }
        VectorMergingServiceImpl.log.info("Removing " + oldVecs.size() + " old vectors for " + type);
        rawExpressionDataVectorService.remove(oldVecs);
        ee.getRawExpressionDataVectors().removeAll(oldVecs);
        numSuccessfulMergers++;
    }
    if (numSuccessfulMergers == 0) {
        /*
             * Try to clean up
             */
        this.bioAssayDimensionService.remove(newBioAd);
        throw new IllegalStateException("Nothing was merged. Maybe all the vectors are effectively merged already");
    }
    expressionExperimentService.update(ee);
    // Several transactions
    this.cleanUp(ee, allOldBioAssayDims, newBioAd);
    // transaction
    this.audit(ee, "Vector merging performed, merged " + allOldBioAssayDims + " old bioassay dimensions for " + qts.size() + " quantitation types.");
    // several transactions
    try {
        preprocessorService.process(ee);
    } catch (PreprocessingException e) {
        VectorMergingServiceImpl.log.error("Error during postprocessing", e);
    }
    return ee;
}
Also used : ArrayDesign(ubic.gemma.model.expression.arrayDesign.ArrayDesign) CompositeSequence(ubic.gemma.model.expression.designElement.CompositeSequence) BioAssayDimension(ubic.gemma.model.expression.bioAssayData.BioAssayDimension) RawExpressionDataVector(ubic.gemma.model.expression.bioAssayData.RawExpressionDataVector) QuantitationType(ubic.gemma.model.common.quantitationtype.QuantitationType) BioAssay(ubic.gemma.model.expression.bioAssay.BioAssay)

Example 37 with RawExpressionDataVector

use of ubic.gemma.model.expression.bioAssayData.RawExpressionDataVector in project Gemma by PavlidisLab.

the class VectorMergingServiceImpl method getDevMap.

/**
 * @param oldVectors old vectors
 * @return map of design element to vectors.
 */
private Map<CompositeSequence, Collection<RawExpressionDataVector>> getDevMap(Collection<RawExpressionDataVector> oldVectors) {
    Map<CompositeSequence, Collection<RawExpressionDataVector>> deVMap = new HashMap<>();
    boolean atLeastOneMatch = false;
    assert !oldVectors.isEmpty();
    for (RawExpressionDataVector vector : oldVectors) {
        CompositeSequence designElement = vector.getDesignElement();
        if (!deVMap.containsKey(designElement)) {
            if (VectorMergingServiceImpl.log.isDebugEnabled())
                VectorMergingServiceImpl.log.debug("adding " + designElement + " " + designElement.getBiologicalCharacteristic());
            deVMap.put(designElement, new HashSet<RawExpressionDataVector>());
        }
        deVMap.get(designElement).add(vector);
        if (!atLeastOneMatch && deVMap.get(designElement).size() > 1) {
            atLeastOneMatch = true;
        }
    }
    if (!atLeastOneMatch) {
        return null;
    }
    return deVMap;
}
Also used : RawExpressionDataVector(ubic.gemma.model.expression.bioAssayData.RawExpressionDataVector) CompositeSequence(ubic.gemma.model.expression.designElement.CompositeSequence)

Example 38 with RawExpressionDataVector

use of ubic.gemma.model.expression.bioAssayData.RawExpressionDataVector in project Gemma by PavlidisLab.

the class VectorMergingServiceTest method test.

@Test
public final void test() throws Exception {
    /*
         * Need a persistent experiment that uses multiple array designs. Then merge the designs, switch the vectors,
         * and merge the vectors. GSE3443
         */
    /*
         * The experiment uses the following GPLs
         *
         * GPL2868, GPL2933, GPL2934, GPL2935, GPL2936, GPL2937, GPL2938
         *
         * Example of a sequence appearing on more than one platform: N57553
         */
    geoService.setGeoDomainObjectGenerator(new GeoDomainObjectGeneratorLocal(this.getTestFileBasePath("gse3443merge")));
    Collection<?> results = geoService.fetchAndLoad("GSE3443", false, false, false);
    ee = (ExpressionExperiment) results.iterator().next();
    ee = this.eeService.thawLite(ee);
    Collection<ArrayDesign> aas = eeService.getArrayDesignsUsed(ee);
    assertEquals(7, aas.size());
    /*
         * Check number of sequences across all platforms. This is how many elements we need on the new platform, plus
         * extras for duplicated sequences (e.g. elements that don't have a sequence...)
         */
    Collection<ArrayDesign> taas = new HashSet<>();
    Set<BioSequence> oldbs = new HashSet<>();
    for (ArrayDesign arrayDesign : aas) {
        arrayDesign = arrayDesignService.thaw(arrayDesign);
        taas.add(arrayDesign);
        for (CompositeSequence cs : arrayDesign.getCompositeSequences()) {
            log.info(cs + " " + cs.getBiologicalCharacteristic());
            oldbs.add(cs.getBiologicalCharacteristic());
        }
    }
    assertEquals(63, oldbs.size());
    /*
         * Check total size of elements across all 7 platforms.
         */
    int totalElements = 0;
    for (ArrayDesign arrayDesign : taas) {
        totalElements += arrayDesign.getCompositeSequences().size();
    }
    assertEquals(140, totalElements);
    ArrayDesign firstaa = taas.iterator().next();
    aas.remove(firstaa);
    assertEquals(null, firstaa.getMergedInto());
    mergedAA = arrayDesignMergeService.merge(firstaa, taas, "testMerge" + RandomStringUtils.randomAlphabetic(5), "merged" + RandomStringUtils.randomAlphabetic(5), false);
    assertEquals(72, mergedAA.getCompositeSequences().size());
    Set<BioSequence> seenBs = new HashSet<>();
    for (CompositeSequence cs : mergedAA.getCompositeSequences()) {
        seenBs.add(cs.getBiologicalCharacteristic());
    }
    assertEquals(63, seenBs.size());
    // just to make this explicit. The new array design has to contain all the old sequences.
    assertEquals(oldbs.size(), seenBs.size());
    ee = eeService.thaw(ee);
    assertEquals(1828, ee.getRawExpressionDataVectors().size());
    ee = eePlatformSwitchService.switchExperimentToArrayDesign(ee, mergedAA);
    ee = eeService.thaw(ee);
    // check we actually got switched over.
    for (BioAssay ba : ee.getBioAssays()) {
        assertEquals(mergedAA, ba.getArrayDesignUsed());
    }
    for (RawExpressionDataVector v : ee.getRawExpressionDataVectors()) {
        assertEquals(mergedAA, v.getDesignElement().getArrayDesign());
    }
    assertEquals(15, ee.getQuantitationTypes().size());
    assertEquals(1828, ee.getRawExpressionDataVectors().size());
    ee = vectorMergingService.mergeVectors(ee);
    // check we got the right processed data
    Collection<ProcessedExpressionDataVector> pvs = processedExpressionDataVectorService.getProcessedDataVectors(ee);
    assertEquals(72, pvs.size());
    ee = eeService.thaw(ee);
    Collection<DoubleVectorValueObject> processedDataArrays = processedExpressionDataVectorService.getProcessedDataArrays(ee, 50);
    assertEquals(50, processedDataArrays.size());
}
Also used : BioSequence(ubic.gemma.model.genome.biosequence.BioSequence) ArrayDesign(ubic.gemma.model.expression.arrayDesign.ArrayDesign) ProcessedExpressionDataVector(ubic.gemma.model.expression.bioAssayData.ProcessedExpressionDataVector) CompositeSequence(ubic.gemma.model.expression.designElement.CompositeSequence) RawExpressionDataVector(ubic.gemma.model.expression.bioAssayData.RawExpressionDataVector) DoubleVectorValueObject(ubic.gemma.model.expression.bioAssayData.DoubleVectorValueObject) BioAssay(ubic.gemma.model.expression.bioAssay.BioAssay) GeoDomainObjectGeneratorLocal(ubic.gemma.core.loader.expression.geo.GeoDomainObjectGeneratorLocal) HashSet(java.util.HashSet) AbstractGeoServiceTest(ubic.gemma.core.loader.expression.geo.AbstractGeoServiceTest) Test(org.junit.Test)

Example 39 with RawExpressionDataVector

use of ubic.gemma.model.expression.bioAssayData.RawExpressionDataVector in project Gemma by PavlidisLab.

the class ExpressionExperimentServiceTest method testGetDesignElementDataVectorsByQt.

@Test
public final void testGetDesignElementDataVectorsByQt() {
    QuantitationType quantitationType = ee.getRawExpressionDataVectors().iterator().next().getQuantitationType();
    Collection<QuantitationType> quantitationTypes = new HashSet<>();
    quantitationTypes.add(quantitationType);
    Collection<RawExpressionDataVector> vectors = rawExpressionDataVectorService.find(quantitationTypes);
    assertEquals(12, vectors.size());
}
Also used : RawExpressionDataVector(ubic.gemma.model.expression.bioAssayData.RawExpressionDataVector) QuantitationType(ubic.gemma.model.common.quantitationtype.QuantitationType) HashSet(java.util.HashSet) Test(org.junit.Test) BaseSpringContextTest(ubic.gemma.core.testing.BaseSpringContextTest)

Example 40 with RawExpressionDataVector

use of ubic.gemma.model.expression.bioAssayData.RawExpressionDataVector in project Gemma by PavlidisLab.

the class ExpressionDataDoubleMatrixTest method testConstructExpressionDataDoubleMatrix.

/**
 * Tests the construction of an ExpressionDataDoubleMatrix
 */
@Test
public void testConstructExpressionDataDoubleMatrix() {
    /* test creating the ExpressionDataDoubleMatrix */
    QuantitationType quantitationType = QuantitationType.Factory.newInstance();
    quantitationType.setName(metaData.getQuantitationTypeName());
    quantitationType.setIsPreferred(true);
    quantitationType.setRepresentation(PrimitiveType.DOUBLE);
    quantitationType.setIsMaskedPreferred(false);
    quantitationType.setIsRatio(true);
    quantitationType.setIsBackground(false);
    quantitationType.setIsBackgroundSubtracted(true);
    quantitationType.setIsNormalized(true);
    Collection<RawExpressionDataVector> designElementDataVectors = ee.getRawExpressionDataVectors();
    Collection<CompositeSequence> designElements = new HashSet<>();
    for (DesignElementDataVector designElementDataVector : designElementDataVectors) {
        CompositeSequence de = designElementDataVector.getDesignElement();
        designElements.add(de);
    }
    /* Constructor 1 */
    ExpressionDataDoubleMatrix expressionDataDoubleMatrix = new ExpressionDataDoubleMatrix(designElementDataVectors);
    /* Assertions */
    CompositeSequence deToQuery = designElements.iterator().next();
    Double[] row = expressionDataDoubleMatrix.getRow(deToQuery);
    assertNotNull(row);
    for (Double aRow : row) {
        log.debug(aRow);
    }
    Double[][] dMatrix = expressionDataDoubleMatrix.getRawMatrix();
    assertEquals(dMatrix.length, 200);
    assertEquals(dMatrix[0].length, 59);
}
Also used : RawExpressionDataVector(ubic.gemma.model.expression.bioAssayData.RawExpressionDataVector) DesignElementDataVector(ubic.gemma.model.expression.bioAssayData.DesignElementDataVector) CompositeSequence(ubic.gemma.model.expression.designElement.CompositeSequence) AbstractGeoServiceTest(ubic.gemma.core.loader.expression.geo.AbstractGeoServiceTest) Test(org.junit.Test)

Aggregations

RawExpressionDataVector (ubic.gemma.model.expression.bioAssayData.RawExpressionDataVector)53 CompositeSequence (ubic.gemma.model.expression.designElement.CompositeSequence)18 ArrayDesign (ubic.gemma.model.expression.arrayDesign.ArrayDesign)16 BioAssay (ubic.gemma.model.expression.bioAssay.BioAssay)16 BioAssayDimension (ubic.gemma.model.expression.bioAssayData.BioAssayDimension)16 Test (org.junit.Test)15 QuantitationType (ubic.gemma.model.common.quantitationtype.QuantitationType)13 ExpressionExperiment (ubic.gemma.model.expression.experiment.ExpressionExperiment)12 BaseSpringContextTest (ubic.gemma.core.testing.BaseSpringContextTest)9 BioMaterial (ubic.gemma.model.expression.biomaterial.BioMaterial)7 InputStream (java.io.InputStream)6 Collection (java.util.Collection)6 HashSet (java.util.HashSet)6 GZIPInputStream (java.util.zip.GZIPInputStream)6 ByteArrayConverter (ubic.basecode.io.ByteArrayConverter)6 AbstractGeoServiceTest (ubic.gemma.core.loader.expression.geo.AbstractGeoServiceTest)6 GeoSeries (ubic.gemma.core.loader.expression.geo.model.GeoSeries)6 DesignElementDataVector (ubic.gemma.model.expression.bioAssayData.DesignElementDataVector)6 ProcessedExpressionDataVector (ubic.gemma.model.expression.bioAssayData.ProcessedExpressionDataVector)5 Transactional (org.springframework.transaction.annotation.Transactional)4