Search in sources :

Example 66 with CompositeSequence

use of ubic.gemma.model.expression.designElement.CompositeSequence in project Gemma by PavlidisLab.

the class GeoConverterImpl method processId.

private int processId(GeoPlatform platform, ArrayDesign arrayDesign, String probeOrganismColumn, ExternalDatabase externalDb, List<String> sequences, List<String> probeOrganism, Taxon primaryTaxon, List<String> cloneIdentifiers, List<List<String>> externalRefs, Iterator<String> descIter, Pattern refSeqAccessionPattern, boolean strictSelection, List<String> skipped, Collection<CompositeSequence> compositeSequences, int i, String id) {
    String externalAccession = null;
    if (externalRefs != null) {
        externalAccession = this.getExternalAccession(externalRefs, i);
    }
    if (strictSelection && StringUtils.isBlank(externalAccession)) {
        // currently this is crafted to deal with affymetrix exon arrays, but could be expanded.
        // mrna_assignment is less strict than gene_assignement
        // salvage it if it has a gene assignment.
        // String filteringColumn = "gene_assignment";
        String filteringColumn = "gene_assignment";
        if (platform.getColumnNames().contains(filteringColumn)) {
            String cd = platform.getColumnData(filteringColumn).get(i);
            if (StringUtils.isBlank(cd) || cd.equals("---")) {
                skipped.add(id);
                if (skipped.size() % 10000 == 0) {
                    GeoConverterImpl.log.info("Skipped " + skipped.size() + " elements due to strict selection; last was " + id);
                }
                i++;
                return i;
            }
        // keep it.
        } else {
            // we just skip ones that don't have an external accession.
            return i;
        }
    // remaining case here: externalAccession is blank, but there is another column that we think saves it.
    }
    String cloneIdentifier = cloneIdentifiers == null ? null : cloneIdentifiers.get(i);
    String description = "";
    if (externalAccession != null) {
        String[] refs = externalAccession.split(",");
        if (refs.length > 1) {
            description = "Multiple external sequence references: " + externalAccession + "; ";
            externalAccession = refs[0];
        }
    }
    if (descIter != null)
        description = description + " " + descIter.next();
    CompositeSequence cs = CompositeSequence.Factory.newInstance();
    String probeName = platform.getProbeNamesInGemma().get(id);
    if (probeName == null) {
        probeName = id;
        if (GeoConverterImpl.log.isDebugEnabled())
            GeoConverterImpl.log.debug("Probe retaining original name: " + probeName);
        // must make sure this is populated.
        platform.getProbeNamesInGemma().put(id, id);
    } else {
        if (GeoConverterImpl.log.isDebugEnabled())
            GeoConverterImpl.log.debug("Found probe: " + probeName);
    }
    cs.setName(probeName);
    cs.setDescription(description);
    cs.setArrayDesign(arrayDesign);
    // LMD:1647- If There is a Organism Column given for the probe then set taxon from that overwriting platform
    // if probeOrganismColumn is set but for this probe no taxon do not set probeTaxon and thus create no
    // biosequence
    Taxon probeTaxon = Taxon.Factory.newInstance();
    if (probeOrganism != null && StringUtils.isNotBlank(probeOrganism.get(i))) {
        probeTaxon = this.convertProbeOrganism(probeOrganism.get(i));
    }
    // if there are no probe taxons then all the probes should take the taxon from the primary taxon
    if (probeOrganismColumn == null) {
        probeTaxon = primaryTaxon;
    }
    BioSequence bs = this.createMinimalBioSequence(probeTaxon);
    this.setBsProps(platform, externalDb, sequences, refSeqAccessionPattern, i, id, externalAccession, cloneIdentifier, bs);
    this.checkCs(arrayDesign, externalAccession, cloneIdentifier, cs, probeTaxon, bs);
    compositeSequences.add(cs);
    platformDesignElementMap.get(arrayDesign.getShortName()).put(probeName, cs);
    i++;
    return i;
}
Also used : BioSequence(ubic.gemma.model.genome.biosequence.BioSequence) Taxon(ubic.gemma.model.genome.Taxon) CompositeSequence(ubic.gemma.model.expression.designElement.CompositeSequence)

Example 67 with CompositeSequence

use of ubic.gemma.model.expression.designElement.CompositeSequence in project Gemma by PavlidisLab.

the class ProcessedExpressionDataVectorDaoImpl method getRanks.

@Override
public Map<CompositeSequence, Double> getRanks(ExpressionExperiment expressionExperiment, RankMethod method) {
    // language=HQL
    final String queryString = "select dedv.designElement, dedv.rankByMean, dedv.rankByMax from ProcessedExpressionDataVector dedv " + "where dedv.expressionExperiment.id = :ee";
    List qr = this.getSessionFactory().getCurrentSession().createQuery(queryString).setParameter("ee", expressionExperiment.getId()).list();
    Map<CompositeSequence, Double> result = new HashMap<>();
    for (Object o : qr) {
        Object[] oa = (Object[]) o;
        CompositeSequence d = (CompositeSequence) oa[0];
        Double rMean = oa[1] == null ? Double.NaN : (Double) oa[1];
        Double rMax = oa[2] == null ? Double.NaN : (Double) oa[2];
        switch(method) {
            case mean:
                result.put(d, rMean);
                break;
            case max:
                result.put(d, rMax);
                break;
            default:
                break;
        }
    }
    return result;
}
Also used : BioAssayValueObject(ubic.gemma.model.expression.bioAssay.BioAssayValueObject) ExpressionExperimentValueObject(ubic.gemma.model.expression.experiment.ExpressionExperimentValueObject) CompositeSequenceValueObject(ubic.gemma.model.expression.designElement.CompositeSequenceValueObject) CompositeSequence(ubic.gemma.model.expression.designElement.CompositeSequence)

Example 68 with CompositeSequence

use of ubic.gemma.model.expression.designElement.CompositeSequence in project Gemma by PavlidisLab.

the class ProcessedExpressionDataVectorDaoImpl method maskAndUnpack.

private Map<CompositeSequence, DoubleVectorValueObject> maskAndUnpack(Collection<RawExpressionDataVector> preferredData, Collection<RawExpressionDataVector> missingValueData) {
    Map<CompositeSequence, DoubleVectorValueObject> unpackedData = this.unpack(preferredData);
    if (missingValueData.size() == 0) {
        AbstractDao.log.info("There is no separate missing data information, simply using the data as is");
        for (DoubleVectorValueObject rv : unpackedData.values()) {
            rv.setMasked(true);
        }
        return unpackedData;
    }
    Collection<BooleanVectorValueObject> unpackedMissingValueData = this.unpackBooleans(missingValueData);
    Map<CompositeSequenceValueObject, BooleanVectorValueObject> missingValueMap = new HashMap<>();
    for (BooleanVectorValueObject bv : unpackedMissingValueData) {
        missingValueMap.put(bv.getDesignElement(), bv);
    }
    boolean warned = false;
    for (DoubleVectorValueObject rv : unpackedData.values()) {
        double[] data = rv.getData();
        CompositeSequenceValueObject de = rv.getDesignElement();
        BooleanVectorValueObject mv = missingValueMap.get(de);
        if (mv == null) {
            if (!warned && AbstractDao.log.isWarnEnabled())
                AbstractDao.log.warn("No mask vector for " + de + ", additional warnings for missing masks for this job will be skipped");
            // we're missing a mask vector for it for some reason, but still flag it as effectively masked.
            rv.setMasked(true);
            warned = true;
            continue;
        }
        boolean[] mvData = mv.getData();
        if (mvData.length != data.length) {
            throw new IllegalStateException("Missing value data didn't match data length");
        }
        for (int i = 0; i < data.length; i++) {
            if (!mvData[i]) {
                data[i] = Double.NaN;
            }
        }
        rv.setMasked(true);
    }
    return unpackedData;
}
Also used : CompositeSequence(ubic.gemma.model.expression.designElement.CompositeSequence) CompositeSequenceValueObject(ubic.gemma.model.expression.designElement.CompositeSequenceValueObject)

Example 69 with CompositeSequence

use of ubic.gemma.model.expression.designElement.CompositeSequence in project Gemma by PavlidisLab.

the class ProcessedExpressionDataVectorDaoImpl method createProcessedDataVectors.

@Override
public ExpressionExperiment createProcessedDataVectors(ExpressionExperiment ee) {
    if (ee == null) {
        throw new IllegalStateException("ExpressionExperiment cannot be null");
    }
    ExpressionExperiment expressionExperiment = (ExpressionExperiment) this.getSessionFactory().getCurrentSession().get(ExpressionExperiment.class, ee.getId());
    assert expressionExperiment != null;
    this.removeProcessedDataVectors(expressionExperiment);
    Hibernate.initialize(expressionExperiment);
    Hibernate.initialize(expressionExperiment.getQuantitationTypes());
    Hibernate.initialize(expressionExperiment.getProcessedExpressionDataVectors());
    expressionExperiment.getProcessedExpressionDataVectors().clear();
    AbstractDao.log.info("Computing processed expression vectors for " + expressionExperiment);
    boolean isTwoChannel = this.isTwoChannel(expressionExperiment);
    Collection<RawExpressionDataVector> missingValueVectors = new HashSet<>();
    if (isTwoChannel) {
        missingValueVectors = this.getMissingValueVectors(expressionExperiment);
    }
    Collection<RawExpressionDataVector> preferredDataVectors = this.getPreferredDataVectors(expressionExperiment);
    if (preferredDataVectors.isEmpty()) {
        throw new IllegalArgumentException("No preferred data vectors for " + expressionExperiment);
    }
    Map<CompositeSequence, DoubleVectorValueObject> maskedVectorObjects = this.maskAndUnpack(preferredDataVectors, missingValueVectors);
    /*
         * Create the vectors. Do a sanity check that we don't have more than we should
         */
    Collection<CompositeSequence> seenDes = new HashSet<>();
    RawExpressionDataVector preferredDataVectorExemplar = preferredDataVectors.iterator().next();
    QuantitationType preferredMaskedDataQuantitationType = this.getPreferredMaskedDataQuantitationType(preferredDataVectorExemplar.getQuantitationType());
    /*
         * Note that we used to not normalize count data, but we've removed this restriction; and in any case we have
         * moved to using non-count summaries for the primary data type.
         */
    if (preferredMaskedDataQuantitationType.getType().equals(StandardQuantitationType.COUNT)) {
        /*
             * Backfill target
             */
        AbstractDao.log.warn("Preferred data are counts; please convert to log2cpm");
    }
    if (!preferredMaskedDataQuantitationType.getIsRatio() && maskedVectorObjects.size() > ProcessedExpressionDataVectorDaoImpl.MIN_SIZE_FOR_RENORMALIZATION) {
        AbstractDao.log.info("Normalizing the data");
        this.renormalize(maskedVectorObjects);
    } else {
        AbstractDao.log.info("Normalization skipped for this data set (not suitable)");
    }
    int i = 0;
    for (CompositeSequence cs : maskedVectorObjects.keySet()) {
        DoubleVectorValueObject dvvo = maskedVectorObjects.get(cs);
        if (seenDes.contains(cs)) {
            // defensive programming, this happens.
            throw new IllegalStateException("Duplicated design element: " + cs + "; make sure the experiment has only one 'preferred' quantitation type. " + "Perhaps you need to run vector merging following an array design switch?");
        }
        ProcessedExpressionDataVector vec = (ProcessedExpressionDataVector) dvvo.toDesignElementDataVector(ee, cs, preferredMaskedDataQuantitationType);
        expressionExperiment.getProcessedExpressionDataVectors().add(vec);
        seenDes.add(cs);
        if (++i % 5000 == 0) {
            AbstractDao.log.info(i + " vectors built");
        }
    }
    AbstractDao.log.info("Persisting " + expressionExperiment.getProcessedExpressionDataVectors().size() + " processed data vectors");
    expressionExperiment.getQuantitationTypes().add(preferredMaskedDataQuantitationType);
    expressionExperiment.setNumberOfDataVectors(expressionExperiment.getProcessedExpressionDataVectors().size());
    this.getSessionFactory().getCurrentSession().update(expressionExperiment);
    assert expressionExperiment.getNumberOfDataVectors() != null;
    this.processedDataVectorCache.clearCache(expressionExperiment.getId());
    return expressionExperiment;
}
Also used : ExpressionExperiment(ubic.gemma.model.expression.experiment.ExpressionExperiment) CompositeSequence(ubic.gemma.model.expression.designElement.CompositeSequence) StandardQuantitationType(ubic.gemma.model.common.quantitationtype.StandardQuantitationType) QuantitationType(ubic.gemma.model.common.quantitationtype.QuantitationType)

Example 70 with CompositeSequence

use of ubic.gemma.model.expression.designElement.CompositeSequence in project Gemma by PavlidisLab.

the class CompositeSequenceDaoImpl method getGenesWithSpecificity.

@Override
public Map<CompositeSequence, Collection<BioSequence2GeneProduct>> getGenesWithSpecificity(Collection<CompositeSequence> compositeSequences) {
    AbstractDao.log.info("Getting cs -> alignment specificity map for " + compositeSequences.size() + " composite sequences");
    Map<CompositeSequence, Collection<BioSequence2GeneProduct>> results = new HashMap<>();
    BatchIterator<CompositeSequence> it = BatchIterator.batches(compositeSequences, CompositeSequenceDaoImpl.PROBE_TO_GENE_MAP_BATCH_SIZE);
    StopWatch timer = new StopWatch();
    timer.start();
    int total = 0;
    for (; it.hasNext(); ) {
        Collection<CompositeSequence> batch = it.next();
        this.batchGetGenesWithSpecificity(batch, results);
        total += batch.size();
    }
    timer.stop();
    if (timer.getTime() > 10000) {
        AbstractDao.log.info("Probe to gene map finished: " + total + " retrieved in " + timer.getTime() + "ms");
    }
    return results;
}
Also used : CompositeSequence(ubic.gemma.model.expression.designElement.CompositeSequence) StopWatch(org.apache.commons.lang3.time.StopWatch)

Aggregations

CompositeSequence (ubic.gemma.model.expression.designElement.CompositeSequence)206 ArrayDesign (ubic.gemma.model.expression.arrayDesign.ArrayDesign)43 BioSequence (ubic.gemma.model.genome.biosequence.BioSequence)40 Gene (ubic.gemma.model.genome.Gene)32 Test (org.junit.Test)30 BioMaterial (ubic.gemma.model.expression.biomaterial.BioMaterial)19 ExpressionDataDoubleMatrix (ubic.gemma.core.datastructure.matrix.ExpressionDataDoubleMatrix)18 BioAssay (ubic.gemma.model.expression.bioAssay.BioAssay)18 DesignElementDataVector (ubic.gemma.model.expression.bioAssayData.DesignElementDataVector)18 RawExpressionDataVector (ubic.gemma.model.expression.bioAssayData.RawExpressionDataVector)18 StopWatch (org.apache.commons.lang3.time.StopWatch)17 HashSet (java.util.HashSet)15 BioAssayDimension (ubic.gemma.model.expression.bioAssayData.BioAssayDimension)15 CompositeSequenceValueObject (ubic.gemma.model.expression.designElement.CompositeSequenceValueObject)15 ArrayList (java.util.ArrayList)14 QuantitationType (ubic.gemma.model.common.quantitationtype.QuantitationType)14 BaseSpringContextTest (ubic.gemma.core.testing.BaseSpringContextTest)13 Taxon (ubic.gemma.model.genome.Taxon)12 Collection (java.util.Collection)11 ByteArrayConverter (ubic.basecode.io.ByteArrayConverter)11