Search in sources :

Example 26 with CompositeSequence

use of ubic.gemma.model.expression.designElement.CompositeSequence in project Gemma by PavlidisLab.

the class ArrayDesignAnnotationServiceImpl method generateAnnotationFile.

@Override
public int generateAnnotationFile(Writer writer, Map<CompositeSequence, Collection<BioSequence2GeneProduct>> genesWithSpecificity, OutputType ty) throws IOException {
    int compositeSequencesProcessed = 0;
    int simple = 0;
    int empty = 0;
    int complex = 0;
    // we used LinkedHasSets to keep everything in a predictable order - this is important for the gene symbols,
    // descriptions and NCBIIds (but not important for GO terms). When a probe maps to multiple genes, we list those
    // three items for the genes in the same order. There is a feature request to make
    // the order deterministic (i.e.,lexicographic sort), this could be done by using little gene objects or whatever.
    Collection<OntologyTerm> goTerms = new LinkedHashSet<>();
    Set<String> genes = new LinkedHashSet<>();
    Set<String> geneDescriptions = new LinkedHashSet<>();
    Set<String> geneIds = new LinkedHashSet<>();
    Set<String> ncbiIds = new LinkedHashSet<>();
    Map<Gene, Collection<VocabCharacteristic>> goMappings = this.getGOMappings(genesWithSpecificity);
    for (CompositeSequence cs : genesWithSpecificity.keySet()) {
        Collection<BioSequence2GeneProduct> geneclusters = genesWithSpecificity.get(cs);
        if (++compositeSequencesProcessed % 2000 == 0 && ArrayDesignAnnotationServiceImpl.log.isInfoEnabled()) {
            ArrayDesignAnnotationServiceImpl.log.info("Processed " + compositeSequencesProcessed + "/" + genesWithSpecificity.size() + " compositeSequences " + empty + " empty; " + simple + " simple; " + complex + " complex;");
        }
        if (geneclusters.isEmpty()) {
            this.writeAnnotationLine(writer, cs.getName(), "", "", null, "", "");
            empty++;
            continue;
        }
        if (geneclusters.size() == 1) {
            // common case, do it quickly.
            BioSequence2GeneProduct b2g = geneclusters.iterator().next();
            Gene g = b2g.getGeneProduct().getGene();
            goTerms = this.getGoTerms(goMappings.get(g), ty);
            String gemmaId = g.getId() == null ? "" : g.getId().toString();
            String ncbiId = g.getNcbiGeneId() == null ? "" : g.getNcbiGeneId().toString();
            this.writeAnnotationLine(writer, cs.getName(), g.getOfficialSymbol(), g.getOfficialName(), goTerms, gemmaId, ncbiId);
            simple++;
            continue;
        }
        goTerms.clear();
        genes.clear();
        geneDescriptions.clear();
        geneIds.clear();
        ncbiIds.clear();
        for (BioSequence2GeneProduct bioSequence2GeneProduct : geneclusters) {
            Gene g = bioSequence2GeneProduct.getGeneProduct().getGene();
            genes.add(g.getOfficialSymbol());
            geneDescriptions.add(g.getOfficialName());
            geneIds.add(g.getId().toString());
            Integer ncbiGeneId = g.getNcbiGeneId();
            if (ncbiGeneId != null) {
                ncbiIds.add(ncbiGeneId.toString());
            }
            goTerms.addAll(this.getGoTerms(goMappings.get(g), ty));
        }
        String geneString = StringUtils.join(genes, "|");
        String geneDescriptionString = StringUtils.join(geneDescriptions, "|");
        String geneIdsString = StringUtils.join(geneIds, "|");
        String ncbiIdsString = StringUtils.join(ncbiIds, "|");
        this.writeAnnotationLine(writer, cs.getName(), geneString, geneDescriptionString, goTerms, geneIdsString, ncbiIdsString);
        complex++;
    }
    writer.close();
    return compositeSequencesProcessed;
}
Also used : BioSequence2GeneProduct(ubic.gemma.model.association.BioSequence2GeneProduct) OntologyTerm(ubic.basecode.ontology.model.OntologyTerm) CompositeSequence(ubic.gemma.model.expression.designElement.CompositeSequence) Gene(ubic.gemma.model.genome.Gene)

Example 27 with CompositeSequence

use of ubic.gemma.model.expression.designElement.CompositeSequence in project Gemma by PavlidisLab.

the class AffyProbeNameFilter method filter.

@Override
public ExpressionDataDoubleMatrix filter(ExpressionDataDoubleMatrix data) {
    int numRows = data.rows();
    List<CompositeSequence> kept = new ArrayList<>();
    for (int i = 0; i < numRows; i++) {
        CompositeSequence d = data.getDesignElementForRow(i);
        assert d != null;
        BioSequence sequence = d.getBiologicalCharacteristic();
        String name;
        if (sequence != null) {
            name = sequence.getName();
        } else {
            name = d.getName();
        }
        // apply the rules.
        if (skip_ST && name.contains("_st")) {
            // 'st' means sense strand.
            continue;
        }
        // control probes.
        if (skip_AFFX && name.contains("AFFX")) {
            continue;
        }
        // gene family.
        if (skip_F && name.contains("_f_at")) {
            continue;
        }
        if (skip_X && name.contains("_x_at")) {
            continue;
        }
        if (skip_G && name.contains("_g_at")) {
            continue;
        }
        kept.add(d);
    }
    AffyProbeNameFilter.log.info("There are " + kept.size() + " rows left after Affy probe name filtering.");
    return new ExpressionDataDoubleMatrix(data, kept);
}
Also used : BioSequence(ubic.gemma.model.genome.biosequence.BioSequence) ExpressionDataDoubleMatrix(ubic.gemma.core.datastructure.matrix.ExpressionDataDoubleMatrix) ArrayList(java.util.ArrayList) CompositeSequence(ubic.gemma.model.expression.designElement.CompositeSequence)

Example 28 with CompositeSequence

use of ubic.gemma.model.expression.designElement.CompositeSequence in project Gemma by PavlidisLab.

the class RowLevelFilter method filter.

@Override
public ExpressionDataDoubleMatrix filter(ExpressionDataDoubleMatrix data) {
    if (lowCut == -Double.MAX_VALUE && highCut == Double.MAX_VALUE) {
        RowLevelFilter.log.info("No filtering requested");
        return data;
    }
    int numRows = data.rows();
    DoubleArrayList criteria = new DoubleArrayList(new double[numRows]);
    int numAllNeg = this.computeCriteria(data, criteria);
    DoubleArrayList sortedCriteria = criteria.copy();
    sortedCriteria.sort();
    int consideredRows = numRows;
    int startIndex = 0;
    if (removeAllNegative) {
        consideredRows = numRows - numAllNeg;
        startIndex = numAllNeg;
    }
    double realHighCut = this.getHighThreshold(sortedCriteria, consideredRows);
    double realLowCut = this.getLowThreshold(numRows, sortedCriteria, consideredRows, startIndex);
    if (Double.isNaN(realHighCut)) {
        throw new IllegalStateException("High threshold cut is NaN");
    }
    RowLevelFilter.log.debug("Low cut = " + realLowCut);
    RowLevelFilter.log.debug("High cut = " + realHighCut);
    if (realHighCut <= realLowCut) {
        throw new RuntimeException("High cut " + realHighCut + " is lower or same as low cut " + realLowCut);
    }
    List<CompositeSequence> kept = new ArrayList<>();
    for (int i = 0; i < numRows; i++) {
        // values, zeros should always be removed
        if (criteria.get(i) > realLowCut && criteria.get(i) <= realHighCut) {
            kept.add(data.getDesignElementForRow(i));
        }
    }
    this.logInfo(numRows, kept);
    return new ExpressionDataDoubleMatrix(data, kept);
}
Also used : ExpressionDataDoubleMatrix(ubic.gemma.core.datastructure.matrix.ExpressionDataDoubleMatrix) DoubleArrayList(cern.colt.list.DoubleArrayList) ArrayList(java.util.ArrayList) DoubleArrayList(cern.colt.list.DoubleArrayList) CompositeSequence(ubic.gemma.model.expression.designElement.CompositeSequence)

Example 29 with CompositeSequence

use of ubic.gemma.model.expression.designElement.CompositeSequence in project Gemma by PavlidisLab.

the class ArrayDesignProbeRenamerCli method rename.

private void rename(ArrayDesign arrayDesign, InputStream newIdFile) {
    Map<String, String> old2new;
    try {
        old2new = this.parseIdFile(newIdFile);
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    AbstractCLI.log.info(old2new.size() + " potential renaming items read");
    int count = 0;
    for (CompositeSequence cs : arrayDesign.getCompositeSequences()) {
        if (old2new.containsKey(cs.getName())) {
            String descriptionAddendum = " [Renamed by Gemma from " + cs.getName() + "]";
            if (StringUtils.isNotBlank(cs.getDescription())) {
                cs.setDescription(cs.getDescription() + descriptionAddendum);
            } else {
                cs.setDescription(descriptionAddendum);
            }
            cs.setName(old2new.get(cs.getName()));
            if (++count % 2000 == 0) {
                AbstractCLI.log.info("Renamed " + count + " composite sequences, last to be renamed was " + cs);
            }
        }
    }
    arrayDesignService.update(arrayDesign);
}
Also used : CompositeSequence(ubic.gemma.model.expression.designElement.CompositeSequence)

Example 30 with CompositeSequence

use of ubic.gemma.model.expression.designElement.CompositeSequence in project Gemma by PavlidisLab.

the class SVDServiceHelperImpl method getTopLoadedVectors.

@Override
public Map<ProbeLoading, DoubleVectorValueObject> getTopLoadedVectors(ExpressionExperiment ee, int component, int count) {
    PrincipalComponentAnalysis pca = principalComponentAnalysisService.loadForExperiment(ee);
    Map<ProbeLoading, DoubleVectorValueObject> result = new HashMap<>();
    if (pca == null) {
        return result;
    }
    List<ProbeLoading> topLoadedProbes = principalComponentAnalysisService.getTopLoadedProbes(ee, component, count);
    if (topLoadedProbes == null) {
        SVDServiceHelperImpl.log.warn("No probes?");
        return result;
    }
    Map<Long, ProbeLoading> probes = new LinkedHashMap<>();
    Set<CompositeSequence> p = new HashSet<>();
    for (ProbeLoading probeLoading : topLoadedProbes) {
        CompositeSequence probe = probeLoading.getProbe();
        probes.put(probe.getId(), probeLoading);
        p.add(probe);
    }
    if (probes.isEmpty())
        return result;
    assert probes.size() <= count;
    Collection<ExpressionExperiment> ees = new HashSet<>();
    ees.add(ee);
    Collection<DoubleVectorValueObject> dvVos = processedExpressionDataVectorService.getProcessedDataArraysByProbe(ees, p);
    if (dvVos.isEmpty()) {
        SVDServiceHelperImpl.log.warn("No vectors came back from the call; check the Gene2CS table?");
        return result;
    }
    // note that this might have come from a cache.
    /*
         * This is actually expected, because we go through the genes.
         */
    BioAssayDimension bioAssayDimension = pca.getBioAssayDimension();
    assert bioAssayDimension != null;
    assert !bioAssayDimension.getBioAssays().isEmpty();
    for (DoubleVectorValueObject vct : dvVos) {
        ProbeLoading probeLoading = probes.get(vct.getDesignElement().getId());
        if (probeLoading == null) {
            /*
                 * This is okay, we will skip this probe. It was another probe for a gene that _was_ highly loaded.
                 */
            continue;
        }
        assert bioAssayDimension.getBioAssays().size() == vct.getData().length;
        vct.setRank(probeLoading.getLoadingRank().doubleValue());
        vct.setExpressionExperiment(new ExpressionExperimentValueObject(ee));
        result.put(probeLoading, vct);
    }
    if (result.isEmpty()) {
        SVDServiceHelperImpl.log.warn("No results, something went wrong; there were " + dvVos.size() + " vectors to start but they all got filtered out.");
    }
    return result;
}
Also used : ProbeLoading(ubic.gemma.model.analysis.expression.pca.ProbeLoading) CompositeSequence(ubic.gemma.model.expression.designElement.CompositeSequence) ExpressionExperiment(ubic.gemma.model.expression.experiment.ExpressionExperiment) BioAssayDimension(ubic.gemma.model.expression.bioAssayData.BioAssayDimension) PrincipalComponentAnalysis(ubic.gemma.model.analysis.expression.pca.PrincipalComponentAnalysis) ExpressionExperimentValueObject(ubic.gemma.model.expression.experiment.ExpressionExperimentValueObject) DoubleVectorValueObject(ubic.gemma.model.expression.bioAssayData.DoubleVectorValueObject)

Aggregations

CompositeSequence (ubic.gemma.model.expression.designElement.CompositeSequence)206 ArrayDesign (ubic.gemma.model.expression.arrayDesign.ArrayDesign)43 BioSequence (ubic.gemma.model.genome.biosequence.BioSequence)40 Gene (ubic.gemma.model.genome.Gene)32 Test (org.junit.Test)30 BioMaterial (ubic.gemma.model.expression.biomaterial.BioMaterial)19 ExpressionDataDoubleMatrix (ubic.gemma.core.datastructure.matrix.ExpressionDataDoubleMatrix)18 BioAssay (ubic.gemma.model.expression.bioAssay.BioAssay)18 DesignElementDataVector (ubic.gemma.model.expression.bioAssayData.DesignElementDataVector)18 RawExpressionDataVector (ubic.gemma.model.expression.bioAssayData.RawExpressionDataVector)18 StopWatch (org.apache.commons.lang3.time.StopWatch)17 HashSet (java.util.HashSet)15 BioAssayDimension (ubic.gemma.model.expression.bioAssayData.BioAssayDimension)15 CompositeSequenceValueObject (ubic.gemma.model.expression.designElement.CompositeSequenceValueObject)15 ArrayList (java.util.ArrayList)14 QuantitationType (ubic.gemma.model.common.quantitationtype.QuantitationType)14 BaseSpringContextTest (ubic.gemma.core.testing.BaseSpringContextTest)13 Taxon (ubic.gemma.model.genome.Taxon)12 Collection (java.util.Collection)11 ByteArrayConverter (ubic.basecode.io.ByteArrayConverter)11