Search in sources :

Example 31 with BioSequence

use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.

the class ArrayDesignSequenceProcessingServiceImpl method initializeFetchList.

/**
 * @param force if true, sequence will be replaced even if it is already there.
 * @return map of biosequence accessions to BioSequences (the existing ones)
 */
private Map<String, BioSequence> initializeFetchList(ArrayDesign arrayDesign, boolean force) {
    Map<String, BioSequence> accessionsToFetch = new HashMap<>();
    int sequenceProvided = 0;
    int noSequence = 0;
    boolean warned = false;
    for (CompositeSequence cs : arrayDesign.getCompositeSequences()) {
        BioSequence bs = cs.getBiologicalCharacteristic();
        if (bs == null) {
            warned = this.warnAboutMissingSequence(noSequence, warned, cs);
            noSequence++;
            continue;
        }
        if (!force && StringUtils.isNotBlank(bs.getSequence())) {
            sequenceProvided++;
            continue;
        }
        String accession = this.getAccession(cs);
        if (accession == null) {
            if (ArrayDesignSequenceProcessingServiceImpl.log.isDebugEnabled())
                ArrayDesignSequenceProcessingServiceImpl.log.debug("No accession for " + cs + ": " + bs);
            continue;
        }
        accessionsToFetch.put(accession, bs);
    }
    this.informAboutFetchListResults(arrayDesign, accessionsToFetch, sequenceProvided, noSequence);
    return accessionsToFetch;
}
Also used : BioSequence(ubic.gemma.model.genome.biosequence.BioSequence) CompositeSequence(ubic.gemma.model.expression.designElement.CompositeSequence)

Example 32 with BioSequence

use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.

the class GeoServiceImpl method matchToExistingPlatform.

private void matchToExistingPlatform(GeoConverter geoConverter, GeoPlatform rawGEOPlatform, ArrayDesignsForExperimentCache c) {
    // we have to populate this.
    Map<String, String> probeNamesInGemma = rawGEOPlatform.getProbeNamesInGemma();
    // do a partial conversion. We will throw this away;
    ArrayDesign geoArrayDesign = (ArrayDesign) geoConverter.convert(rawGEOPlatform);
    if (geoArrayDesign == null) {
        if (!rawGEOPlatform.useDataFromGeo()) {
            // MPSS, exon arrays
            return;
        }
        throw new IllegalStateException("Platform is missing");
    }
    // find in our system. Note we only use the short name. The full name can change in GEO, causing trouble.
    ArrayDesign existing = arrayDesignService.findByShortName(geoArrayDesign.getShortName());
    if (existing == null) {
        AbstractGeoService.log.info(rawGEOPlatform + " looks new to Gemma");
        for (CompositeSequence cs : geoArrayDesign.getCompositeSequences()) {
            String geoProbeName = cs.getName();
            probeNamesInGemma.put(geoProbeName, geoProbeName);
        // no mapping needed. NB the converter fills
        // this in already, we're just being defensive
        // here.
        }
    } else {
        AbstractGeoService.log.info("Platform " + rawGEOPlatform.getGeoAccession() + " exists in Gemma, checking for correct probe names and re-matching if necessary ...");
        String columnWithGeoNames;
        columnWithGeoNames = this.getGEOIDColumnName(rawGEOPlatform, geoArrayDesign);
        if (columnWithGeoNames == null) {
            // no problem: this means the design has no elements, so it is actually a placeholder (e.g., MPSS)
            return;
        }
        AbstractGeoService.log.info("Loading probes ...");
        Map<CompositeSequence, BioSequence> m = arrayDesignService.getBioSequences(existing);
        c.add(existing, m);
        this.getGemmaIDColumnNameInGEO(rawGEOPlatform, m, columnWithGeoNames);
    }
}
Also used : BioSequence(ubic.gemma.model.genome.biosequence.BioSequence) ArrayDesign(ubic.gemma.model.expression.arrayDesign.ArrayDesign) CompositeSequence(ubic.gemma.model.expression.designElement.CompositeSequence)

Example 33 with BioSequence

use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.

the class SimpleExpressionDataLoaderServiceImpl method provideImageClone.

/**
 * This will eventually go - no special IMAGE clone support.
 */
private void provideImageClone(CompositeSequence cs, Taxon taxon) {
    BioSequence bs = BioSequence.Factory.newInstance();
    bs.setTaxon(taxon);
    String imageId = cs.getName();
    if (imageId == null)
        throw new IllegalArgumentException("ComposisteSequence must have name filled in first");
    imageId = imageId.replaceFirst("___\\d$", "");
    if (!imageId.startsWith("IMAGE:")) {
        imageId = "IMAGE:" + imageId;
    }
    assert imageId.matches("^IMAGE:\\d+$");
    bs.setName(imageId);
    cs.setBiologicalCharacteristic(bs);
}
Also used : BioSequence(ubic.gemma.model.genome.biosequence.BioSequence)

Example 34 with BioSequence

use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.

the class GeoConverterImpl method processId.

private int processId(GeoPlatform platform, ArrayDesign arrayDesign, String probeOrganismColumn, ExternalDatabase externalDb, List<String> sequences, List<String> probeOrganism, Taxon primaryTaxon, List<String> cloneIdentifiers, List<List<String>> externalRefs, Iterator<String> descIter, Pattern refSeqAccessionPattern, boolean strictSelection, List<String> skipped, Collection<CompositeSequence> compositeSequences, int i, String id) {
    String externalAccession = null;
    if (externalRefs != null) {
        externalAccession = this.getExternalAccession(externalRefs, i);
    }
    if (strictSelection && StringUtils.isBlank(externalAccession)) {
        // currently this is crafted to deal with affymetrix exon arrays, but could be expanded.
        // mrna_assignment is less strict than gene_assignement
        // salvage it if it has a gene assignment.
        // String filteringColumn = "gene_assignment";
        String filteringColumn = "gene_assignment";
        if (platform.getColumnNames().contains(filteringColumn)) {
            String cd = platform.getColumnData(filteringColumn).get(i);
            if (StringUtils.isBlank(cd) || cd.equals("---")) {
                skipped.add(id);
                if (skipped.size() % 10000 == 0) {
                    GeoConverterImpl.log.info("Skipped " + skipped.size() + " elements due to strict selection; last was " + id);
                }
                i++;
                return i;
            }
        // keep it.
        } else {
            // we just skip ones that don't have an external accession.
            return i;
        }
    // remaining case here: externalAccession is blank, but there is another column that we think saves it.
    }
    String cloneIdentifier = cloneIdentifiers == null ? null : cloneIdentifiers.get(i);
    String description = "";
    if (externalAccession != null) {
        String[] refs = externalAccession.split(",");
        if (refs.length > 1) {
            description = "Multiple external sequence references: " + externalAccession + "; ";
            externalAccession = refs[0];
        }
    }
    if (descIter != null)
        description = description + " " + descIter.next();
    CompositeSequence cs = CompositeSequence.Factory.newInstance();
    String probeName = platform.getProbeNamesInGemma().get(id);
    if (probeName == null) {
        probeName = id;
        if (GeoConverterImpl.log.isDebugEnabled())
            GeoConverterImpl.log.debug("Probe retaining original name: " + probeName);
        // must make sure this is populated.
        platform.getProbeNamesInGemma().put(id, id);
    } else {
        if (GeoConverterImpl.log.isDebugEnabled())
            GeoConverterImpl.log.debug("Found probe: " + probeName);
    }
    cs.setName(probeName);
    cs.setDescription(description);
    cs.setArrayDesign(arrayDesign);
    // LMD:1647- If There is a Organism Column given for the probe then set taxon from that overwriting platform
    // if probeOrganismColumn is set but for this probe no taxon do not set probeTaxon and thus create no
    // biosequence
    Taxon probeTaxon = Taxon.Factory.newInstance();
    if (probeOrganism != null && StringUtils.isNotBlank(probeOrganism.get(i))) {
        probeTaxon = this.convertProbeOrganism(probeOrganism.get(i));
    }
    // if there are no probe taxons then all the probes should take the taxon from the primary taxon
    if (probeOrganismColumn == null) {
        probeTaxon = primaryTaxon;
    }
    BioSequence bs = this.createMinimalBioSequence(probeTaxon);
    this.setBsProps(platform, externalDb, sequences, refSeqAccessionPattern, i, id, externalAccession, cloneIdentifier, bs);
    this.checkCs(arrayDesign, externalAccession, cloneIdentifier, cs, probeTaxon, bs);
    compositeSequences.add(cs);
    platformDesignElementMap.get(arrayDesign.getShortName()).put(probeName, cs);
    i++;
    return i;
}
Also used : BioSequence(ubic.gemma.model.genome.biosequence.BioSequence) Taxon(ubic.gemma.model.genome.Taxon) CompositeSequence(ubic.gemma.model.expression.designElement.CompositeSequence)

Example 35 with BioSequence

use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.

the class FastaParser method parseHeader.

/**
 * Recognizes Defline format as described at <a href='http://en.wikipedia.org/wiki/Fasta_format#Sequence_identifiers'>wikipedia</a>.
 * Our amendments:
 * FIXME: recognize multi-line headers separated by ^A.(used for redundant sequences)
 * FIXME: parsing of more obscure (to us) headers might not be complete.
 *
 * @param matcher matcher
 * @return BAs
 */
private Collection<BioSequence> parseHeader(Matcher matcher) {
    Collection<BioSequence> bioSequences = new HashSet<>();
    boolean gotSomething = matcher.find();
    if (!gotSomething) {
        throw new IllegalArgumentException("Invalid FASTA record");
    }
    String header = matcher.group(1);
    String[] recordHeaders = StringUtils.split(header, '>');
    boolean keep;
    for (String rheader : recordHeaders) {
        BioSequence bioSequence = BioSequence.Factory.newInstance();
        bioSequence.setName(rheader);
        /*
             * Look for either a '|' or a ':'. Allow for the possibility of ':' and then '|' occuring; use whichever
             * comes first.
             */
        int firstPipe = rheader.indexOf('|');
        int firstColon = rheader.indexOf(':');
        if (firstPipe > 0 && (firstColon < 0 || firstPipe < firstColon)) {
            keep = this.parseDeflineHeader(bioSequence, rheader);
        } else if (firstColon > 0) {
            keep = this.parseAffyHeader(bioSequence, rheader);
        } else if (rheader.matches(FastaParser.NIA_HEADER_REGEX)) {
            keep = this.parseNIA(bioSequence, rheader);
        } else {
            // just treat the whole header as the sequence name.
            keep = this.parseDeflineHeader(bioSequence, rheader);
        }
        if (keep)
            bioSequences.add(bioSequence);
    }
    return bioSequences;
}
Also used : BioSequence(ubic.gemma.model.genome.biosequence.BioSequence) HashSet(java.util.HashSet)

Aggregations

BioSequence (ubic.gemma.model.genome.biosequence.BioSequence)105 CompositeSequence (ubic.gemma.model.expression.designElement.CompositeSequence)40 ArrayDesign (ubic.gemma.model.expression.arrayDesign.ArrayDesign)24 Test (org.junit.Test)18 HashSet (java.util.HashSet)17 Taxon (ubic.gemma.model.genome.Taxon)15 BlatResult (ubic.gemma.model.genome.sequenceAnalysis.BlatResult)12 InputStream (java.io.InputStream)11 Collection (java.util.Collection)11 HashMap (java.util.HashMap)10 BaseSpringContextTest (ubic.gemma.core.testing.BaseSpringContextTest)10 GZIPInputStream (java.util.zip.GZIPInputStream)7 Gene (ubic.gemma.model.genome.Gene)7 GeoPlatform (ubic.gemma.core.loader.expression.geo.model.GeoPlatform)6 DatabaseEntry (ubic.gemma.model.common.description.DatabaseEntry)6 StopWatch (org.apache.commons.lang3.time.StopWatch)5 GeneProduct (ubic.gemma.model.genome.gene.GeneProduct)5 BioSequenceValueObject (ubic.gemma.model.genome.sequenceAnalysis.BioSequenceValueObject)5 BlatAssociation (ubic.gemma.model.genome.sequenceAnalysis.BlatAssociation)5 AbstractGeoServiceTest (ubic.gemma.core.loader.expression.geo.AbstractGeoServiceTest)4