Search in sources :

Example 16 with BioSequence

use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.

the class GeoConverterTest method testWithImages.

/*
     * Has image clones.
     */
@Test
public final void testWithImages() throws Exception {
    GeoFamilyParser parser = new GeoFamilyParser();
    parser.setProcessPlatformsOnly(true);
    try (InputStream is = new GZIPInputStream(this.getClass().getResourceAsStream("/data/loader/expression/geo/GPL890_family.soft.gz"))) {
        parser.parse(is);
    }
    GeoPlatform platform = ((GeoParseResult) parser.getResults().iterator().next()).getPlatformMap().get("GPL890");
    Object result = this.gc.convert(platform);
    ArrayDesign ad = (ArrayDesign) result;
    for (CompositeSequence cs : ad.getCompositeSequences()) {
        BioSequence bs = cs.getBiologicalCharacteristic();
        if (bs != null && bs.getSequence() != null) {
            return;
        }
    }
    fail("No sequences!");
}
Also used : GZIPInputStream(java.util.zip.GZIPInputStream) BioSequence(ubic.gemma.model.genome.biosequence.BioSequence) GZIPInputStream(java.util.zip.GZIPInputStream) InputStream(java.io.InputStream) ArrayDesign(ubic.gemma.model.expression.arrayDesign.ArrayDesign) CompositeSequence(ubic.gemma.model.expression.designElement.CompositeSequence) GeoPlatform(ubic.gemma.core.loader.expression.geo.model.GeoPlatform) Test(org.junit.Test) BaseSpringContextTest(ubic.gemma.core.testing.BaseSpringContextTest)

Example 17 with BioSequence

use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.

the class GeoConverterTest method testMultipleTaxaIdentifiedBYAbbreviationsOnArrayWithOrganismColumn.

/*
     * Method to test that an array design can have multiple taxa stored against it and that if abbreviations used as
     * probe names mapped to the scientific names correctly if the abbreviation is stored in DB.
     */
@Test
public void testMultipleTaxaIdentifiedBYAbbreviationsOnArrayWithOrganismColumn() throws Exception {
    Taxon rainbowTroat = taxonService.findByAbbreviation("omyk");
    Taxon whiteFish = taxonService.findByAbbreviation("cclu");
    Taxon rainbowSmelt = taxonService.findByAbbreviation("omor");
    Taxon atlanticSalm = taxonService.findByAbbreviation("ssal");
    assertNotNull(atlanticSalm);
    // prototype bean.
    gc = this.getBean(GeoConverter.class);
    InputStream is = new GZIPInputStream(this.getClass().getResourceAsStream("/data/loader/expression/geo/GPL2899_family.soft.gz"));
    GeoFamilyParser parser = new GeoFamilyParser();
    // parse only the plaform
    parser.setProcessPlatformsOnly(true);
    parser.parse(is);
    GeoPlatform platform = ((GeoParseResult) parser.getResults().iterator().next()).getPlatformMap().get("GPL2899");
    Object result = gc.convert(platform);
    ArrayDesign ad = (ArrayDesign) result;
    assertNotNull(ad);
    Set<Taxon> taxa = new HashSet<>();
    BioSequence bs;
    for (CompositeSequence cs : ad.getCompositeSequences()) {
        bs = cs.getBiologicalCharacteristic();
        if (bs != null) {
            assertNotNull(bs.getTaxon());
            taxa.add(bs.getTaxon());
        }
    }
    assertEquals(4, taxa.size());
    // original file has five taxa, test file just kept four.
    assertTrue(taxa.contains(atlanticSalm));
    assertTrue(taxa.contains(rainbowTroat));
    assertTrue(taxa.contains(whiteFish));
    assertTrue(taxa.contains(rainbowSmelt));
}
Also used : GZIPInputStream(java.util.zip.GZIPInputStream) BioSequence(ubic.gemma.model.genome.biosequence.BioSequence) GZIPInputStream(java.util.zip.GZIPInputStream) InputStream(java.io.InputStream) ArrayDesign(ubic.gemma.model.expression.arrayDesign.ArrayDesign) Taxon(ubic.gemma.model.genome.Taxon) CompositeSequence(ubic.gemma.model.expression.designElement.CompositeSequence) GeoPlatform(ubic.gemma.core.loader.expression.geo.model.GeoPlatform) Test(org.junit.Test) BaseSpringContextTest(ubic.gemma.core.testing.BaseSpringContextTest)

Example 18 with BioSequence

use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.

the class GeoConverterTest method testConvertGSE4229IMAGE.

/*
     * See bug 3328 - we don't want to use IMAGE clone IDs
     */
@Test
public void testConvertGSE4229IMAGE() throws Exception {
    InputStream is = new GZIPInputStream(this.getClass().getResourceAsStream("/data/loader/expression/geo/gse4229Short/GSE4229.soft.gz"));
    GeoFamilyParser parser = new GeoFamilyParser();
    parser.parse(is);
    GeoSeries series = ((GeoParseResult) parser.getResults().iterator().next()).getSeriesMap().get("GSE4229");
    DatasetCombiner datasetCombiner = new DatasetCombiner();
    GeoSampleCorrespondence correspondence = datasetCombiner.findGSECorrespondence(series);
    series.setSampleCorrespondence(correspondence);
    Object result = this.gc.convert(series);
    assertNotNull(result);
    @SuppressWarnings("unchecked") Collection<ExpressionExperiment> ees = (Collection<ExpressionExperiment>) result;
    ExpressionExperiment ee = ees.iterator().next();
    ArrayDesign platform = ee.getBioAssays().iterator().next().getArrayDesignUsed();
    BioSequence seq = platform.getCompositeSequences().iterator().next().getBiologicalCharacteristic();
    assertNotNull(seq.getSequenceDatabaseEntry());
    String acc = seq.getSequenceDatabaseEntry().getAccession();
    assertEquals("Genbank", seq.getSequenceDatabaseEntry().getExternalDatabase().getName());
    assertTrue(!acc.startsWith("IMAGE"));
}
Also used : GeoSeries(ubic.gemma.core.loader.expression.geo.model.GeoSeries) BioSequence(ubic.gemma.model.genome.biosequence.BioSequence) GZIPInputStream(java.util.zip.GZIPInputStream) InputStream(java.io.InputStream) ArrayDesign(ubic.gemma.model.expression.arrayDesign.ArrayDesign) ExpressionExperiment(ubic.gemma.model.expression.experiment.ExpressionExperiment) GZIPInputStream(java.util.zip.GZIPInputStream) Test(org.junit.Test) BaseSpringContextTest(ubic.gemma.core.testing.BaseSpringContextTest)

Example 19 with BioSequence

use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.

the class ProbeMapperImpl method processBlatResults.

@Override
public Map<String, Collection<BlatAssociation>> processBlatResults(GoldenPathSequenceAnalysis goldenPathDb, Collection<BlatResult> blatResults, ProbeMapperConfig config) {
    if (log.isDebugEnabled()) {
        log.debug(blatResults.size() + " Blat results to map ");
    }
    assert goldenPathDb != null;
    Map<String, Collection<BlatAssociation>> allRes = new HashMap<>();
    int count = 0;
    Map<BioSequence, Collection<BlatResult>> biosequenceToBlatResults = this.groupBlatResultsByBioSequence(blatResults);
    assert !biosequenceToBlatResults.isEmpty();
    // Do them one sequence at a time.
    for (BioSequence sequence : biosequenceToBlatResults.keySet()) {
        Collection<BlatResult> blatResultsForSequence = biosequenceToBlatResults.get(sequence);
        if (log.isDebugEnabled()) {
            log.debug(blatResultsForSequence.size() + " Blat results for " + sequence);
        }
        assert !blatResultsForSequence.isEmpty();
        Collection<BlatResult> trimmedBlatResultsForSequence = this.trimNonCanonicalChromosomeHits(blatResultsForSequence, config);
        /*
             * Screen out likely repeats, where cross-hybridization is a problem.
             */
        Double fractionRepeats = sequence.getFractionRepeats();
        if (fractionRepeats != null && fractionRepeats > config.getMaximumRepeatFraction() && trimmedBlatResultsForSequence.size() >= config.getNonSpecificSiteCountThreshold()) {
            if (config.getWarnings() < ProbeMapperImpl.MAX_WARNINGS) {
                log.info("Skipped " + sequence + " due to repeat content (" + fractionRepeats + ")");
                if (config.getWarnings() == ProbeMapperImpl.MAX_WARNINGS)
                    log.info("Further non-mappings will not be logged");
                config.incrementWarnings();
            }
            continue;
        }
        /*
             * Filter out sequences that have many high-quality alignments to the genome, even if it isn't a repeat. It
             * just seems like a good idea to reject probes that have too many alignment sites, even if only one of them
             * is to the site of a known gene.
             */
        if (trimmedBlatResultsForSequence.size() >= config.getNonRepeatNonSpecificSiteCountThreshold()) {
            if (config.getWarnings() < ProbeMapperImpl.MAX_WARNINGS) {
                log.info("Skipped " + sequence + " due to non-specificity (" + trimmedBlatResultsForSequence.size() + " hits)");
                if (config.getWarnings() == ProbeMapperImpl.MAX_WARNINGS)
                    log.info("Further non-mappings will not be logged");
                config.incrementWarnings();
            }
            continue;
        }
        /*
             * Sanity check to make sure user is paying attention to what they are putting in.
             */
        if (trimmedBlatResultsForSequence.size() > 25) {
            log.warn(sequence + " has " + trimmedBlatResultsForSequence.size() + " blat associations (after trimming non-canonical chromosome hits)");
        }
        Collection<BlatAssociation> blatAssociationsForSequence = new HashSet<>();
        int skipped = 0;
        // map each blat result.
        for (BlatResult blatResult : trimmedBlatResultsForSequence) {
            if (blatResult.getQuerySequence() == null || (blatResult.getQuerySequence().getLength() == null && StringUtils.isBlank(blatResult.getQuerySequence().getSequence()))) {
                log.warn("Blat result had no sequence: " + blatResult);
                continue;
            }
            assert blatResult.score() >= 0 && blatResult.score() <= 1.0 : "Score was " + blatResult.score();
            assert blatResult.identity() >= 0 && blatResult.identity() <= 1.0 : "Identity was " + blatResult.identity();
            if (blatResult.score() < config.getBlatScoreThreshold() || blatResult.identity() < config.getIdentityThreshold()) {
                if (log.isDebugEnabled())
                    log.debug("Result for " + sequence + " skipped with score=" + blatResult.score() + " identity=" + blatResult.identity());
                skipped++;
                continue;
            }
            if (blatResult.getQuerySequence().getTaxon() == null) {
                Taxon taxon = goldenPathDb.getTaxon();
                assert taxon != null;
                blatResult.getQuerySequence().setTaxon(taxon);
            }
            // here's the key line! Find gene products that map to the given blat result.
            Collection<BlatAssociation> resultsForOneBlatResult = this.processBlatResult(goldenPathDb, blatResult, config);
            if (resultsForOneBlatResult != null && resultsForOneBlatResult.size() > 0) {
                // would be very unlikely.
                if (resultsForOneBlatResult.size() > 100) {
                    log.warn(blatResult + " for " + sequence + " has " + resultsForOneBlatResult.size() + " blat associations");
                }
                blatAssociationsForSequence.addAll(resultsForOneBlatResult);
            }
            // there are rarely this many, but it does happen.
            if (++count % 100 == 0 && log.isDebugEnabled())
                log.debug("Annotations computed for " + count + " blat results for " + sequence);
        }
        // Another important step: fill in the specificity, remove duplicates
        if (!blatAssociationsForSequence.isEmpty()) {
            BlatAssociationScorer.scoreResults(blatAssociationsForSequence);
            // Remove hits not meeting criteria
            blatAssociationsForSequence = this.filterOnScores(blatAssociationsForSequence, config);
        }
        // it might be empty now...
        if (blatAssociationsForSequence.isEmpty()) {
            if (config.getWarnings() < ProbeMapperImpl.MAX_WARNINGS) {
                log.info("No mappings for " + sequence + "; " + trimmedBlatResultsForSequence.size() + " individual blat results checked; " + skipped + " had identity or score below threshold, rest had no mapping");
                if (config.getWarnings() == ProbeMapperImpl.MAX_WARNINGS)
                    log.info("Further non-mappings will not be logged");
            }
            config.incrementWarnings();
            continue;
        } else if (log.isDebugEnabled()) {
            log.debug(blatAssociationsForSequence.size() + " associations for " + sequence);
        }
        if (skipped > 0) {
            if (log.isDebugEnabled())
                log.debug("Skipped " + skipped + "/" + blatResults.size() + " individual blat results that didn't meet criteria");
        }
        String queryName = sequence.getName();
        assert StringUtils.isNotBlank(queryName);
        if (!allRes.containsKey(queryName)) {
            allRes.put(queryName, blatAssociationsForSequence);
        }
        allRes.get(queryName).addAll(blatAssociationsForSequence);
    // if ( log.isDebugEnabled() ) {
    // log.info( blatAssociationsForSequence.size() + " associations for " + sequence
    // + " after redundancy reduction and score filtering; "
    // + ( scored - blatAssociationsForSequence.size() ) + " not used" );
    // }
    }
    return allRes;
}
Also used : HashMap(java.util.HashMap) BioSequence(ubic.gemma.model.genome.biosequence.BioSequence) Taxon(ubic.gemma.model.genome.Taxon) Collection(java.util.Collection) BlatAssociation(ubic.gemma.model.genome.sequenceAnalysis.BlatAssociation) BlatResult(ubic.gemma.model.genome.sequenceAnalysis.BlatResult) HashSet(java.util.HashSet)

Example 20 with BioSequence

use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.

the class ProbeMapperImpl method processSequences.

@Override
public Map<String, Collection<BlatAssociation>> processSequences(GoldenPathSequenceAnalysis goldenpath, Collection<BioSequence> sequences, ProbeMapperConfig config) {
    Blat b = new ShellDelegatingBlat();
    b.setBlatScoreThreshold(config.getBlatScoreThreshold());
    try {
        Map<BioSequence, Collection<BlatResult>> results = b.blatQuery(sequences, goldenpath.getTaxon());
        Collection<BlatResult> blatRes = new HashSet<>();
        for (Collection<BlatResult> coll : results.values()) {
            blatRes.addAll(coll);
        }
        return this.processBlatResults(goldenpath, blatRes);
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
}
Also used : Blat(ubic.gemma.core.apps.Blat) ShellDelegatingBlat(ubic.gemma.core.apps.ShellDelegatingBlat) BioSequence(ubic.gemma.model.genome.biosequence.BioSequence) ShellDelegatingBlat(ubic.gemma.core.apps.ShellDelegatingBlat) Collection(java.util.Collection) IOException(java.io.IOException) BlatResult(ubic.gemma.model.genome.sequenceAnalysis.BlatResult) HashSet(java.util.HashSet)

Aggregations

BioSequence (ubic.gemma.model.genome.biosequence.BioSequence)105 CompositeSequence (ubic.gemma.model.expression.designElement.CompositeSequence)40 ArrayDesign (ubic.gemma.model.expression.arrayDesign.ArrayDesign)24 Test (org.junit.Test)18 HashSet (java.util.HashSet)17 Taxon (ubic.gemma.model.genome.Taxon)15 BlatResult (ubic.gemma.model.genome.sequenceAnalysis.BlatResult)12 InputStream (java.io.InputStream)11 Collection (java.util.Collection)11 HashMap (java.util.HashMap)10 BaseSpringContextTest (ubic.gemma.core.testing.BaseSpringContextTest)10 GZIPInputStream (java.util.zip.GZIPInputStream)7 Gene (ubic.gemma.model.genome.Gene)7 GeoPlatform (ubic.gemma.core.loader.expression.geo.model.GeoPlatform)6 DatabaseEntry (ubic.gemma.model.common.description.DatabaseEntry)6 StopWatch (org.apache.commons.lang3.time.StopWatch)5 GeneProduct (ubic.gemma.model.genome.gene.GeneProduct)5 BioSequenceValueObject (ubic.gemma.model.genome.sequenceAnalysis.BioSequenceValueObject)5 BlatAssociation (ubic.gemma.model.genome.sequenceAnalysis.BlatAssociation)5 AbstractGeoServiceTest (ubic.gemma.core.loader.expression.geo.AbstractGeoServiceTest)4