Search in sources :

Example 31 with Taxon

use of ubic.gemma.model.genome.Taxon in project Gemma by PavlidisLab.

the class NCBIGene2GOAssociationLoaderCLI method doWork.

@Override
protected Exception doWork(String[] args) {
    Exception e = this.processCommandLine(args);
    if (e != null) {
        AbstractCLI.log.error(e);
        return e;
    }
    TaxonService taxonService = this.getBean(TaxonService.class);
    NCBIGene2GOAssociationLoader gene2GOAssLoader = new NCBIGene2GOAssociationLoader();
    gene2GOAssLoader.setPersisterHelper(this.getPersisterHelper());
    Collection<Taxon> taxa = taxonService.loadAll();
    gene2GOAssLoader.setParser(new NCBIGene2GOAssociationParser(taxa));
    HttpFetcher fetcher = new HttpFetcher();
    Collection<LocalFile> files;
    if (filePath != null) {
        File f = new File(filePath);
        if (!f.canRead()) {
            return new IOException("Cannot read from " + filePath);
        }
        files = new HashSet<>();
        LocalFile lf = LocalFile.Factory.newInstance();
        try {
            lf.setLocalURL(f.toURI().toURL());
        } catch (MalformedURLException e1) {
            return e1;
        }
        files.add(lf);
    } else {
        files = fetcher.fetch("ftp://ftp.ncbi.nih.gov/gene/DATA/" + NCBIGene2GOAssociationLoaderCLI.GENE2GO_FILE);
    }
    assert files.size() == 1;
    LocalFile gene2Gofile = files.iterator().next();
    Gene2GOAssociationService ggoserv = this.getBean(Gene2GOAssociationService.class);
    AbstractCLI.log.info("Removing all old GO associations");
    ggoserv.removeAll();
    AbstractCLI.log.info("Done, loading new ones");
    gene2GOAssLoader.load(gene2Gofile);
    AbstractCLI.log.info("Don't forget to update the annotation files for platforms.");
    return null;
}
Also used : HttpFetcher(ubic.gemma.core.loader.util.fetcher.HttpFetcher) MalformedURLException(java.net.MalformedURLException) TaxonService(ubic.gemma.persistence.service.genome.taxon.TaxonService) NCBIGene2GOAssociationParser(ubic.gemma.core.loader.association.NCBIGene2GOAssociationParser) Taxon(ubic.gemma.model.genome.Taxon) IOException(java.io.IOException) MalformedURLException(java.net.MalformedURLException) IOException(java.io.IOException) NCBIGene2GOAssociationLoader(ubic.gemma.core.loader.association.NCBIGene2GOAssociationLoader) LocalFile(ubic.gemma.model.common.description.LocalFile) Gene2GOAssociationService(ubic.gemma.persistence.service.association.Gene2GOAssociationService) File(java.io.File) LocalFile(ubic.gemma.model.common.description.LocalFile)

Example 32 with Taxon

use of ubic.gemma.model.genome.Taxon in project Gemma by PavlidisLab.

the class ProbeMapperImpl method processBlatResults.

@Override
public Map<String, Collection<BlatAssociation>> processBlatResults(GoldenPathSequenceAnalysis goldenPathDb, Collection<BlatResult> blatResults, ProbeMapperConfig config) {
    if (log.isDebugEnabled()) {
        log.debug(blatResults.size() + " Blat results to map ");
    }
    assert goldenPathDb != null;
    Map<String, Collection<BlatAssociation>> allRes = new HashMap<>();
    int count = 0;
    Map<BioSequence, Collection<BlatResult>> biosequenceToBlatResults = this.groupBlatResultsByBioSequence(blatResults);
    assert !biosequenceToBlatResults.isEmpty();
    // Do them one sequence at a time.
    for (BioSequence sequence : biosequenceToBlatResults.keySet()) {
        Collection<BlatResult> blatResultsForSequence = biosequenceToBlatResults.get(sequence);
        if (log.isDebugEnabled()) {
            log.debug(blatResultsForSequence.size() + " Blat results for " + sequence);
        }
        assert !blatResultsForSequence.isEmpty();
        Collection<BlatResult> trimmedBlatResultsForSequence = this.trimNonCanonicalChromosomeHits(blatResultsForSequence, config);
        /*
             * Screen out likely repeats, where cross-hybridization is a problem.
             */
        Double fractionRepeats = sequence.getFractionRepeats();
        if (fractionRepeats != null && fractionRepeats > config.getMaximumRepeatFraction() && trimmedBlatResultsForSequence.size() >= config.getNonSpecificSiteCountThreshold()) {
            if (config.getWarnings() < ProbeMapperImpl.MAX_WARNINGS) {
                log.info("Skipped " + sequence + " due to repeat content (" + fractionRepeats + ")");
                if (config.getWarnings() == ProbeMapperImpl.MAX_WARNINGS)
                    log.info("Further non-mappings will not be logged");
                config.incrementWarnings();
            }
            continue;
        }
        /*
             * Filter out sequences that have many high-quality alignments to the genome, even if it isn't a repeat. It
             * just seems like a good idea to reject probes that have too many alignment sites, even if only one of them
             * is to the site of a known gene.
             */
        if (trimmedBlatResultsForSequence.size() >= config.getNonRepeatNonSpecificSiteCountThreshold()) {
            if (config.getWarnings() < ProbeMapperImpl.MAX_WARNINGS) {
                log.info("Skipped " + sequence + " due to non-specificity (" + trimmedBlatResultsForSequence.size() + " hits)");
                if (config.getWarnings() == ProbeMapperImpl.MAX_WARNINGS)
                    log.info("Further non-mappings will not be logged");
                config.incrementWarnings();
            }
            continue;
        }
        /*
             * Sanity check to make sure user is paying attention to what they are putting in.
             */
        if (trimmedBlatResultsForSequence.size() > 25) {
            log.warn(sequence + " has " + trimmedBlatResultsForSequence.size() + " blat associations (after trimming non-canonical chromosome hits)");
        }
        Collection<BlatAssociation> blatAssociationsForSequence = new HashSet<>();
        int skipped = 0;
        // map each blat result.
        for (BlatResult blatResult : trimmedBlatResultsForSequence) {
            if (blatResult.getQuerySequence() == null || (blatResult.getQuerySequence().getLength() == null && StringUtils.isBlank(blatResult.getQuerySequence().getSequence()))) {
                log.warn("Blat result had no sequence: " + blatResult);
                continue;
            }
            assert blatResult.score() >= 0 && blatResult.score() <= 1.0 : "Score was " + blatResult.score();
            assert blatResult.identity() >= 0 && blatResult.identity() <= 1.0 : "Identity was " + blatResult.identity();
            if (blatResult.score() < config.getBlatScoreThreshold() || blatResult.identity() < config.getIdentityThreshold()) {
                if (log.isDebugEnabled())
                    log.debug("Result for " + sequence + " skipped with score=" + blatResult.score() + " identity=" + blatResult.identity());
                skipped++;
                continue;
            }
            if (blatResult.getQuerySequence().getTaxon() == null) {
                Taxon taxon = goldenPathDb.getTaxon();
                assert taxon != null;
                blatResult.getQuerySequence().setTaxon(taxon);
            }
            // here's the key line! Find gene products that map to the given blat result.
            Collection<BlatAssociation> resultsForOneBlatResult = this.processBlatResult(goldenPathDb, blatResult, config);
            if (resultsForOneBlatResult != null && resultsForOneBlatResult.size() > 0) {
                // would be very unlikely.
                if (resultsForOneBlatResult.size() > 100) {
                    log.warn(blatResult + " for " + sequence + " has " + resultsForOneBlatResult.size() + " blat associations");
                }
                blatAssociationsForSequence.addAll(resultsForOneBlatResult);
            }
            // there are rarely this many, but it does happen.
            if (++count % 100 == 0 && log.isDebugEnabled())
                log.debug("Annotations computed for " + count + " blat results for " + sequence);
        }
        // Another important step: fill in the specificity, remove duplicates
        if (!blatAssociationsForSequence.isEmpty()) {
            BlatAssociationScorer.scoreResults(blatAssociationsForSequence);
            // Remove hits not meeting criteria
            blatAssociationsForSequence = this.filterOnScores(blatAssociationsForSequence, config);
        }
        // it might be empty now...
        if (blatAssociationsForSequence.isEmpty()) {
            if (config.getWarnings() < ProbeMapperImpl.MAX_WARNINGS) {
                log.info("No mappings for " + sequence + "; " + trimmedBlatResultsForSequence.size() + " individual blat results checked; " + skipped + " had identity or score below threshold, rest had no mapping");
                if (config.getWarnings() == ProbeMapperImpl.MAX_WARNINGS)
                    log.info("Further non-mappings will not be logged");
            }
            config.incrementWarnings();
            continue;
        } else if (log.isDebugEnabled()) {
            log.debug(blatAssociationsForSequence.size() + " associations for " + sequence);
        }
        if (skipped > 0) {
            if (log.isDebugEnabled())
                log.debug("Skipped " + skipped + "/" + blatResults.size() + " individual blat results that didn't meet criteria");
        }
        String queryName = sequence.getName();
        assert StringUtils.isNotBlank(queryName);
        if (!allRes.containsKey(queryName)) {
            allRes.put(queryName, blatAssociationsForSequence);
        }
        allRes.get(queryName).addAll(blatAssociationsForSequence);
    // if ( log.isDebugEnabled() ) {
    // log.info( blatAssociationsForSequence.size() + " associations for " + sequence
    // + " after redundancy reduction and score filtering; "
    // + ( scored - blatAssociationsForSequence.size() ) + " not used" );
    // }
    }
    return allRes;
}
Also used : HashMap(java.util.HashMap) BioSequence(ubic.gemma.model.genome.biosequence.BioSequence) Taxon(ubic.gemma.model.genome.Taxon) Collection(java.util.Collection) BlatAssociation(ubic.gemma.model.genome.sequenceAnalysis.BlatAssociation) BlatResult(ubic.gemma.model.genome.sequenceAnalysis.BlatResult) HashSet(java.util.HashSet)

Example 33 with Taxon

use of ubic.gemma.model.genome.Taxon in project Gemma by PavlidisLab.

the class ExpressionDataFileServiceImpl method writeCoexpressionData.

/**
 * Loads the probe to probe coexpression link information for a given expression experiment and writes it to disk.
 */
private void writeCoexpressionData(File file, ExpressionExperiment ee) throws IOException {
    Taxon tax = expressionExperimentService.getTaxon(ee);
    assert tax != null;
    Collection<CoexpressionValueObject> geneLinks = gene2geneCoexpressionService.getCoexpression(ee, true);
    Date timestamp = new Date(System.currentTimeMillis());
    StringBuilder buf = new StringBuilder();
    // Write header information
    buf.append("# Coexpression data for:  ").append(ee.getShortName()).append(" : ").append(ee.getName()).append(" \n");
    buf.append("# Generated On: ").append(timestamp).append(" \n");
    buf.append("# Links are listed in an arbitrary order with an indication of positive or negative correlation\n");
    buf.append(ExpressionDataFileService.DISCLAIMER);
    buf.append("GeneSymbol1\tGeneSymbol2\tDirection\tSupport\n");
    // Data
    for (CoexpressionValueObject link : geneLinks) {
        buf.append(link.getQueryGeneSymbol()).append("\t").append(link.getCoexGeneSymbol()).append("\t");
        buf.append(link.isPositiveCorrelation() ? "+" : "-" + "\n");
    }
    // Write coexpression data to file (zipped of course)
    try (Writer writer = new OutputStreamWriter(new GZIPOutputStream(new FileOutputStream(file)))) {
        writer.write(buf.toString());
    }
}
Also used : GZIPOutputStream(java.util.zip.GZIPOutputStream) Taxon(ubic.gemma.model.genome.Taxon) CoexpressionValueObject(ubic.gemma.persistence.service.association.coexpression.CoexpressionValueObject) MatrixWriter(ubic.gemma.core.datastructure.matrix.MatrixWriter) ExperimentalDesignWriter(ubic.gemma.core.datastructure.matrix.ExperimentalDesignWriter)

Example 34 with Taxon

use of ubic.gemma.model.genome.Taxon in project Gemma by PavlidisLab.

the class ExpressionExperimentManipulatingCLI method findExpressionExperimentsByQuery.

/**
 * Use the search engine to locate expression experiments.
 */
private Set<BioAssaySet> findExpressionExperimentsByQuery(String query) {
    Set<BioAssaySet> ees = new HashSet<>();
    Collection<SearchResult> eeSearchResults = searchService.search(SearchSettingsImpl.expressionExperimentSearch(query)).get(ExpressionExperiment.class);
    AbstractCLI.log.info(ees.size() + " Expression experiments matched '" + query + "'");
    // Filter out all the ee that are not of correct taxon
    for (SearchResult sr : eeSearchResults) {
        ExpressionExperiment ee = (ExpressionExperiment) sr.getResultObject();
        Taxon t = eeService.getTaxon(ee);
        if (t != null && t.getCommonName().equalsIgnoreCase(taxon.getCommonName())) {
            ees.add(ee);
        }
    }
    return ees;
}
Also used : BioAssaySet(ubic.gemma.model.expression.experiment.BioAssaySet) Taxon(ubic.gemma.model.genome.Taxon) SearchResult(ubic.gemma.core.search.SearchResult) ExpressionExperiment(ubic.gemma.model.expression.experiment.ExpressionExperiment) HashSet(java.util.HashSet)

Example 35 with Taxon

use of ubic.gemma.model.genome.Taxon in project Gemma by PavlidisLab.

the class ExpressionExperimentSetValueObjectHelperTest method setUp.

@Before
public void setUp() {
    Taxon tax1 = this.getTaxon("human");
    ee = this.getTestPersistentExpressionExperiment(tax1);
    Collection<BioAssaySet> ees = new HashSet<>();
    ees.add(ee);
    eeSet = ExpressionExperimentSet.Factory.newInstance();
    eeSet.setName("CreateTest");
    eeSet.setDescription("CreateDesc");
    eeSet.getExperiments().addAll(ees);
    eeSet.setTaxon(tax1);
    eeSet = expressionExperimentSetService.create(eeSet);
}
Also used : Taxon(ubic.gemma.model.genome.Taxon) HashSet(java.util.HashSet) Before(org.junit.Before)

Aggregations

Taxon (ubic.gemma.model.genome.Taxon)161 Gene (ubic.gemma.model.genome.Gene)34 Test (org.junit.Test)31 BaseSpringContextTest (ubic.gemma.core.testing.BaseSpringContextTest)29 HashSet (java.util.HashSet)23 ArrayDesign (ubic.gemma.model.expression.arrayDesign.ArrayDesign)23 InputStream (java.io.InputStream)17 Before (org.junit.Before)16 BioSequence (ubic.gemma.model.genome.biosequence.BioSequence)15 ExpressionExperiment (ubic.gemma.model.expression.experiment.ExpressionExperiment)14 CompositeSequence (ubic.gemma.model.expression.designElement.CompositeSequence)12 StopWatch (org.apache.commons.lang3.time.StopWatch)11 Transactional (org.springframework.transaction.annotation.Transactional)11 ArrayList (java.util.ArrayList)10 File (java.io.File)9 SimpleExpressionExperimentMetaData (ubic.gemma.core.loader.expression.simple.model.SimpleExpressionExperimentMetaData)9 Chromosome (ubic.gemma.model.genome.Chromosome)8 Collection (java.util.Collection)7 Element (org.w3c.dom.Element)7 PhysicalLocation (ubic.gemma.model.genome.PhysicalLocation)7