Search in sources :

Example 51 with Taxon

use of ubic.gemma.model.genome.Taxon in project Gemma by PavlidisLab.

the class GeoConverterImpl method convertProbeOrganism.

/**
 * Retrieve taxon details for a probe given an abbreviation or scientific name. All scientific names should be in
 * the map as they were set there by the convertPlatform method. If the abbreviation is not found in the database
 * then stop processing as the organism name is likely to be an unknown abbreviation.
 *
 * @param probeOrganism scientific name, common name or abbreviation of organism associated to a biosequence.
 * @return Taxon of biosequence.
 * @throws IllegalArgumentException taxon supplied has not been processed before, it does not match the scientific
 *                                  names used in platform definition and does not match a known abbreviation in the database.
 */
private Taxon convertProbeOrganism(String probeOrganism) {
    Taxon taxon = Taxon.Factory.newInstance();
    // Check if we have processed this organism before as defined by scientific or abbreviation definition.
    assert probeOrganism != null;
    /*
         * Detect blank taxon. We support 'n/a' here .... a little kludgy but shows up in some files.
         */
    if (StringUtils.isBlank(probeOrganism) || probeOrganism.equals("n/a")) {
        return null;
    }
    if (taxonScientificNameMap.containsKey(probeOrganism)) {
        return taxonScientificNameMap.get(probeOrganism);
    }
    if (taxonAbbreviationMap.containsKey(probeOrganism)) {
        return taxonAbbreviationMap.get(probeOrganism);
    }
    taxon.setAbbreviation(probeOrganism);
    // taxon not processed before check database.
    if (taxonService != null) {
        Taxon t = taxonService.findByAbbreviation(probeOrganism.toLowerCase());
        if (t != null) {
            taxon = t;
            taxonAbbreviationMap.put(taxon.getAbbreviation(), t);
        } else {
            t = taxonService.findByCommonName(probeOrganism.toLowerCase());
            if (t != null) {
                taxon = t;
                taxonAbbreviationMap.put(taxon.getAbbreviation(), t);
            } else {
                // if probe organism can not be found i.e it is not a known abbreviation or scientific name
                // and it was not already created during platform organism processing then warn user. Examples would
                // be "taxa" like "ILMN Controls". See bug 3207 (we used to throw an exception)
                GeoConverterImpl.log.warn("'" + probeOrganism + "' is not recognized as a taxon in Gemma");
                return null;
            }
        }
    }
    return taxon;
}
Also used : Taxon(ubic.gemma.model.genome.Taxon)

Example 52 with Taxon

use of ubic.gemma.model.genome.Taxon in project Gemma by PavlidisLab.

the class NcbiGeneConverter method convert.

public Gene convert(NCBIGeneInfo info) {
    Gene gene = Gene.Factory.newInstance();
    gene.setNcbiGeneId(Integer.parseInt(info.getGeneId()));
    gene.setName(info.getDefaultSymbol());
    gene.setOfficialSymbol(info.getDefaultSymbol());
    gene.setOfficialName(info.getDescription());
    gene.setEnsemblId(info.getEnsemblId());
    /*
         * NOTE we allow multiple discontinued or previous ids, separated by commas. This is a hack to account for cases
         * uncovered recently...can be minimized by running this regularly.
         */
    if (info.getHistory() != null) {
        assert info.getHistory().getCurrentId() == null || info.getGeneId().equals(info.getHistory().getCurrentId());
        assert info.getHistory().getPreviousIds() != null;
        if (!info.getHistory().getPreviousIds().isEmpty()) {
            String previousIds = StringUtils.join(info.getHistory().getPreviousIds(), ",");
            gene.setPreviousNcbiId(previousIds);
        }
    } else if (StringUtils.isNotBlank(info.getDiscontinuedId())) {
        if (NcbiGeneConverter.log.isDebugEnabled())
            NcbiGeneConverter.log.debug("Gene matches a gene that was discontinued: " + gene + " matches gene that had id " + info.getDiscontinuedId());
        gene.setPreviousNcbiId(info.getDiscontinuedId());
    }
    gene.setDescription("Imported from NCBI gene; Nomenclature status: " + info.getNomenclatureStatus());
    Taxon t = Taxon.Factory.newInstance();
    t.setNcbiId(info.getTaxId());
    t.setIsGenesUsable(false);
    t.setIsSpecies(true);
    gene.setTaxon(t);
    /*
         * We are going to stop maintaining this information
         */
    PhysicalLocation pl = PhysicalLocation.Factory.newInstance();
    Chromosome chrom = new Chromosome(info.getChromosome(), t);
    pl.setChromosome(chrom);
    gene.setPhysicalLocation(pl);
    Collection<GeneAlias> aliases = gene.getAliases();
    for (String alias : info.getSynonyms()) {
        GeneAlias newAlias = GeneAlias.Factory.newInstance();
        newAlias.setAlias(alias);
        aliases.add(newAlias);
    }
    for (String dbname : info.getDbXrefs().keySet()) {
        if (!dbname.equalsIgnoreCase("Ensembl"))
            continue;
        String identifier = info.getDbXrefs().get(dbname);
        DatabaseEntry crossref = DatabaseEntry.Factory.newInstance();
        crossref.setAccession(identifier);
        crossref.setExternalDatabase(NcbiGeneConverter.getEnsembl());
        gene.getAccessions().add(crossref);
    }
    return gene;
}
Also used : Gene(ubic.gemma.model.genome.Gene) GeneAlias(ubic.gemma.model.genome.gene.GeneAlias) Taxon(ubic.gemma.model.genome.Taxon) Chromosome(ubic.gemma.model.genome.Chromosome) DatabaseEntry(ubic.gemma.model.common.description.DatabaseEntry) PhysicalLocation(ubic.gemma.model.genome.PhysicalLocation)

Example 53 with Taxon

use of ubic.gemma.model.genome.Taxon in project Gemma by PavlidisLab.

the class NcbiGeneLoader method updateTaxaWithGenesUsable.

/**
 * Method to update taxon to indicate that genes have been loaded for that taxon are are usable. If there is a
 * parent taxon for this species and it has genes loaded against it then use that parent's taxons genes rather than
 * the species found in NCBI. Set the flag genesUSable to false for that child taxon that was found in ncbi.
 *
 * @param taxaGenesLoaded List of taxa that have had genes loaded into GEMMA from NCBI.
 */
public void updateTaxaWithGenesUsable(Collection<Taxon> taxaGenesLoaded) {
    if (taxaGenesLoaded != null && !taxaGenesLoaded.isEmpty()) {
        for (Taxon taxon : taxaGenesLoaded) {
            if (taxon == null) {
                NcbiGeneLoader.log.warn("null taxon");
                continue;
            }
            Boolean genesUsableParent = false;
            Taxon parentTaxon = taxon.getParentTaxon();
            if (parentTaxon != null && parentTaxon.getIsGenesUsable()) {
                genesUsableParent = true;
                taxon.setIsGenesUsable(false);
                taxonService.update(taxon);
                NcbiGeneLoader.log.debug("Parent taxon found: " + parentTaxon + ": Not using genes from taxon: " + taxon);
            }
            if (!taxon.getIsGenesUsable() && !genesUsableParent) {
                taxon.setIsGenesUsable(true);
                taxonService.update(taxon);
                NcbiGeneLoader.log.debug("Updating taxon genes usable to true for taxon " + taxon);
            }
        }
    } else {
        throw new IllegalArgumentException("No taxa were processed for this NCBI load");
    }
}
Also used : Taxon(ubic.gemma.model.genome.Taxon) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean)

Example 54 with Taxon

use of ubic.gemma.model.genome.Taxon in project Gemma by PavlidisLab.

the class StringProteinInteractionLoader method load.

/**
 * Main method to load string protein protein interactions. Can either be supplied with files to load from or do
 * remote download. After files have been located/fetched the files are parsed and converted into value objects.
 * These value objects are then converted into GEMMA Gene2GeneProteinInteractions. Which are then loaded into the
 * database. Can be run on all eligable TAXA in gemma or on a supplied taxon.
 *
 * @param stringProteinFileNameLocal     The name of the string file on the local system
 * @param stringProteinFileNameRemote    The name of the string file on the remote system (just in case the string name
 *                                       proves to be too variable) - can be null
 * @param localEnsembl2EntrezMappingFile The name of the local biomart file - can be null?
 * @param taxa                           taxa to load data for. List of taxon to process
 * @throws IOException io problems
 */
public void load(File stringProteinFileNameLocal, String stringProteinFileNameRemote, File localEnsembl2EntrezMappingFile, Collection<Taxon> taxa) throws IOException {
    // very basic validation before any processing done
    this.validateLoadParameters(stringProteinFileNameLocal, taxa);
    // retrieve STRING protein protein interactions
    StringProteinProteinInteractionObjectGenerator stringProteinProteinInteractionObjectGenerator = new StringProteinProteinInteractionObjectGenerator(stringProteinFileNameLocal, stringProteinFileNameRemote);
    Map<Taxon, Collection<StringProteinProteinInteraction>> map = stringProteinProteinInteractionObjectGenerator.generate(taxa);
    /*
         * Get ENSEMBL to NCBI id mappings so we can store the STRING interactions
         */
    Map<String, Ensembl2NcbiValueObject> bioMartStringEntreGeneMapping = this.getIdMappings(localEnsembl2EntrezMappingFile, taxa);
    // To one taxon at a time to reduce memory use
    for (Taxon taxon : map.keySet()) {
        StringProteinInteractionLoader.log.debug("Loading for taxon " + taxon);
        Collection<StringProteinProteinInteraction> proteinInteractions = map.get(taxon);
        StringProteinInteractionLoader.log.info("Found " + proteinInteractions.size() + " STRING interactions for: " + taxon);
        this.loadOneTaxonAtATime(bioMartStringEntreGeneMapping, proteinInteractions);
    }
}
Also used : Ensembl2NcbiValueObject(ubic.gemma.core.loader.protein.biomart.model.Ensembl2NcbiValueObject) StringProteinProteinInteractionObjectGenerator(ubic.gemma.core.loader.protein.string.StringProteinProteinInteractionObjectGenerator) Taxon(ubic.gemma.model.genome.Taxon) Collection(java.util.Collection) StringProteinProteinInteraction(ubic.gemma.core.loader.protein.string.model.StringProteinProteinInteraction)

Example 55 with Taxon

use of ubic.gemma.model.genome.Taxon in project Gemma by PavlidisLab.

the class BiomartEnsemblNcbiFetcher method fetch.

/**
 * Main method that iterates through each taxon supplied and calls the fetch method for each taxon. Which returns a
 * biomart file for each taxon supplied.
 *
 * @param taxa Collection of taxa to retrieve biomart files for.
 * @return A map of biomart files as stored on local file system keyed on taxon.
 * @throws IOException if there is a problem while manipulating the file
 */
public Map<Taxon, File> fetch(Collection<Taxon> taxa) throws IOException {
    Map<Taxon, File> taxonFileMap = new HashMap<>();
    String taxonName;
    File taxonFile;
    for (Taxon taxon : taxa) {
        taxonName = this.getBiomartTaxonName(taxon);
        if (taxonName != null) {
            taxonFile = fetchFileForProteinQuery(taxonName);
            taxonFileMap.put(taxon, taxonFile);
            log.debug("Downloading file " + taxonFile + "for taxon " + taxon);
        }
    }
    return taxonFileMap;
}
Also used : HashMap(java.util.HashMap) Taxon(ubic.gemma.model.genome.Taxon)

Aggregations

Taxon (ubic.gemma.model.genome.Taxon)161 Gene (ubic.gemma.model.genome.Gene)34 Test (org.junit.Test)31 BaseSpringContextTest (ubic.gemma.core.testing.BaseSpringContextTest)29 HashSet (java.util.HashSet)23 ArrayDesign (ubic.gemma.model.expression.arrayDesign.ArrayDesign)23 InputStream (java.io.InputStream)17 Before (org.junit.Before)16 BioSequence (ubic.gemma.model.genome.biosequence.BioSequence)15 ExpressionExperiment (ubic.gemma.model.expression.experiment.ExpressionExperiment)14 CompositeSequence (ubic.gemma.model.expression.designElement.CompositeSequence)12 StopWatch (org.apache.commons.lang3.time.StopWatch)11 Transactional (org.springframework.transaction.annotation.Transactional)11 ArrayList (java.util.ArrayList)10 File (java.io.File)9 SimpleExpressionExperimentMetaData (ubic.gemma.core.loader.expression.simple.model.SimpleExpressionExperimentMetaData)9 Chromosome (ubic.gemma.model.genome.Chromosome)8 Collection (java.util.Collection)7 Element (org.w3c.dom.Element)7 PhysicalLocation (ubic.gemma.model.genome.PhysicalLocation)7