use of ubic.gemma.model.genome.Taxon in project Gemma by PavlidisLab.
the class GeoConverterImpl method convertProbeOrganism.
/**
* Retrieve taxon details for a probe given an abbreviation or scientific name. All scientific names should be in
* the map as they were set there by the convertPlatform method. If the abbreviation is not found in the database
* then stop processing as the organism name is likely to be an unknown abbreviation.
*
* @param probeOrganism scientific name, common name or abbreviation of organism associated to a biosequence.
* @return Taxon of biosequence.
* @throws IllegalArgumentException taxon supplied has not been processed before, it does not match the scientific
* names used in platform definition and does not match a known abbreviation in the database.
*/
private Taxon convertProbeOrganism(String probeOrganism) {
Taxon taxon = Taxon.Factory.newInstance();
// Check if we have processed this organism before as defined by scientific or abbreviation definition.
assert probeOrganism != null;
/*
* Detect blank taxon. We support 'n/a' here .... a little kludgy but shows up in some files.
*/
if (StringUtils.isBlank(probeOrganism) || probeOrganism.equals("n/a")) {
return null;
}
if (taxonScientificNameMap.containsKey(probeOrganism)) {
return taxonScientificNameMap.get(probeOrganism);
}
if (taxonAbbreviationMap.containsKey(probeOrganism)) {
return taxonAbbreviationMap.get(probeOrganism);
}
taxon.setAbbreviation(probeOrganism);
// taxon not processed before check database.
if (taxonService != null) {
Taxon t = taxonService.findByAbbreviation(probeOrganism.toLowerCase());
if (t != null) {
taxon = t;
taxonAbbreviationMap.put(taxon.getAbbreviation(), t);
} else {
t = taxonService.findByCommonName(probeOrganism.toLowerCase());
if (t != null) {
taxon = t;
taxonAbbreviationMap.put(taxon.getAbbreviation(), t);
} else {
// if probe organism can not be found i.e it is not a known abbreviation or scientific name
// and it was not already created during platform organism processing then warn user. Examples would
// be "taxa" like "ILMN Controls". See bug 3207 (we used to throw an exception)
GeoConverterImpl.log.warn("'" + probeOrganism + "' is not recognized as a taxon in Gemma");
return null;
}
}
}
return taxon;
}
use of ubic.gemma.model.genome.Taxon in project Gemma by PavlidisLab.
the class NcbiGeneConverter method convert.
public Gene convert(NCBIGeneInfo info) {
Gene gene = Gene.Factory.newInstance();
gene.setNcbiGeneId(Integer.parseInt(info.getGeneId()));
gene.setName(info.getDefaultSymbol());
gene.setOfficialSymbol(info.getDefaultSymbol());
gene.setOfficialName(info.getDescription());
gene.setEnsemblId(info.getEnsemblId());
/*
* NOTE we allow multiple discontinued or previous ids, separated by commas. This is a hack to account for cases
* uncovered recently...can be minimized by running this regularly.
*/
if (info.getHistory() != null) {
assert info.getHistory().getCurrentId() == null || info.getGeneId().equals(info.getHistory().getCurrentId());
assert info.getHistory().getPreviousIds() != null;
if (!info.getHistory().getPreviousIds().isEmpty()) {
String previousIds = StringUtils.join(info.getHistory().getPreviousIds(), ",");
gene.setPreviousNcbiId(previousIds);
}
} else if (StringUtils.isNotBlank(info.getDiscontinuedId())) {
if (NcbiGeneConverter.log.isDebugEnabled())
NcbiGeneConverter.log.debug("Gene matches a gene that was discontinued: " + gene + " matches gene that had id " + info.getDiscontinuedId());
gene.setPreviousNcbiId(info.getDiscontinuedId());
}
gene.setDescription("Imported from NCBI gene; Nomenclature status: " + info.getNomenclatureStatus());
Taxon t = Taxon.Factory.newInstance();
t.setNcbiId(info.getTaxId());
t.setIsGenesUsable(false);
t.setIsSpecies(true);
gene.setTaxon(t);
/*
* We are going to stop maintaining this information
*/
PhysicalLocation pl = PhysicalLocation.Factory.newInstance();
Chromosome chrom = new Chromosome(info.getChromosome(), t);
pl.setChromosome(chrom);
gene.setPhysicalLocation(pl);
Collection<GeneAlias> aliases = gene.getAliases();
for (String alias : info.getSynonyms()) {
GeneAlias newAlias = GeneAlias.Factory.newInstance();
newAlias.setAlias(alias);
aliases.add(newAlias);
}
for (String dbname : info.getDbXrefs().keySet()) {
if (!dbname.equalsIgnoreCase("Ensembl"))
continue;
String identifier = info.getDbXrefs().get(dbname);
DatabaseEntry crossref = DatabaseEntry.Factory.newInstance();
crossref.setAccession(identifier);
crossref.setExternalDatabase(NcbiGeneConverter.getEnsembl());
gene.getAccessions().add(crossref);
}
return gene;
}
use of ubic.gemma.model.genome.Taxon in project Gemma by PavlidisLab.
the class NcbiGeneLoader method updateTaxaWithGenesUsable.
/**
* Method to update taxon to indicate that genes have been loaded for that taxon are are usable. If there is a
* parent taxon for this species and it has genes loaded against it then use that parent's taxons genes rather than
* the species found in NCBI. Set the flag genesUSable to false for that child taxon that was found in ncbi.
*
* @param taxaGenesLoaded List of taxa that have had genes loaded into GEMMA from NCBI.
*/
public void updateTaxaWithGenesUsable(Collection<Taxon> taxaGenesLoaded) {
if (taxaGenesLoaded != null && !taxaGenesLoaded.isEmpty()) {
for (Taxon taxon : taxaGenesLoaded) {
if (taxon == null) {
NcbiGeneLoader.log.warn("null taxon");
continue;
}
Boolean genesUsableParent = false;
Taxon parentTaxon = taxon.getParentTaxon();
if (parentTaxon != null && parentTaxon.getIsGenesUsable()) {
genesUsableParent = true;
taxon.setIsGenesUsable(false);
taxonService.update(taxon);
NcbiGeneLoader.log.debug("Parent taxon found: " + parentTaxon + ": Not using genes from taxon: " + taxon);
}
if (!taxon.getIsGenesUsable() && !genesUsableParent) {
taxon.setIsGenesUsable(true);
taxonService.update(taxon);
NcbiGeneLoader.log.debug("Updating taxon genes usable to true for taxon " + taxon);
}
}
} else {
throw new IllegalArgumentException("No taxa were processed for this NCBI load");
}
}
use of ubic.gemma.model.genome.Taxon in project Gemma by PavlidisLab.
the class StringProteinInteractionLoader method load.
/**
* Main method to load string protein protein interactions. Can either be supplied with files to load from or do
* remote download. After files have been located/fetched the files are parsed and converted into value objects.
* These value objects are then converted into GEMMA Gene2GeneProteinInteractions. Which are then loaded into the
* database. Can be run on all eligable TAXA in gemma or on a supplied taxon.
*
* @param stringProteinFileNameLocal The name of the string file on the local system
* @param stringProteinFileNameRemote The name of the string file on the remote system (just in case the string name
* proves to be too variable) - can be null
* @param localEnsembl2EntrezMappingFile The name of the local biomart file - can be null?
* @param taxa taxa to load data for. List of taxon to process
* @throws IOException io problems
*/
public void load(File stringProteinFileNameLocal, String stringProteinFileNameRemote, File localEnsembl2EntrezMappingFile, Collection<Taxon> taxa) throws IOException {
// very basic validation before any processing done
this.validateLoadParameters(stringProteinFileNameLocal, taxa);
// retrieve STRING protein protein interactions
StringProteinProteinInteractionObjectGenerator stringProteinProteinInteractionObjectGenerator = new StringProteinProteinInteractionObjectGenerator(stringProteinFileNameLocal, stringProteinFileNameRemote);
Map<Taxon, Collection<StringProteinProteinInteraction>> map = stringProteinProteinInteractionObjectGenerator.generate(taxa);
/*
* Get ENSEMBL to NCBI id mappings so we can store the STRING interactions
*/
Map<String, Ensembl2NcbiValueObject> bioMartStringEntreGeneMapping = this.getIdMappings(localEnsembl2EntrezMappingFile, taxa);
// To one taxon at a time to reduce memory use
for (Taxon taxon : map.keySet()) {
StringProteinInteractionLoader.log.debug("Loading for taxon " + taxon);
Collection<StringProteinProteinInteraction> proteinInteractions = map.get(taxon);
StringProteinInteractionLoader.log.info("Found " + proteinInteractions.size() + " STRING interactions for: " + taxon);
this.loadOneTaxonAtATime(bioMartStringEntreGeneMapping, proteinInteractions);
}
}
use of ubic.gemma.model.genome.Taxon in project Gemma by PavlidisLab.
the class BiomartEnsemblNcbiFetcher method fetch.
/**
* Main method that iterates through each taxon supplied and calls the fetch method for each taxon. Which returns a
* biomart file for each taxon supplied.
*
* @param taxa Collection of taxa to retrieve biomart files for.
* @return A map of biomart files as stored on local file system keyed on taxon.
* @throws IOException if there is a problem while manipulating the file
*/
public Map<Taxon, File> fetch(Collection<Taxon> taxa) throws IOException {
Map<Taxon, File> taxonFileMap = new HashMap<>();
String taxonName;
File taxonFile;
for (Taxon taxon : taxa) {
taxonName = this.getBiomartTaxonName(taxon);
if (taxonName != null) {
taxonFile = fetchFileForProteinQuery(taxonName);
taxonFileMap.put(taxon, taxonFile);
log.debug("Downloading file " + taxonFile + "for taxon " + taxon);
}
}
return taxonFileMap;
}
Aggregations