use of ubic.gemma.core.loader.protein.biomart.model.Ensembl2NcbiValueObject in project Gemma by PavlidisLab.
the class StringBiomartProteinConverterTest method setUp.
@Before
public void setUp() {
String fileNameBiomartmouse = "/data/loader/protein/biomart/biomartmmusculusShort.txt";
URL fileNameBiomartmouseURL = this.getClass().getResource(fileNameBiomartmouse);
File taxonBiomartFile = new File(fileNameBiomartmouseURL.getFile());
Taxon taxon = Taxon.Factory.newInstance();
taxon.setIsGenesUsable(true);
taxon.setNcbiId(10090);
taxon.setScientificName("Mus musculus");
taxon.setIsSpecies(true);
taxa.add(taxon);
try {
BiomartEnsemblNcbiObjectGenerator biomartEnsemblNcbiObjectGenerator = new BiomartEnsemblNcbiObjectGenerator();
biomartEnsemblNcbiObjectGenerator.setBioMartFileName(taxonBiomartFile);
Map<String, Ensembl2NcbiValueObject> map = biomartEnsemblNcbiObjectGenerator.generate(taxa);
stringBiomartProteinConverter = new StringProteinProteinInteractionConverter(map);
} catch (Exception e) {
e.printStackTrace();
fail();
}
stringProteinProteinInteractionOne = new StringProteinProteinInteraction("ENSMUSP00000111623", "ENSMUSP00000100396");
StringProteinProteinInteraction stringProteinProteinInteractionTwo = new StringProteinProteinInteraction("ENSMUSP00000100395", "ENSMUSP00000100396");
StringProteinProteinInteraction stringProteinProteinInteractionThree = new StringProteinProteinInteraction("ENSMUSP00000100407", "ENSMUSP00000100395");
// add them to array
stringProteinProteinInteractions.add(stringProteinProteinInteractionOne);
stringProteinProteinInteractions.add(stringProteinProteinInteractionTwo);
stringProteinProteinInteractions.add(stringProteinProteinInteractionThree);
}
use of ubic.gemma.core.loader.protein.biomart.model.Ensembl2NcbiValueObject in project Gemma by PavlidisLab.
the class StringProteinInteractionLoader method load.
/**
* Main method to load string protein protein interactions. Can either be supplied with files to load from or do
* remote download. After files have been located/fetched the files are parsed and converted into value objects.
* These value objects are then converted into GEMMA Gene2GeneProteinInteractions. Which are then loaded into the
* database. Can be run on all eligable TAXA in gemma or on a supplied taxon.
*
* @param stringProteinFileNameLocal The name of the string file on the local system
* @param stringProteinFileNameRemote The name of the string file on the remote system (just in case the string name
* proves to be too variable) - can be null
* @param localEnsembl2EntrezMappingFile The name of the local biomart file - can be null?
* @param taxa taxa to load data for. List of taxon to process
* @throws IOException io problems
*/
public void load(File stringProteinFileNameLocal, String stringProteinFileNameRemote, File localEnsembl2EntrezMappingFile, Collection<Taxon> taxa) throws IOException {
// very basic validation before any processing done
this.validateLoadParameters(stringProteinFileNameLocal, taxa);
// retrieve STRING protein protein interactions
StringProteinProteinInteractionObjectGenerator stringProteinProteinInteractionObjectGenerator = new StringProteinProteinInteractionObjectGenerator(stringProteinFileNameLocal, stringProteinFileNameRemote);
Map<Taxon, Collection<StringProteinProteinInteraction>> map = stringProteinProteinInteractionObjectGenerator.generate(taxa);
/*
* Get ENSEMBL to NCBI id mappings so we can store the STRING interactions
*/
Map<String, Ensembl2NcbiValueObject> bioMartStringEntreGeneMapping = this.getIdMappings(localEnsembl2EntrezMappingFile, taxa);
// To one taxon at a time to reduce memory use
for (Taxon taxon : map.keySet()) {
StringProteinInteractionLoader.log.debug("Loading for taxon " + taxon);
Collection<StringProteinProteinInteraction> proteinInteractions = map.get(taxon);
StringProteinInteractionLoader.log.info("Found " + proteinInteractions.size() + " STRING interactions for: " + taxon);
this.loadOneTaxonAtATime(bioMartStringEntreGeneMapping, proteinInteractions);
}
}
use of ubic.gemma.core.loader.protein.biomart.model.Ensembl2NcbiValueObject in project Gemma by PavlidisLab.
the class StringProteinProteinInteractionConverter method getNcbiGene.
/**
* One ensemblProteinID can map to multiple ncbi genes. This method takes the ensembl gene and creates a collection
* of entrez ncbi genes. It first has to remove the taxon id from the beginning of the peptide id as given by
* string.
*
* @param ensemblProteinId The ensembl protein id in this interaction
* @return Collection of genes as represented in ncbi entrez gene
*/
public Collection<Gene> getNcbiGene(String ensemblProteinId) {
// log.debug("getting ncbi gene for ensembl id " + ensemblProteinId);
Collection<Gene> genes = new ArrayList<>();
// in case species id is still on there from STRING like 12334.ENSD....
String eid = ensemblProteinId.replaceFirst("[0-9]+\\.", "");
Ensembl2NcbiValueObject e2n = ensembl2ncbi.get(eid);
if (e2n == null || e2n.getEntrezgenes().isEmpty()) {
return genes;
}
String ensemblGeneId = e2n.getEnsemblGeneId();
Collection<String> entrezGeneIds = (e2n.getEntrezgenes());
for (String entrezGeneId : entrezGeneIds) {
if (!entrezGeneId.isEmpty()) {
Gene gene = Gene.Factory.newInstance();
gene.setNcbiGeneId(Integer.parseInt(entrezGeneId));
gene.setEnsemblId(ensemblGeneId);
genes.add(gene);
if (StringProteinProteinInteractionConverter.log.isDebugEnabled())
StringProteinProteinInteractionConverter.log.debug("Entry found for entrezGeneId " + entrezGeneId);
}
}
return genes;
}
use of ubic.gemma.core.loader.protein.biomart.model.Ensembl2NcbiValueObject in project Gemma by PavlidisLab.
the class BiomartEnsemblNcbiObjectGenerator method generateRemote.
/**
* @return Generates file from remote biomart location
* @throws IOException if there is a problem while manipulating the file
*/
public Map<String, Ensembl2NcbiValueObject> generateRemote(Collection<Taxon> validTaxa) throws IOException {
Map<String, Ensembl2NcbiValueObject> bioMartEnsemblNcbiIdsForValidAllGemmaTaxa = new HashMap<>();
Map<Taxon, File> taxaBiomartFiles = this.biomartEnsemblNcbiFetcher.fetch(validTaxa);
if (taxaBiomartFiles != null && !taxaBiomartFiles.isEmpty()) {
for (Taxon taxon : taxaBiomartFiles.keySet()) {
File fileForTaxon = taxaBiomartFiles.get(taxon);
if (fileForTaxon != null) {
log.info("Starting processing taxon " + taxon + " for file " + fileForTaxon);
Map<String, Ensembl2NcbiValueObject> map = parseTaxonBiomartFile(taxon, fileForTaxon);
bioMartEnsemblNcbiIdsForValidAllGemmaTaxa.putAll(map);
} else {
log.error("No biomart file retrieved for taxon " + taxon);
}
}
} else {
throw new RuntimeException("No files could be downloaded from Biomart for provided taxon");
}
return bioMartEnsemblNcbiIdsForValidAllGemmaTaxa;
}
use of ubic.gemma.core.loader.protein.biomart.model.Ensembl2NcbiValueObject in project Gemma by PavlidisLab.
the class BiomartEnsembleNcbiParser method createBioMartEnsembleNcbi.
/**
* Given an array of strings representing the line to parse then create a BioMartEnsembleNcbi value object with some
* validation. That is if a duplicate record keyed on peptide id is found then that means that it maps to more than
* one entrez gene id. As such check that the duplicate and currently processed record share the same ensemble gene
* id as a sanity check. Add the entrez gene to the existing collection of entrez genes.
*
* @param fields Parsed line split on delimiter
* @return BioMartEnsembleNcbi value object
* @throws NumberFormatException Parsing a number that is not one
* @throws FileFormatException Validation than when a duplicate record is found then the peptide id is the same the
* ensemble gene id should be the same.
*/
// Possible external use
@SuppressWarnings({ "unused", "WeakerAccess" })
public Ensembl2NcbiValueObject createBioMartEnsembleNcbi(String[] fields) throws NumberFormatException, FileFormatException {
Ensembl2NcbiValueObject bioMartEnsembleNcbi = new Ensembl2NcbiValueObject();
String entrezGene = fields[2].trim();
String ensemblProteinId = fields[3].trim();
if (StringUtils.isBlank(ensemblProteinId)) {
if (log.isDebugEnabled())
log.debug("Blank protein id for line: " + StringUtils.join(fields, " "));
return null;
}
// if there is no entrezgene skip as that is what we want
if (StringUtils.isBlank(entrezGene)) {
log.debug(ensemblProteinId + " has no entrez gene mapping");
return null;
}
String ensemblGeneID = fields[0].trim();
bioMartEnsembleNcbi.setNcbiTaxonId(taxon.getNcbiId());
bioMartEnsembleNcbi.setEnsemblGeneId(ensemblGeneID);
bioMartEnsembleNcbi.setEnsemblTranscriptId(fields[1]);
bioMartEnsembleNcbi.setEnsemblPeptideId(ensemblProteinId);
if (!bioMartHeaderFields[4].isEmpty() && fields[4] != null) {
// only humans should have this field
bioMartEnsembleNcbi.setHgnc_id(fields[4]);
}
// Ensembl ids can map to multiple entrez genes so we maintain a collection of entrezgenes
if (!this.containsKey(ensemblProteinId)) {
bioMartEnsembleNcbi.getEntrezgenes().add(entrezGene);
results.put(ensemblProteinId, bioMartEnsembleNcbi);
if (log.isDebugEnabled())
log.debug(ensemblProteinId + " has no existing entrez gene mapping");
} else {
Ensembl2NcbiValueObject bioMartEnsembleNcbiDup = this.get(ensemblProteinId);
// check that the this duplicate record also is the same for ensembl id
if (ensemblGeneID.equals(bioMartEnsembleNcbiDup.getEnsemblGeneId())) {
this.get(ensemblProteinId).getEntrezgenes().add(entrezGene);
if (log.isDebugEnabled())
log.debug(ensemblProteinId + "added gene to duplicate ");
} else {
throw new FileFormatException("A duplicate ensemblProteinId has been found: " + ensemblProteinId + " but it does not match with the exisiting objects gene id " + ensemblGeneID + ", it was " + bioMartEnsembleNcbiDup.getEnsemblGeneId() + ", line was:\n" + StringUtils.join(fields, " "));
}
}
return bioMartEnsembleNcbi;
}
Aggregations