Search in sources :

Example 1 with Ensembl2NcbiValueObject

use of ubic.gemma.core.loader.protein.biomart.model.Ensembl2NcbiValueObject in project Gemma by PavlidisLab.

the class StringBiomartProteinConverterTest method setUp.

@Before
public void setUp() {
    String fileNameBiomartmouse = "/data/loader/protein/biomart/biomartmmusculusShort.txt";
    URL fileNameBiomartmouseURL = this.getClass().getResource(fileNameBiomartmouse);
    File taxonBiomartFile = new File(fileNameBiomartmouseURL.getFile());
    Taxon taxon = Taxon.Factory.newInstance();
    taxon.setIsGenesUsable(true);
    taxon.setNcbiId(10090);
    taxon.setScientificName("Mus musculus");
    taxon.setIsSpecies(true);
    taxa.add(taxon);
    try {
        BiomartEnsemblNcbiObjectGenerator biomartEnsemblNcbiObjectGenerator = new BiomartEnsemblNcbiObjectGenerator();
        biomartEnsemblNcbiObjectGenerator.setBioMartFileName(taxonBiomartFile);
        Map<String, Ensembl2NcbiValueObject> map = biomartEnsemblNcbiObjectGenerator.generate(taxa);
        stringBiomartProteinConverter = new StringProteinProteinInteractionConverter(map);
    } catch (Exception e) {
        e.printStackTrace();
        fail();
    }
    stringProteinProteinInteractionOne = new StringProteinProteinInteraction("ENSMUSP00000111623", "ENSMUSP00000100396");
    StringProteinProteinInteraction stringProteinProteinInteractionTwo = new StringProteinProteinInteraction("ENSMUSP00000100395", "ENSMUSP00000100396");
    StringProteinProteinInteraction stringProteinProteinInteractionThree = new StringProteinProteinInteraction("ENSMUSP00000100407", "ENSMUSP00000100395");
    // add them to array
    stringProteinProteinInteractions.add(stringProteinProteinInteractionOne);
    stringProteinProteinInteractions.add(stringProteinProteinInteractionTwo);
    stringProteinProteinInteractions.add(stringProteinProteinInteractionThree);
}
Also used : Ensembl2NcbiValueObject(ubic.gemma.core.loader.protein.biomart.model.Ensembl2NcbiValueObject) BiomartEnsemblNcbiObjectGenerator(ubic.gemma.core.loader.protein.biomart.BiomartEnsemblNcbiObjectGenerator) Taxon(ubic.gemma.model.genome.Taxon) File(java.io.File) URL(java.net.URL) StringProteinProteinInteraction(ubic.gemma.core.loader.protein.string.model.StringProteinProteinInteraction) Before(org.junit.Before)

Example 2 with Ensembl2NcbiValueObject

use of ubic.gemma.core.loader.protein.biomart.model.Ensembl2NcbiValueObject in project Gemma by PavlidisLab.

the class StringProteinInteractionLoader method load.

/**
 * Main method to load string protein protein interactions. Can either be supplied with files to load from or do
 * remote download. After files have been located/fetched the files are parsed and converted into value objects.
 * These value objects are then converted into GEMMA Gene2GeneProteinInteractions. Which are then loaded into the
 * database. Can be run on all eligable TAXA in gemma or on a supplied taxon.
 *
 * @param stringProteinFileNameLocal     The name of the string file on the local system
 * @param stringProteinFileNameRemote    The name of the string file on the remote system (just in case the string name
 *                                       proves to be too variable) - can be null
 * @param localEnsembl2EntrezMappingFile The name of the local biomart file - can be null?
 * @param taxa                           taxa to load data for. List of taxon to process
 * @throws IOException io problems
 */
public void load(File stringProteinFileNameLocal, String stringProteinFileNameRemote, File localEnsembl2EntrezMappingFile, Collection<Taxon> taxa) throws IOException {
    // very basic validation before any processing done
    this.validateLoadParameters(stringProteinFileNameLocal, taxa);
    // retrieve STRING protein protein interactions
    StringProteinProteinInteractionObjectGenerator stringProteinProteinInteractionObjectGenerator = new StringProteinProteinInteractionObjectGenerator(stringProteinFileNameLocal, stringProteinFileNameRemote);
    Map<Taxon, Collection<StringProteinProteinInteraction>> map = stringProteinProteinInteractionObjectGenerator.generate(taxa);
    /*
         * Get ENSEMBL to NCBI id mappings so we can store the STRING interactions
         */
    Map<String, Ensembl2NcbiValueObject> bioMartStringEntreGeneMapping = this.getIdMappings(localEnsembl2EntrezMappingFile, taxa);
    // To one taxon at a time to reduce memory use
    for (Taxon taxon : map.keySet()) {
        StringProteinInteractionLoader.log.debug("Loading for taxon " + taxon);
        Collection<StringProteinProteinInteraction> proteinInteractions = map.get(taxon);
        StringProteinInteractionLoader.log.info("Found " + proteinInteractions.size() + " STRING interactions for: " + taxon);
        this.loadOneTaxonAtATime(bioMartStringEntreGeneMapping, proteinInteractions);
    }
}
Also used : Ensembl2NcbiValueObject(ubic.gemma.core.loader.protein.biomart.model.Ensembl2NcbiValueObject) StringProteinProteinInteractionObjectGenerator(ubic.gemma.core.loader.protein.string.StringProteinProteinInteractionObjectGenerator) Taxon(ubic.gemma.model.genome.Taxon) Collection(java.util.Collection) StringProteinProteinInteraction(ubic.gemma.core.loader.protein.string.model.StringProteinProteinInteraction)

Example 3 with Ensembl2NcbiValueObject

use of ubic.gemma.core.loader.protein.biomart.model.Ensembl2NcbiValueObject in project Gemma by PavlidisLab.

the class StringProteinProteinInteractionConverter method getNcbiGene.

/**
 * One ensemblProteinID can map to multiple ncbi genes. This method takes the ensembl gene and creates a collection
 * of entrez ncbi genes. It first has to remove the taxon id from the beginning of the peptide id as given by
 * string.
 *
 * @param ensemblProteinId The ensembl protein id in this interaction
 * @return Collection of genes as represented in ncbi entrez gene
 */
public Collection<Gene> getNcbiGene(String ensemblProteinId) {
    // log.debug("getting ncbi gene for ensembl id " + ensemblProteinId);
    Collection<Gene> genes = new ArrayList<>();
    // in case species id is still on there from STRING like 12334.ENSD....
    String eid = ensemblProteinId.replaceFirst("[0-9]+\\.", "");
    Ensembl2NcbiValueObject e2n = ensembl2ncbi.get(eid);
    if (e2n == null || e2n.getEntrezgenes().isEmpty()) {
        return genes;
    }
    String ensemblGeneId = e2n.getEnsemblGeneId();
    Collection<String> entrezGeneIds = (e2n.getEntrezgenes());
    for (String entrezGeneId : entrezGeneIds) {
        if (!entrezGeneId.isEmpty()) {
            Gene gene = Gene.Factory.newInstance();
            gene.setNcbiGeneId(Integer.parseInt(entrezGeneId));
            gene.setEnsemblId(ensemblGeneId);
            genes.add(gene);
            if (StringProteinProteinInteractionConverter.log.isDebugEnabled())
                StringProteinProteinInteractionConverter.log.debug("Entry found for entrezGeneId " + entrezGeneId);
        }
    }
    return genes;
}
Also used : Ensembl2NcbiValueObject(ubic.gemma.core.loader.protein.biomart.model.Ensembl2NcbiValueObject) Gene(ubic.gemma.model.genome.Gene) ArrayList(java.util.ArrayList)

Example 4 with Ensembl2NcbiValueObject

use of ubic.gemma.core.loader.protein.biomart.model.Ensembl2NcbiValueObject in project Gemma by PavlidisLab.

the class BiomartEnsemblNcbiObjectGenerator method generateRemote.

/**
 * @return Generates file from remote biomart location
 * @throws IOException if there is a problem while manipulating the file
 */
public Map<String, Ensembl2NcbiValueObject> generateRemote(Collection<Taxon> validTaxa) throws IOException {
    Map<String, Ensembl2NcbiValueObject> bioMartEnsemblNcbiIdsForValidAllGemmaTaxa = new HashMap<>();
    Map<Taxon, File> taxaBiomartFiles = this.biomartEnsemblNcbiFetcher.fetch(validTaxa);
    if (taxaBiomartFiles != null && !taxaBiomartFiles.isEmpty()) {
        for (Taxon taxon : taxaBiomartFiles.keySet()) {
            File fileForTaxon = taxaBiomartFiles.get(taxon);
            if (fileForTaxon != null) {
                log.info("Starting processing taxon " + taxon + " for file " + fileForTaxon);
                Map<String, Ensembl2NcbiValueObject> map = parseTaxonBiomartFile(taxon, fileForTaxon);
                bioMartEnsemblNcbiIdsForValidAllGemmaTaxa.putAll(map);
            } else {
                log.error("No biomart file retrieved for taxon " + taxon);
            }
        }
    } else {
        throw new RuntimeException("No files could be downloaded from Biomart for provided taxon");
    }
    return bioMartEnsemblNcbiIdsForValidAllGemmaTaxa;
}
Also used : Ensembl2NcbiValueObject(ubic.gemma.core.loader.protein.biomart.model.Ensembl2NcbiValueObject) HashMap(java.util.HashMap) Taxon(ubic.gemma.model.genome.Taxon) File(java.io.File)

Example 5 with Ensembl2NcbiValueObject

use of ubic.gemma.core.loader.protein.biomart.model.Ensembl2NcbiValueObject in project Gemma by PavlidisLab.

the class BiomartEnsembleNcbiParser method createBioMartEnsembleNcbi.

/**
 * Given an array of strings representing the line to parse then create a BioMartEnsembleNcbi value object with some
 * validation. That is if a duplicate record keyed on peptide id is found then that means that it maps to more than
 * one entrez gene id. As such check that the duplicate and currently processed record share the same ensemble gene
 * id as a sanity check. Add the entrez gene to the existing collection of entrez genes.
 *
 * @param fields Parsed line split on delimiter
 * @return BioMartEnsembleNcbi value object
 * @throws NumberFormatException Parsing a number that is not one
 * @throws FileFormatException   Validation than when a duplicate record is found then the peptide id is the same the
 *                               ensemble gene id should be the same.
 */
// Possible external use
@SuppressWarnings({ "unused", "WeakerAccess" })
public Ensembl2NcbiValueObject createBioMartEnsembleNcbi(String[] fields) throws NumberFormatException, FileFormatException {
    Ensembl2NcbiValueObject bioMartEnsembleNcbi = new Ensembl2NcbiValueObject();
    String entrezGene = fields[2].trim();
    String ensemblProteinId = fields[3].trim();
    if (StringUtils.isBlank(ensemblProteinId)) {
        if (log.isDebugEnabled())
            log.debug("Blank protein id for line: " + StringUtils.join(fields, " "));
        return null;
    }
    // if there is no entrezgene skip as that is what we want
    if (StringUtils.isBlank(entrezGene)) {
        log.debug(ensemblProteinId + " has no entrez gene mapping");
        return null;
    }
    String ensemblGeneID = fields[0].trim();
    bioMartEnsembleNcbi.setNcbiTaxonId(taxon.getNcbiId());
    bioMartEnsembleNcbi.setEnsemblGeneId(ensemblGeneID);
    bioMartEnsembleNcbi.setEnsemblTranscriptId(fields[1]);
    bioMartEnsembleNcbi.setEnsemblPeptideId(ensemblProteinId);
    if (!bioMartHeaderFields[4].isEmpty() && fields[4] != null) {
        // only humans should have this field
        bioMartEnsembleNcbi.setHgnc_id(fields[4]);
    }
    // Ensembl ids can map to multiple entrez genes so we maintain a collection of entrezgenes
    if (!this.containsKey(ensemblProteinId)) {
        bioMartEnsembleNcbi.getEntrezgenes().add(entrezGene);
        results.put(ensemblProteinId, bioMartEnsembleNcbi);
        if (log.isDebugEnabled())
            log.debug(ensemblProteinId + " has no existing  entrez gene mapping");
    } else {
        Ensembl2NcbiValueObject bioMartEnsembleNcbiDup = this.get(ensemblProteinId);
        // check that the this duplicate record also is the same for ensembl id
        if (ensemblGeneID.equals(bioMartEnsembleNcbiDup.getEnsemblGeneId())) {
            this.get(ensemblProteinId).getEntrezgenes().add(entrezGene);
            if (log.isDebugEnabled())
                log.debug(ensemblProteinId + "added gene to duplicate  ");
        } else {
            throw new FileFormatException("A duplicate ensemblProteinId has been found: " + ensemblProteinId + " but it does not match with the exisiting objects gene id " + ensemblGeneID + ", it was " + bioMartEnsembleNcbiDup.getEnsemblGeneId() + ", line was:\n" + StringUtils.join(fields, " "));
        }
    }
    return bioMartEnsembleNcbi;
}
Also used : Ensembl2NcbiValueObject(ubic.gemma.core.loader.protein.biomart.model.Ensembl2NcbiValueObject) FileFormatException(ubic.gemma.core.loader.util.parser.FileFormatException)

Aggregations

Ensembl2NcbiValueObject (ubic.gemma.core.loader.protein.biomart.model.Ensembl2NcbiValueObject)10 Test (org.junit.Test)5 File (java.io.File)4 URL (java.net.URL)3 Taxon (ubic.gemma.model.genome.Taxon)3 StringProteinProteinInteraction (ubic.gemma.core.loader.protein.string.model.StringProteinProteinInteraction)2 ArrayList (java.util.ArrayList)1 Collection (java.util.Collection)1 HashMap (java.util.HashMap)1 Before (org.junit.Before)1 BiomartEnsemblNcbiObjectGenerator (ubic.gemma.core.loader.protein.biomart.BiomartEnsemblNcbiObjectGenerator)1 StringProteinProteinInteractionObjectGenerator (ubic.gemma.core.loader.protein.string.StringProteinProteinInteractionObjectGenerator)1 FileFormatException (ubic.gemma.core.loader.util.parser.FileFormatException)1 Gene (ubic.gemma.model.genome.Gene)1