Search in sources :

Example 31 with GeneProduct

use of ubic.gemma.model.genome.gene.GeneProduct in project Gemma by PavlidisLab.

the class StringBiomartGene2GeneProteinLoaderTest method makeGene.

private Gene makeGene(Taxon t, String name, String ncbiId) {
    Gene g = Gene.Factory.newInstance();
    g.setName(name);
    g.setOfficialName(name);
    g.setOfficialSymbol(name);
    g.setNcbiGeneId(Integer.parseInt(ncbiId));
    g.setTaxon(t);
    Collection<GeneProduct> ggg = new HashSet<>();
    ggg.add(PersistentDummyObjectHelper.getTestNonPersistentGeneProduct(g));
    g.getProducts().addAll(ggg);
    g = (Gene) persisterHelper.persist(g);
    return g;
}
Also used : GeneProduct(ubic.gemma.model.genome.gene.GeneProduct) Gene(ubic.gemma.model.genome.Gene) HashSet(java.util.HashSet)

Example 32 with GeneProduct

use of ubic.gemma.model.genome.gene.GeneProduct in project Gemma by PavlidisLab.

the class NcbiGeneConverter method convert.

public Gene convert(NcbiGeneData data) {
    // get gene info and fill in gene
    NCBIGeneInfo geneInfo = data.getGeneInfo();
    Gene gene = this.convert(geneInfo);
    // grab all accessions and fill in GeneProduct/DatabaseEntry
    // and associate with Gene
    Collection<NCBIGene2Accession> gene2accession = data.getAccessions();
    Collection<GeneProduct> geneProducts = new HashSet<>();
    for (NCBIGene2Accession acc : gene2accession) {
        geneProducts.addAll(this.convert(acc, gene));
    }
    gene.setProducts(geneProducts);
    return gene;
}
Also used : NCBIGeneInfo(ubic.gemma.core.loader.genome.gene.ncbi.model.NCBIGeneInfo) NCBIGene2Accession(ubic.gemma.core.loader.genome.gene.ncbi.model.NCBIGene2Accession) GeneProduct(ubic.gemma.model.genome.gene.GeneProduct) Gene(ubic.gemma.model.genome.Gene) HashSet(java.util.HashSet)

Example 33 with GeneProduct

use of ubic.gemma.model.genome.gene.GeneProduct in project Gemma by PavlidisLab.

the class GenomePersister method updateGene.

/**
 * Update a gene.
 *
 * @param newGeneInfo the non-persistent gene we are copying information from
 */
// Possible external use
@SuppressWarnings({ "unused", "WeakerAccess" })
public Gene updateGene(Gene existingGene, Gene newGeneInfo) {
    // NCBI id can be null if gene has been loaded from a gene info file.
    Integer existingNcbiId = existingGene.getNcbiGeneId();
    if (existingNcbiId != null && !existingNcbiId.equals(newGeneInfo.getNcbiGeneId())) {
        AbstractPersister.log.info("NCBI ID Change for " + existingGene + ", new id =" + newGeneInfo.getNcbiGeneId());
        String previousIdString = newGeneInfo.getPreviousNcbiId();
        if (StringUtils.isNotBlank(previousIdString)) {
            /*
                 * Unfortunately, we need to check multiple 'previous' genes. The example I have run across is MTUS2-AS1
                 * (human) which was created by merging two previous genes, LOC728437 and LOC731614; only the former was
                 * in Gemma with its gene product GI:22268051. It also has a product we don't have, GI:14676690. This
                 * comma-delimited set thing is a hack.
                 */
            String[] previousIds = StringUtils.split(previousIdString, ",");
            boolean found = false;
            for (String previousId : previousIds) {
                if (previousId.equals(existingGene.getNcbiGeneId().toString())) {
                    found = true;
                }
            }
            if (!found) {
                throw new IllegalStateException("The NCBI ID for " + newGeneInfo + " has changed and the previous NCBI id on record with NCBI (" + newGeneInfo.getPreviousNcbiId() + ") doesn't match.");
            }
        }
        // swap
        existingGene.setPreviousNcbiId(existingGene.getNcbiGeneId().toString());
        existingGene.setNcbiGeneId(newGeneInfo.getNcbiGeneId());
    /*
             * Note: On occasion, we have two genes with the same symbol but different NCBI ids. This happens when NCBI
             * screws up somehow (?) and has two records for the same gene with different IDs, and we end up with them
             * both at the time they were considered separate genes. At some later date NCBI decides to (in effect)
             * merge them, so one of the genes has to be deprecated. Such 'relics' are deleted by the DAO, because it
             * results in more than one gene being found.
             */
    }
    /*
         * We might want to change this behaviour to clear the value if the updated one has none. For now I just want to
         * avoid wiping data.
         */
    if (StringUtils.isNotBlank(newGeneInfo.getEnsemblId())) {
        existingGene.setEnsemblId(newGeneInfo.getEnsemblId());
    }
    // We assume the taxon hasn't changed.
    Map<String, DatabaseEntry> updatedacMap = new HashMap<>();
    for (DatabaseEntry de : existingGene.getAccessions()) {
        updatedacMap.put(de.getAccession(), de);
    }
    for (DatabaseEntry de : newGeneInfo.getAccessions()) {
        if (!updatedacMap.containsKey(de.getAccession())) {
            this.fillInDatabaseEntry(de);
            existingGene.getAccessions().add(de);
        }
    }
    existingGene.setName(newGeneInfo.getName());
    existingGene.setDescription(newGeneInfo.getDescription());
    existingGene.setOfficialName(newGeneInfo.getOfficialName());
    existingGene.setOfficialSymbol(newGeneInfo.getOfficialSymbol());
    existingGene.setPhysicalLocation(newGeneInfo.getPhysicalLocation());
    this.fillChromosomeLocationAssociations(existingGene.getPhysicalLocation(), existingGene.getTaxon());
    existingGene.getAliases().clear();
    existingGene.getAliases().addAll(newGeneInfo.getAliases());
    /*
         * This is the only tricky part - the gene products. We update them if they are already there, and add them if
         * not. We do not normally remove 'old' ones that the new gene instance does not have, because they might be
         * from different sources. For example, Ensembl or GoldenPath. -- UNLESS the product has an NCBI GI because we
         * know those come from NCBI.
         */
    Map<String, GeneProduct> updatedGpMap = new HashMap<>();
    for (GeneProduct existingGp : existingGene.getProducts()) {
        updatedGpMap.put(existingGp.getName(), existingGp);
        updatedGpMap.put(existingGp.getNcbiGi(), existingGp);
    }
    Map<String, GeneProduct> usedGIs = new HashMap<>();
    for (GeneProduct newGeneProductInfo : newGeneInfo.getProducts()) {
        if (updatedGpMap.containsKey(newGeneProductInfo.getName())) {
            AbstractPersister.log.debug("Updating gene product based on name: " + newGeneProductInfo);
            GeneProduct existingGeneProduct = updatedGpMap.get(newGeneProductInfo.getName());
            this.updateGeneProduct(existingGeneProduct, newGeneProductInfo);
        } else if (updatedGpMap.containsKey(newGeneProductInfo.getNcbiGi())) {
            AbstractPersister.log.debug("Updating gene product based on GI: " + newGeneProductInfo);
            GeneProduct existingGeneProduct = updatedGpMap.get(newGeneProductInfo.getNcbiGi());
            this.updateGeneProduct(existingGeneProduct, newGeneProductInfo);
        } else {
            GeneProduct existingGeneProduct = geneProductDao.find(newGeneProductInfo);
            if (existingGeneProduct == null) {
                // it is, in fact, new, so far as we can tell.
                newGeneProductInfo.setGene(existingGene);
                this.fillInGeneProductAssociations(newGeneProductInfo);
                AbstractPersister.log.info("New product for " + existingGene + ": " + newGeneProductInfo);
                existingGene.getProducts().add(newGeneProductInfo);
            } else {
                /*
                     * This can only happen if this gene product is associated with a different gene. This generally
                     * happens when a transcript is associated with two genes in NCBI, so the switching is actually not
                     * useful to us, but we do it anyway to be consistent (and in case it really does matter). It is
                     * rare. Causes can be 1) bicistronic genes such as human LUZP6 and MTPN; 2) genome-duplicated
                     * genes; or 3) an error in the data source. The problem for us is at this point in processing, we
                     * don't know if the gene is going to get 'reattached' to its original gene.
                     */
                existingGeneProduct = geneProductDao.thaw(existingGeneProduct);
                Gene oldGeneForExistingGeneProduct = existingGeneProduct.getGene();
                if (oldGeneForExistingGeneProduct != null) {
                    // transient.
                    Gene geneInfo = newGeneProductInfo.getGene();
                    if (!oldGeneForExistingGeneProduct.equals(geneInfo)) {
                        AbstractPersister.log.warn("Switching gene product from one gene to another: " + existingGeneProduct + " switching to " + geneInfo + " (this can also happen if an mRNA is associated with two genes, which we don't allow, so we switch it arbitrarily)");
                        // Here we just remove its old association.
                        oldGeneForExistingGeneProduct = geneDao.thaw(oldGeneForExistingGeneProduct);
                        oldGeneForExistingGeneProduct.getProducts().remove(existingGeneProduct);
                        log.info("Switch: Removing " + existingGeneProduct + " from " + oldGeneForExistingGeneProduct + " GI=" + existingGeneProduct.getNcbiGi());
                        geneDao.update(oldGeneForExistingGeneProduct);
                        if (oldGeneForExistingGeneProduct.getProducts().isEmpty()) {
                            AbstractPersister.log.warn("Gene has no products left after removing that gene product (but it might change later): " + oldGeneForExistingGeneProduct);
                        /*
                                 * On occasion, we run into problems with sequences that have two diffent NCBI GI
                                 * IDs (due to an update) and which is also associated with two genes - almost
                                 * always in Drosophila. A recent example was GenBank: BT099970, which had the GI
                                 * 289666832 but after an update was GI 1108657489 associated with both Lcp65Ab1 and
                                 * Lcp65Ab2 in gene2accession. It's proven hard to track down exactly how to fix this as
                                 * the failure happens at the transaction flush - but using --restart seems to fix it.
                                 */
                        }
                    }
                    assert !oldGeneForExistingGeneProduct.getProducts().contains(existingGeneProduct);
                } else {
                    AbstractPersister.log.info("Attaching orphaned gene product to " + existingGene + " : " + existingGeneProduct);
                }
                existingGeneProduct.setGene(existingGene);
                existingGene.getProducts().add(existingGeneProduct);
                assert existingGeneProduct.getGene().equals(existingGene);
                this.updateGeneProduct(existingGeneProduct, newGeneProductInfo);
            }
        }
        if (newGeneProductInfo.getNcbiGi() != null)
            usedGIs.put(newGeneProductInfo.getNcbiGi(), newGeneProductInfo);
    }
    Collection<GeneProduct> toRemove = new HashSet<>();
    if (!usedGIs.isEmpty()) {
        toRemove = this.handleGeneProductChangedGIs(existingGene, usedGIs);
    }
    geneDao.update(existingGene);
    if (!toRemove.isEmpty()) {
        this.removeGeneProducts(toRemove);
    }
    if (existingGene.getProducts().isEmpty()) {
        AbstractPersister.log.debug("No products left for: " + existingGene);
    }
    return existingGene;
}
Also used : HashMap(java.util.HashMap) DatabaseEntry(ubic.gemma.model.common.description.DatabaseEntry) BioSequence2GeneProduct(ubic.gemma.model.association.BioSequence2GeneProduct) GeneProduct(ubic.gemma.model.genome.gene.GeneProduct) HashSet(java.util.HashSet)

Example 34 with GeneProduct

use of ubic.gemma.model.genome.gene.GeneProduct in project Gemma by PavlidisLab.

the class Gene method computeHashCode.

private int computeHashCode() {
    int hashCode = 29;
    if (this.getNcbiGeneId() != null) {
        hashCode += this.getNcbiGeneId().hashCode();
        return hashCode;
    }
    if (this.getOfficialSymbol() != null) {
        hashCode += this.getOfficialSymbol().hashCode();
    }
    if (this.getTaxon() != null) {
        hashCode += this.getTaxon().hashCode();
    }
    if (this.getOfficialName() != null) {
        hashCode += this.getOfficialName().hashCode();
    } else if (this.getPhysicalLocation() != null) {
        hashCode += this.getPhysicalLocation().hashCode();
    } else if (this.getProducts() != null && this.getProducts().size() > 0) {
        GeneProduct gp = this.getProducts().iterator().next();
        hashCode += gp.hashCode();
    }
    hashCode += super.hashCode();
    return hashCode;
}
Also used : GeneProduct(ubic.gemma.model.genome.gene.GeneProduct)

Example 35 with GeneProduct

use of ubic.gemma.model.genome.gene.GeneProduct in project Gemma by PavlidisLab.

the class GoldenPathSequenceAnalysis method findClosestGene.

/**
 * Given a location, find the nearest gene on the same strand, including only "known", "refseq" or "ensembl"
 * transcripts.
 *
 * @param chromosome chromosome
 * @param queryStart start
 * @param queryEnd end
 * @param strand Either '+' or '-'
 * @param maxWindow the number of bases on each side to look, at most, in addition to looking inside the given
 *        region.
 * @return the Gene closest to the given location. This is a transient instance, not from Gemma's database.
 */
public Gene findClosestGene(String chromosome, Long queryStart, Long queryEnd, String strand, int maxWindow) {
    if (queryEnd < queryStart)
        throw new IllegalArgumentException("End must not be less than start");
    long round = 0L;
    int numRounds = 5;
    int increment = (int) (maxWindow / (double) numRounds);
    while (round < numRounds) {
        long left = queryStart + round * increment;
        long right = queryEnd + round * increment;
        Collection<GeneProduct> geneProducts = this.findRefGenesByLocation(chromosome, left, right, strand);
        geneProducts.addAll(this.findKnownGenesByLocation(chromosome, left, right, strand));
        Gene nearest = null;
        int closestSoFar = Integer.MAX_VALUE;
        for (GeneProduct geneProduct : geneProducts) {
            PhysicalLocation gpl = geneProduct.getPhysicalLocation();
            Long start = gpl.getNucleotide();
            Long end = start + gpl.getNucleotideLength();
            int gap = (int) Math.min(left - end, start - right);
            if (gap < closestSoFar) {
                closestSoFar = gap;
                nearest = geneProduct.getGene();
            }
        }
        if (nearest != null)
            return nearest;
        round++;
    }
    return null;
}
Also used : BioSequence2GeneProduct(ubic.gemma.model.association.BioSequence2GeneProduct) GeneProduct(ubic.gemma.model.genome.gene.GeneProduct) Gene(ubic.gemma.model.genome.Gene) PhysicalLocation(ubic.gemma.model.genome.PhysicalLocation)

Aggregations

GeneProduct (ubic.gemma.model.genome.gene.GeneProduct)41 Gene (ubic.gemma.model.genome.Gene)20 HashSet (java.util.HashSet)16 BioSequence2GeneProduct (ubic.gemma.model.association.BioSequence2GeneProduct)12 DatabaseEntry (ubic.gemma.model.common.description.DatabaseEntry)8 BlatAssociation (ubic.gemma.model.genome.sequenceAnalysis.BlatAssociation)8 Test (org.junit.Test)6 BaseSpringContextTest (ubic.gemma.core.testing.BaseSpringContextTest)5 BioSequence (ubic.gemma.model.genome.biosequence.BioSequence)5 AnnotationAssociation (ubic.gemma.model.genome.sequenceAnalysis.AnnotationAssociation)5 HashMap (java.util.HashMap)4 PhysicalLocation (ubic.gemma.model.genome.PhysicalLocation)4 Criteria (org.hibernate.Criteria)3 CompositeSequence (ubic.gemma.model.expression.designElement.CompositeSequence)3 IOException (java.io.IOException)2 ArrayList (java.util.ArrayList)2 Collection (java.util.Collection)2 GeneProductValueObject (ubic.gemma.model.genome.gene.GeneProductValueObject)2 BufferedReader (java.io.BufferedReader)1 FileReader (java.io.FileReader)1