Search in sources :

Example 31 with DatabaseEntry

use of ubic.gemma.model.common.description.DatabaseEntry in project Gemma by PavlidisLab.

the class GenomePersister method updateGene.

/**
 * Update a gene.
 *
 * @param newGeneInfo the non-persistent gene we are copying information from
 */
// Possible external use
@SuppressWarnings({ "unused", "WeakerAccess" })
public Gene updateGene(Gene existingGene, Gene newGeneInfo) {
    // NCBI id can be null if gene has been loaded from a gene info file.
    Integer existingNcbiId = existingGene.getNcbiGeneId();
    if (existingNcbiId != null && !existingNcbiId.equals(newGeneInfo.getNcbiGeneId())) {
        AbstractPersister.log.info("NCBI ID Change for " + existingGene + ", new id =" + newGeneInfo.getNcbiGeneId());
        String previousIdString = newGeneInfo.getPreviousNcbiId();
        if (StringUtils.isNotBlank(previousIdString)) {
            /*
                 * Unfortunately, we need to check multiple 'previous' genes. The example I have run across is MTUS2-AS1
                 * (human) which was created by merging two previous genes, LOC728437 and LOC731614; only the former was
                 * in Gemma with its gene product GI:22268051. It also has a product we don't have, GI:14676690. This
                 * comma-delimited set thing is a hack.
                 */
            String[] previousIds = StringUtils.split(previousIdString, ",");
            boolean found = false;
            for (String previousId : previousIds) {
                if (previousId.equals(existingGene.getNcbiGeneId().toString())) {
                    found = true;
                }
            }
            if (!found) {
                throw new IllegalStateException("The NCBI ID for " + newGeneInfo + " has changed and the previous NCBI id on record with NCBI (" + newGeneInfo.getPreviousNcbiId() + ") doesn't match.");
            }
        }
        // swap
        existingGene.setPreviousNcbiId(existingGene.getNcbiGeneId().toString());
        existingGene.setNcbiGeneId(newGeneInfo.getNcbiGeneId());
    /*
             * Note: On occasion, we have two genes with the same symbol but different NCBI ids. This happens when NCBI
             * screws up somehow (?) and has two records for the same gene with different IDs, and we end up with them
             * both at the time they were considered separate genes. At some later date NCBI decides to (in effect)
             * merge them, so one of the genes has to be deprecated. Such 'relics' are deleted by the DAO, because it
             * results in more than one gene being found.
             */
    }
    /*
         * We might want to change this behaviour to clear the value if the updated one has none. For now I just want to
         * avoid wiping data.
         */
    if (StringUtils.isNotBlank(newGeneInfo.getEnsemblId())) {
        existingGene.setEnsemblId(newGeneInfo.getEnsemblId());
    }
    // We assume the taxon hasn't changed.
    Map<String, DatabaseEntry> updatedacMap = new HashMap<>();
    for (DatabaseEntry de : existingGene.getAccessions()) {
        updatedacMap.put(de.getAccession(), de);
    }
    for (DatabaseEntry de : newGeneInfo.getAccessions()) {
        if (!updatedacMap.containsKey(de.getAccession())) {
            this.fillInDatabaseEntry(de);
            existingGene.getAccessions().add(de);
        }
    }
    existingGene.setName(newGeneInfo.getName());
    existingGene.setDescription(newGeneInfo.getDescription());
    existingGene.setOfficialName(newGeneInfo.getOfficialName());
    existingGene.setOfficialSymbol(newGeneInfo.getOfficialSymbol());
    existingGene.setPhysicalLocation(newGeneInfo.getPhysicalLocation());
    this.fillChromosomeLocationAssociations(existingGene.getPhysicalLocation(), existingGene.getTaxon());
    existingGene.getAliases().clear();
    existingGene.getAliases().addAll(newGeneInfo.getAliases());
    /*
         * This is the only tricky part - the gene products. We update them if they are already there, and add them if
         * not. We do not normally remove 'old' ones that the new gene instance does not have, because they might be
         * from different sources. For example, Ensembl or GoldenPath. -- UNLESS the product has an NCBI GI because we
         * know those come from NCBI.
         */
    Map<String, GeneProduct> updatedGpMap = new HashMap<>();
    for (GeneProduct existingGp : existingGene.getProducts()) {
        updatedGpMap.put(existingGp.getName(), existingGp);
        updatedGpMap.put(existingGp.getNcbiGi(), existingGp);
    }
    Map<String, GeneProduct> usedGIs = new HashMap<>();
    for (GeneProduct newGeneProductInfo : newGeneInfo.getProducts()) {
        if (updatedGpMap.containsKey(newGeneProductInfo.getName())) {
            AbstractPersister.log.debug("Updating gene product based on name: " + newGeneProductInfo);
            GeneProduct existingGeneProduct = updatedGpMap.get(newGeneProductInfo.getName());
            this.updateGeneProduct(existingGeneProduct, newGeneProductInfo);
        } else if (updatedGpMap.containsKey(newGeneProductInfo.getNcbiGi())) {
            AbstractPersister.log.debug("Updating gene product based on GI: " + newGeneProductInfo);
            GeneProduct existingGeneProduct = updatedGpMap.get(newGeneProductInfo.getNcbiGi());
            this.updateGeneProduct(existingGeneProduct, newGeneProductInfo);
        } else {
            GeneProduct existingGeneProduct = geneProductDao.find(newGeneProductInfo);
            if (existingGeneProduct == null) {
                // it is, in fact, new, so far as we can tell.
                newGeneProductInfo.setGene(existingGene);
                this.fillInGeneProductAssociations(newGeneProductInfo);
                AbstractPersister.log.info("New product for " + existingGene + ": " + newGeneProductInfo);
                existingGene.getProducts().add(newGeneProductInfo);
            } else {
                /*
                     * This can only happen if this gene product is associated with a different gene. This generally
                     * happens when a transcript is associated with two genes in NCBI, so the switching is actually not
                     * useful to us, but we do it anyway to be consistent (and in case it really does matter). It is
                     * rare. Causes can be 1) bicistronic genes such as human LUZP6 and MTPN; 2) genome-duplicated
                     * genes; or 3) an error in the data source. The problem for us is at this point in processing, we
                     * don't know if the gene is going to get 'reattached' to its original gene.
                     */
                existingGeneProduct = geneProductDao.thaw(existingGeneProduct);
                Gene oldGeneForExistingGeneProduct = existingGeneProduct.getGene();
                if (oldGeneForExistingGeneProduct != null) {
                    // transient.
                    Gene geneInfo = newGeneProductInfo.getGene();
                    if (!oldGeneForExistingGeneProduct.equals(geneInfo)) {
                        AbstractPersister.log.warn("Switching gene product from one gene to another: " + existingGeneProduct + " switching to " + geneInfo + " (this can also happen if an mRNA is associated with two genes, which we don't allow, so we switch it arbitrarily)");
                        // Here we just remove its old association.
                        oldGeneForExistingGeneProduct = geneDao.thaw(oldGeneForExistingGeneProduct);
                        oldGeneForExistingGeneProduct.getProducts().remove(existingGeneProduct);
                        log.info("Switch: Removing " + existingGeneProduct + " from " + oldGeneForExistingGeneProduct + " GI=" + existingGeneProduct.getNcbiGi());
                        geneDao.update(oldGeneForExistingGeneProduct);
                        if (oldGeneForExistingGeneProduct.getProducts().isEmpty()) {
                            AbstractPersister.log.warn("Gene has no products left after removing that gene product (but it might change later): " + oldGeneForExistingGeneProduct);
                        /*
                                 * On occasion, we run into problems with sequences that have two diffent NCBI GI
                                 * IDs (due to an update) and which is also associated with two genes - almost
                                 * always in Drosophila. A recent example was GenBank: BT099970, which had the GI
                                 * 289666832 but after an update was GI 1108657489 associated with both Lcp65Ab1 and
                                 * Lcp65Ab2 in gene2accession. It's proven hard to track down exactly how to fix this as
                                 * the failure happens at the transaction flush - but using --restart seems to fix it.
                                 */
                        }
                    }
                    assert !oldGeneForExistingGeneProduct.getProducts().contains(existingGeneProduct);
                } else {
                    AbstractPersister.log.info("Attaching orphaned gene product to " + existingGene + " : " + existingGeneProduct);
                }
                existingGeneProduct.setGene(existingGene);
                existingGene.getProducts().add(existingGeneProduct);
                assert existingGeneProduct.getGene().equals(existingGene);
                this.updateGeneProduct(existingGeneProduct, newGeneProductInfo);
            }
        }
        if (newGeneProductInfo.getNcbiGi() != null)
            usedGIs.put(newGeneProductInfo.getNcbiGi(), newGeneProductInfo);
    }
    Collection<GeneProduct> toRemove = new HashSet<>();
    if (!usedGIs.isEmpty()) {
        toRemove = this.handleGeneProductChangedGIs(existingGene, usedGIs);
    }
    geneDao.update(existingGene);
    if (!toRemove.isEmpty()) {
        this.removeGeneProducts(toRemove);
    }
    if (existingGene.getProducts().isEmpty()) {
        AbstractPersister.log.debug("No products left for: " + existingGene);
    }
    return existingGene;
}
Also used : HashMap(java.util.HashMap) DatabaseEntry(ubic.gemma.model.common.description.DatabaseEntry) BioSequence2GeneProduct(ubic.gemma.model.association.BioSequence2GeneProduct) GeneProduct(ubic.gemma.model.genome.gene.GeneProduct) HashSet(java.util.HashSet)

Example 32 with DatabaseEntry

use of ubic.gemma.model.common.description.DatabaseEntry in project Gemma by PavlidisLab.

the class GenomePersister method addAnyNewAccessions.

private void addAnyNewAccessions(GeneProduct existing, GeneProduct geneProduct) {
    Map<String, DatabaseEntry> updatedGpMap = new HashMap<>();
    existing = geneProductDao.thaw(existing);
    for (DatabaseEntry de : existing.getAccessions()) {
        updatedGpMap.put(de.getAccession(), de);
    }
    for (DatabaseEntry de : geneProduct.getAccessions()) {
        if (!updatedGpMap.containsKey(de.getAccession())) {
            this.fillInDatabaseEntry(de);
            existing.getAccessions().add(de);
        }
    }
}
Also used : HashMap(java.util.HashMap) DatabaseEntry(ubic.gemma.model.common.description.DatabaseEntry)

Example 33 with DatabaseEntry

use of ubic.gemma.model.common.description.DatabaseEntry in project Gemma by PavlidisLab.

the class GenericGenelistDesignGenerator method doWork.

@Override
protected Exception doWork(String[] args) {
    Exception exception = super.processCommandLine(args);
    if (exception != null) {
        return exception;
    }
    ExternalDatabase genbank = externalDatabaseService.findByName("Genbank");
    ExternalDatabase ensembl = externalDatabaseService.findByName("Ensembl");
    assert genbank != null;
    assert ensembl != null;
    /*
         * Create the stub array design for the organism. The name and etc. are generated automatically. If the design
         * exists, we update it.
         */
    String shortName = this.generateShortName();
    ArrayDesign arrayDesign = ArrayDesign.Factory.newInstance();
    arrayDesign.setShortName(shortName);
    // common name
    arrayDesign.setPrimaryTaxon(taxon);
    String nameExt = useNCBIIds ? ", indexed by NCBI IDs" : useEnsemblIds ? ", indexed by Ensembl IDs" : "";
    arrayDesign.setName("Generic platform for " + taxon.getScientificName() + nameExt);
    arrayDesign.setDescription("Created by Gemma");
    // this is key
    arrayDesign.setTechnologyType(TechnologyType.NONE);
    if (arrayDesignService.find(arrayDesign) != null) {
        AbstractCLI.log.info("Platform for " + taxon + " already exists, will update");
        arrayDesign = arrayDesignService.find(arrayDesign);
        arrayDesignService.deleteGeneProductAssociations(arrayDesign);
        arrayDesign = arrayDesignService.load(arrayDesign.getId());
    } else {
        AbstractCLI.log.info("Creating new 'generic' platform");
        arrayDesign = arrayDesignService.create(arrayDesign);
    }
    arrayDesign = arrayDesignService.thaw(arrayDesign);
    // temporary: making sure we set it, as it is new.
    arrayDesign.setTechnologyType(TechnologyType.NONE);
    /*
         * Load up the genes for the organism.
         */
    Collection<Gene> knownGenes = geneService.loadAll(taxon);
    AbstractCLI.log.info("Taxon has " + knownGenes.size() + " genes");
    // this would be good for cases where the identifier we are using has changed.
    Map<Gene, CompositeSequence> existingGeneMap = new HashMap<>();
    if (!useNCBIIds && !useEnsemblIds) {
        // only using this for symbol changes.
        existingGeneMap = this.getExistingGeneMap(arrayDesign);
    }
    Map<String, CompositeSequence> existingSymbolMap = this.getExistingProbeNameMap(arrayDesign);
    int count = 0;
    int numWithNoTranscript = 0;
    // int hasGeneAlready = 0;
    // int numNewGenes = 0;
    int numNewElements = 0;
    int numUpdatedElements = 0;
    for (Gene gene : knownGenes) {
        gene = geneService.thaw(gene);
        Collection<GeneProduct> products = gene.getProducts();
        if (products.isEmpty()) {
            numWithNoTranscript++;
            AbstractCLI.log.debug("No transcript for " + gene);
            continue;
        }
        count++;
        CompositeSequence csForGene = null;
        if (useNCBIIds) {
            if (gene.getNcbiGeneId() == null) {
                AbstractCLI.log.debug("No NCBI ID for " + gene + ", skipping");
                continue;
            }
            if (existingSymbolMap.containsKey(gene.getNcbiGeneId().toString())) {
                csForGene = existingSymbolMap.get(gene.getNcbiGeneId().toString());
            }
        } else if (useEnsemblIds) {
            if (gene.getEnsemblId() == null) {
                AbstractCLI.log.debug("No Ensembl ID for " + gene + ", skipping");
                continue;
            }
            if (existingSymbolMap.containsKey(gene.getEnsemblId())) {
                csForGene = existingSymbolMap.get(gene.getEnsemblId());
            }
        } else {
            /*
                 * detect when the symbol has changed
                 */
            if (existingSymbolMap.containsKey(gene.getOfficialSymbol())) {
                csForGene = existingSymbolMap.get(gene.getOfficialSymbol());
            } else if (existingGeneMap.containsKey(gene)) {
                csForGene = existingGeneMap.get(gene);
                AbstractCLI.log.debug("Gene symbol has changed for: " + gene + "? Current element has name=" + csForGene.getName());
                csForGene.setName(gene.getOfficialSymbol());
            }
        }
        assert csForGene == null || csForGene.getId() != null : "Null id for " + csForGene;
        /*
             * We arbitrarily link the "probe" to one of the gene's RNA transcripts. We could consider other strategies
             * to pick the representative, but it generally doesn't matter.
             */
        for (GeneProduct geneProduct : products) {
            if (!GeneProductType.RNA.equals(geneProduct.getType())) {
                continue;
            }
            /*
                 * Name is usually the genbank or ensembl accession
                 */
            String name = geneProduct.getName();
            BioSequence bioSequence = BioSequence.Factory.newInstance();
            Collection<DatabaseEntry> accessions = geneProduct.getAccessions();
            bioSequence.setName(name);
            bioSequence.setTaxon(taxon);
            bioSequence.setPolymerType(PolymerType.RNA);
            bioSequence.setType(SequenceType.mRNA);
            BioSequence existing = null;
            if (accessions.isEmpty()) {
                // this should not be hit.
                AbstractCLI.log.warn("No accession for " + name);
                DatabaseEntry de = DatabaseEntry.Factory.newInstance();
                de.setAccession(name);
                if (name.startsWith("ENS") && name.length() > 10) {
                    de.setExternalDatabase(ensembl);
                } else {
                    if (name.matches("^[A-Z]{1,2}(_?)[0-9]+(\\.[0-9]+)?$")) {
                        de.setExternalDatabase(genbank);
                    } else {
                        AbstractCLI.log.info("Name doesn't look like genbank or ensembl, skipping: " + name);
                        continue;
                    }
                }
                bioSequence.setSequenceDatabaseEntry(de);
            } else {
                bioSequence.setSequenceDatabaseEntry(accessions.iterator().next());
                existing = bioSequenceService.findByAccession(accessions.iterator().next());
            // FIXME It is possible that this sequence will have been aligned to the genome, which is a bit
            // confusing. So it will map to a gene. Worse case: it maps to more than one gene ...
            }
            if (existing == null) {
                bioSequence = (BioSequence) this.getPersisterHelper().persist(bioSequence);
            } else {
                bioSequence = existing;
            }
            assert bioSequence != null && bioSequence.getId() != null;
            if (bioSequence.getSequenceDatabaseEntry() == null) {
                AbstractCLI.log.info("No DB entry for " + bioSequence + "(" + gene + "), will look for a better sequence to use ...");
                continue;
            }
            if (csForGene == null) {
                if (AbstractCLI.log.isDebugEnabled())
                    AbstractCLI.log.debug("New element " + " with " + bioSequence + " for " + gene);
                csForGene = CompositeSequence.Factory.newInstance();
                if (useNCBIIds) {
                    if (gene.getNcbiGeneId() == null) {
                        continue;
                    }
                    csForGene.setName(gene.getNcbiGeneId().toString());
                } else if (useEnsemblIds) {
                    if (gene.getEnsemblId() == null) {
                        continue;
                    }
                    csForGene.setName(gene.getEnsemblId());
                } else {
                    csForGene.setName(gene.getOfficialSymbol());
                }
                csForGene.setArrayDesign(arrayDesign);
                csForGene.setBiologicalCharacteristic(bioSequence);
                csForGene.setDescription("Generic expression element for " + gene);
                csForGene = compositeSequenceService.create(csForGene);
                assert csForGene.getId() != null : "No id for " + csForGene + " for " + gene;
                arrayDesign.getCompositeSequences().add(csForGene);
                numNewElements++;
            } else {
                if (AbstractCLI.log.isDebugEnabled())
                    AbstractCLI.log.debug("Updating existing element: " + csForGene + " with " + bioSequence + " for " + gene);
                csForGene.setArrayDesign(arrayDesign);
                csForGene.setBiologicalCharacteristic(bioSequence);
                csForGene.setDescription("Generic expression element for " + gene);
                assert csForGene.getId() != null : "No id for " + csForGene + " for " + gene;
                compositeSequenceService.update(csForGene);
                // making sure ...
                csForGene = compositeSequenceService.load(csForGene.getId());
                assert csForGene.getId() != null;
                arrayDesign.getCompositeSequences().add(csForGene);
                numUpdatedElements++;
            }
            assert bioSequence.getId() != null;
            assert geneProduct.getId() != null;
            assert csForGene.getBiologicalCharacteristic() != null && csForGene.getBiologicalCharacteristic().getId() != null;
            AnnotationAssociation aa = AnnotationAssociation.Factory.newInstance();
            aa.setGeneProduct(geneProduct);
            aa.setBioSequence(bioSequence);
            annotationAssociationService.create(aa);
            break;
        }
        if (count % 100 == 0)
            AbstractCLI.log.info(count + " genes processed; " + numNewElements + " new elements; " + numUpdatedElements + " updated elements; " + numWithNoTranscript + " genes had no transcript and were skipped.");
    }
    // is this necessary? causes an error sometimes.
    // arrayDesignService.update( arrayDesign );
    AbstractCLI.log.info("Array design has " + arrayDesignService.numCompositeSequenceWithGenes(arrayDesign) + " 'probes' associated with genes.");
    arrayDesignReportService.generateArrayDesignReport(arrayDesign.getId());
    auditTrailService.addUpdateEvent(arrayDesign, AnnotationBasedGeneMappingEvent.Factory.newInstance(), count + " genes processed; " + numNewElements + " new elements; " + numUpdatedElements + " updated elements; " + numWithNoTranscript + " genes had no transcript and were skipped.");
    arrayDesignAnnotationService.deleteExistingFiles(arrayDesign);
    AbstractCLI.log.info("Don't forget to update the annotation files");
    return null;
}
Also used : AnnotationAssociation(ubic.gemma.model.genome.sequenceAnalysis.AnnotationAssociation) HashMap(java.util.HashMap) BioSequence(ubic.gemma.model.genome.biosequence.BioSequence) ArrayDesign(ubic.gemma.model.expression.arrayDesign.ArrayDesign) DatabaseEntry(ubic.gemma.model.common.description.DatabaseEntry) CompositeSequence(ubic.gemma.model.expression.designElement.CompositeSequence) GeneProduct(ubic.gemma.model.genome.gene.GeneProduct) ExternalDatabase(ubic.gemma.model.common.description.ExternalDatabase) Gene(ubic.gemma.model.genome.Gene)

Example 34 with DatabaseEntry

use of ubic.gemma.model.common.description.DatabaseEntry in project Gemma by PavlidisLab.

the class LoadExpressionDataCli method removeIfExists.

/**
 * Delete previous version of the experiment.
 *
 * @param accession accession
 */
private void removeIfExists(String accession) {
    DatabaseEntry acDbe = DatabaseEntry.Factory.newInstance();
    acDbe.setAccession(accession);
    ExternalDatabase geo = ExternalDatabase.Factory.newInstance();
    geo.setName("GEO");
    acDbe.setExternalDatabase(geo);
    Collection<ExpressionExperiment> existing = eeService.findByAccession(acDbe);
    if (!existing.isEmpty()) {
        AbstractCLI.log.info("Deleting existing version of " + accession);
        for (ExpressionExperiment expressionExperiment : existing) {
            eeService.remove(expressionExperiment);
        }
    }
}
Also used : ExternalDatabase(ubic.gemma.model.common.description.ExternalDatabase) DatabaseEntry(ubic.gemma.model.common.description.DatabaseEntry) ExpressionExperiment(ubic.gemma.model.expression.experiment.ExpressionExperiment)

Example 35 with DatabaseEntry

use of ubic.gemma.model.common.description.DatabaseEntry in project Gemma by PavlidisLab.

the class ArrayDesignControllerImpl method setExtRefsAndCounts.

/**
 * Sets external references, design element count and express. experiment count on the given value object.
 */
private ArrayDesignValueObjectExt setExtRefsAndCounts(ArrayDesignValueObjectExt result, ArrayDesign arrayDesign) {
    Integer numCompositeSequences = arrayDesignService.getCompositeSequenceCount(arrayDesign).intValue();
    int numExpressionExperiments = arrayDesignService.numExperiments(arrayDesign);
    Collection<DatabaseEntryValueObject> externalReferences = new HashSet<>();
    for (DatabaseEntry en : arrayDesign.getExternalReferences()) {
        externalReferences.add(new DatabaseEntryValueObject(en));
    }
    result.setExternalReferences(externalReferences);
    result.setDesignElementCount(numCompositeSequences);
    result.setExpressionExperimentCount(numExpressionExperiments);
    return result;
}
Also used : DatabaseEntry(ubic.gemma.model.common.description.DatabaseEntry) DatabaseEntryValueObject(ubic.gemma.model.common.description.DatabaseEntryValueObject)

Aggregations

DatabaseEntry (ubic.gemma.model.common.description.DatabaseEntry)37 ExternalDatabase (ubic.gemma.model.common.description.ExternalDatabase)11 GeneProduct (ubic.gemma.model.genome.gene.GeneProduct)8 HashSet (java.util.HashSet)6 Test (org.junit.Test)6 ExpressionExperiment (ubic.gemma.model.expression.experiment.ExpressionExperiment)6 Gene (ubic.gemma.model.genome.Gene)6 BioSequence (ubic.gemma.model.genome.biosequence.BioSequence)6 Taxon (ubic.gemma.model.genome.Taxon)5 BaseSpringContextTest (ubic.gemma.core.testing.BaseSpringContextTest)4 BioSequence2GeneProduct (ubic.gemma.model.association.BioSequence2GeneProduct)4 CompositeSequence (ubic.gemma.model.expression.designElement.CompositeSequence)4 HashMap (java.util.HashMap)3 BibliographicReference (ubic.gemma.model.common.description.BibliographicReference)3 AnnotationAssociation (ubic.gemma.model.genome.sequenceAnalysis.AnnotationAssociation)3 IOException (java.io.IOException)2 ArrayList (java.util.ArrayList)2 Before (org.junit.Before)2 AlreadyExistsInSystemException (ubic.gemma.core.loader.util.AlreadyExistsInSystemException)2 BioAssay (ubic.gemma.model.expression.bioAssay.BioAssay)2