Search in sources :

Example 91 with BioSequence

use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.

the class GenomePersister method persistOrUpdateBioSequence.

private BioSequence persistOrUpdateBioSequence(BioSequence bioSequence) {
    if (bioSequence == null)
        return null;
    /*
         * Note that this method is only really used by the ArrayDesignSequencePersister: it's for filling in
         * information about probes on arrays.
         */
    BioSequence existingBioSequence = bioSequenceDao.find(bioSequence);
    if (existingBioSequence == null) {
        if (AbstractPersister.log.isDebugEnabled())
            AbstractPersister.log.debug("Creating new: " + bioSequence);
        return this.persistNewBioSequence(bioSequence);
    }
    if (AbstractPersister.log.isDebugEnabled())
        AbstractPersister.log.debug("Found existing: " + existingBioSequence);
    // the sequence is the main field we might update.
    if (bioSequence.getSequence() != null && !bioSequence.getSequence().equals(existingBioSequence.getSequence())) {
        existingBioSequence.setSequence(bioSequence.getSequence());
    }
    /*
         * Can do for all fields that might not be the same: anything besides the name and taxon.
         */
    if (bioSequence.getDescription() != null && !bioSequence.getDescription().equals(existingBioSequence.getDescription())) {
        existingBioSequence.setDescription(bioSequence.getDescription());
    }
    if (bioSequence.getType() != null && !bioSequence.getType().equals(existingBioSequence.getType())) {
        existingBioSequence.setType(bioSequence.getType());
    }
    if (bioSequence.getFractionRepeats() != null && !bioSequence.getFractionRepeats().equals(existingBioSequence.getFractionRepeats())) {
        existingBioSequence.setFractionRepeats(bioSequence.getFractionRepeats());
    }
    if (bioSequence.getLength() != null && !bioSequence.getLength().equals(existingBioSequence.getLength())) {
        existingBioSequence.setLength(bioSequence.getLength());
    }
    if (bioSequence.getIsCircular() != null && !bioSequence.getIsCircular().equals(existingBioSequence.getIsCircular())) {
        existingBioSequence.setIsCircular(bioSequence.getIsCircular());
    }
    if (bioSequence.getPolymerType() != null && !bioSequence.getPolymerType().equals(existingBioSequence.getPolymerType())) {
        existingBioSequence.setPolymerType(bioSequence.getPolymerType());
    }
    if (bioSequence.getSequenceDatabaseEntry() != null && !bioSequence.getSequenceDatabaseEntry().equals(existingBioSequence.getSequenceDatabaseEntry())) {
        existingBioSequence.setSequenceDatabaseEntry((DatabaseEntry) this.persist(bioSequence.getSequenceDatabaseEntry()));
    }
    bioSequenceDao.update(existingBioSequence);
    return existingBioSequence;
}
Also used : BioSequence(ubic.gemma.model.genome.biosequence.BioSequence)

Example 92 with BioSequence

use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.

the class MatrixWriter method writeJSON.

public void writeJSON(Writer writer, ExpressionDataMatrix<?> matrix) throws IOException {
    int columns = matrix.columns();
    int rows = matrix.rows();
    StringBuilder buf = new StringBuilder();
    buf.append("{ 'numRows' : ").append(matrix.rows()).append(", 'rows': ");
    buf.append("[");
    for (int j = 0; j < rows; j++) {
        if (j > 0)
            buf.append(",");
        buf.append("{");
        buf.append("'id' : \"").append(matrix.getDesignElementForRow(j).getName()).append("\"");
        BioSequence biologicalCharacteristic = matrix.getDesignElementForRow(j).getBiologicalCharacteristic();
        if (biologicalCharacteristic != null)
            buf.append(", 'sequence' : \"").append(biologicalCharacteristic.getName()).append("\"");
        buf.append(", 'data' : [");
        for (int i = 0; i < columns; i++) {
            Object val = matrix.get(j, i);
            if (i > 0)
                buf.append(",");
            buf.append(val);
        }
        buf.append("]}\n");
    }
    buf.append("\n]}");
    writer.write(buf.toString());
}
Also used : BioSequence(ubic.gemma.model.genome.biosequence.BioSequence)

Example 93 with BioSequence

use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.

the class MatrixWriter method writeSequence.

private void writeSequence(boolean writeSequence, StringBuffer buf, CompositeSequence probeForRow) {
    if (writeSequence) {
        BioSequence biologicalCharacteristic = probeForRow.getBiologicalCharacteristic();
        if (biologicalCharacteristic != null)
            buf.append(biologicalCharacteristic.getName());
        buf.append("\t");
    }
}
Also used : BioSequence(ubic.gemma.model.genome.biosequence.BioSequence)

Example 94 with BioSequence

use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.

the class ShellDelegatingBlat method blatQuery.

@Override
public Map<BioSequence, Collection<BlatResult>> blatQuery(Collection<BioSequence> sequences, boolean sensitive, Taxon taxon) throws IOException {
    Map<BioSequence, Collection<BlatResult>> results = new HashMap<>();
    File querySequenceFile = File.createTempFile("sequences-for-blat", ".fa");
    int count = SequenceWriter.writeSequencesToFile(sequences, querySequenceFile);
    if (count == 0) {
        EntityUtils.deleteFile(querySequenceFile);
        throw new IllegalArgumentException("No sequences!");
    }
    String outputPath = this.getTmpPslFilePath("blat-output");
    Integer port = this.choosePortForQuery(taxon, sensitive);
    if (port == null) {
        throw new IllegalStateException("Could not locate port for BLAT with settings taxon=" + taxon + ", sensitive=" + sensitive + ", check your configuration.");
    }
    Collection<BlatResult> rawResults = this.gfClient(querySequenceFile, outputPath, port);
    ShellDelegatingBlat.log.info("Got " + rawResults.size() + " raw blat results");
    ExternalDatabase searchedDatabase = ShellDelegatingBlat.getSearchedGenome(taxon);
    for (BlatResult blatResult : rawResults) {
        blatResult.setSearchedDatabase(searchedDatabase);
        BioSequence query = blatResult.getQuerySequence();
        if (!results.containsKey(query)) {
            results.put(query, new HashSet<BlatResult>());
        }
        results.get(query).add(blatResult);
    }
    EntityUtils.deleteFile(querySequenceFile);
    return results;
}
Also used : BioSequence(ubic.gemma.model.genome.biosequence.BioSequence) HashMap(java.util.HashMap) ExternalDatabase(ubic.gemma.model.common.description.ExternalDatabase) Collection(java.util.Collection) BlatResult(ubic.gemma.model.genome.sequenceAnalysis.BlatResult)

Example 95 with BioSequence

use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.

the class GenericGenelistDesignGenerator method doWork.

@Override
protected Exception doWork(String[] args) {
    Exception exception = super.processCommandLine(args);
    if (exception != null) {
        return exception;
    }
    ExternalDatabase genbank = externalDatabaseService.findByName("Genbank");
    ExternalDatabase ensembl = externalDatabaseService.findByName("Ensembl");
    assert genbank != null;
    assert ensembl != null;
    /*
         * Create the stub array design for the organism. The name and etc. are generated automatically. If the design
         * exists, we update it.
         */
    String shortName = this.generateShortName();
    ArrayDesign arrayDesign = ArrayDesign.Factory.newInstance();
    arrayDesign.setShortName(shortName);
    // common name
    arrayDesign.setPrimaryTaxon(taxon);
    String nameExt = useNCBIIds ? ", indexed by NCBI IDs" : useEnsemblIds ? ", indexed by Ensembl IDs" : "";
    arrayDesign.setName("Generic platform for " + taxon.getScientificName() + nameExt);
    arrayDesign.setDescription("Created by Gemma");
    // this is key
    arrayDesign.setTechnologyType(TechnologyType.NONE);
    if (arrayDesignService.find(arrayDesign) != null) {
        AbstractCLI.log.info("Platform for " + taxon + " already exists, will update");
        arrayDesign = arrayDesignService.find(arrayDesign);
        arrayDesignService.deleteGeneProductAssociations(arrayDesign);
        arrayDesign = arrayDesignService.load(arrayDesign.getId());
    } else {
        AbstractCLI.log.info("Creating new 'generic' platform");
        arrayDesign = arrayDesignService.create(arrayDesign);
    }
    arrayDesign = arrayDesignService.thaw(arrayDesign);
    // temporary: making sure we set it, as it is new.
    arrayDesign.setTechnologyType(TechnologyType.NONE);
    /*
         * Load up the genes for the organism.
         */
    Collection<Gene> knownGenes = geneService.loadAll(taxon);
    AbstractCLI.log.info("Taxon has " + knownGenes.size() + " genes");
    // this would be good for cases where the identifier we are using has changed.
    Map<Gene, CompositeSequence> existingGeneMap = new HashMap<>();
    if (!useNCBIIds && !useEnsemblIds) {
        // only using this for symbol changes.
        existingGeneMap = this.getExistingGeneMap(arrayDesign);
    }
    Map<String, CompositeSequence> existingSymbolMap = this.getExistingProbeNameMap(arrayDesign);
    int count = 0;
    int numWithNoTranscript = 0;
    // int hasGeneAlready = 0;
    // int numNewGenes = 0;
    int numNewElements = 0;
    int numUpdatedElements = 0;
    for (Gene gene : knownGenes) {
        gene = geneService.thaw(gene);
        Collection<GeneProduct> products = gene.getProducts();
        if (products.isEmpty()) {
            numWithNoTranscript++;
            AbstractCLI.log.debug("No transcript for " + gene);
            continue;
        }
        count++;
        CompositeSequence csForGene = null;
        if (useNCBIIds) {
            if (gene.getNcbiGeneId() == null) {
                AbstractCLI.log.debug("No NCBI ID for " + gene + ", skipping");
                continue;
            }
            if (existingSymbolMap.containsKey(gene.getNcbiGeneId().toString())) {
                csForGene = existingSymbolMap.get(gene.getNcbiGeneId().toString());
            }
        } else if (useEnsemblIds) {
            if (gene.getEnsemblId() == null) {
                AbstractCLI.log.debug("No Ensembl ID for " + gene + ", skipping");
                continue;
            }
            if (existingSymbolMap.containsKey(gene.getEnsemblId())) {
                csForGene = existingSymbolMap.get(gene.getEnsemblId());
            }
        } else {
            /*
                 * detect when the symbol has changed
                 */
            if (existingSymbolMap.containsKey(gene.getOfficialSymbol())) {
                csForGene = existingSymbolMap.get(gene.getOfficialSymbol());
            } else if (existingGeneMap.containsKey(gene)) {
                csForGene = existingGeneMap.get(gene);
                AbstractCLI.log.debug("Gene symbol has changed for: " + gene + "? Current element has name=" + csForGene.getName());
                csForGene.setName(gene.getOfficialSymbol());
            }
        }
        assert csForGene == null || csForGene.getId() != null : "Null id for " + csForGene;
        /*
             * We arbitrarily link the "probe" to one of the gene's RNA transcripts. We could consider other strategies
             * to pick the representative, but it generally doesn't matter.
             */
        for (GeneProduct geneProduct : products) {
            if (!GeneProductType.RNA.equals(geneProduct.getType())) {
                continue;
            }
            /*
                 * Name is usually the genbank or ensembl accession
                 */
            String name = geneProduct.getName();
            BioSequence bioSequence = BioSequence.Factory.newInstance();
            Collection<DatabaseEntry> accessions = geneProduct.getAccessions();
            bioSequence.setName(name);
            bioSequence.setTaxon(taxon);
            bioSequence.setPolymerType(PolymerType.RNA);
            bioSequence.setType(SequenceType.mRNA);
            BioSequence existing = null;
            if (accessions.isEmpty()) {
                // this should not be hit.
                AbstractCLI.log.warn("No accession for " + name);
                DatabaseEntry de = DatabaseEntry.Factory.newInstance();
                de.setAccession(name);
                if (name.startsWith("ENS") && name.length() > 10) {
                    de.setExternalDatabase(ensembl);
                } else {
                    if (name.matches("^[A-Z]{1,2}(_?)[0-9]+(\\.[0-9]+)?$")) {
                        de.setExternalDatabase(genbank);
                    } else {
                        AbstractCLI.log.info("Name doesn't look like genbank or ensembl, skipping: " + name);
                        continue;
                    }
                }
                bioSequence.setSequenceDatabaseEntry(de);
            } else {
                bioSequence.setSequenceDatabaseEntry(accessions.iterator().next());
                existing = bioSequenceService.findByAccession(accessions.iterator().next());
            // FIXME It is possible that this sequence will have been aligned to the genome, which is a bit
            // confusing. So it will map to a gene. Worse case: it maps to more than one gene ...
            }
            if (existing == null) {
                bioSequence = (BioSequence) this.getPersisterHelper().persist(bioSequence);
            } else {
                bioSequence = existing;
            }
            assert bioSequence != null && bioSequence.getId() != null;
            if (bioSequence.getSequenceDatabaseEntry() == null) {
                AbstractCLI.log.info("No DB entry for " + bioSequence + "(" + gene + "), will look for a better sequence to use ...");
                continue;
            }
            if (csForGene == null) {
                if (AbstractCLI.log.isDebugEnabled())
                    AbstractCLI.log.debug("New element " + " with " + bioSequence + " for " + gene);
                csForGene = CompositeSequence.Factory.newInstance();
                if (useNCBIIds) {
                    if (gene.getNcbiGeneId() == null) {
                        continue;
                    }
                    csForGene.setName(gene.getNcbiGeneId().toString());
                } else if (useEnsemblIds) {
                    if (gene.getEnsemblId() == null) {
                        continue;
                    }
                    csForGene.setName(gene.getEnsemblId());
                } else {
                    csForGene.setName(gene.getOfficialSymbol());
                }
                csForGene.setArrayDesign(arrayDesign);
                csForGene.setBiologicalCharacteristic(bioSequence);
                csForGene.setDescription("Generic expression element for " + gene);
                csForGene = compositeSequenceService.create(csForGene);
                assert csForGene.getId() != null : "No id for " + csForGene + " for " + gene;
                arrayDesign.getCompositeSequences().add(csForGene);
                numNewElements++;
            } else {
                if (AbstractCLI.log.isDebugEnabled())
                    AbstractCLI.log.debug("Updating existing element: " + csForGene + " with " + bioSequence + " for " + gene);
                csForGene.setArrayDesign(arrayDesign);
                csForGene.setBiologicalCharacteristic(bioSequence);
                csForGene.setDescription("Generic expression element for " + gene);
                assert csForGene.getId() != null : "No id for " + csForGene + " for " + gene;
                compositeSequenceService.update(csForGene);
                // making sure ...
                csForGene = compositeSequenceService.load(csForGene.getId());
                assert csForGene.getId() != null;
                arrayDesign.getCompositeSequences().add(csForGene);
                numUpdatedElements++;
            }
            assert bioSequence.getId() != null;
            assert geneProduct.getId() != null;
            assert csForGene.getBiologicalCharacteristic() != null && csForGene.getBiologicalCharacteristic().getId() != null;
            AnnotationAssociation aa = AnnotationAssociation.Factory.newInstance();
            aa.setGeneProduct(geneProduct);
            aa.setBioSequence(bioSequence);
            annotationAssociationService.create(aa);
            break;
        }
        if (count % 100 == 0)
            AbstractCLI.log.info(count + " genes processed; " + numNewElements + " new elements; " + numUpdatedElements + " updated elements; " + numWithNoTranscript + " genes had no transcript and were skipped.");
    }
    // is this necessary? causes an error sometimes.
    // arrayDesignService.update( arrayDesign );
    AbstractCLI.log.info("Array design has " + arrayDesignService.numCompositeSequenceWithGenes(arrayDesign) + " 'probes' associated with genes.");
    arrayDesignReportService.generateArrayDesignReport(arrayDesign.getId());
    auditTrailService.addUpdateEvent(arrayDesign, AnnotationBasedGeneMappingEvent.Factory.newInstance(), count + " genes processed; " + numNewElements + " new elements; " + numUpdatedElements + " updated elements; " + numWithNoTranscript + " genes had no transcript and were skipped.");
    arrayDesignAnnotationService.deleteExistingFiles(arrayDesign);
    AbstractCLI.log.info("Don't forget to update the annotation files");
    return null;
}
Also used : AnnotationAssociation(ubic.gemma.model.genome.sequenceAnalysis.AnnotationAssociation) HashMap(java.util.HashMap) BioSequence(ubic.gemma.model.genome.biosequence.BioSequence) ArrayDesign(ubic.gemma.model.expression.arrayDesign.ArrayDesign) DatabaseEntry(ubic.gemma.model.common.description.DatabaseEntry) CompositeSequence(ubic.gemma.model.expression.designElement.CompositeSequence) GeneProduct(ubic.gemma.model.genome.gene.GeneProduct) ExternalDatabase(ubic.gemma.model.common.description.ExternalDatabase) Gene(ubic.gemma.model.genome.Gene)

Aggregations

BioSequence (ubic.gemma.model.genome.biosequence.BioSequence)105 CompositeSequence (ubic.gemma.model.expression.designElement.CompositeSequence)40 ArrayDesign (ubic.gemma.model.expression.arrayDesign.ArrayDesign)24 Test (org.junit.Test)18 HashSet (java.util.HashSet)17 Taxon (ubic.gemma.model.genome.Taxon)15 BlatResult (ubic.gemma.model.genome.sequenceAnalysis.BlatResult)12 InputStream (java.io.InputStream)11 Collection (java.util.Collection)11 HashMap (java.util.HashMap)10 BaseSpringContextTest (ubic.gemma.core.testing.BaseSpringContextTest)10 GZIPInputStream (java.util.zip.GZIPInputStream)7 Gene (ubic.gemma.model.genome.Gene)7 GeoPlatform (ubic.gemma.core.loader.expression.geo.model.GeoPlatform)6 DatabaseEntry (ubic.gemma.model.common.description.DatabaseEntry)6 StopWatch (org.apache.commons.lang3.time.StopWatch)5 GeneProduct (ubic.gemma.model.genome.gene.GeneProduct)5 BioSequenceValueObject (ubic.gemma.model.genome.sequenceAnalysis.BioSequenceValueObject)5 BlatAssociation (ubic.gemma.model.genome.sequenceAnalysis.BlatAssociation)5 AbstractGeoServiceTest (ubic.gemma.core.loader.expression.geo.AbstractGeoServiceTest)4