Search in sources :

Example 26 with DatabaseEntry

use of ubic.gemma.model.common.description.DatabaseEntry in project Gemma by PavlidisLab.

the class GeoServiceImpl method checkSamplesAreNew.

/**
 * Another common case, typified by samples in GSE3193. We must confirm that all samples included in the data set
 * are not included in other data sets. In GEO this primarily occurs in 'superseries' that combine other series.
 */
private void checkSamplesAreNew(GeoSeries series) {
    Collection<GeoSample> toSkip = new HashSet<>();
    for (GeoSample sample : series.getSamples()) {
        if (!sample.appearsInMultipleSeries()) {
            // nothing to worry about: if this series is not loaded, then we're guaranteed to be new.
            continue;
        }
        Collection<BioAssay> existingBioAssays = bioAssayService.findByAccession(sample.getGeoAccession());
        for (BioAssay ba : existingBioAssays) {
            DatabaseEntry acc = ba.getAccession();
            if (acc == null)
                continue;
            String sampleId = sample.getGeoAccession();
            String existingAcc = acc.getAccession();
            if (existingAcc.equals(sampleId) && ba.getAccession().getExternalDatabase().getName().equals(GeoServiceImpl.GEO_DB_NAME)) {
                AbstractGeoService.log.debug(sampleId + " appears in an expression experiment already in the system, skipping");
                toSkip.add(sample);
            }
        }
    }
    if (!toSkip.isEmpty()) {
        AbstractGeoService.log.info("Found " + toSkip.size() + " samples that are already in the system; they will be removed from the new set (example: " + toSkip.iterator().next().getGeoAccession() + ")");
    }
    for (GeoSample gs : toSkip) {
        series.getSamples().remove(gs);
        series.getSampleCorrespondence().removeSample(gs.getGeoAccession());
    }
    for (GeoDataset gds : series.getDatasets()) {
        for (GeoSubset gSub : gds.getSubsets()) {
            for (GeoSample gs : toSkip) {
                gSub.getSamples().remove(gs);
            }
        }
    }
    // update the description, so we keep some kind of record.
    if (toSkip.size() > 0) {
        series.setSummaries(series.getSummaries() + "\nNote: " + toSkip.size() + " samples from this series, which appear in other Expression Experiments in Gemma, " + "were not imported from the GEO source. The following samples were removed: " + StringUtils.join(toSkip, ","));
    }
    if (series.getSamples().size() == 0) {
        throw new AlreadyExistsInSystemException("All the samples in " + series + " are in the system already (in other ExpressionExperiments)");
    }
    if (series.getSamples().size() < 2) /* we don't really have a lower limit set anywhere else */
    {
        throw new IllegalStateException("After removing samples already in the system, this data set is too small to load: " + series.getSamples().size() + " left (removed " + toSkip.size() + ")");
    }
    AbstractGeoService.log.info("Series now contains " + series.getSamples().size() + " (removed " + toSkip.size() + ")");
}
Also used : DatabaseEntry(ubic.gemma.model.common.description.DatabaseEntry) AlreadyExistsInSystemException(ubic.gemma.core.loader.util.AlreadyExistsInSystemException) BioAssay(ubic.gemma.model.expression.bioAssay.BioAssay)

Example 27 with DatabaseEntry

use of ubic.gemma.model.common.description.DatabaseEntry in project Gemma by PavlidisLab.

the class GeoServiceImpl method checkForExisting.

private void checkForExisting(Collection<DatabaseEntry> projectedAccessions) {
    if (projectedAccessions == null || projectedAccessions.size() == 0) {
        // that's okay, it might have been a GPL.
        return;
    }
    for (DatabaseEntry entry : projectedAccessions) {
        Collection<ExpressionExperiment> existing = expressionExperimentService.findByAccession(entry);
        if (!existing.isEmpty()) {
            String message = "There is already an expression experiment that matches " + entry.getAccession();
            AbstractGeoService.log.info(message);
            throw new AlreadyExistsInSystemException(message, existing);
        }
    }
}
Also used : DatabaseEntry(ubic.gemma.model.common.description.DatabaseEntry) AlreadyExistsInSystemException(ubic.gemma.core.loader.util.AlreadyExistsInSystemException) ExpressionExperiment(ubic.gemma.model.expression.experiment.ExpressionExperiment)

Example 28 with DatabaseEntry

use of ubic.gemma.model.common.description.DatabaseEntry in project Gemma by PavlidisLab.

the class NcbiGeneConverter method getChromosomeDetails.

private void getChromosomeDetails(NCBIGene2Accession acc, Gene gene) {
    Chromosome chrom = gene.getPhysicalLocation().getChromosome();
    BioSequence chromSeq = BioSequence.Factory.newInstance();
    chromSeq.setName(acc.getGenomicNucleotideAccession());
    chromSeq.setType(SequenceType.WHOLE_CHROMOSOME);
    chromSeq.setTaxon(gene.getTaxon());
    DatabaseEntry dbe = DatabaseEntry.Factory.newInstance();
    dbe.setExternalDatabase(NcbiGeneConverter.genBank);
    dbe.setAccession(acc.getGenomicNucleotideAccession());
    dbe.setAccessionVersion(acc.getGenomicNucleotideAccessionVersion());
    chromSeq.setSequenceDatabaseEntry(dbe);
    try {
        FieldUtils.writeField(chrom, "sequence", chromSeq, true);
    } catch (IllegalAccessException e) {
        e.printStackTrace();
    }
}
Also used : BioSequence(ubic.gemma.model.genome.biosequence.BioSequence) Chromosome(ubic.gemma.model.genome.Chromosome) DatabaseEntry(ubic.gemma.model.common.description.DatabaseEntry)

Example 29 with DatabaseEntry

use of ubic.gemma.model.common.description.DatabaseEntry in project Gemma by PavlidisLab.

the class FastaParser method parseDeflineHeader.

/**
 * The following formats are supported
 * <ul>
 * <li>GenBank: gi|gi-number|gb|accession|locus
 * <li>EMBL Data Library : gi|gi-number|emb|accession|locus
 * <li>DDBJ, DNA Database of Japan : gi|gi-number|dbj|accession|locus
 * <li>NBRF PIR : pir||entry
 * <li>Protein Research Foundation : prf||name
 * <li>SWISS-PROT : sp|accession|name
 * <li>Brookhaven Protein Data Bank (1) : pdb|entry|chain
 * <li>Brookhaven Protein Data Bank (2) : entry:chain|PDBID|CHAIN|SEQUENCE
 * <li>Patents : pat|country|number
 * <li>GenInfo Backbone Id bbs|number
 * <li>General database identifier : gnl|database|identifier
 * <li>NCBI Reference Sequence : ref|accession|locus
 * <li>Local Sequence identifier : lcl|identifier
 * <li>NIA 15k and 7k sets : H[0-9A-Z]{1-9}-\d | alternate (example: &gt;H4002F12-5 )
 * <li>Generic: probeid
 * </ul>
 *
 * @param bioSequence BA
 * @param header      header
 * @return boolean
 */
private boolean parseDeflineHeader(BioSequence bioSequence, String header) {
    // one of the genbank formats.
    String[] split = StringUtils.splitPreserveAllTokens(header, "|;");
    String firstTag = split[0];
    // assert firstTag.startsWith( ">" );
    // assert firstTag.length() > 1;
    firstTag = StringUtils.removeStart(firstTag, ">");
    if (firstTag.equals("gi")) {
        bioSequence.setDescription(split[4]);
        // with version number, possibly
        String genbankAcc = split[3];
        DatabaseEntry genbank = ExternalDatabaseUtils.getGenbankAccession(genbankAcc);
        // without version number.
        bioSequence.setName(genbank.getAccession());
        bioSequence.setSequenceDatabaseEntry(genbank);
    } else if (firstTag.equals("pir")) {
        bioSequence.setName(split[1]);
    } else if (firstTag.equals("sp")) {
        bioSequence.setName(split[1]);
        bioSequence.setDescription(split[2]);
    } else if (firstTag.equals("ref")) {
        bioSequence.setName(split[1]);
        bioSequence.setDescription(split[2]);
    } else if (firstTag.equals("lcl")) {
        bioSequence.setName(split[1]);
    } else if (firstTag.equals("pdb")) {
        bioSequence.setName(split[1]);
        bioSequence.setDescription(split[2]);
    } else if (firstTag.equals("gnl")) {
        bioSequence.setName(split[2]);
    } else if (firstTag.equals("entry:chain")) {
        bioSequence.setName(split[1]);
    } else if (firstTag.matches(FastaParser.NIA_HEADER_REGEX)) {
        return this.parseNIA(bioSequence, header);
    } else {
        // generic.
        bioSequence.setName(split[0]);
        if (split.length > 1)
            bioSequence.setDescription(split[1]);
    // log.warn( "Defline-style FASTA header in unrecognized format, started with " + firstTag );
    // return false;
    }
    return true;
}
Also used : DatabaseEntry(ubic.gemma.model.common.description.DatabaseEntry)

Example 30 with DatabaseEntry

use of ubic.gemma.model.common.description.DatabaseEntry in project Gemma by PavlidisLab.

the class FastaParser method parseAffyHeader.

/**
 * <pre>
 *        Affymetrix targets or collapsed sequence     target:array:probeset;
 *        Affymetrix &quot;style&quot; file            target:probename
 *        Affymetrix probe                             probe:array:probeset:xcoord:ycoord; Interrogation_Position=XXXX; Antisense;
 *        Affymetrix consensus/exemplar                exemplar:array:probeset; gb|accession; gb:accession /DEF=Homo sapiens metalloprotease-like, disintegrin-like, cysteine-rich protein 2 delta (ADAM22) mRNA, alternative splice product, complete cds.  /FEA=mRNA /GEN=ADAM22 /PROD=metalloprotease-like,
 *        Affymetrix-like format                       array:probe or other string containing ':'.
 * </pre>
 *
 * @param bioSequence BA
 * @param header      header
 * @return boolean always true
 */
// Consistency with other similar methods
@SuppressWarnings("SameReturnValue")
private boolean parseAffyHeader(BioSequence bioSequence, String header) {
    // affymetrix format
    String[] split = StringUtils.split(header, ":;");
    String firstTag = StringUtils.removeStart(split[0], ">");
    switch(firstTag) {
        case "probe":
            bioSequence.setName(split[1] + ":" + split[2] + ":" + split[3] + ":" + split[4]);
            break;
        case "target":
            // split[2] = probe name
            if (split.length > 2) {
                bioSequence.setName(split[2]);
            } else {
                bioSequence.setName(split[1]);
            }
            break;
        case "exemplar":
            bioSequence.setName(split[1] + ":" + split[2]);
            bioSequence.setDescription(split[3]);
            break;
        default:
            // This is the case if the xxxx:xxxx format is used on non-affy
            bioSequence.setName(StringUtils.removeStart(header, ">"));
            return true;
    }
    for (String string : split) {
        string = StringUtils.strip(string);
        // fill in the sequence database entry
        if (string.startsWith("gb|") || string.startsWith("gb:")) {
            String[] splits = StringUtils.split(string, ":|");
            String genbankAcc = splits[1];
            DatabaseEntry genbank = ExternalDatabaseUtils.getGenbankAccession(genbankAcc);
            bioSequence.setName(genbank.getAccession());
            bioSequence.setSequenceDatabaseEntry(genbank);
            if (RecordParser.log.isDebugEnabled())
                RecordParser.log.debug("Got genbank accession " + genbankAcc + " for " + bioSequence.getName());
            break;
        }
    }
    return true;
}
Also used : DatabaseEntry(ubic.gemma.model.common.description.DatabaseEntry)

Aggregations

DatabaseEntry (ubic.gemma.model.common.description.DatabaseEntry)37 ExternalDatabase (ubic.gemma.model.common.description.ExternalDatabase)11 GeneProduct (ubic.gemma.model.genome.gene.GeneProduct)8 HashSet (java.util.HashSet)6 Test (org.junit.Test)6 ExpressionExperiment (ubic.gemma.model.expression.experiment.ExpressionExperiment)6 Gene (ubic.gemma.model.genome.Gene)6 BioSequence (ubic.gemma.model.genome.biosequence.BioSequence)6 Taxon (ubic.gemma.model.genome.Taxon)5 BaseSpringContextTest (ubic.gemma.core.testing.BaseSpringContextTest)4 BioSequence2GeneProduct (ubic.gemma.model.association.BioSequence2GeneProduct)4 CompositeSequence (ubic.gemma.model.expression.designElement.CompositeSequence)4 HashMap (java.util.HashMap)3 BibliographicReference (ubic.gemma.model.common.description.BibliographicReference)3 AnnotationAssociation (ubic.gemma.model.genome.sequenceAnalysis.AnnotationAssociation)3 IOException (java.io.IOException)2 ArrayList (java.util.ArrayList)2 Before (org.junit.Before)2 AlreadyExistsInSystemException (ubic.gemma.core.loader.util.AlreadyExistsInSystemException)2 BioAssay (ubic.gemma.model.expression.bioAssay.BioAssay)2