use of ubic.gemma.model.common.description.DatabaseEntry in project Gemma by PavlidisLab.
the class GeoServiceImpl method checkSamplesAreNew.
/**
* Another common case, typified by samples in GSE3193. We must confirm that all samples included in the data set
* are not included in other data sets. In GEO this primarily occurs in 'superseries' that combine other series.
*/
private void checkSamplesAreNew(GeoSeries series) {
Collection<GeoSample> toSkip = new HashSet<>();
for (GeoSample sample : series.getSamples()) {
if (!sample.appearsInMultipleSeries()) {
// nothing to worry about: if this series is not loaded, then we're guaranteed to be new.
continue;
}
Collection<BioAssay> existingBioAssays = bioAssayService.findByAccession(sample.getGeoAccession());
for (BioAssay ba : existingBioAssays) {
DatabaseEntry acc = ba.getAccession();
if (acc == null)
continue;
String sampleId = sample.getGeoAccession();
String existingAcc = acc.getAccession();
if (existingAcc.equals(sampleId) && ba.getAccession().getExternalDatabase().getName().equals(GeoServiceImpl.GEO_DB_NAME)) {
AbstractGeoService.log.debug(sampleId + " appears in an expression experiment already in the system, skipping");
toSkip.add(sample);
}
}
}
if (!toSkip.isEmpty()) {
AbstractGeoService.log.info("Found " + toSkip.size() + " samples that are already in the system; they will be removed from the new set (example: " + toSkip.iterator().next().getGeoAccession() + ")");
}
for (GeoSample gs : toSkip) {
series.getSamples().remove(gs);
series.getSampleCorrespondence().removeSample(gs.getGeoAccession());
}
for (GeoDataset gds : series.getDatasets()) {
for (GeoSubset gSub : gds.getSubsets()) {
for (GeoSample gs : toSkip) {
gSub.getSamples().remove(gs);
}
}
}
// update the description, so we keep some kind of record.
if (toSkip.size() > 0) {
series.setSummaries(series.getSummaries() + "\nNote: " + toSkip.size() + " samples from this series, which appear in other Expression Experiments in Gemma, " + "were not imported from the GEO source. The following samples were removed: " + StringUtils.join(toSkip, ","));
}
if (series.getSamples().size() == 0) {
throw new AlreadyExistsInSystemException("All the samples in " + series + " are in the system already (in other ExpressionExperiments)");
}
if (series.getSamples().size() < 2) /* we don't really have a lower limit set anywhere else */
{
throw new IllegalStateException("After removing samples already in the system, this data set is too small to load: " + series.getSamples().size() + " left (removed " + toSkip.size() + ")");
}
AbstractGeoService.log.info("Series now contains " + series.getSamples().size() + " (removed " + toSkip.size() + ")");
}
use of ubic.gemma.model.common.description.DatabaseEntry in project Gemma by PavlidisLab.
the class GeoServiceImpl method checkForExisting.
private void checkForExisting(Collection<DatabaseEntry> projectedAccessions) {
if (projectedAccessions == null || projectedAccessions.size() == 0) {
// that's okay, it might have been a GPL.
return;
}
for (DatabaseEntry entry : projectedAccessions) {
Collection<ExpressionExperiment> existing = expressionExperimentService.findByAccession(entry);
if (!existing.isEmpty()) {
String message = "There is already an expression experiment that matches " + entry.getAccession();
AbstractGeoService.log.info(message);
throw new AlreadyExistsInSystemException(message, existing);
}
}
}
use of ubic.gemma.model.common.description.DatabaseEntry in project Gemma by PavlidisLab.
the class NcbiGeneConverter method getChromosomeDetails.
private void getChromosomeDetails(NCBIGene2Accession acc, Gene gene) {
Chromosome chrom = gene.getPhysicalLocation().getChromosome();
BioSequence chromSeq = BioSequence.Factory.newInstance();
chromSeq.setName(acc.getGenomicNucleotideAccession());
chromSeq.setType(SequenceType.WHOLE_CHROMOSOME);
chromSeq.setTaxon(gene.getTaxon());
DatabaseEntry dbe = DatabaseEntry.Factory.newInstance();
dbe.setExternalDatabase(NcbiGeneConverter.genBank);
dbe.setAccession(acc.getGenomicNucleotideAccession());
dbe.setAccessionVersion(acc.getGenomicNucleotideAccessionVersion());
chromSeq.setSequenceDatabaseEntry(dbe);
try {
FieldUtils.writeField(chrom, "sequence", chromSeq, true);
} catch (IllegalAccessException e) {
e.printStackTrace();
}
}
use of ubic.gemma.model.common.description.DatabaseEntry in project Gemma by PavlidisLab.
the class FastaParser method parseDeflineHeader.
/**
* The following formats are supported
* <ul>
* <li>GenBank: gi|gi-number|gb|accession|locus
* <li>EMBL Data Library : gi|gi-number|emb|accession|locus
* <li>DDBJ, DNA Database of Japan : gi|gi-number|dbj|accession|locus
* <li>NBRF PIR : pir||entry
* <li>Protein Research Foundation : prf||name
* <li>SWISS-PROT : sp|accession|name
* <li>Brookhaven Protein Data Bank (1) : pdb|entry|chain
* <li>Brookhaven Protein Data Bank (2) : entry:chain|PDBID|CHAIN|SEQUENCE
* <li>Patents : pat|country|number
* <li>GenInfo Backbone Id bbs|number
* <li>General database identifier : gnl|database|identifier
* <li>NCBI Reference Sequence : ref|accession|locus
* <li>Local Sequence identifier : lcl|identifier
* <li>NIA 15k and 7k sets : H[0-9A-Z]{1-9}-\d | alternate (example: >H4002F12-5 )
* <li>Generic: probeid
* </ul>
*
* @param bioSequence BA
* @param header header
* @return boolean
*/
private boolean parseDeflineHeader(BioSequence bioSequence, String header) {
// one of the genbank formats.
String[] split = StringUtils.splitPreserveAllTokens(header, "|;");
String firstTag = split[0];
// assert firstTag.startsWith( ">" );
// assert firstTag.length() > 1;
firstTag = StringUtils.removeStart(firstTag, ">");
if (firstTag.equals("gi")) {
bioSequence.setDescription(split[4]);
// with version number, possibly
String genbankAcc = split[3];
DatabaseEntry genbank = ExternalDatabaseUtils.getGenbankAccession(genbankAcc);
// without version number.
bioSequence.setName(genbank.getAccession());
bioSequence.setSequenceDatabaseEntry(genbank);
} else if (firstTag.equals("pir")) {
bioSequence.setName(split[1]);
} else if (firstTag.equals("sp")) {
bioSequence.setName(split[1]);
bioSequence.setDescription(split[2]);
} else if (firstTag.equals("ref")) {
bioSequence.setName(split[1]);
bioSequence.setDescription(split[2]);
} else if (firstTag.equals("lcl")) {
bioSequence.setName(split[1]);
} else if (firstTag.equals("pdb")) {
bioSequence.setName(split[1]);
bioSequence.setDescription(split[2]);
} else if (firstTag.equals("gnl")) {
bioSequence.setName(split[2]);
} else if (firstTag.equals("entry:chain")) {
bioSequence.setName(split[1]);
} else if (firstTag.matches(FastaParser.NIA_HEADER_REGEX)) {
return this.parseNIA(bioSequence, header);
} else {
// generic.
bioSequence.setName(split[0]);
if (split.length > 1)
bioSequence.setDescription(split[1]);
// log.warn( "Defline-style FASTA header in unrecognized format, started with " + firstTag );
// return false;
}
return true;
}
use of ubic.gemma.model.common.description.DatabaseEntry in project Gemma by PavlidisLab.
the class FastaParser method parseAffyHeader.
/**
* <pre>
* Affymetrix targets or collapsed sequence target:array:probeset;
* Affymetrix "style" file target:probename
* Affymetrix probe probe:array:probeset:xcoord:ycoord; Interrogation_Position=XXXX; Antisense;
* Affymetrix consensus/exemplar exemplar:array:probeset; gb|accession; gb:accession /DEF=Homo sapiens metalloprotease-like, disintegrin-like, cysteine-rich protein 2 delta (ADAM22) mRNA, alternative splice product, complete cds. /FEA=mRNA /GEN=ADAM22 /PROD=metalloprotease-like,
* Affymetrix-like format array:probe or other string containing ':'.
* </pre>
*
* @param bioSequence BA
* @param header header
* @return boolean always true
*/
// Consistency with other similar methods
@SuppressWarnings("SameReturnValue")
private boolean parseAffyHeader(BioSequence bioSequence, String header) {
// affymetrix format
String[] split = StringUtils.split(header, ":;");
String firstTag = StringUtils.removeStart(split[0], ">");
switch(firstTag) {
case "probe":
bioSequence.setName(split[1] + ":" + split[2] + ":" + split[3] + ":" + split[4]);
break;
case "target":
// split[2] = probe name
if (split.length > 2) {
bioSequence.setName(split[2]);
} else {
bioSequence.setName(split[1]);
}
break;
case "exemplar":
bioSequence.setName(split[1] + ":" + split[2]);
bioSequence.setDescription(split[3]);
break;
default:
// This is the case if the xxxx:xxxx format is used on non-affy
bioSequence.setName(StringUtils.removeStart(header, ">"));
return true;
}
for (String string : split) {
string = StringUtils.strip(string);
// fill in the sequence database entry
if (string.startsWith("gb|") || string.startsWith("gb:")) {
String[] splits = StringUtils.split(string, ":|");
String genbankAcc = splits[1];
DatabaseEntry genbank = ExternalDatabaseUtils.getGenbankAccession(genbankAcc);
bioSequence.setName(genbank.getAccession());
bioSequence.setSequenceDatabaseEntry(genbank);
if (RecordParser.log.isDebugEnabled())
RecordParser.log.debug("Got genbank accession " + genbankAcc + " for " + bioSequence.getName());
break;
}
}
return true;
}
Aggregations