use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.
the class ArrayDesignSequenceProcessingServiceImpl method initializeFetchList.
/**
* @param force if true, sequence will be replaced even if it is already there.
* @return map of biosequence accessions to BioSequences (the existing ones)
*/
private Map<String, BioSequence> initializeFetchList(ArrayDesign arrayDesign, boolean force) {
Map<String, BioSequence> accessionsToFetch = new HashMap<>();
int sequenceProvided = 0;
int noSequence = 0;
boolean warned = false;
for (CompositeSequence cs : arrayDesign.getCompositeSequences()) {
BioSequence bs = cs.getBiologicalCharacteristic();
if (bs == null) {
warned = this.warnAboutMissingSequence(noSequence, warned, cs);
noSequence++;
continue;
}
if (!force && StringUtils.isNotBlank(bs.getSequence())) {
sequenceProvided++;
continue;
}
String accession = this.getAccession(cs);
if (accession == null) {
if (ArrayDesignSequenceProcessingServiceImpl.log.isDebugEnabled())
ArrayDesignSequenceProcessingServiceImpl.log.debug("No accession for " + cs + ": " + bs);
continue;
}
accessionsToFetch.put(accession, bs);
}
this.informAboutFetchListResults(arrayDesign, accessionsToFetch, sequenceProvided, noSequence);
return accessionsToFetch;
}
use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.
the class GeoServiceImpl method matchToExistingPlatform.
private void matchToExistingPlatform(GeoConverter geoConverter, GeoPlatform rawGEOPlatform, ArrayDesignsForExperimentCache c) {
// we have to populate this.
Map<String, String> probeNamesInGemma = rawGEOPlatform.getProbeNamesInGemma();
// do a partial conversion. We will throw this away;
ArrayDesign geoArrayDesign = (ArrayDesign) geoConverter.convert(rawGEOPlatform);
if (geoArrayDesign == null) {
if (!rawGEOPlatform.useDataFromGeo()) {
// MPSS, exon arrays
return;
}
throw new IllegalStateException("Platform is missing");
}
// find in our system. Note we only use the short name. The full name can change in GEO, causing trouble.
ArrayDesign existing = arrayDesignService.findByShortName(geoArrayDesign.getShortName());
if (existing == null) {
AbstractGeoService.log.info(rawGEOPlatform + " looks new to Gemma");
for (CompositeSequence cs : geoArrayDesign.getCompositeSequences()) {
String geoProbeName = cs.getName();
probeNamesInGemma.put(geoProbeName, geoProbeName);
// no mapping needed. NB the converter fills
// this in already, we're just being defensive
// here.
}
} else {
AbstractGeoService.log.info("Platform " + rawGEOPlatform.getGeoAccession() + " exists in Gemma, checking for correct probe names and re-matching if necessary ...");
String columnWithGeoNames;
columnWithGeoNames = this.getGEOIDColumnName(rawGEOPlatform, geoArrayDesign);
if (columnWithGeoNames == null) {
// no problem: this means the design has no elements, so it is actually a placeholder (e.g., MPSS)
return;
}
AbstractGeoService.log.info("Loading probes ...");
Map<CompositeSequence, BioSequence> m = arrayDesignService.getBioSequences(existing);
c.add(existing, m);
this.getGemmaIDColumnNameInGEO(rawGEOPlatform, m, columnWithGeoNames);
}
}
use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.
the class SimpleExpressionDataLoaderServiceImpl method provideImageClone.
/**
* This will eventually go - no special IMAGE clone support.
*/
private void provideImageClone(CompositeSequence cs, Taxon taxon) {
BioSequence bs = BioSequence.Factory.newInstance();
bs.setTaxon(taxon);
String imageId = cs.getName();
if (imageId == null)
throw new IllegalArgumentException("ComposisteSequence must have name filled in first");
imageId = imageId.replaceFirst("___\\d$", "");
if (!imageId.startsWith("IMAGE:")) {
imageId = "IMAGE:" + imageId;
}
assert imageId.matches("^IMAGE:\\d+$");
bs.setName(imageId);
cs.setBiologicalCharacteristic(bs);
}
use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.
the class GeoConverterImpl method processId.
private int processId(GeoPlatform platform, ArrayDesign arrayDesign, String probeOrganismColumn, ExternalDatabase externalDb, List<String> sequences, List<String> probeOrganism, Taxon primaryTaxon, List<String> cloneIdentifiers, List<List<String>> externalRefs, Iterator<String> descIter, Pattern refSeqAccessionPattern, boolean strictSelection, List<String> skipped, Collection<CompositeSequence> compositeSequences, int i, String id) {
String externalAccession = null;
if (externalRefs != null) {
externalAccession = this.getExternalAccession(externalRefs, i);
}
if (strictSelection && StringUtils.isBlank(externalAccession)) {
// currently this is crafted to deal with affymetrix exon arrays, but could be expanded.
// mrna_assignment is less strict than gene_assignement
// salvage it if it has a gene assignment.
// String filteringColumn = "gene_assignment";
String filteringColumn = "gene_assignment";
if (platform.getColumnNames().contains(filteringColumn)) {
String cd = platform.getColumnData(filteringColumn).get(i);
if (StringUtils.isBlank(cd) || cd.equals("---")) {
skipped.add(id);
if (skipped.size() % 10000 == 0) {
GeoConverterImpl.log.info("Skipped " + skipped.size() + " elements due to strict selection; last was " + id);
}
i++;
return i;
}
// keep it.
} else {
// we just skip ones that don't have an external accession.
return i;
}
// remaining case here: externalAccession is blank, but there is another column that we think saves it.
}
String cloneIdentifier = cloneIdentifiers == null ? null : cloneIdentifiers.get(i);
String description = "";
if (externalAccession != null) {
String[] refs = externalAccession.split(",");
if (refs.length > 1) {
description = "Multiple external sequence references: " + externalAccession + "; ";
externalAccession = refs[0];
}
}
if (descIter != null)
description = description + " " + descIter.next();
CompositeSequence cs = CompositeSequence.Factory.newInstance();
String probeName = platform.getProbeNamesInGemma().get(id);
if (probeName == null) {
probeName = id;
if (GeoConverterImpl.log.isDebugEnabled())
GeoConverterImpl.log.debug("Probe retaining original name: " + probeName);
// must make sure this is populated.
platform.getProbeNamesInGemma().put(id, id);
} else {
if (GeoConverterImpl.log.isDebugEnabled())
GeoConverterImpl.log.debug("Found probe: " + probeName);
}
cs.setName(probeName);
cs.setDescription(description);
cs.setArrayDesign(arrayDesign);
// LMD:1647- If There is a Organism Column given for the probe then set taxon from that overwriting platform
// if probeOrganismColumn is set but for this probe no taxon do not set probeTaxon and thus create no
// biosequence
Taxon probeTaxon = Taxon.Factory.newInstance();
if (probeOrganism != null && StringUtils.isNotBlank(probeOrganism.get(i))) {
probeTaxon = this.convertProbeOrganism(probeOrganism.get(i));
}
// if there are no probe taxons then all the probes should take the taxon from the primary taxon
if (probeOrganismColumn == null) {
probeTaxon = primaryTaxon;
}
BioSequence bs = this.createMinimalBioSequence(probeTaxon);
this.setBsProps(platform, externalDb, sequences, refSeqAccessionPattern, i, id, externalAccession, cloneIdentifier, bs);
this.checkCs(arrayDesign, externalAccession, cloneIdentifier, cs, probeTaxon, bs);
compositeSequences.add(cs);
platformDesignElementMap.get(arrayDesign.getShortName()).put(probeName, cs);
i++;
return i;
}
use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.
the class FastaParser method parseHeader.
/**
* Recognizes Defline format as described at <a href='http://en.wikipedia.org/wiki/Fasta_format#Sequence_identifiers'>wikipedia</a>.
* Our amendments:
* FIXME: recognize multi-line headers separated by ^A.(used for redundant sequences)
* FIXME: parsing of more obscure (to us) headers might not be complete.
*
* @param matcher matcher
* @return BAs
*/
private Collection<BioSequence> parseHeader(Matcher matcher) {
Collection<BioSequence> bioSequences = new HashSet<>();
boolean gotSomething = matcher.find();
if (!gotSomething) {
throw new IllegalArgumentException("Invalid FASTA record");
}
String header = matcher.group(1);
String[] recordHeaders = StringUtils.split(header, '>');
boolean keep;
for (String rheader : recordHeaders) {
BioSequence bioSequence = BioSequence.Factory.newInstance();
bioSequence.setName(rheader);
/*
* Look for either a '|' or a ':'. Allow for the possibility of ':' and then '|' occuring; use whichever
* comes first.
*/
int firstPipe = rheader.indexOf('|');
int firstColon = rheader.indexOf(':');
if (firstPipe > 0 && (firstColon < 0 || firstPipe < firstColon)) {
keep = this.parseDeflineHeader(bioSequence, rheader);
} else if (firstColon > 0) {
keep = this.parseAffyHeader(bioSequence, rheader);
} else if (rheader.matches(FastaParser.NIA_HEADER_REGEX)) {
keep = this.parseNIA(bioSequence, rheader);
} else {
// just treat the whole header as the sequence name.
keep = this.parseDeflineHeader(bioSequence, rheader);
}
if (keep)
bioSequences.add(bioSequence);
}
return bioSequences;
}
Aggregations