use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.
the class GenomePersister method persistOrUpdateBioSequence.
private BioSequence persistOrUpdateBioSequence(BioSequence bioSequence) {
if (bioSequence == null)
return null;
/*
* Note that this method is only really used by the ArrayDesignSequencePersister: it's for filling in
* information about probes on arrays.
*/
BioSequence existingBioSequence = bioSequenceDao.find(bioSequence);
if (existingBioSequence == null) {
if (AbstractPersister.log.isDebugEnabled())
AbstractPersister.log.debug("Creating new: " + bioSequence);
return this.persistNewBioSequence(bioSequence);
}
if (AbstractPersister.log.isDebugEnabled())
AbstractPersister.log.debug("Found existing: " + existingBioSequence);
// the sequence is the main field we might update.
if (bioSequence.getSequence() != null && !bioSequence.getSequence().equals(existingBioSequence.getSequence())) {
existingBioSequence.setSequence(bioSequence.getSequence());
}
/*
* Can do for all fields that might not be the same: anything besides the name and taxon.
*/
if (bioSequence.getDescription() != null && !bioSequence.getDescription().equals(existingBioSequence.getDescription())) {
existingBioSequence.setDescription(bioSequence.getDescription());
}
if (bioSequence.getType() != null && !bioSequence.getType().equals(existingBioSequence.getType())) {
existingBioSequence.setType(bioSequence.getType());
}
if (bioSequence.getFractionRepeats() != null && !bioSequence.getFractionRepeats().equals(existingBioSequence.getFractionRepeats())) {
existingBioSequence.setFractionRepeats(bioSequence.getFractionRepeats());
}
if (bioSequence.getLength() != null && !bioSequence.getLength().equals(existingBioSequence.getLength())) {
existingBioSequence.setLength(bioSequence.getLength());
}
if (bioSequence.getIsCircular() != null && !bioSequence.getIsCircular().equals(existingBioSequence.getIsCircular())) {
existingBioSequence.setIsCircular(bioSequence.getIsCircular());
}
if (bioSequence.getPolymerType() != null && !bioSequence.getPolymerType().equals(existingBioSequence.getPolymerType())) {
existingBioSequence.setPolymerType(bioSequence.getPolymerType());
}
if (bioSequence.getSequenceDatabaseEntry() != null && !bioSequence.getSequenceDatabaseEntry().equals(existingBioSequence.getSequenceDatabaseEntry())) {
existingBioSequence.setSequenceDatabaseEntry((DatabaseEntry) this.persist(bioSequence.getSequenceDatabaseEntry()));
}
bioSequenceDao.update(existingBioSequence);
return existingBioSequence;
}
use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.
the class MatrixWriter method writeJSON.
public void writeJSON(Writer writer, ExpressionDataMatrix<?> matrix) throws IOException {
int columns = matrix.columns();
int rows = matrix.rows();
StringBuilder buf = new StringBuilder();
buf.append("{ 'numRows' : ").append(matrix.rows()).append(", 'rows': ");
buf.append("[");
for (int j = 0; j < rows; j++) {
if (j > 0)
buf.append(",");
buf.append("{");
buf.append("'id' : \"").append(matrix.getDesignElementForRow(j).getName()).append("\"");
BioSequence biologicalCharacteristic = matrix.getDesignElementForRow(j).getBiologicalCharacteristic();
if (biologicalCharacteristic != null)
buf.append(", 'sequence' : \"").append(biologicalCharacteristic.getName()).append("\"");
buf.append(", 'data' : [");
for (int i = 0; i < columns; i++) {
Object val = matrix.get(j, i);
if (i > 0)
buf.append(",");
buf.append(val);
}
buf.append("]}\n");
}
buf.append("\n]}");
writer.write(buf.toString());
}
use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.
the class MatrixWriter method writeSequence.
private void writeSequence(boolean writeSequence, StringBuffer buf, CompositeSequence probeForRow) {
if (writeSequence) {
BioSequence biologicalCharacteristic = probeForRow.getBiologicalCharacteristic();
if (biologicalCharacteristic != null)
buf.append(biologicalCharacteristic.getName());
buf.append("\t");
}
}
use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.
the class ShellDelegatingBlat method blatQuery.
@Override
public Map<BioSequence, Collection<BlatResult>> blatQuery(Collection<BioSequence> sequences, boolean sensitive, Taxon taxon) throws IOException {
Map<BioSequence, Collection<BlatResult>> results = new HashMap<>();
File querySequenceFile = File.createTempFile("sequences-for-blat", ".fa");
int count = SequenceWriter.writeSequencesToFile(sequences, querySequenceFile);
if (count == 0) {
EntityUtils.deleteFile(querySequenceFile);
throw new IllegalArgumentException("No sequences!");
}
String outputPath = this.getTmpPslFilePath("blat-output");
Integer port = this.choosePortForQuery(taxon, sensitive);
if (port == null) {
throw new IllegalStateException("Could not locate port for BLAT with settings taxon=" + taxon + ", sensitive=" + sensitive + ", check your configuration.");
}
Collection<BlatResult> rawResults = this.gfClient(querySequenceFile, outputPath, port);
ShellDelegatingBlat.log.info("Got " + rawResults.size() + " raw blat results");
ExternalDatabase searchedDatabase = ShellDelegatingBlat.getSearchedGenome(taxon);
for (BlatResult blatResult : rawResults) {
blatResult.setSearchedDatabase(searchedDatabase);
BioSequence query = blatResult.getQuerySequence();
if (!results.containsKey(query)) {
results.put(query, new HashSet<BlatResult>());
}
results.get(query).add(blatResult);
}
EntityUtils.deleteFile(querySequenceFile);
return results;
}
use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.
the class GenericGenelistDesignGenerator method doWork.
@Override
protected Exception doWork(String[] args) {
Exception exception = super.processCommandLine(args);
if (exception != null) {
return exception;
}
ExternalDatabase genbank = externalDatabaseService.findByName("Genbank");
ExternalDatabase ensembl = externalDatabaseService.findByName("Ensembl");
assert genbank != null;
assert ensembl != null;
/*
* Create the stub array design for the organism. The name and etc. are generated automatically. If the design
* exists, we update it.
*/
String shortName = this.generateShortName();
ArrayDesign arrayDesign = ArrayDesign.Factory.newInstance();
arrayDesign.setShortName(shortName);
// common name
arrayDesign.setPrimaryTaxon(taxon);
String nameExt = useNCBIIds ? ", indexed by NCBI IDs" : useEnsemblIds ? ", indexed by Ensembl IDs" : "";
arrayDesign.setName("Generic platform for " + taxon.getScientificName() + nameExt);
arrayDesign.setDescription("Created by Gemma");
// this is key
arrayDesign.setTechnologyType(TechnologyType.NONE);
if (arrayDesignService.find(arrayDesign) != null) {
AbstractCLI.log.info("Platform for " + taxon + " already exists, will update");
arrayDesign = arrayDesignService.find(arrayDesign);
arrayDesignService.deleteGeneProductAssociations(arrayDesign);
arrayDesign = arrayDesignService.load(arrayDesign.getId());
} else {
AbstractCLI.log.info("Creating new 'generic' platform");
arrayDesign = arrayDesignService.create(arrayDesign);
}
arrayDesign = arrayDesignService.thaw(arrayDesign);
// temporary: making sure we set it, as it is new.
arrayDesign.setTechnologyType(TechnologyType.NONE);
/*
* Load up the genes for the organism.
*/
Collection<Gene> knownGenes = geneService.loadAll(taxon);
AbstractCLI.log.info("Taxon has " + knownGenes.size() + " genes");
// this would be good for cases where the identifier we are using has changed.
Map<Gene, CompositeSequence> existingGeneMap = new HashMap<>();
if (!useNCBIIds && !useEnsemblIds) {
// only using this for symbol changes.
existingGeneMap = this.getExistingGeneMap(arrayDesign);
}
Map<String, CompositeSequence> existingSymbolMap = this.getExistingProbeNameMap(arrayDesign);
int count = 0;
int numWithNoTranscript = 0;
// int hasGeneAlready = 0;
// int numNewGenes = 0;
int numNewElements = 0;
int numUpdatedElements = 0;
for (Gene gene : knownGenes) {
gene = geneService.thaw(gene);
Collection<GeneProduct> products = gene.getProducts();
if (products.isEmpty()) {
numWithNoTranscript++;
AbstractCLI.log.debug("No transcript for " + gene);
continue;
}
count++;
CompositeSequence csForGene = null;
if (useNCBIIds) {
if (gene.getNcbiGeneId() == null) {
AbstractCLI.log.debug("No NCBI ID for " + gene + ", skipping");
continue;
}
if (existingSymbolMap.containsKey(gene.getNcbiGeneId().toString())) {
csForGene = existingSymbolMap.get(gene.getNcbiGeneId().toString());
}
} else if (useEnsemblIds) {
if (gene.getEnsemblId() == null) {
AbstractCLI.log.debug("No Ensembl ID for " + gene + ", skipping");
continue;
}
if (existingSymbolMap.containsKey(gene.getEnsemblId())) {
csForGene = existingSymbolMap.get(gene.getEnsemblId());
}
} else {
/*
* detect when the symbol has changed
*/
if (existingSymbolMap.containsKey(gene.getOfficialSymbol())) {
csForGene = existingSymbolMap.get(gene.getOfficialSymbol());
} else if (existingGeneMap.containsKey(gene)) {
csForGene = existingGeneMap.get(gene);
AbstractCLI.log.debug("Gene symbol has changed for: " + gene + "? Current element has name=" + csForGene.getName());
csForGene.setName(gene.getOfficialSymbol());
}
}
assert csForGene == null || csForGene.getId() != null : "Null id for " + csForGene;
/*
* We arbitrarily link the "probe" to one of the gene's RNA transcripts. We could consider other strategies
* to pick the representative, but it generally doesn't matter.
*/
for (GeneProduct geneProduct : products) {
if (!GeneProductType.RNA.equals(geneProduct.getType())) {
continue;
}
/*
* Name is usually the genbank or ensembl accession
*/
String name = geneProduct.getName();
BioSequence bioSequence = BioSequence.Factory.newInstance();
Collection<DatabaseEntry> accessions = geneProduct.getAccessions();
bioSequence.setName(name);
bioSequence.setTaxon(taxon);
bioSequence.setPolymerType(PolymerType.RNA);
bioSequence.setType(SequenceType.mRNA);
BioSequence existing = null;
if (accessions.isEmpty()) {
// this should not be hit.
AbstractCLI.log.warn("No accession for " + name);
DatabaseEntry de = DatabaseEntry.Factory.newInstance();
de.setAccession(name);
if (name.startsWith("ENS") && name.length() > 10) {
de.setExternalDatabase(ensembl);
} else {
if (name.matches("^[A-Z]{1,2}(_?)[0-9]+(\\.[0-9]+)?$")) {
de.setExternalDatabase(genbank);
} else {
AbstractCLI.log.info("Name doesn't look like genbank or ensembl, skipping: " + name);
continue;
}
}
bioSequence.setSequenceDatabaseEntry(de);
} else {
bioSequence.setSequenceDatabaseEntry(accessions.iterator().next());
existing = bioSequenceService.findByAccession(accessions.iterator().next());
// FIXME It is possible that this sequence will have been aligned to the genome, which is a bit
// confusing. So it will map to a gene. Worse case: it maps to more than one gene ...
}
if (existing == null) {
bioSequence = (BioSequence) this.getPersisterHelper().persist(bioSequence);
} else {
bioSequence = existing;
}
assert bioSequence != null && bioSequence.getId() != null;
if (bioSequence.getSequenceDatabaseEntry() == null) {
AbstractCLI.log.info("No DB entry for " + bioSequence + "(" + gene + "), will look for a better sequence to use ...");
continue;
}
if (csForGene == null) {
if (AbstractCLI.log.isDebugEnabled())
AbstractCLI.log.debug("New element " + " with " + bioSequence + " for " + gene);
csForGene = CompositeSequence.Factory.newInstance();
if (useNCBIIds) {
if (gene.getNcbiGeneId() == null) {
continue;
}
csForGene.setName(gene.getNcbiGeneId().toString());
} else if (useEnsemblIds) {
if (gene.getEnsemblId() == null) {
continue;
}
csForGene.setName(gene.getEnsemblId());
} else {
csForGene.setName(gene.getOfficialSymbol());
}
csForGene.setArrayDesign(arrayDesign);
csForGene.setBiologicalCharacteristic(bioSequence);
csForGene.setDescription("Generic expression element for " + gene);
csForGene = compositeSequenceService.create(csForGene);
assert csForGene.getId() != null : "No id for " + csForGene + " for " + gene;
arrayDesign.getCompositeSequences().add(csForGene);
numNewElements++;
} else {
if (AbstractCLI.log.isDebugEnabled())
AbstractCLI.log.debug("Updating existing element: " + csForGene + " with " + bioSequence + " for " + gene);
csForGene.setArrayDesign(arrayDesign);
csForGene.setBiologicalCharacteristic(bioSequence);
csForGene.setDescription("Generic expression element for " + gene);
assert csForGene.getId() != null : "No id for " + csForGene + " for " + gene;
compositeSequenceService.update(csForGene);
// making sure ...
csForGene = compositeSequenceService.load(csForGene.getId());
assert csForGene.getId() != null;
arrayDesign.getCompositeSequences().add(csForGene);
numUpdatedElements++;
}
assert bioSequence.getId() != null;
assert geneProduct.getId() != null;
assert csForGene.getBiologicalCharacteristic() != null && csForGene.getBiologicalCharacteristic().getId() != null;
AnnotationAssociation aa = AnnotationAssociation.Factory.newInstance();
aa.setGeneProduct(geneProduct);
aa.setBioSequence(bioSequence);
annotationAssociationService.create(aa);
break;
}
if (count % 100 == 0)
AbstractCLI.log.info(count + " genes processed; " + numNewElements + " new elements; " + numUpdatedElements + " updated elements; " + numWithNoTranscript + " genes had no transcript and were skipped.");
}
// is this necessary? causes an error sometimes.
// arrayDesignService.update( arrayDesign );
AbstractCLI.log.info("Array design has " + arrayDesignService.numCompositeSequenceWithGenes(arrayDesign) + " 'probes' associated with genes.");
arrayDesignReportService.generateArrayDesignReport(arrayDesign.getId());
auditTrailService.addUpdateEvent(arrayDesign, AnnotationBasedGeneMappingEvent.Factory.newInstance(), count + " genes processed; " + numNewElements + " new elements; " + numUpdatedElements + " updated elements; " + numWithNoTranscript + " genes had no transcript and were skipped.");
arrayDesignAnnotationService.deleteExistingFiles(arrayDesign);
AbstractCLI.log.info("Don't forget to update the annotation files");
return null;
}
Aggregations