use of ubic.gemma.model.genome.Gene in project Gemma by PavlidisLab.
the class ArrayDesignProbeMapperServiceImpl method printResult.
/**
* Print line of result to STDOUT.
*/
private void printResult(CompositeSequence cs, BlatAssociation blatAssociation) {
GeneProduct geneProduct = blatAssociation.getGeneProduct();
Gene gene = geneProduct.getGene();
System.out.println(cs.getName() + '\t' + blatAssociation.getBioSequence().getName() + '\t' + geneProduct.getName() + '\t' + gene.getOfficialSymbol() + "\t" + gene.getClass().getSimpleName());
}
use of ubic.gemma.model.genome.Gene in project Gemma by PavlidisLab.
the class ArrayDesignProbeMapperServiceImpl method processArrayDesign.
@Override
public void processArrayDesign(ArrayDesign arrayDesign, Taxon taxon, File source, ExternalDatabase sourceDB, boolean ncbiIds) throws IOException {
if (taxon == null && !ncbiIds) {
throw new IllegalArgumentException("You must provide a taxon unless passing ncbiIds = true");
}
if (arrayDesign.getTechnologyType().equals(TechnologyType.NONE)) {
throw new IllegalArgumentException("Do not use this service to process platforms that do not use an probe-based technology.");
}
try (BufferedReader b = new BufferedReader(new FileReader(source))) {
String line;
int numSkipped = 0;
ArrayDesignProbeMapperServiceImpl.log.info("Removing any old associations");
arrayDesignService.deleteGeneProductAssociations(arrayDesign);
while ((line = b.readLine()) != null) {
if (StringUtils.isBlank(line)) {
continue;
}
if (line.startsWith("#")) {
continue;
}
String[] fields = StringUtils.splitPreserveAllTokens(line, '\t');
if (fields.length != 3) {
throw new IOException("Illegal format, expected three columns, got " + fields.length);
}
String probeId = fields[0];
String seqName = fields[1];
/*
* FIXME. We have to allow NCBI gene ids here.
*/
String geneSymbol = fields[2];
if (StringUtils.isBlank(geneSymbol)) {
numSkipped++;
continue;
}
CompositeSequence c = compositeSequenceService.findByName(arrayDesign, probeId);
if (c == null) {
if (ArrayDesignProbeMapperServiceImpl.log.isDebugEnabled())
ArrayDesignProbeMapperServiceImpl.log.debug("No probe found for '" + probeId + "' on " + arrayDesign + ", skipping");
numSkipped++;
continue;
}
// a probe can have more than one gene associated with it if so they are piped |
Collection<Gene> geneListProbe = new HashSet<>();
// indicate multiple genes
Gene geneDetails;
StringTokenizer st = new StringTokenizer(geneSymbol, "|");
while (st.hasMoreTokens()) {
String geneToken = st.nextToken().trim();
if (ncbiIds) {
geneDetails = geneService.findByNCBIId(Integer.parseInt(geneToken));
} else {
geneDetails = geneService.findByOfficialSymbol(geneToken, taxon);
}
if (geneDetails != null) {
geneListProbe.add(geneDetails);
}
}
if (geneListProbe.size() == 0) {
ArrayDesignProbeMapperServiceImpl.log.warn("No gene(s) found for '" + geneSymbol + "' in " + taxon + ", skipping");
numSkipped++;
continue;
} else if (geneListProbe.size() > 1) {
// this is a common situation, when the geneSymbol actually has |-separated genes, so no need to
// make a
// lot of fuss.
ArrayDesignProbeMapperServiceImpl.log.debug("More than one gene found for '" + geneSymbol + "' in " + taxon);
}
BioSequence bs = c.getBiologicalCharacteristic();
if (bs != null) {
if (StringUtils.isNotBlank(seqName)) {
bs = bioSequenceService.thaw(bs);
if (!bs.getName().equals(seqName)) {
ArrayDesignProbeMapperServiceImpl.log.warn("Sequence name '" + seqName + "' given for " + probeId + " does not match existing entry " + bs.getName() + ", skipping");
numSkipped++;
continue;
}
}
// otherwise we assume everything is okay.
} else {
// create one based on the text provided.
if (StringUtils.isBlank(seqName)) {
ArrayDesignProbeMapperServiceImpl.log.warn("You must provide sequence names for probes which are not already mapped. probeName=" + probeId + " had no sequence associated and no name provided; skipping");
numSkipped++;
continue;
}
bs = BioSequence.Factory.newInstance();
bs.setName(seqName);
bs.setTaxon(taxon);
bs.setDescription("Imported from annotation file");
// Placeholder.
bs.setType(SequenceType.OTHER);
bs = bioSequenceService.create(bs);
c.setBiologicalCharacteristic(bs);
compositeSequenceService.update(c);
}
assert bs != null;
assert bs.getId() != null;
for (Gene gene : geneListProbe) {
gene = geneService.thaw(gene);
if (gene.getProducts().size() == 0) {
ArrayDesignProbeMapperServiceImpl.log.warn("There are no gene products for " + gene + ", it cannot be mapped to probes. Skipping");
numSkipped++;
continue;
}
for (GeneProduct gp : gene.getProducts()) {
AnnotationAssociation association = AnnotationAssociation.Factory.newInstance();
association.setBioSequence(bs);
association.setGeneProduct(gp);
association.setSource(sourceDB);
annotationAssociationService.create(association);
}
}
}
arrayDesignReportService.generateArrayDesignReport(arrayDesign.getId());
this.deleteOldFiles(arrayDesign);
ArrayDesignProbeMapperServiceImpl.log.info("Completed association processing for " + arrayDesign + ", " + numSkipped + " were skipped");
}
}
use of ubic.gemma.model.genome.Gene in project Gemma by PavlidisLab.
the class NcbiGeneConverter method convert.
public Gene convert(NCBIGeneInfo info) {
Gene gene = Gene.Factory.newInstance();
gene.setNcbiGeneId(Integer.parseInt(info.getGeneId()));
gene.setName(info.getDefaultSymbol());
gene.setOfficialSymbol(info.getDefaultSymbol());
gene.setOfficialName(info.getDescription());
gene.setEnsemblId(info.getEnsemblId());
/*
* NOTE we allow multiple discontinued or previous ids, separated by commas. This is a hack to account for cases
* uncovered recently...can be minimized by running this regularly.
*/
if (info.getHistory() != null) {
assert info.getHistory().getCurrentId() == null || info.getGeneId().equals(info.getHistory().getCurrentId());
assert info.getHistory().getPreviousIds() != null;
if (!info.getHistory().getPreviousIds().isEmpty()) {
String previousIds = StringUtils.join(info.getHistory().getPreviousIds(), ",");
gene.setPreviousNcbiId(previousIds);
}
} else if (StringUtils.isNotBlank(info.getDiscontinuedId())) {
if (NcbiGeneConverter.log.isDebugEnabled())
NcbiGeneConverter.log.debug("Gene matches a gene that was discontinued: " + gene + " matches gene that had id " + info.getDiscontinuedId());
gene.setPreviousNcbiId(info.getDiscontinuedId());
}
gene.setDescription("Imported from NCBI gene; Nomenclature status: " + info.getNomenclatureStatus());
Taxon t = Taxon.Factory.newInstance();
t.setNcbiId(info.getTaxId());
t.setIsGenesUsable(false);
t.setIsSpecies(true);
gene.setTaxon(t);
/*
* We are going to stop maintaining this information
*/
PhysicalLocation pl = PhysicalLocation.Factory.newInstance();
Chromosome chrom = new Chromosome(info.getChromosome(), t);
pl.setChromosome(chrom);
gene.setPhysicalLocation(pl);
Collection<GeneAlias> aliases = gene.getAliases();
for (String alias : info.getSynonyms()) {
GeneAlias newAlias = GeneAlias.Factory.newInstance();
newAlias.setAlias(alias);
aliases.add(newAlias);
}
for (String dbname : info.getDbXrefs().keySet()) {
if (!dbname.equalsIgnoreCase("Ensembl"))
continue;
String identifier = info.getDbXrefs().get(dbname);
DatabaseEntry crossref = DatabaseEntry.Factory.newInstance();
crossref.setAccession(identifier);
crossref.setExternalDatabase(NcbiGeneConverter.getEnsembl());
gene.getAccessions().add(crossref);
}
return gene;
}
use of ubic.gemma.model.genome.Gene in project Gemma by PavlidisLab.
the class NcbiGeneLoader method doLoad.
void doLoad(final BlockingQueue<Gene> geneQueue) {
StopWatch timer = new StopWatch();
timer.start();
while (!(converterDone.get() && geneQueue.isEmpty())) {
Gene gene = null;
try {
// the converted genes.
gene = geneQueue.poll();
if (gene == null) {
continue;
}
persisterHelper.persistOrUpdate(gene);
if (++loadedGeneCount % 1000 == 0 || timer.getTime() > 30 * 1000) {
NcbiGeneLoader.log.info("Processed " + loadedGeneCount + " genes. Queue has " + geneQueue.size() + " items; last gene: " + gene);
timer.reset();
timer.start();
}
} catch (Exception e) {
NcbiGeneLoader.log.error("Error while loading gene: " + gene + ": " + e.getMessage(), e);
loaderDone.set(true);
throw new RuntimeException(e);
}
}
NcbiGeneLoader.log.info("Loaded " + loadedGeneCount + " genes. ");
loaderDone.set(true);
}
use of ubic.gemma.model.genome.Gene in project Gemma by PavlidisLab.
the class StringProteinInteractionLoader method doLoad.
/**
* Poll the queue to see if any Gene2GeneProteinAssociation to load into database. If so firstly check to see if the
* genes are in the gemma db as these identifiers came from biomart If both genes found load.
*
* @param gene2GeneProteinAssociationQueue queue of Gene2GeneProteinAssociation to load
*/
private void doLoad(final BlockingQueue<Gene2GeneProteinAssociation> gene2GeneProteinAssociationQueue) {
StringProteinInteractionLoader.log.info("starting processing ");
while (!(converterDone.get() && gene2GeneProteinAssociationQueue.isEmpty())) {
try {
Gene2GeneProteinAssociation gene2GeneProteinAssociation = gene2GeneProteinAssociationQueue.poll();
if (gene2GeneProteinAssociation == null) {
continue;
}
// check they are genes gemma knows about
Gene geneOne = geneService.findByNCBIId(gene2GeneProteinAssociation.getFirstGene().getNcbiGeneId());
Gene geneTwo = geneService.findByNCBIId(gene2GeneProteinAssociation.getSecondGene().getNcbiGeneId());
if (geneOne == null) {
StringProteinInteractionLoader.log.warn("Gene with NCBI id=" + gene2GeneProteinAssociation.getFirstGene().getNcbiGeneId() + " not in Gemma");
continue;
}
if (geneTwo == null) {
StringProteinInteractionLoader.log.warn("Gene with NCBI id=" + gene2GeneProteinAssociation.getSecondGene().getNcbiGeneId() + " not in Gemma");
continue;
}
FieldUtils.writeField(gene2GeneProteinAssociation, "firstGene", geneOne, true);
FieldUtils.writeField(gene2GeneProteinAssociation, "secondGene", geneTwo, true);
persisterHelper.persist(gene2GeneProteinAssociation);
if (++loadedGeneCount % 1000 == 0) {
StringProteinInteractionLoader.log.info("Proceesed " + loadedGeneCount + " protein protein interactions. " + "Current queue has " + gene2GeneProteinAssociationQueue.size() + " items.");
}
} catch (Exception e) {
StringProteinInteractionLoader.log.error(e, e);
loaderDone.set(true);
throw new RuntimeException(e);
}
}
StringProteinInteractionLoader.log.info("Loaded " + loadedGeneCount + " protein protein interactions. ");
loaderDone.set(true);
}
Aggregations