use of ubic.gemma.model.genome.Taxon in project Gemma by PavlidisLab.
the class NCBIGene2GOAssociationLoaderCLI method doWork.
@Override
protected Exception doWork(String[] args) {
Exception e = this.processCommandLine(args);
if (e != null) {
AbstractCLI.log.error(e);
return e;
}
TaxonService taxonService = this.getBean(TaxonService.class);
NCBIGene2GOAssociationLoader gene2GOAssLoader = new NCBIGene2GOAssociationLoader();
gene2GOAssLoader.setPersisterHelper(this.getPersisterHelper());
Collection<Taxon> taxa = taxonService.loadAll();
gene2GOAssLoader.setParser(new NCBIGene2GOAssociationParser(taxa));
HttpFetcher fetcher = new HttpFetcher();
Collection<LocalFile> files;
if (filePath != null) {
File f = new File(filePath);
if (!f.canRead()) {
return new IOException("Cannot read from " + filePath);
}
files = new HashSet<>();
LocalFile lf = LocalFile.Factory.newInstance();
try {
lf.setLocalURL(f.toURI().toURL());
} catch (MalformedURLException e1) {
return e1;
}
files.add(lf);
} else {
files = fetcher.fetch("ftp://ftp.ncbi.nih.gov/gene/DATA/" + NCBIGene2GOAssociationLoaderCLI.GENE2GO_FILE);
}
assert files.size() == 1;
LocalFile gene2Gofile = files.iterator().next();
Gene2GOAssociationService ggoserv = this.getBean(Gene2GOAssociationService.class);
AbstractCLI.log.info("Removing all old GO associations");
ggoserv.removeAll();
AbstractCLI.log.info("Done, loading new ones");
gene2GOAssLoader.load(gene2Gofile);
AbstractCLI.log.info("Don't forget to update the annotation files for platforms.");
return null;
}
use of ubic.gemma.model.genome.Taxon in project Gemma by PavlidisLab.
the class ProbeMapperImpl method processBlatResults.
@Override
public Map<String, Collection<BlatAssociation>> processBlatResults(GoldenPathSequenceAnalysis goldenPathDb, Collection<BlatResult> blatResults, ProbeMapperConfig config) {
if (log.isDebugEnabled()) {
log.debug(blatResults.size() + " Blat results to map ");
}
assert goldenPathDb != null;
Map<String, Collection<BlatAssociation>> allRes = new HashMap<>();
int count = 0;
Map<BioSequence, Collection<BlatResult>> biosequenceToBlatResults = this.groupBlatResultsByBioSequence(blatResults);
assert !biosequenceToBlatResults.isEmpty();
// Do them one sequence at a time.
for (BioSequence sequence : biosequenceToBlatResults.keySet()) {
Collection<BlatResult> blatResultsForSequence = biosequenceToBlatResults.get(sequence);
if (log.isDebugEnabled()) {
log.debug(blatResultsForSequence.size() + " Blat results for " + sequence);
}
assert !blatResultsForSequence.isEmpty();
Collection<BlatResult> trimmedBlatResultsForSequence = this.trimNonCanonicalChromosomeHits(blatResultsForSequence, config);
/*
* Screen out likely repeats, where cross-hybridization is a problem.
*/
Double fractionRepeats = sequence.getFractionRepeats();
if (fractionRepeats != null && fractionRepeats > config.getMaximumRepeatFraction() && trimmedBlatResultsForSequence.size() >= config.getNonSpecificSiteCountThreshold()) {
if (config.getWarnings() < ProbeMapperImpl.MAX_WARNINGS) {
log.info("Skipped " + sequence + " due to repeat content (" + fractionRepeats + ")");
if (config.getWarnings() == ProbeMapperImpl.MAX_WARNINGS)
log.info("Further non-mappings will not be logged");
config.incrementWarnings();
}
continue;
}
/*
* Filter out sequences that have many high-quality alignments to the genome, even if it isn't a repeat. It
* just seems like a good idea to reject probes that have too many alignment sites, even if only one of them
* is to the site of a known gene.
*/
if (trimmedBlatResultsForSequence.size() >= config.getNonRepeatNonSpecificSiteCountThreshold()) {
if (config.getWarnings() < ProbeMapperImpl.MAX_WARNINGS) {
log.info("Skipped " + sequence + " due to non-specificity (" + trimmedBlatResultsForSequence.size() + " hits)");
if (config.getWarnings() == ProbeMapperImpl.MAX_WARNINGS)
log.info("Further non-mappings will not be logged");
config.incrementWarnings();
}
continue;
}
/*
* Sanity check to make sure user is paying attention to what they are putting in.
*/
if (trimmedBlatResultsForSequence.size() > 25) {
log.warn(sequence + " has " + trimmedBlatResultsForSequence.size() + " blat associations (after trimming non-canonical chromosome hits)");
}
Collection<BlatAssociation> blatAssociationsForSequence = new HashSet<>();
int skipped = 0;
// map each blat result.
for (BlatResult blatResult : trimmedBlatResultsForSequence) {
if (blatResult.getQuerySequence() == null || (blatResult.getQuerySequence().getLength() == null && StringUtils.isBlank(blatResult.getQuerySequence().getSequence()))) {
log.warn("Blat result had no sequence: " + blatResult);
continue;
}
assert blatResult.score() >= 0 && blatResult.score() <= 1.0 : "Score was " + blatResult.score();
assert blatResult.identity() >= 0 && blatResult.identity() <= 1.0 : "Identity was " + blatResult.identity();
if (blatResult.score() < config.getBlatScoreThreshold() || blatResult.identity() < config.getIdentityThreshold()) {
if (log.isDebugEnabled())
log.debug("Result for " + sequence + " skipped with score=" + blatResult.score() + " identity=" + blatResult.identity());
skipped++;
continue;
}
if (blatResult.getQuerySequence().getTaxon() == null) {
Taxon taxon = goldenPathDb.getTaxon();
assert taxon != null;
blatResult.getQuerySequence().setTaxon(taxon);
}
// here's the key line! Find gene products that map to the given blat result.
Collection<BlatAssociation> resultsForOneBlatResult = this.processBlatResult(goldenPathDb, blatResult, config);
if (resultsForOneBlatResult != null && resultsForOneBlatResult.size() > 0) {
// would be very unlikely.
if (resultsForOneBlatResult.size() > 100) {
log.warn(blatResult + " for " + sequence + " has " + resultsForOneBlatResult.size() + " blat associations");
}
blatAssociationsForSequence.addAll(resultsForOneBlatResult);
}
// there are rarely this many, but it does happen.
if (++count % 100 == 0 && log.isDebugEnabled())
log.debug("Annotations computed for " + count + " blat results for " + sequence);
}
// Another important step: fill in the specificity, remove duplicates
if (!blatAssociationsForSequence.isEmpty()) {
BlatAssociationScorer.scoreResults(blatAssociationsForSequence);
// Remove hits not meeting criteria
blatAssociationsForSequence = this.filterOnScores(blatAssociationsForSequence, config);
}
// it might be empty now...
if (blatAssociationsForSequence.isEmpty()) {
if (config.getWarnings() < ProbeMapperImpl.MAX_WARNINGS) {
log.info("No mappings for " + sequence + "; " + trimmedBlatResultsForSequence.size() + " individual blat results checked; " + skipped + " had identity or score below threshold, rest had no mapping");
if (config.getWarnings() == ProbeMapperImpl.MAX_WARNINGS)
log.info("Further non-mappings will not be logged");
}
config.incrementWarnings();
continue;
} else if (log.isDebugEnabled()) {
log.debug(blatAssociationsForSequence.size() + " associations for " + sequence);
}
if (skipped > 0) {
if (log.isDebugEnabled())
log.debug("Skipped " + skipped + "/" + blatResults.size() + " individual blat results that didn't meet criteria");
}
String queryName = sequence.getName();
assert StringUtils.isNotBlank(queryName);
if (!allRes.containsKey(queryName)) {
allRes.put(queryName, blatAssociationsForSequence);
}
allRes.get(queryName).addAll(blatAssociationsForSequence);
// if ( log.isDebugEnabled() ) {
// log.info( blatAssociationsForSequence.size() + " associations for " + sequence
// + " after redundancy reduction and score filtering; "
// + ( scored - blatAssociationsForSequence.size() ) + " not used" );
// }
}
return allRes;
}
use of ubic.gemma.model.genome.Taxon in project Gemma by PavlidisLab.
the class ExpressionDataFileServiceImpl method writeCoexpressionData.
/**
* Loads the probe to probe coexpression link information for a given expression experiment and writes it to disk.
*/
private void writeCoexpressionData(File file, ExpressionExperiment ee) throws IOException {
Taxon tax = expressionExperimentService.getTaxon(ee);
assert tax != null;
Collection<CoexpressionValueObject> geneLinks = gene2geneCoexpressionService.getCoexpression(ee, true);
Date timestamp = new Date(System.currentTimeMillis());
StringBuilder buf = new StringBuilder();
// Write header information
buf.append("# Coexpression data for: ").append(ee.getShortName()).append(" : ").append(ee.getName()).append(" \n");
buf.append("# Generated On: ").append(timestamp).append(" \n");
buf.append("# Links are listed in an arbitrary order with an indication of positive or negative correlation\n");
buf.append(ExpressionDataFileService.DISCLAIMER);
buf.append("GeneSymbol1\tGeneSymbol2\tDirection\tSupport\n");
// Data
for (CoexpressionValueObject link : geneLinks) {
buf.append(link.getQueryGeneSymbol()).append("\t").append(link.getCoexGeneSymbol()).append("\t");
buf.append(link.isPositiveCorrelation() ? "+" : "-" + "\n");
}
// Write coexpression data to file (zipped of course)
try (Writer writer = new OutputStreamWriter(new GZIPOutputStream(new FileOutputStream(file)))) {
writer.write(buf.toString());
}
}
use of ubic.gemma.model.genome.Taxon in project Gemma by PavlidisLab.
the class ExpressionExperimentManipulatingCLI method findExpressionExperimentsByQuery.
/**
* Use the search engine to locate expression experiments.
*/
private Set<BioAssaySet> findExpressionExperimentsByQuery(String query) {
Set<BioAssaySet> ees = new HashSet<>();
Collection<SearchResult> eeSearchResults = searchService.search(SearchSettingsImpl.expressionExperimentSearch(query)).get(ExpressionExperiment.class);
AbstractCLI.log.info(ees.size() + " Expression experiments matched '" + query + "'");
// Filter out all the ee that are not of correct taxon
for (SearchResult sr : eeSearchResults) {
ExpressionExperiment ee = (ExpressionExperiment) sr.getResultObject();
Taxon t = eeService.getTaxon(ee);
if (t != null && t.getCommonName().equalsIgnoreCase(taxon.getCommonName())) {
ees.add(ee);
}
}
return ees;
}
use of ubic.gemma.model.genome.Taxon in project Gemma by PavlidisLab.
the class ExpressionExperimentSetValueObjectHelperTest method setUp.
@Before
public void setUp() {
Taxon tax1 = this.getTaxon("human");
ee = this.getTestPersistentExpressionExperiment(tax1);
Collection<BioAssaySet> ees = new HashSet<>();
ees.add(ee);
eeSet = ExpressionExperimentSet.Factory.newInstance();
eeSet.setName("CreateTest");
eeSet.setDescription("CreateDesc");
eeSet.getExperiments().addAll(ees);
eeSet.setTaxon(tax1);
eeSet = expressionExperimentSetService.create(eeSet);
}
Aggregations