use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.
the class GeoConverterTest method testWithImages.
/*
* Has image clones.
*/
@Test
public final void testWithImages() throws Exception {
GeoFamilyParser parser = new GeoFamilyParser();
parser.setProcessPlatformsOnly(true);
try (InputStream is = new GZIPInputStream(this.getClass().getResourceAsStream("/data/loader/expression/geo/GPL890_family.soft.gz"))) {
parser.parse(is);
}
GeoPlatform platform = ((GeoParseResult) parser.getResults().iterator().next()).getPlatformMap().get("GPL890");
Object result = this.gc.convert(platform);
ArrayDesign ad = (ArrayDesign) result;
for (CompositeSequence cs : ad.getCompositeSequences()) {
BioSequence bs = cs.getBiologicalCharacteristic();
if (bs != null && bs.getSequence() != null) {
return;
}
}
fail("No sequences!");
}
use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.
the class GeoConverterTest method testMultipleTaxaIdentifiedBYAbbreviationsOnArrayWithOrganismColumn.
/*
* Method to test that an array design can have multiple taxa stored against it and that if abbreviations used as
* probe names mapped to the scientific names correctly if the abbreviation is stored in DB.
*/
@Test
public void testMultipleTaxaIdentifiedBYAbbreviationsOnArrayWithOrganismColumn() throws Exception {
Taxon rainbowTroat = taxonService.findByAbbreviation("omyk");
Taxon whiteFish = taxonService.findByAbbreviation("cclu");
Taxon rainbowSmelt = taxonService.findByAbbreviation("omor");
Taxon atlanticSalm = taxonService.findByAbbreviation("ssal");
assertNotNull(atlanticSalm);
// prototype bean.
gc = this.getBean(GeoConverter.class);
InputStream is = new GZIPInputStream(this.getClass().getResourceAsStream("/data/loader/expression/geo/GPL2899_family.soft.gz"));
GeoFamilyParser parser = new GeoFamilyParser();
// parse only the plaform
parser.setProcessPlatformsOnly(true);
parser.parse(is);
GeoPlatform platform = ((GeoParseResult) parser.getResults().iterator().next()).getPlatformMap().get("GPL2899");
Object result = gc.convert(platform);
ArrayDesign ad = (ArrayDesign) result;
assertNotNull(ad);
Set<Taxon> taxa = new HashSet<>();
BioSequence bs;
for (CompositeSequence cs : ad.getCompositeSequences()) {
bs = cs.getBiologicalCharacteristic();
if (bs != null) {
assertNotNull(bs.getTaxon());
taxa.add(bs.getTaxon());
}
}
assertEquals(4, taxa.size());
// original file has five taxa, test file just kept four.
assertTrue(taxa.contains(atlanticSalm));
assertTrue(taxa.contains(rainbowTroat));
assertTrue(taxa.contains(whiteFish));
assertTrue(taxa.contains(rainbowSmelt));
}
use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.
the class GeoConverterTest method testConvertGSE4229IMAGE.
/*
* See bug 3328 - we don't want to use IMAGE clone IDs
*/
@Test
public void testConvertGSE4229IMAGE() throws Exception {
InputStream is = new GZIPInputStream(this.getClass().getResourceAsStream("/data/loader/expression/geo/gse4229Short/GSE4229.soft.gz"));
GeoFamilyParser parser = new GeoFamilyParser();
parser.parse(is);
GeoSeries series = ((GeoParseResult) parser.getResults().iterator().next()).getSeriesMap().get("GSE4229");
DatasetCombiner datasetCombiner = new DatasetCombiner();
GeoSampleCorrespondence correspondence = datasetCombiner.findGSECorrespondence(series);
series.setSampleCorrespondence(correspondence);
Object result = this.gc.convert(series);
assertNotNull(result);
@SuppressWarnings("unchecked") Collection<ExpressionExperiment> ees = (Collection<ExpressionExperiment>) result;
ExpressionExperiment ee = ees.iterator().next();
ArrayDesign platform = ee.getBioAssays().iterator().next().getArrayDesignUsed();
BioSequence seq = platform.getCompositeSequences().iterator().next().getBiologicalCharacteristic();
assertNotNull(seq.getSequenceDatabaseEntry());
String acc = seq.getSequenceDatabaseEntry().getAccession();
assertEquals("Genbank", seq.getSequenceDatabaseEntry().getExternalDatabase().getName());
assertTrue(!acc.startsWith("IMAGE"));
}
use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.
the class ProbeMapperImpl method processBlatResults.
@Override
public Map<String, Collection<BlatAssociation>> processBlatResults(GoldenPathSequenceAnalysis goldenPathDb, Collection<BlatResult> blatResults, ProbeMapperConfig config) {
if (log.isDebugEnabled()) {
log.debug(blatResults.size() + " Blat results to map ");
}
assert goldenPathDb != null;
Map<String, Collection<BlatAssociation>> allRes = new HashMap<>();
int count = 0;
Map<BioSequence, Collection<BlatResult>> biosequenceToBlatResults = this.groupBlatResultsByBioSequence(blatResults);
assert !biosequenceToBlatResults.isEmpty();
// Do them one sequence at a time.
for (BioSequence sequence : biosequenceToBlatResults.keySet()) {
Collection<BlatResult> blatResultsForSequence = biosequenceToBlatResults.get(sequence);
if (log.isDebugEnabled()) {
log.debug(blatResultsForSequence.size() + " Blat results for " + sequence);
}
assert !blatResultsForSequence.isEmpty();
Collection<BlatResult> trimmedBlatResultsForSequence = this.trimNonCanonicalChromosomeHits(blatResultsForSequence, config);
/*
* Screen out likely repeats, where cross-hybridization is a problem.
*/
Double fractionRepeats = sequence.getFractionRepeats();
if (fractionRepeats != null && fractionRepeats > config.getMaximumRepeatFraction() && trimmedBlatResultsForSequence.size() >= config.getNonSpecificSiteCountThreshold()) {
if (config.getWarnings() < ProbeMapperImpl.MAX_WARNINGS) {
log.info("Skipped " + sequence + " due to repeat content (" + fractionRepeats + ")");
if (config.getWarnings() == ProbeMapperImpl.MAX_WARNINGS)
log.info("Further non-mappings will not be logged");
config.incrementWarnings();
}
continue;
}
/*
* Filter out sequences that have many high-quality alignments to the genome, even if it isn't a repeat. It
* just seems like a good idea to reject probes that have too many alignment sites, even if only one of them
* is to the site of a known gene.
*/
if (trimmedBlatResultsForSequence.size() >= config.getNonRepeatNonSpecificSiteCountThreshold()) {
if (config.getWarnings() < ProbeMapperImpl.MAX_WARNINGS) {
log.info("Skipped " + sequence + " due to non-specificity (" + trimmedBlatResultsForSequence.size() + " hits)");
if (config.getWarnings() == ProbeMapperImpl.MAX_WARNINGS)
log.info("Further non-mappings will not be logged");
config.incrementWarnings();
}
continue;
}
/*
* Sanity check to make sure user is paying attention to what they are putting in.
*/
if (trimmedBlatResultsForSequence.size() > 25) {
log.warn(sequence + " has " + trimmedBlatResultsForSequence.size() + " blat associations (after trimming non-canonical chromosome hits)");
}
Collection<BlatAssociation> blatAssociationsForSequence = new HashSet<>();
int skipped = 0;
// map each blat result.
for (BlatResult blatResult : trimmedBlatResultsForSequence) {
if (blatResult.getQuerySequence() == null || (blatResult.getQuerySequence().getLength() == null && StringUtils.isBlank(blatResult.getQuerySequence().getSequence()))) {
log.warn("Blat result had no sequence: " + blatResult);
continue;
}
assert blatResult.score() >= 0 && blatResult.score() <= 1.0 : "Score was " + blatResult.score();
assert blatResult.identity() >= 0 && blatResult.identity() <= 1.0 : "Identity was " + blatResult.identity();
if (blatResult.score() < config.getBlatScoreThreshold() || blatResult.identity() < config.getIdentityThreshold()) {
if (log.isDebugEnabled())
log.debug("Result for " + sequence + " skipped with score=" + blatResult.score() + " identity=" + blatResult.identity());
skipped++;
continue;
}
if (blatResult.getQuerySequence().getTaxon() == null) {
Taxon taxon = goldenPathDb.getTaxon();
assert taxon != null;
blatResult.getQuerySequence().setTaxon(taxon);
}
// here's the key line! Find gene products that map to the given blat result.
Collection<BlatAssociation> resultsForOneBlatResult = this.processBlatResult(goldenPathDb, blatResult, config);
if (resultsForOneBlatResult != null && resultsForOneBlatResult.size() > 0) {
// would be very unlikely.
if (resultsForOneBlatResult.size() > 100) {
log.warn(blatResult + " for " + sequence + " has " + resultsForOneBlatResult.size() + " blat associations");
}
blatAssociationsForSequence.addAll(resultsForOneBlatResult);
}
// there are rarely this many, but it does happen.
if (++count % 100 == 0 && log.isDebugEnabled())
log.debug("Annotations computed for " + count + " blat results for " + sequence);
}
// Another important step: fill in the specificity, remove duplicates
if (!blatAssociationsForSequence.isEmpty()) {
BlatAssociationScorer.scoreResults(blatAssociationsForSequence);
// Remove hits not meeting criteria
blatAssociationsForSequence = this.filterOnScores(blatAssociationsForSequence, config);
}
// it might be empty now...
if (blatAssociationsForSequence.isEmpty()) {
if (config.getWarnings() < ProbeMapperImpl.MAX_WARNINGS) {
log.info("No mappings for " + sequence + "; " + trimmedBlatResultsForSequence.size() + " individual blat results checked; " + skipped + " had identity or score below threshold, rest had no mapping");
if (config.getWarnings() == ProbeMapperImpl.MAX_WARNINGS)
log.info("Further non-mappings will not be logged");
}
config.incrementWarnings();
continue;
} else if (log.isDebugEnabled()) {
log.debug(blatAssociationsForSequence.size() + " associations for " + sequence);
}
if (skipped > 0) {
if (log.isDebugEnabled())
log.debug("Skipped " + skipped + "/" + blatResults.size() + " individual blat results that didn't meet criteria");
}
String queryName = sequence.getName();
assert StringUtils.isNotBlank(queryName);
if (!allRes.containsKey(queryName)) {
allRes.put(queryName, blatAssociationsForSequence);
}
allRes.get(queryName).addAll(blatAssociationsForSequence);
// if ( log.isDebugEnabled() ) {
// log.info( blatAssociationsForSequence.size() + " associations for " + sequence
// + " after redundancy reduction and score filtering; "
// + ( scored - blatAssociationsForSequence.size() ) + " not used" );
// }
}
return allRes;
}
use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.
the class ProbeMapperImpl method processSequences.
@Override
public Map<String, Collection<BlatAssociation>> processSequences(GoldenPathSequenceAnalysis goldenpath, Collection<BioSequence> sequences, ProbeMapperConfig config) {
Blat b = new ShellDelegatingBlat();
b.setBlatScoreThreshold(config.getBlatScoreThreshold());
try {
Map<BioSequence, Collection<BlatResult>> results = b.blatQuery(sequences, goldenpath.getTaxon());
Collection<BlatResult> blatRes = new HashSet<>();
for (Collection<BlatResult> coll : results.values()) {
blatRes.addAll(coll);
}
return this.processBlatResults(goldenpath, blatRes);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
Aggregations