Search in sources :

Example 26 with BioSequence

use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.

the class ArrayDesignSequenceAlignmentServiceImpl method processArrayDesign.

@Override
public Collection<BlatResult> processArrayDesign(ArrayDesign ad, Taxon taxon, Collection<BlatResult> rawBlatResults) {
    ArrayDesignSequenceAlignmentServiceImpl.log.info("Looking for old results to remove...");
    ad = arrayDesignService.thaw(ad);
    arrayDesignService.deleteAlignmentData(ad);
    // Blat file processing can only be run on one taxon at a time
    taxon = this.validateTaxaForBlatFile(ad, taxon);
    Collection<BioSequence> sequencesToBlat = ArrayDesignSequenceAlignmentServiceImpl.getSequences(ad);
    sequencesToBlat = bioSequenceService.thaw(sequencesToBlat);
    // if the blat results were loaded from a file, we have to replace the
    // query sequences with the actual ones
    // attached to the array design. We have to do this by name because the
    // sequence name is what the files contain.
    // Note that if there is ambiguity there will be problems!
    Map<String, BioSequence> seqMap = new HashMap<>();
    for (BioSequence bioSequence : sequencesToBlat) {
        seqMap.put(bioSequence.getName(), bioSequence);
    }
    ExternalDatabase searchedDatabase = ShellDelegatingBlat.getSearchedGenome(taxon);
    Collection<BlatResult> toSkip = new HashSet<>();
    for (BlatResult result : rawBlatResults) {
        /*
             * If the sequences don't have ids, replace them with the actual sequences associated with the array design.
             */
        if (result.getQuerySequence().getId() == null) {
            String querySeqName = result.getQuerySequence().getName();
            BioSequence actualSequence = seqMap.get(querySeqName);
            if (actualSequence == null) {
                ArrayDesignSequenceAlignmentServiceImpl.log.debug("Array design does not contain a sequence with name " + querySeqName);
                toSkip.add(result);
                continue;
            }
            result.setQuerySequence(actualSequence);
        } else {
            result.getQuerySequence().setTaxon(taxon);
        }
        result.setSearchedDatabase(searchedDatabase);
        try {
            FieldUtils.writeField(result.getTargetChromosome(), "taxon", taxon, true);
        } catch (IllegalAccessException e) {
            e.printStackTrace();
        }
        result.getTargetChromosome().getSequence().setTaxon(taxon);
    }
    if (toSkip.size() > 0) {
        ArrayDesignSequenceAlignmentServiceImpl.log.warn(toSkip.size() + " blat results were for sequences not on " + ad + "; they will be ignored.");
        rawBlatResults.removeAll(toSkip);
    }
    Map<BioSequence, Collection<BlatResult>> goldenPathAlignments = new HashMap<>();
    this.getGoldenPathAlignments(sequencesToBlat, taxon, goldenPathAlignments);
    for (BioSequence sequence : goldenPathAlignments.keySet()) {
        rawBlatResults.addAll(goldenPathAlignments.get(sequence));
    }
    Collection<BlatResult> results = this.persistBlatResults(rawBlatResults);
    arrayDesignReportService.generateArrayDesignReport(ad.getId());
    return results;
}
Also used : ExternalDatabase(ubic.gemma.model.common.description.ExternalDatabase) BioSequence(ubic.gemma.model.genome.biosequence.BioSequence) HashMap(java.util.HashMap) Collection(java.util.Collection) BlatResult(ubic.gemma.model.genome.sequenceAnalysis.BlatResult) HashSet(java.util.HashSet)

Example 27 with BioSequence

use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.

the class ArrayDesignSequenceAlignmentServiceImpl method getSequences.

/**
 * @param taxon (specified in case array has multiple taxa)
 */
// Possible external use
@SuppressWarnings({ "unused", "WeakerAccess" })
public static Collection<BioSequence> getSequences(ArrayDesign ad, Taxon taxon) {
    Collection<CompositeSequence> compositeSequences = ad.getCompositeSequences();
    Collection<BioSequence> sequencesToBlat = new HashSet<>();
    int numWithNoBioSequence = 0;
    int numWithNoSequenceData = 0;
    boolean warned = false;
    for (CompositeSequence cs : compositeSequences) {
        BioSequence bs = cs.getBiologicalCharacteristic();
        if (!warned && (numWithNoBioSequence > 20 || numWithNoSequenceData > 20)) {
            warned = true;
            ArrayDesignSequenceAlignmentServiceImpl.log.warn("More than 20 composite sequences don't have sequence information, no more warnings...");
        }
        if (bs == null) {
            ++numWithNoBioSequence;
            if (!warned) {
                ArrayDesignSequenceAlignmentServiceImpl.log.warn(cs + " had no associated biosequence object");
            }
            continue;
        }
        if (bs.getTaxon() == null) {
            warned = true;
            ArrayDesignSequenceAlignmentServiceImpl.log.warn("There is no taxon defined for this biosequence ");
            continue;
        }
        // if the taxon is null that means we want this run for all taxa for that array
        if (taxon != null && !bs.getTaxon().equals(taxon)) {
            continue;
        }
        // noinspection ConstantConditions // Better readability
        if (!warned && (numWithNoBioSequence > 20 || numWithNoSequenceData > 20)) {
            warned = true;
            ArrayDesignSequenceAlignmentServiceImpl.log.warn("More than 20 composite sequences don't have sequence information, no more warnings...");
        }
        if (StringUtils.isBlank(bs.getSequence())) {
            ++numWithNoSequenceData;
            if (!warned) {
                ArrayDesignSequenceAlignmentServiceImpl.log.warn(cs + " had " + bs + " but no sequence, skipping");
            }
            continue;
        }
        sequencesToBlat.add(bs);
    }
    if (numWithNoBioSequence > 0 || numWithNoSequenceData > 0) {
        ArrayDesignSequenceAlignmentServiceImpl.log.warn(numWithNoBioSequence + " composite sequences lacked biosequence associations; " + numWithNoSequenceData + " lacked sequence data ( out of " + compositeSequences.size() + " total).");
    }
    return sequencesToBlat;
}
Also used : BioSequence(ubic.gemma.model.genome.biosequence.BioSequence) CompositeSequence(ubic.gemma.model.expression.designElement.CompositeSequence) HashSet(java.util.HashSet)

Example 28 with BioSequence

use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.

the class ArrayDesignSequenceAlignmentServiceImpl method getGoldenPathAlignments.

/**
 * Check if there are alignment results in the goldenpath database, in which case we do not reanalyze the sequences.
 *
 * @param sequencesToBlat The full set of sequences that need analysis.
 * @param results         Will be stored here.
 * @return the sequences which ARE NOT found in goldenpath and which therefore DO need blat.
 */
private Collection<BioSequence> getGoldenPathAlignments(Collection<BioSequence> sequencesToBlat, Taxon taxon, Map<BioSequence, Collection<BlatResult>> results) {
    GoldenPathQuery gpq = new GoldenPathQuery(taxon);
    Collection<BioSequence> needBlat = new HashSet<>();
    int count = 0;
    int totalFound = 0;
    for (BioSequence sequence : sequencesToBlat) {
        boolean found = false;
        if (sequence.getSequenceDatabaseEntry() != null) {
            Collection<BlatResult> brs = gpq.findAlignments(sequence.getSequenceDatabaseEntry().getAccession());
            if (brs != null && brs.size() > 0) {
                for (BlatResult result : brs) {
                    this.copyLengthInformation(sequence, result);
                    result.setQuerySequence(sequence);
                }
                results.put(sequence, brs);
                found = true;
                totalFound++;
            }
        }
        if (++count % 1000 == 0 && totalFound > 0) {
            ArrayDesignSequenceAlignmentServiceImpl.log.info("Alignments in Golden Path database for " + totalFound + "/" + count + " checked so far.");
        }
        if (!found) {
            needBlat.add(sequence);
        }
    }
    if (totalFound > 0) {
        ArrayDesignSequenceAlignmentServiceImpl.log.info("Found " + totalFound + "/" + count + " alignments in Golden Path database");
    }
    return needBlat;
}
Also used : BioSequence(ubic.gemma.model.genome.biosequence.BioSequence) GoldenPathQuery(ubic.gemma.core.externalDb.GoldenPathQuery) HashSet(java.util.HashSet) BlatResult(ubic.gemma.model.genome.sequenceAnalysis.BlatResult)

Example 29 with BioSequence

use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.

the class ArrayDesignSequenceProcessingServiceImpl method assignSequencesToDesignElements.

@Override
public void assignSequencesToDesignElements(Collection<CompositeSequence> designElements, Collection<BioSequence> sequences) {
    Map<String, BioSequence> nameMap = new HashMap<>();
    for (BioSequence sequence : sequences) {
        nameMap.put(this.deMangleProbeId(sequence.getName()), sequence);
    }
    int numNotFound = 0;
    for (CompositeSequence designElement : designElements) {
        if (!nameMap.containsKey(designElement.getName())) {
            ArrayDesignSequenceProcessingServiceImpl.log.debug("No sequence matches " + designElement.getName());
            numNotFound++;
            continue;
        }
        designElement.setBiologicalCharacteristic(nameMap.get(designElement.getName()));
    }
    ArrayDesignSequenceProcessingServiceImpl.log.info(sequences.size() + " sequences processed for " + designElements.size() + " design elements");
    if (numNotFound > 0) {
        ArrayDesignSequenceProcessingServiceImpl.log.warn(numNotFound + " probes had no matching sequence");
    }
}
Also used : BioSequence(ubic.gemma.model.genome.biosequence.BioSequence) CompositeSequence(ubic.gemma.model.expression.designElement.CompositeSequence)

Example 30 with BioSequence

use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.

the class ArrayDesignSequenceProcessingServiceImpl method findLocalSequences.

private Map<String, BioSequence> findLocalSequences(Collection<String> identifiersToSearch, Taxon taxon) {
    Map<String, BioSequence> found = new HashMap<>();
    for (String id : identifiersToSearch) {
        BioSequence template = BioSequence.Factory.newInstance();
        template.setTaxon(taxon);
        template.setName(id);
        BioSequence seq = bioSequenceService.find(template);
        if (seq != null) {
            found.put(id, seq);
        }
    }
    return found;
}
Also used : BioSequence(ubic.gemma.model.genome.biosequence.BioSequence)

Aggregations

BioSequence (ubic.gemma.model.genome.biosequence.BioSequence)105 CompositeSequence (ubic.gemma.model.expression.designElement.CompositeSequence)40 ArrayDesign (ubic.gemma.model.expression.arrayDesign.ArrayDesign)24 Test (org.junit.Test)18 HashSet (java.util.HashSet)17 Taxon (ubic.gemma.model.genome.Taxon)15 BlatResult (ubic.gemma.model.genome.sequenceAnalysis.BlatResult)12 InputStream (java.io.InputStream)11 Collection (java.util.Collection)11 HashMap (java.util.HashMap)10 BaseSpringContextTest (ubic.gemma.core.testing.BaseSpringContextTest)10 GZIPInputStream (java.util.zip.GZIPInputStream)7 Gene (ubic.gemma.model.genome.Gene)7 GeoPlatform (ubic.gemma.core.loader.expression.geo.model.GeoPlatform)6 DatabaseEntry (ubic.gemma.model.common.description.DatabaseEntry)6 StopWatch (org.apache.commons.lang3.time.StopWatch)5 GeneProduct (ubic.gemma.model.genome.gene.GeneProduct)5 BioSequenceValueObject (ubic.gemma.model.genome.sequenceAnalysis.BioSequenceValueObject)5 BlatAssociation (ubic.gemma.model.genome.sequenceAnalysis.BlatAssociation)5 AbstractGeoServiceTest (ubic.gemma.core.loader.expression.geo.AbstractGeoServiceTest)4