Search in sources :

Example 61 with BioSequence

use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.

the class ArrayDesignSequenceAlignmentServiceImpl method processArrayDesign.

private Collection<BlatResult> processArrayDesign(ArrayDesign ad, boolean sensitive, Blat blat) {
    if (blat == null)
        blat = new ShellDelegatingBlat();
    Collection<BlatResult> allResults = new HashSet<>();
    if (sensitive)
        ArrayDesignSequenceAlignmentServiceImpl.log.info("Running in 'sensitive' mode if possible");
    Collection<Taxon> taxa = arrayDesignService.getTaxa(ad.getId());
    boolean first = true;
    for (Taxon taxon : taxa) {
        Collection<BioSequence> sequencesToBlat = ArrayDesignSequenceAlignmentServiceImpl.getSequences(ad, taxon);
        Map<BioSequence, Collection<BlatResult>> results = this.getAlignments(sequencesToBlat, sensitive, taxon, blat);
        ArrayDesignSequenceAlignmentServiceImpl.log.info("Got BLAT results for " + results.keySet().size() + " query sequences");
        Map<String, BioSequence> nameMap = new HashMap<>();
        for (BioSequence bs : results.keySet()) {
            if (nameMap.containsKey(bs.getName())) {
                throw new IllegalStateException("All distinct sequences on the array must have unique names; found " + bs.getName() + " more than once.");
            }
            nameMap.put(bs.getName(), bs);
        }
        int noResults = 0;
        int count = 0;
        // We only remove the results here, after we have at least one set of blat results.
        if (first) {
            ArrayDesignSequenceAlignmentServiceImpl.log.info("Looking for old results to remove...");
            arrayDesignService.deleteAlignmentData(ad);
        }
        for (BioSequence sequence : sequencesToBlat) {
            if (sequence == null) {
                ArrayDesignSequenceAlignmentServiceImpl.log.warn("Null sequence!");
                continue;
            }
            Collection<BlatResult> brs = results.get(nameMap.get(sequence.getName()));
            if (brs == null) {
                ++noResults;
                continue;
            }
            for (BlatResult result : brs) {
                // must do this to replace
                result.setQuerySequence(sequence);
            // placeholder instance.
            }
            allResults.addAll(this.persistBlatResults(brs));
            if (++count % 2000 == 0) {
                ArrayDesignSequenceAlignmentServiceImpl.log.info("Checked results for " + count + " queries, " + allResults.size() + " blat results so far.");
            }
        }
        ArrayDesignSequenceAlignmentServiceImpl.log.info(noResults + "/" + sequencesToBlat.size() + " sequences had no blat results");
        first = false;
    }
    arrayDesignReportService.generateArrayDesignReport(ad.getId());
    return allResults;
}
Also used : BioSequence(ubic.gemma.model.genome.biosequence.BioSequence) HashMap(java.util.HashMap) Taxon(ubic.gemma.model.genome.Taxon) ShellDelegatingBlat(ubic.gemma.core.apps.ShellDelegatingBlat) Collection(java.util.Collection) BlatResult(ubic.gemma.model.genome.sequenceAnalysis.BlatResult) HashSet(java.util.HashSet)

Example 62 with BioSequence

use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.

the class ArrayDesignSequenceProcessingServiceImpl method assignSequencesToDesignElements.

/**
 * Associate sequences with an array design. It is assumed that the name of the sequences can be matched to the name
 * of a design element. Provided for testing purposes.
 */
@Override
public void assignSequencesToDesignElements(Collection<CompositeSequence> designElements, InputStream fastaFile) throws IOException {
    FastaParser fp = new FastaParser();
    fp.parse(fastaFile);
    Collection<BioSequence> sequences = fp.getResults();
    ArrayDesignSequenceProcessingServiceImpl.log.debug("Parsed " + sequences.size() + " sequences");
    this.assignSequencesToDesignElements(designElements, sequences);
}
Also used : BioSequence(ubic.gemma.model.genome.biosequence.BioSequence) FastaParser(ubic.gemma.core.loader.genome.FastaParser)

Example 63 with BioSequence

use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.

the class ArrayDesignSequenceProcessingServiceImpl method findOrUpdateSequences.

/**
 * Copy sequences into the original versions, or create new sequences in the DB, as needed.
 *
 * @param accessionsToFetch  accessions that we need to fill in
 * @param retrievedSequences candidate sequence information for copying into the database.
 * @param force              If true, if an existing BioSequence that matches if found in the system, any existing sequence
 *                           information in the BioSequence will be overwritten.
 * @param taxa               Representing taxa on array
 * @return Items that were found.
 */
private Map<String, BioSequence> findOrUpdateSequences(Map<String, BioSequence> accessionsToFetch, Collection<BioSequence> retrievedSequences, Collection<Taxon> taxa, boolean force) {
    Map<String, BioSequence> found = new HashMap<>();
    for (Taxon taxon : taxa) {
        if (taxon == null) {
            ArrayDesignSequenceProcessingServiceImpl.log.warn(// probably should be an exception
            "Null taxon ...");
        }
        assert taxon != null;
        boolean warned = false;
        for (BioSequence sequence : retrievedSequences) {
            if (sequence.getTaxon() == null) {
                if (!warned) {
                    ArrayDesignSequenceProcessingServiceImpl.log.warn("Sequence taxon is null [" + sequence + "], copying array taxon " + taxon + " ; further warnings for this array taxon are suppressed.");
                }
                warned = true;
            } else if (!sequence.getTaxon().equals(taxon)) {
                continue;
            }
            sequence.setTaxon(taxon);
            if (sequence.getSequenceDatabaseEntry() == null) {
                ArrayDesignSequenceProcessingServiceImpl.log.warn("Sequence from BLAST db lacks database entry: " + sequence + "; skipping");
                continue;
            }
            sequence = this.createOrUpdateGenbankSequence(sequence, force);
            sequence = this.bioSequenceService.thaw(sequence);
            String accession = sequence.getSequenceDatabaseEntry().getAccession();
            found.put(accession, sequence);
            accessionsToFetch.remove(accession);
        }
    }
    return found;
}
Also used : BioSequence(ubic.gemma.model.genome.biosequence.BioSequence) Taxon(ubic.gemma.model.genome.Taxon)

Example 64 with BioSequence

use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.

the class ArrayDesignSequenceProcessingServiceImpl method processArrayDesign.

@Override
public Collection<BioSequence> processArrayDesign(ArrayDesign arrayDesign, String[] databaseNames, String blastDbHome, boolean force, FastaCmd fc) {
    Map<String, BioSequence> accessionsToFetch = this.initializeFetchList(arrayDesign, force);
    if (accessionsToFetch.size() == 0) {
        ArrayDesignSequenceProcessingServiceImpl.log.info("No accessions to fetch, no processing will be done");
        return null;
    }
    Collection<Taxon> taxaOnArray = arrayDesignService.getTaxa(arrayDesign.getId());
    // not taxon found
    if (taxaOnArray.size() == 0) {
        throw new IllegalArgumentException(taxaOnArray.size() + " taxon found for " + arrayDesign + "please specify which taxon to run");
    }
    Collection<String> notFound = accessionsToFetch.keySet();
    Collection<BioSequence> finalResult = new HashSet<>();
    int versionNumber = 1;
    if (fc == null)
        fc = new SimpleFastaCmd();
    while (versionNumber < ArrayDesignSequenceProcessingService.MAX_VERSION_NUMBER) {
        Collection<BioSequence> retrievedSequences = this.searchBlastDbs(databaseNames, blastDbHome, notFound, fc);
        // we can loop through the taxa as we can ignore sequence when retrieved and arraydesign taxon not match.
        Map<String, BioSequence> found = this.findOrUpdateSequences(accessionsToFetch, retrievedSequences, taxaOnArray, force);
        finalResult.addAll(found.values());
        notFound = this.getUnFound(notFound, found);
        if (notFound.isEmpty()) {
            break;
        }
        for (String accession : notFound) {
            if (ArrayDesignSequenceProcessingServiceImpl.log.isTraceEnabled())
                ArrayDesignSequenceProcessingServiceImpl.log.trace(accession + " not found, increasing version number to " + versionNumber);
            // remove the version number and increase it
            BioSequence bs = accessionsToFetch.get(accession);
            accessionsToFetch.remove(accession);
            // add or increase the version number.
            accession = accession.replaceFirst("\\.\\d+$", "");
            accession = accession + "." + Integer.toString(versionNumber);
            accessionsToFetch.put(accession, bs);
        }
        notFound = accessionsToFetch.keySet();
        ++versionNumber;
    }
    if (!notFound.isEmpty()) {
        this.logMissingSequences(arrayDesign, notFound);
    }
    ArrayDesignSequenceProcessingServiceImpl.log.info(finalResult.size() + " sequences found");
    arrayDesignReportService.generateArrayDesignReport(arrayDesign.getId());
    return finalResult;
}
Also used : BioSequence(ubic.gemma.model.genome.biosequence.BioSequence) Taxon(ubic.gemma.model.genome.Taxon) SimpleFastaCmd(ubic.gemma.core.loader.genome.SimpleFastaCmd)

Example 65 with BioSequence

use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.

the class ArrayDesignSequenceProcessingServiceImpl method processAffymetrixDesign.

@Override
public Collection<BioSequence> processAffymetrixDesign(ArrayDesign arrayDesign, InputStream probeSequenceFile, Taxon taxon, boolean force) throws IOException {
    ArrayDesignSequenceProcessingServiceImpl.log.info("Processing Affymetrix design");
    // arrayDesignService.thawRawAndProcessed( arrayDesign );
    boolean wasOriginallyLackingCompositeSequences = arrayDesign.getCompositeSequences().size() == 0;
    taxon = this.validateTaxon(taxon, arrayDesign);
    Collection<BioSequence> bioSequences = new HashSet<>();
    int done = 0;
    int percent = 0;
    AffyProbeReader apr = new AffyProbeReader();
    apr.parse(probeSequenceFile);
    Collection<CompositeSequence> compositeSequencesFromProbes = apr.getKeySet();
    int total = compositeSequencesFromProbes.size();
    Map<String, CompositeSequence> quickFindMap = new HashMap<>();
    List<BioSequence> sequenceBuffer = new ArrayList<>();
    Map<String, CompositeSequence> csBuffer = new HashMap<>();
    for (CompositeSequence newCompositeSequence : compositeSequencesFromProbes) {
        // these composite sequences are just use
        newCompositeSequence.setArrayDesign(arrayDesign);
        BioSequence collapsed = SequenceManipulation.collapse(apr.get(newCompositeSequence));
        String sequenceName = newCompositeSequence.getName() + "_collapsed";
        collapsed.setName(sequenceName);
        collapsed.setType(SequenceType.AFFY_COLLAPSED);
        collapsed.setPolymerType(PolymerType.DNA);
        collapsed.setTaxon(taxon);
        sequenceBuffer.add(collapsed);
        if (csBuffer.containsKey(sequenceName))
            throw new IllegalArgumentException("All probes must have unique names");
        csBuffer.put(sequenceName, newCompositeSequence);
        if (sequenceBuffer.size() == ArrayDesignSequenceProcessingServiceImpl.BATCH_SIZE) {
            this.flushBuffer(bioSequences, sequenceBuffer, csBuffer);
        }
        if (wasOriginallyLackingCompositeSequences) {
            arrayDesign.getCompositeSequences().add(newCompositeSequence);
        } else {
            if (force) {
                collapsed = this.persistSequence(collapsed);
                assert collapsed.getTaxon().equals(taxon);
            }
            quickFindMap.put(newCompositeSequence.getName(), newCompositeSequence);
        }
        if (++done % 1000 == 0) {
            percent = this.updateProgress(total, done, percent);
        }
    }
    this.flushBuffer(bioSequences, sequenceBuffer, csBuffer);
    this.updateProgress(total, done, percent);
    if (!wasOriginallyLackingCompositeSequences) {
        percent = 0;
        done = 0;
        int numWithNoSequence = 0;
        for (CompositeSequence originalCompositeSequence : arrayDesign.getCompositeSequences()) {
            // go back and fill this information into the composite sequences, namely the database entry
            // information.
            CompositeSequence compositeSequenceFromParse = quickFindMap.get(originalCompositeSequence.getName());
            if (compositeSequenceFromParse == null) {
                numWithNoSequence++;
                this.notifyAboutMissingSequences(numWithNoSequence, originalCompositeSequence);
                continue;
            }
            ArrayDesignSequenceProcessingServiceImpl.log.debug(originalCompositeSequence + " matches " + compositeSequenceFromParse + " seq is " + compositeSequenceFromParse.getBiologicalCharacteristic());
            originalCompositeSequence.setBiologicalCharacteristic(compositeSequenceFromParse.getBiologicalCharacteristic());
            assert originalCompositeSequence.getBiologicalCharacteristic().getId() != null;
            originalCompositeSequence.setArrayDesign(compositeSequenceFromParse.getArrayDesign());
            if (++done % 1000 == 0) {
                percent = this.updateProgress(total, done, percent);
            }
        }
        ArrayDesignSequenceProcessingServiceImpl.log.info(numWithNoSequence + "/" + arrayDesign.getCompositeSequences().size() + " probes could not be matched to a sequence");
    }
    arrayDesign.setAdvertisedNumberOfDesignElements(compositeSequencesFromProbes.size());
    ArrayDesignSequenceProcessingServiceImpl.log.info("Updating " + arrayDesign);
    arrayDesignService.update(arrayDesign);
    arrayDesignReportService.generateArrayDesignReport(arrayDesign.getId());
    ArrayDesignSequenceProcessingServiceImpl.log.info("Done adding sequence information!");
    return bioSequences;
}
Also used : BioSequence(ubic.gemma.model.genome.biosequence.BioSequence) CompositeSequence(ubic.gemma.model.expression.designElement.CompositeSequence)

Aggregations

BioSequence (ubic.gemma.model.genome.biosequence.BioSequence)105 CompositeSequence (ubic.gemma.model.expression.designElement.CompositeSequence)40 ArrayDesign (ubic.gemma.model.expression.arrayDesign.ArrayDesign)24 Test (org.junit.Test)18 HashSet (java.util.HashSet)17 Taxon (ubic.gemma.model.genome.Taxon)15 BlatResult (ubic.gemma.model.genome.sequenceAnalysis.BlatResult)12 InputStream (java.io.InputStream)11 Collection (java.util.Collection)11 HashMap (java.util.HashMap)10 BaseSpringContextTest (ubic.gemma.core.testing.BaseSpringContextTest)10 GZIPInputStream (java.util.zip.GZIPInputStream)7 Gene (ubic.gemma.model.genome.Gene)7 GeoPlatform (ubic.gemma.core.loader.expression.geo.model.GeoPlatform)6 DatabaseEntry (ubic.gemma.model.common.description.DatabaseEntry)6 StopWatch (org.apache.commons.lang3.time.StopWatch)5 GeneProduct (ubic.gemma.model.genome.gene.GeneProduct)5 BioSequenceValueObject (ubic.gemma.model.genome.sequenceAnalysis.BioSequenceValueObject)5 BlatAssociation (ubic.gemma.model.genome.sequenceAnalysis.BlatAssociation)5 AbstractGeoServiceTest (ubic.gemma.core.loader.expression.geo.AbstractGeoServiceTest)4