Search in sources :

Example 66 with BioSequence

use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.

the class ArrayDesignSequenceProcessingServiceImpl method flushBuffer.

private void flushBuffer(Collection<BioSequence> bioSequences, Collection<BioSequence> sequenceBuffer, Map<String, CompositeSequence> csBuffer) {
    Collection<BioSequence> newOnes = bioSequenceService.findOrCreate(sequenceBuffer);
    bioSequences.addAll(newOnes);
    for (BioSequence sequence : newOnes) {
        CompositeSequence cs = csBuffer.get(sequence.getName());
        assert cs != null;
        cs.setBiologicalCharacteristic(sequence);
    }
    csBuffer.clear();
    sequenceBuffer.clear();
}
Also used : BioSequence(ubic.gemma.model.genome.biosequence.BioSequence) CompositeSequence(ubic.gemma.model.expression.designElement.CompositeSequence)

Example 67 with BioSequence

use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.

the class ArrayDesignSequenceProcessingServiceImpl method processArrayDesign.

@Override
public Collection<BioSequence> processArrayDesign(ArrayDesign arrayDesign, InputStream sequenceFile, SequenceType sequenceType, Taxon taxon) throws IOException {
    if (sequenceType.equals(SequenceType.AFFY_PROBE)) {
        return this.processAffymetrixDesign(arrayDesign, sequenceFile, taxon, true);
    } else if (sequenceType.equals(SequenceType.OLIGO)) {
        return this.processOligoDesign(arrayDesign, sequenceFile, taxon);
    }
    taxon = this.validateTaxon(taxon, arrayDesign);
    this.checkForCompositeSequences(arrayDesign);
    FastaParser fastaParser = new FastaParser();
    fastaParser.parse(sequenceFile);
    Collection<BioSequence> bioSequences = fastaParser.getResults();
    // make two maps: one for genbank ids, one for the sequence name.
    Map<String, BioSequence> gbIdMap = new HashMap<>();
    Map<String, BioSequence> nameMap = new HashMap<>();
    int total = bioSequences.size() + arrayDesign.getCompositeSequences().size();
    int done = 0;
    int percent = 0;
    for (BioSequence sequence : bioSequences) {
        sequence.setType(sequenceType);
        sequence.setPolymerType(PolymerType.DNA);
        sequence.setTaxon(taxon);
        sequence = this.persistSequence(sequence);
        this.addToMaps(gbIdMap, nameMap, sequence);
    }
    ArrayDesignSequenceProcessingServiceImpl.log.info("Sequences done, updating composite sequences");
    int numWithNoSequence = 0;
    int numMatchedByAccession = 0;
    int numMatchedByProbeName = 0;
    String mungeRegex = ArrayDesignSequenceProcessingServiceImpl.DUPLICATE_PROBE_NAME_MUNGE_SEPARATOR + ".+$";
    for (CompositeSequence compositeSequence : arrayDesign.getCompositeSequences()) {
        if (ArrayDesignSequenceProcessingServiceImpl.log.isTraceEnabled())
            ArrayDesignSequenceProcessingServiceImpl.log.trace("Looking for sequence for: " + compositeSequence.getName());
        BioSequence match = null;
        if (nameMap.containsKey(compositeSequence.getName())) {
            match = nameMap.get(compositeSequence.getName());
            numMatchedByProbeName++;
        } else if (compositeSequence.getName().matches(mungeRegex)) {
            String unMungedName = compositeSequence.getName().replaceFirst(mungeRegex, "");
            if (nameMap.containsKey(unMungedName)) {
                numMatchedByProbeName++;
                continue;
            }
        } else {
            BioSequence biologicalCharacteristic = compositeSequence.getBiologicalCharacteristic();
            if (biologicalCharacteristic != null) {
                biologicalCharacteristic = bioSequenceService.thaw(biologicalCharacteristic);
                if (biologicalCharacteristic.getSequenceDatabaseEntry() != null && gbIdMap.containsKey(biologicalCharacteristic.getSequenceDatabaseEntry().getAccession())) {
                    match = gbIdMap.get(biologicalCharacteristic.getSequenceDatabaseEntry().getAccession());
                    numMatchedByAccession++;
                } else {
                    compositeSequence.setBiologicalCharacteristic(null);
                    numWithNoSequence++;
                    this.notifyAboutMissingSequences(numWithNoSequence, compositeSequence);
                }
            } else {
                numWithNoSequence++;
                this.notifyAboutMissingSequences(numWithNoSequence, compositeSequence);
            }
        }
        if (match != null) {
            // overwrite the existing characteristic if necessary.
            compositeSequence.setBiologicalCharacteristic(match);
            compositeSequence.setArrayDesign(arrayDesign);
        }
        if (++done % 1000 == 0) {
            percent = this.updateProgress(total, done, percent);
        }
    }
    ArrayDesignSequenceProcessingServiceImpl.log.info(numMatchedByAccession + "/" + arrayDesign.getCompositeSequences().size() + " composite sequences were matched to sequences by Genbank accession");
    ArrayDesignSequenceProcessingServiceImpl.log.info(numMatchedByProbeName + "/" + arrayDesign.getCompositeSequences().size() + " composite sequences were matched to sequences by probe name");
    if (numWithNoSequence > 0)
        ArrayDesignSequenceProcessingServiceImpl.log.info("There were " + numWithNoSequence + "/" + arrayDesign.getCompositeSequences().size() + " composite sequences with no associated biological characteristic");
    ArrayDesignSequenceProcessingServiceImpl.log.info("Updating sequences on arrayDesign");
    arrayDesignService.update(arrayDesign);
    arrayDesignReportService.generateArrayDesignReport(arrayDesign.getId());
    return bioSequences;
}
Also used : BioSequence(ubic.gemma.model.genome.biosequence.BioSequence) FastaParser(ubic.gemma.core.loader.genome.FastaParser) CompositeSequence(ubic.gemma.model.expression.designElement.CompositeSequence)

Example 68 with BioSequence

use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.

the class ArrayDesignSequenceProcessingServiceImpl method processOligoDesign.

/**
 * @param sequenceFile; the expected format is described in {@link ProbeSequenceParser}
 * @see ProbeSequenceParser
 */
private Collection<BioSequence> processOligoDesign(ArrayDesign arrayDesign, InputStream sequenceFile, Taxon taxon) throws IOException {
    this.checkForCompositeSequences(arrayDesign);
    ProbeSequenceParser parser = new ProbeSequenceParser();
    parser.parse(sequenceFile);
    int total = arrayDesign.getCompositeSequences().size();
    int done = 0;
    int percent = 0;
    taxon = this.validateTaxon(taxon, arrayDesign);
    ArrayDesignSequenceProcessingServiceImpl.log.info("Sequences done, updating composite sequences");
    int numWithNoSequence = 0;
    Collection<BioSequence> res = new HashSet<>();
    for (CompositeSequence compositeSequence : arrayDesign.getCompositeSequences()) {
        if (ArrayDesignSequenceProcessingServiceImpl.log.isTraceEnabled())
            ArrayDesignSequenceProcessingServiceImpl.log.trace("Looking for sequence for: " + compositeSequence.getName());
        BioSequence sequence = parser.get(compositeSequence.getName());
        if (sequence != null) {
            // overwrite the existing characteristic if necessary.
            assert sequence.getSequence() != null;
            sequence.setType(SequenceType.OLIGO);
            sequence.setPolymerType(PolymerType.DNA);
            sequence.setTaxon(taxon);
            sequence = this.persistSequence(sequence);
            compositeSequence.setBiologicalCharacteristic(sequence);
            compositeSequence.setArrayDesign(arrayDesign);
            res.add(sequence);
        } else {
            numWithNoSequence++;
            this.notifyAboutMissingSequences(numWithNoSequence, compositeSequence);
        }
        if (++done % 1000 == 0) {
            percent = this.updateProgress(total, done, percent);
        }
    }
    if (numWithNoSequence > 0)
        ArrayDesignSequenceProcessingServiceImpl.log.info("There were " + numWithNoSequence + "/" + arrayDesign.getCompositeSequences().size() + " composite sequences with no associated biological characteristic");
    ArrayDesignSequenceProcessingServiceImpl.log.info("Updating sequences on arrayDesign");
    arrayDesignService.update(arrayDesign);
    return res;
}
Also used : BioSequence(ubic.gemma.model.genome.biosequence.BioSequence) ProbeSequenceParser(ubic.gemma.core.loader.genome.ProbeSequenceParser) CompositeSequence(ubic.gemma.model.expression.designElement.CompositeSequence)

Example 69 with BioSequence

use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.

the class ArrayDesignSequenceProcessingServiceImpl method getAccession.

/**
 * Used to check if an IMAGE clone exists to use for an accession. If the IMAGE clone is used instead, we update the
 * composite sequence.
 */
private String getAccession(CompositeSequence cs) {
    BioSequence bs = cs.getBiologicalCharacteristic();
    if (bs.getSequenceDatabaseEntry() == null) {
        return null;
    }
    bs = this.bioSequenceService.thaw(bs);
    return bs.getSequenceDatabaseEntry().getAccession();
}
Also used : BioSequence(ubic.gemma.model.genome.biosequence.BioSequence)

Example 70 with BioSequence

use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.

the class ArrayDesignSequenceProcessingServiceImpl method createOrUpdateGenbankSequence.

/**
 * @param found a new (non-persistent) biosequence that can be used to create a new entry or update an existing one
 *              with the sequence. The sequence would have come from Genbank.
 * @param force If true, if an existing BioSequence that matches if found in the system, any existing sequence
 *              information in the BioSequence will be overwritten. Otherwise, the sequence will only be updated if the
 *              actual sequence information was missing in our DB and 'found' has a sequence.
 * @return persistent BioSequence.
 */
private BioSequence createOrUpdateGenbankSequence(BioSequence found, boolean force) {
    assert found != null;
    DatabaseEntry sequenceDatabaseEntry = found.getSequenceDatabaseEntry();
    // this should always be the case because the sequences comes from
    assert sequenceDatabaseEntry != null;
    // genbank (blastDb)
    assert sequenceDatabaseEntry.getExternalDatabase() != null;
    BioSequence existing;
    existing = bioSequenceService.findByAccession(sequenceDatabaseEntry);
    BioSequence result;
    if (existing == null) {
        if (ArrayDesignSequenceProcessingServiceImpl.log.isDebugEnabled())
            ArrayDesignSequenceProcessingServiceImpl.log.debug("Find (or creating) new sequence " + found);
        // there still might be a match.
        result = bioSequenceService.find(found);
        if (result == null) {
            result = bioSequenceService.create(found);
        }
    } else {
        result = existing;
    }
    assert result != null;
    // note that no matter what we make sure the database entry is filled in.
    if (force || (StringUtils.isBlank(result.getSequence()) && !StringUtils.isBlank(found.getSequence()))) {
        result = this.updateExistingWithSequenceData(found, result);
    } else {
        this.fillInDatabaseEntry(found, result);
    }
    return result;
}
Also used : BioSequence(ubic.gemma.model.genome.biosequence.BioSequence) DatabaseEntry(ubic.gemma.model.common.description.DatabaseEntry)

Aggregations

BioSequence (ubic.gemma.model.genome.biosequence.BioSequence)105 CompositeSequence (ubic.gemma.model.expression.designElement.CompositeSequence)40 ArrayDesign (ubic.gemma.model.expression.arrayDesign.ArrayDesign)24 Test (org.junit.Test)18 HashSet (java.util.HashSet)17 Taxon (ubic.gemma.model.genome.Taxon)15 BlatResult (ubic.gemma.model.genome.sequenceAnalysis.BlatResult)12 InputStream (java.io.InputStream)11 Collection (java.util.Collection)11 HashMap (java.util.HashMap)10 BaseSpringContextTest (ubic.gemma.core.testing.BaseSpringContextTest)10 GZIPInputStream (java.util.zip.GZIPInputStream)7 Gene (ubic.gemma.model.genome.Gene)7 GeoPlatform (ubic.gemma.core.loader.expression.geo.model.GeoPlatform)6 DatabaseEntry (ubic.gemma.model.common.description.DatabaseEntry)6 StopWatch (org.apache.commons.lang3.time.StopWatch)5 GeneProduct (ubic.gemma.model.genome.gene.GeneProduct)5 BioSequenceValueObject (ubic.gemma.model.genome.sequenceAnalysis.BioSequenceValueObject)5 BlatAssociation (ubic.gemma.model.genome.sequenceAnalysis.BlatAssociation)5 AbstractGeoServiceTest (ubic.gemma.core.loader.expression.geo.AbstractGeoServiceTest)4