Search in sources :

Example 6 with BioSequence

use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.

the class MockFastaCmd method getMultiple.

@SuppressWarnings("unused")
private Collection<BioSequence> getMultiple(Collection<?> accessions, String database, String blastHome) {
    Collection<BioSequence> results = new HashSet<>();
    for (Object object : accessions) {
        BioSequence result = this.makeSequence(object);
        results.add(result);
    }
    return results;
}
Also used : BioSequence(ubic.gemma.model.genome.biosequence.BioSequence) HashSet(java.util.HashSet)

Example 7 with BioSequence

use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.

the class AffyProbeNameFilter method filter.

@Override
public ExpressionDataDoubleMatrix filter(ExpressionDataDoubleMatrix data) {
    int numRows = data.rows();
    List<CompositeSequence> kept = new ArrayList<>();
    for (int i = 0; i < numRows; i++) {
        CompositeSequence d = data.getDesignElementForRow(i);
        assert d != null;
        BioSequence sequence = d.getBiologicalCharacteristic();
        String name;
        if (sequence != null) {
            name = sequence.getName();
        } else {
            name = d.getName();
        }
        // apply the rules.
        if (skip_ST && name.contains("_st")) {
            // 'st' means sense strand.
            continue;
        }
        // control probes.
        if (skip_AFFX && name.contains("AFFX")) {
            continue;
        }
        // gene family.
        if (skip_F && name.contains("_f_at")) {
            continue;
        }
        if (skip_X && name.contains("_x_at")) {
            continue;
        }
        if (skip_G && name.contains("_g_at")) {
            continue;
        }
        kept.add(d);
    }
    AffyProbeNameFilter.log.info("There are " + kept.size() + " rows left after Affy probe name filtering.");
    return new ExpressionDataDoubleMatrix(data, kept);
}
Also used : BioSequence(ubic.gemma.model.genome.biosequence.BioSequence) ExpressionDataDoubleMatrix(ubic.gemma.core.datastructure.matrix.ExpressionDataDoubleMatrix) ArrayList(java.util.ArrayList) CompositeSequence(ubic.gemma.model.expression.designElement.CompositeSequence)

Example 8 with BioSequence

use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.

the class ArrayDesignMapResultServiceImpl method summarizeMapResults.

@Override
public Collection<CompositeSequenceMapSummary> summarizeMapResults(Collection<CompositeSequence> compositeSequences) {
    Collection<CompositeSequenceMapSummary> result = new HashSet<>();
    int count = 0;
    for (CompositeSequence cs : compositeSequences) {
        CompositeSequenceMapSummary summary = new CompositeSequenceMapSummary(cs);
        BioSequence bioSequence = cs.getBiologicalCharacteristic();
        if (bioSequence == null) {
            result.add(summary);
            continue;
        }
        Collection<BlatResult> blats = blatResultService.findByBioSequence(bioSequence);
        summary.setBlatResults(blats);
        Collection<BlatAssociation> maps = blatAssociationService.find(bioSequence);
        blatAssociationService.thaw(maps);
        for (BlatAssociation association : maps) {
            summary.getGeneProducts().add(association.getGeneProduct());
            summary.getGenes().add(association.getGeneProduct().getGene());
        }
        result.add(summary);
        if (++count % 1000 == 0) {
            ArrayDesignMapResultServiceImpl.log.info("Processed " + count + " elements...");
        }
    }
    ArrayDesignMapResultServiceImpl.log.info("Done, processed " + count + " elements");
    return result;
}
Also used : BioSequence(ubic.gemma.model.genome.biosequence.BioSequence) CompositeSequence(ubic.gemma.model.expression.designElement.CompositeSequence) BlatAssociation(ubic.gemma.model.genome.sequenceAnalysis.BlatAssociation) BlatResult(ubic.gemma.model.genome.sequenceAnalysis.BlatResult)

Example 9 with BioSequence

use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.

the class RepeatScan method processRepeatMaskerOutput.

/**
 * @param sequences          sequences
 * @param outputSequencePath in FASTA format
 * @return Sequences which were updated.
 */
public Collection<BioSequence> processRepeatMaskerOutput(Collection<BioSequence> sequences, String outputSequencePath) {
    FastaParser parser = new FastaParser();
    try {
        parser.parse(outputSequencePath);
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    Collection<BioSequence> finalRes = new HashSet<>();
    // build map of identifiers to sequences.
    Collection<BioSequence> results = parser.getResults();
    Map<String, BioSequence> map = new HashMap<>();
    for (BioSequence maskedSeq : results) {
        String identifier = maskedSeq.getName();
        if (RepeatScan.log.isDebugEnabled())
            RepeatScan.log.debug("Masked: " + identifier);
        map.put(identifier, maskedSeq);
    }
    for (BioSequence origSeq : sequences) {
        String identifier = SequenceWriter.getIdentifier(origSeq);
        BioSequence maskedSeq = map.get(identifier);
        if (RepeatScan.log.isDebugEnabled())
            RepeatScan.log.debug("Orig: " + identifier);
        if (maskedSeq == null) {
            RepeatScan.log.warn("No masked sequence for " + identifier);
            continue;
        }
        origSeq.setSequence(maskedSeq.getSequence());
        double fraction = this.computeFractionMasked(maskedSeq);
        origSeq.setFractionRepeats(fraction);
        if (fraction > 0) {
            finalRes.add(origSeq);
        }
    }
    RepeatScan.log.info(finalRes.size() + " sequences had non-zero repeat fractions.");
    return finalRes;
}
Also used : BioSequence(ubic.gemma.model.genome.biosequence.BioSequence) HashMap(java.util.HashMap) FastaParser(ubic.gemma.core.loader.genome.FastaParser) HashSet(java.util.HashSet)

Example 10 with BioSequence

use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.

the class SequenceWriter method writeSequencesToFile.

/**
 * Write a collection of sequences in FASTA format
 *
 * @param sequences  sequences
 * @param outputFile file
 * @return number of sequences written, excluding blanks and duplicates.
 * @throws IOException io problems
 */
public static int writeSequencesToFile(Collection<BioSequence> sequences, File outputFile) throws IOException {
    try (BufferedWriter out = new BufferedWriter(new FileWriter(outputFile))) {
        SequenceWriter.log.debug("Processing " + sequences.size() + " sequences for blat analysis");
        int count = 0;
        Collection<Object> identifiers = new HashSet<>();
        int repeats = 0;
        for (BioSequence b : sequences) {
            if (StringUtils.isBlank(b.getSequence())) {
                SequenceWriter.log.warn("Blank sequence for " + b);
                continue;
            }
            String identifier = SequenceWriter.getIdentifier(b);
            if (identifiers.contains(identifier)) {
                SequenceWriter.log.debug(b + " is a repeat with identifier " + identifier);
                repeats++;
                // don't repeat sequences.
                continue;
            }
            // use toUpper to ensure that sequence does not start out 'masked'.
            out.write(">" + identifier + "\n" + b.getSequence().toUpperCase() + "\n");
            identifiers.add(identifier);
            if (++count % 2000 == 0) {
                SequenceWriter.log.debug("Wrote " + count + " sequences");
            }
        }
        SequenceWriter.log.info("Wrote " + count + " sequences to " + outputFile + (repeats > 0 ? " ( " + repeats + " repeated items were skipped)." : ""));
        return count;
    }
}
Also used : BioSequence(ubic.gemma.model.genome.biosequence.BioSequence) FileWriter(java.io.FileWriter) BufferedWriter(java.io.BufferedWriter) HashSet(java.util.HashSet)

Aggregations

BioSequence (ubic.gemma.model.genome.biosequence.BioSequence)105 CompositeSequence (ubic.gemma.model.expression.designElement.CompositeSequence)40 ArrayDesign (ubic.gemma.model.expression.arrayDesign.ArrayDesign)24 Test (org.junit.Test)18 HashSet (java.util.HashSet)17 Taxon (ubic.gemma.model.genome.Taxon)15 BlatResult (ubic.gemma.model.genome.sequenceAnalysis.BlatResult)12 InputStream (java.io.InputStream)11 Collection (java.util.Collection)11 HashMap (java.util.HashMap)10 BaseSpringContextTest (ubic.gemma.core.testing.BaseSpringContextTest)10 GZIPInputStream (java.util.zip.GZIPInputStream)7 Gene (ubic.gemma.model.genome.Gene)7 GeoPlatform (ubic.gemma.core.loader.expression.geo.model.GeoPlatform)6 DatabaseEntry (ubic.gemma.model.common.description.DatabaseEntry)6 StopWatch (org.apache.commons.lang3.time.StopWatch)5 GeneProduct (ubic.gemma.model.genome.gene.GeneProduct)5 BioSequenceValueObject (ubic.gemma.model.genome.sequenceAnalysis.BioSequenceValueObject)5 BlatAssociation (ubic.gemma.model.genome.sequenceAnalysis.BlatAssociation)5 AbstractGeoServiceTest (ubic.gemma.core.loader.expression.geo.AbstractGeoServiceTest)4