Search in sources :

Example 1 with FastaParser

use of ubic.gemma.core.loader.genome.FastaParser in project Gemma by PavlidisLab.

the class RepeatScan method processRepeatMaskerOutput.

/**
 * @param sequences          sequences
 * @param outputSequencePath in FASTA format
 * @return Sequences which were updated.
 */
public Collection<BioSequence> processRepeatMaskerOutput(Collection<BioSequence> sequences, String outputSequencePath) {
    FastaParser parser = new FastaParser();
    try {
        parser.parse(outputSequencePath);
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    Collection<BioSequence> finalRes = new HashSet<>();
    // build map of identifiers to sequences.
    Collection<BioSequence> results = parser.getResults();
    Map<String, BioSequence> map = new HashMap<>();
    for (BioSequence maskedSeq : results) {
        String identifier = maskedSeq.getName();
        if (RepeatScan.log.isDebugEnabled())
            RepeatScan.log.debug("Masked: " + identifier);
        map.put(identifier, maskedSeq);
    }
    for (BioSequence origSeq : sequences) {
        String identifier = SequenceWriter.getIdentifier(origSeq);
        BioSequence maskedSeq = map.get(identifier);
        if (RepeatScan.log.isDebugEnabled())
            RepeatScan.log.debug("Orig: " + identifier);
        if (maskedSeq == null) {
            RepeatScan.log.warn("No masked sequence for " + identifier);
            continue;
        }
        origSeq.setSequence(maskedSeq.getSequence());
        double fraction = this.computeFractionMasked(maskedSeq);
        origSeq.setFractionRepeats(fraction);
        if (fraction > 0) {
            finalRes.add(origSeq);
        }
    }
    RepeatScan.log.info(finalRes.size() + " sequences had non-zero repeat fractions.");
    return finalRes;
}
Also used : BioSequence(ubic.gemma.model.genome.biosequence.BioSequence) HashMap(java.util.HashMap) FastaParser(ubic.gemma.core.loader.genome.FastaParser) HashSet(java.util.HashSet)

Example 2 with FastaParser

use of ubic.gemma.core.loader.genome.FastaParser in project Gemma by PavlidisLab.

the class ArrayDesignSequenceProcessingServiceImpl method assignSequencesToDesignElements.

/**
 * Associate sequences with an array design. It is assumed that the name of the sequences can be matched to the name
 * of a design element. Provided for testing purposes.
 */
@Override
public void assignSequencesToDesignElements(Collection<CompositeSequence> designElements, InputStream fastaFile) throws IOException {
    FastaParser fp = new FastaParser();
    fp.parse(fastaFile);
    Collection<BioSequence> sequences = fp.getResults();
    ArrayDesignSequenceProcessingServiceImpl.log.debug("Parsed " + sequences.size() + " sequences");
    this.assignSequencesToDesignElements(designElements, sequences);
}
Also used : BioSequence(ubic.gemma.model.genome.biosequence.BioSequence) FastaParser(ubic.gemma.core.loader.genome.FastaParser)

Example 3 with FastaParser

use of ubic.gemma.core.loader.genome.FastaParser in project Gemma by PavlidisLab.

the class ArrayDesignSequenceProcessingServiceImpl method processArrayDesign.

@Override
public Collection<BioSequence> processArrayDesign(ArrayDesign arrayDesign, InputStream sequenceFile, SequenceType sequenceType, Taxon taxon) throws IOException {
    if (sequenceType.equals(SequenceType.AFFY_PROBE)) {
        return this.processAffymetrixDesign(arrayDesign, sequenceFile, taxon, true);
    } else if (sequenceType.equals(SequenceType.OLIGO)) {
        return this.processOligoDesign(arrayDesign, sequenceFile, taxon);
    }
    taxon = this.validateTaxon(taxon, arrayDesign);
    this.checkForCompositeSequences(arrayDesign);
    FastaParser fastaParser = new FastaParser();
    fastaParser.parse(sequenceFile);
    Collection<BioSequence> bioSequences = fastaParser.getResults();
    // make two maps: one for genbank ids, one for the sequence name.
    Map<String, BioSequence> gbIdMap = new HashMap<>();
    Map<String, BioSequence> nameMap = new HashMap<>();
    int total = bioSequences.size() + arrayDesign.getCompositeSequences().size();
    int done = 0;
    int percent = 0;
    for (BioSequence sequence : bioSequences) {
        sequence.setType(sequenceType);
        sequence.setPolymerType(PolymerType.DNA);
        sequence.setTaxon(taxon);
        sequence = this.persistSequence(sequence);
        this.addToMaps(gbIdMap, nameMap, sequence);
    }
    ArrayDesignSequenceProcessingServiceImpl.log.info("Sequences done, updating composite sequences");
    int numWithNoSequence = 0;
    int numMatchedByAccession = 0;
    int numMatchedByProbeName = 0;
    String mungeRegex = ArrayDesignSequenceProcessingServiceImpl.DUPLICATE_PROBE_NAME_MUNGE_SEPARATOR + ".+$";
    for (CompositeSequence compositeSequence : arrayDesign.getCompositeSequences()) {
        if (ArrayDesignSequenceProcessingServiceImpl.log.isTraceEnabled())
            ArrayDesignSequenceProcessingServiceImpl.log.trace("Looking for sequence for: " + compositeSequence.getName());
        BioSequence match = null;
        if (nameMap.containsKey(compositeSequence.getName())) {
            match = nameMap.get(compositeSequence.getName());
            numMatchedByProbeName++;
        } else if (compositeSequence.getName().matches(mungeRegex)) {
            String unMungedName = compositeSequence.getName().replaceFirst(mungeRegex, "");
            if (nameMap.containsKey(unMungedName)) {
                numMatchedByProbeName++;
                continue;
            }
        } else {
            BioSequence biologicalCharacteristic = compositeSequence.getBiologicalCharacteristic();
            if (biologicalCharacteristic != null) {
                biologicalCharacteristic = bioSequenceService.thaw(biologicalCharacteristic);
                if (biologicalCharacteristic.getSequenceDatabaseEntry() != null && gbIdMap.containsKey(biologicalCharacteristic.getSequenceDatabaseEntry().getAccession())) {
                    match = gbIdMap.get(biologicalCharacteristic.getSequenceDatabaseEntry().getAccession());
                    numMatchedByAccession++;
                } else {
                    compositeSequence.setBiologicalCharacteristic(null);
                    numWithNoSequence++;
                    this.notifyAboutMissingSequences(numWithNoSequence, compositeSequence);
                }
            } else {
                numWithNoSequence++;
                this.notifyAboutMissingSequences(numWithNoSequence, compositeSequence);
            }
        }
        if (match != null) {
            // overwrite the existing characteristic if necessary.
            compositeSequence.setBiologicalCharacteristic(match);
            compositeSequence.setArrayDesign(arrayDesign);
        }
        if (++done % 1000 == 0) {
            percent = this.updateProgress(total, done, percent);
        }
    }
    ArrayDesignSequenceProcessingServiceImpl.log.info(numMatchedByAccession + "/" + arrayDesign.getCompositeSequences().size() + " composite sequences were matched to sequences by Genbank accession");
    ArrayDesignSequenceProcessingServiceImpl.log.info(numMatchedByProbeName + "/" + arrayDesign.getCompositeSequences().size() + " composite sequences were matched to sequences by probe name");
    if (numWithNoSequence > 0)
        ArrayDesignSequenceProcessingServiceImpl.log.info("There were " + numWithNoSequence + "/" + arrayDesign.getCompositeSequences().size() + " composite sequences with no associated biological characteristic");
    ArrayDesignSequenceProcessingServiceImpl.log.info("Updating sequences on arrayDesign");
    arrayDesignService.update(arrayDesign);
    arrayDesignReportService.generateArrayDesignReport(arrayDesign.getId());
    return bioSequences;
}
Also used : BioSequence(ubic.gemma.model.genome.biosequence.BioSequence) FastaParser(ubic.gemma.core.loader.genome.FastaParser) CompositeSequence(ubic.gemma.model.expression.designElement.CompositeSequence)

Aggregations

FastaParser (ubic.gemma.core.loader.genome.FastaParser)3 BioSequence (ubic.gemma.model.genome.biosequence.BioSequence)3 HashMap (java.util.HashMap)1 HashSet (java.util.HashSet)1 CompositeSequence (ubic.gemma.model.expression.designElement.CompositeSequence)1