use of ubic.gemma.core.loader.genome.FastaParser in project Gemma by PavlidisLab.
the class RepeatScan method processRepeatMaskerOutput.
/**
* @param sequences sequences
* @param outputSequencePath in FASTA format
* @return Sequences which were updated.
*/
public Collection<BioSequence> processRepeatMaskerOutput(Collection<BioSequence> sequences, String outputSequencePath) {
FastaParser parser = new FastaParser();
try {
parser.parse(outputSequencePath);
} catch (IOException e) {
throw new RuntimeException(e);
}
Collection<BioSequence> finalRes = new HashSet<>();
// build map of identifiers to sequences.
Collection<BioSequence> results = parser.getResults();
Map<String, BioSequence> map = new HashMap<>();
for (BioSequence maskedSeq : results) {
String identifier = maskedSeq.getName();
if (RepeatScan.log.isDebugEnabled())
RepeatScan.log.debug("Masked: " + identifier);
map.put(identifier, maskedSeq);
}
for (BioSequence origSeq : sequences) {
String identifier = SequenceWriter.getIdentifier(origSeq);
BioSequence maskedSeq = map.get(identifier);
if (RepeatScan.log.isDebugEnabled())
RepeatScan.log.debug("Orig: " + identifier);
if (maskedSeq == null) {
RepeatScan.log.warn("No masked sequence for " + identifier);
continue;
}
origSeq.setSequence(maskedSeq.getSequence());
double fraction = this.computeFractionMasked(maskedSeq);
origSeq.setFractionRepeats(fraction);
if (fraction > 0) {
finalRes.add(origSeq);
}
}
RepeatScan.log.info(finalRes.size() + " sequences had non-zero repeat fractions.");
return finalRes;
}
use of ubic.gemma.core.loader.genome.FastaParser in project Gemma by PavlidisLab.
the class ArrayDesignSequenceProcessingServiceImpl method assignSequencesToDesignElements.
/**
* Associate sequences with an array design. It is assumed that the name of the sequences can be matched to the name
* of a design element. Provided for testing purposes.
*/
@Override
public void assignSequencesToDesignElements(Collection<CompositeSequence> designElements, InputStream fastaFile) throws IOException {
FastaParser fp = new FastaParser();
fp.parse(fastaFile);
Collection<BioSequence> sequences = fp.getResults();
ArrayDesignSequenceProcessingServiceImpl.log.debug("Parsed " + sequences.size() + " sequences");
this.assignSequencesToDesignElements(designElements, sequences);
}
use of ubic.gemma.core.loader.genome.FastaParser in project Gemma by PavlidisLab.
the class ArrayDesignSequenceProcessingServiceImpl method processArrayDesign.
@Override
public Collection<BioSequence> processArrayDesign(ArrayDesign arrayDesign, InputStream sequenceFile, SequenceType sequenceType, Taxon taxon) throws IOException {
if (sequenceType.equals(SequenceType.AFFY_PROBE)) {
return this.processAffymetrixDesign(arrayDesign, sequenceFile, taxon, true);
} else if (sequenceType.equals(SequenceType.OLIGO)) {
return this.processOligoDesign(arrayDesign, sequenceFile, taxon);
}
taxon = this.validateTaxon(taxon, arrayDesign);
this.checkForCompositeSequences(arrayDesign);
FastaParser fastaParser = new FastaParser();
fastaParser.parse(sequenceFile);
Collection<BioSequence> bioSequences = fastaParser.getResults();
// make two maps: one for genbank ids, one for the sequence name.
Map<String, BioSequence> gbIdMap = new HashMap<>();
Map<String, BioSequence> nameMap = new HashMap<>();
int total = bioSequences.size() + arrayDesign.getCompositeSequences().size();
int done = 0;
int percent = 0;
for (BioSequence sequence : bioSequences) {
sequence.setType(sequenceType);
sequence.setPolymerType(PolymerType.DNA);
sequence.setTaxon(taxon);
sequence = this.persistSequence(sequence);
this.addToMaps(gbIdMap, nameMap, sequence);
}
ArrayDesignSequenceProcessingServiceImpl.log.info("Sequences done, updating composite sequences");
int numWithNoSequence = 0;
int numMatchedByAccession = 0;
int numMatchedByProbeName = 0;
String mungeRegex = ArrayDesignSequenceProcessingServiceImpl.DUPLICATE_PROBE_NAME_MUNGE_SEPARATOR + ".+$";
for (CompositeSequence compositeSequence : arrayDesign.getCompositeSequences()) {
if (ArrayDesignSequenceProcessingServiceImpl.log.isTraceEnabled())
ArrayDesignSequenceProcessingServiceImpl.log.trace("Looking for sequence for: " + compositeSequence.getName());
BioSequence match = null;
if (nameMap.containsKey(compositeSequence.getName())) {
match = nameMap.get(compositeSequence.getName());
numMatchedByProbeName++;
} else if (compositeSequence.getName().matches(mungeRegex)) {
String unMungedName = compositeSequence.getName().replaceFirst(mungeRegex, "");
if (nameMap.containsKey(unMungedName)) {
numMatchedByProbeName++;
continue;
}
} else {
BioSequence biologicalCharacteristic = compositeSequence.getBiologicalCharacteristic();
if (biologicalCharacteristic != null) {
biologicalCharacteristic = bioSequenceService.thaw(biologicalCharacteristic);
if (biologicalCharacteristic.getSequenceDatabaseEntry() != null && gbIdMap.containsKey(biologicalCharacteristic.getSequenceDatabaseEntry().getAccession())) {
match = gbIdMap.get(biologicalCharacteristic.getSequenceDatabaseEntry().getAccession());
numMatchedByAccession++;
} else {
compositeSequence.setBiologicalCharacteristic(null);
numWithNoSequence++;
this.notifyAboutMissingSequences(numWithNoSequence, compositeSequence);
}
} else {
numWithNoSequence++;
this.notifyAboutMissingSequences(numWithNoSequence, compositeSequence);
}
}
if (match != null) {
// overwrite the existing characteristic if necessary.
compositeSequence.setBiologicalCharacteristic(match);
compositeSequence.setArrayDesign(arrayDesign);
}
if (++done % 1000 == 0) {
percent = this.updateProgress(total, done, percent);
}
}
ArrayDesignSequenceProcessingServiceImpl.log.info(numMatchedByAccession + "/" + arrayDesign.getCompositeSequences().size() + " composite sequences were matched to sequences by Genbank accession");
ArrayDesignSequenceProcessingServiceImpl.log.info(numMatchedByProbeName + "/" + arrayDesign.getCompositeSequences().size() + " composite sequences were matched to sequences by probe name");
if (numWithNoSequence > 0)
ArrayDesignSequenceProcessingServiceImpl.log.info("There were " + numWithNoSequence + "/" + arrayDesign.getCompositeSequences().size() + " composite sequences with no associated biological characteristic");
ArrayDesignSequenceProcessingServiceImpl.log.info("Updating sequences on arrayDesign");
arrayDesignService.update(arrayDesign);
arrayDesignReportService.generateArrayDesignReport(arrayDesign.getId());
return bioSequences;
}
Aggregations