use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.
the class MockFastaCmd method getMultiple.
@SuppressWarnings("unused")
private Collection<BioSequence> getMultiple(Collection<?> accessions, String database, String blastHome) {
Collection<BioSequence> results = new HashSet<>();
for (Object object : accessions) {
BioSequence result = this.makeSequence(object);
results.add(result);
}
return results;
}
use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.
the class AffyProbeNameFilter method filter.
@Override
public ExpressionDataDoubleMatrix filter(ExpressionDataDoubleMatrix data) {
int numRows = data.rows();
List<CompositeSequence> kept = new ArrayList<>();
for (int i = 0; i < numRows; i++) {
CompositeSequence d = data.getDesignElementForRow(i);
assert d != null;
BioSequence sequence = d.getBiologicalCharacteristic();
String name;
if (sequence != null) {
name = sequence.getName();
} else {
name = d.getName();
}
// apply the rules.
if (skip_ST && name.contains("_st")) {
// 'st' means sense strand.
continue;
}
// control probes.
if (skip_AFFX && name.contains("AFFX")) {
continue;
}
// gene family.
if (skip_F && name.contains("_f_at")) {
continue;
}
if (skip_X && name.contains("_x_at")) {
continue;
}
if (skip_G && name.contains("_g_at")) {
continue;
}
kept.add(d);
}
AffyProbeNameFilter.log.info("There are " + kept.size() + " rows left after Affy probe name filtering.");
return new ExpressionDataDoubleMatrix(data, kept);
}
use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.
the class ArrayDesignMapResultServiceImpl method summarizeMapResults.
@Override
public Collection<CompositeSequenceMapSummary> summarizeMapResults(Collection<CompositeSequence> compositeSequences) {
Collection<CompositeSequenceMapSummary> result = new HashSet<>();
int count = 0;
for (CompositeSequence cs : compositeSequences) {
CompositeSequenceMapSummary summary = new CompositeSequenceMapSummary(cs);
BioSequence bioSequence = cs.getBiologicalCharacteristic();
if (bioSequence == null) {
result.add(summary);
continue;
}
Collection<BlatResult> blats = blatResultService.findByBioSequence(bioSequence);
summary.setBlatResults(blats);
Collection<BlatAssociation> maps = blatAssociationService.find(bioSequence);
blatAssociationService.thaw(maps);
for (BlatAssociation association : maps) {
summary.getGeneProducts().add(association.getGeneProduct());
summary.getGenes().add(association.getGeneProduct().getGene());
}
result.add(summary);
if (++count % 1000 == 0) {
ArrayDesignMapResultServiceImpl.log.info("Processed " + count + " elements...");
}
}
ArrayDesignMapResultServiceImpl.log.info("Done, processed " + count + " elements");
return result;
}
use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.
the class RepeatScan method processRepeatMaskerOutput.
/**
* @param sequences sequences
* @param outputSequencePath in FASTA format
* @return Sequences which were updated.
*/
public Collection<BioSequence> processRepeatMaskerOutput(Collection<BioSequence> sequences, String outputSequencePath) {
FastaParser parser = new FastaParser();
try {
parser.parse(outputSequencePath);
} catch (IOException e) {
throw new RuntimeException(e);
}
Collection<BioSequence> finalRes = new HashSet<>();
// build map of identifiers to sequences.
Collection<BioSequence> results = parser.getResults();
Map<String, BioSequence> map = new HashMap<>();
for (BioSequence maskedSeq : results) {
String identifier = maskedSeq.getName();
if (RepeatScan.log.isDebugEnabled())
RepeatScan.log.debug("Masked: " + identifier);
map.put(identifier, maskedSeq);
}
for (BioSequence origSeq : sequences) {
String identifier = SequenceWriter.getIdentifier(origSeq);
BioSequence maskedSeq = map.get(identifier);
if (RepeatScan.log.isDebugEnabled())
RepeatScan.log.debug("Orig: " + identifier);
if (maskedSeq == null) {
RepeatScan.log.warn("No masked sequence for " + identifier);
continue;
}
origSeq.setSequence(maskedSeq.getSequence());
double fraction = this.computeFractionMasked(maskedSeq);
origSeq.setFractionRepeats(fraction);
if (fraction > 0) {
finalRes.add(origSeq);
}
}
RepeatScan.log.info(finalRes.size() + " sequences had non-zero repeat fractions.");
return finalRes;
}
use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.
the class SequenceWriter method writeSequencesToFile.
/**
* Write a collection of sequences in FASTA format
*
* @param sequences sequences
* @param outputFile file
* @return number of sequences written, excluding blanks and duplicates.
* @throws IOException io problems
*/
public static int writeSequencesToFile(Collection<BioSequence> sequences, File outputFile) throws IOException {
try (BufferedWriter out = new BufferedWriter(new FileWriter(outputFile))) {
SequenceWriter.log.debug("Processing " + sequences.size() + " sequences for blat analysis");
int count = 0;
Collection<Object> identifiers = new HashSet<>();
int repeats = 0;
for (BioSequence b : sequences) {
if (StringUtils.isBlank(b.getSequence())) {
SequenceWriter.log.warn("Blank sequence for " + b);
continue;
}
String identifier = SequenceWriter.getIdentifier(b);
if (identifiers.contains(identifier)) {
SequenceWriter.log.debug(b + " is a repeat with identifier " + identifier);
repeats++;
// don't repeat sequences.
continue;
}
// use toUpper to ensure that sequence does not start out 'masked'.
out.write(">" + identifier + "\n" + b.getSequence().toUpperCase() + "\n");
identifiers.add(identifier);
if (++count % 2000 == 0) {
SequenceWriter.log.debug("Wrote " + count + " sequences");
}
}
SequenceWriter.log.info("Wrote " + count + " sequences to " + outputFile + (repeats > 0 ? " ( " + repeats + " repeated items were skipped)." : ""));
return count;
}
}
Aggregations