use of uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome in project SeqMonk by s-andrews.
the class ShuffleListProbeGenerator method selectRandomChromosome.
private Chromosome selectRandomChromosome(long totalGenomeLength, HashMap<Chromosome, int[]> limits, int probeLength) {
int count = 0;
while (true) {
// Select a random position in the genome
long randomPos = (long) (Math.random() * totalGenomeLength);
// Figure out which chromosome this relates to
long currentPos = 0;
Chromosome selectedChr = null;
for (Chromosome c : limits.keySet()) {
if (currentPos + c.length() >= randomPos) {
// It's this chromosome
selectedChr = c;
break;
}
currentPos += c.length();
}
if ((limits.get(selectedChr)[1] - limits.get(selectedChr)[0]) + 1 >= probeLength) {
return selectedChr;
}
++count;
if (count > 1000) {
System.err.println("Couldn't randomly generate a viable position for probe of length " + probeLength);
return null;
}
}
}
use of uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome in project SeqMonk by s-andrews.
the class SeqMonkParser method parseAnnotation.
/**
* Parses an external set of annotations
*
* @param sections The tab split initial annotation line
* @throws SeqMonkException
* @throws IOException Signals that an I/O exception has occurred.
*/
private AnnotationSet parseAnnotation(String[] sections) throws SeqMonkException, IOException {
if (sections.length != 3) {
throw new SeqMonkException("Annotation line didn't contain 3 sections");
}
AnnotationSet set = new AnnotationSet(application.dataCollection().genome(), sections[1]);
int featureCount = Integer.parseInt(sections[2]);
for (int i = 0; i < featureCount; i++) {
if (i % 1000 == 0) {
progressUpdated("Parsing annotation in " + set.name(), i, featureCount);
}
sections = br.readLine().split("\\t");
Chromosome c;
try {
c = application.dataCollection().genome().getChromosome(sections[1]).chromosome();
} catch (Exception sme) {
Enumeration<ProgressListener> e = listeners.elements();
while (e.hasMoreElements()) {
e.nextElement().progressWarningReceived(new SeqMonkException("Annotation feature could not be mapped to chromosome '" + sections[1] + "'"));
}
continue;
}
Feature f = new Feature(sections[0], c.name());
// TODO: Can we improve this to not use a Split Location each time?
f.setLocation(new SplitLocation(sections[2]));
for (int a = 3; a + 1 < sections.length; a += 2) {
f.addAttribute(sections[a], sections[a + 1]);
}
set.addFeature(f);
}
set.finalise();
return set;
}
use of uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome in project SeqMonk by s-andrews.
the class SeqMonkParser method parseProbes.
/**
* Parses the list of probes.
*
* @param sections The tab split initial line from the probes section
* @throws SeqMonkException
* @throws IOException Signals that an I/O exception has occurred.
*/
private void parseProbes(String[] sections) throws SeqMonkException, IOException {
if (sections.length < 2) {
throw new SeqMonkException("Probe line didn't contain at least 2 sections");
}
if (!sections[0].equals("Probes")) {
throw new SeqMonkException("Couldn't find expected probes line");
}
int n = Integer.parseInt(sections[1]);
probes = new Probe[n];
String description = "No generator description available";
if (sections.length > 2) {
description = sections[2];
}
ProbeSet probeSet = new ProbeSet(description, n);
if (sections.length > 3) {
if (sections[3].length() > 0) {
probeSet.setCurrentQuantitation(sections[3]);
}
}
if (sections.length > 4) {
probeSet.setComments(sections[4].replaceAll("`", "\n"));
}
// We need to save the probeset to the dataset at this point so we can add the probe
// lists as we get to them.
application.dataCollection().setProbeSet(probeSet);
int positionOffset;
// We used to store chr start and end
if (thisDataVersion < 8) {
positionOffset = 4;
} else // We now store chr and packed position (to give start end and strand)
{
positionOffset = 3;
}
int expectedSectionLength = 3 + dataSets.length + dataGroups.length;
String line;
for (int i = 0; i < n; i++) {
line = br.readLine();
if (line == null) {
throw new SeqMonkException("Ran out of probe data at line " + i + " (expected " + n + " probes)");
}
// Since the probes section can have blank trailing sections we need
// to not trim these, hence the -1 limit.
sections = line.split("\\t", -1);
if (i == 0) {
/*
* Older versions of this format put down data for just
* datasets. Newer versions include data for datagroups
* as well. We need to figure out which one we're looking
* at
*/
if (sections.length == positionOffset + dataSets.length) {
expectedSectionLength = positionOffset + dataSets.length;
} else if (sections.length == positionOffset + dataSets.length + dataGroups.length) {
expectedSectionLength = positionOffset + dataSets.length + dataGroups.length;
}
}
if (sections.length != expectedSectionLength) {
throw new SeqMonkException("Expected " + expectedSectionLength + " sections in data file for " + sections[0] + " but got " + sections.length);
}
if (i % 10000 == 0) {
Enumeration<ProgressListener> e = listeners.elements();
while (e.hasMoreElements()) {
e.nextElement().progressUpdated("Processed data for " + i + " probes", i, n);
}
}
Chromosome c = application.dataCollection().genome().getChromosome(sections[1]).chromosome();
// Sanity check
if (c == null) {
throw new SeqMonkException("Couldn't find a chromosome called " + sections[1]);
}
Probe p;
if (thisDataVersion < 8) {
int start = Integer.parseInt(sections[2]);
int end = Integer.parseInt(sections[3]);
p = new Probe(c, start, end);
} else {
long packedValue = Long.parseLong(sections[2]);
p = new Probe(c, packedValue);
}
if (!sections[0].equals("null")) {
p.setName(sections[0]);
}
probes[i] = p;
probeSet.addProbe(probes[i], null);
for (int j = positionOffset; j < sections.length; j++) {
if (sections[j].length() == 0)
continue;
if ((j - positionOffset) >= dataSets.length) {
dataGroups[j - (positionOffset + dataSets.length)].setValueForProbe(p, Float.parseFloat(sections[j]));
} else {
dataSets[j - positionOffset].setValueForProbe(p, Float.parseFloat(sections[j]));
}
}
}
application.dataCollection().activeProbeListChanged(probeSet);
// This rename doesn't actually change the name. We put this in because
// the All Probes group is drawn in the data view before probes have been
// added to it. This means that it's name isn't updated when the probes have
// been added and it appears labelled with 0 probes. This doesn't happen if
// there are any probe lists under all probes as they cause it to be refreshed,
// but if you only have the probe set then you need this to make the display show
// the correct information.
probeSet.setName("All Probes");
}
use of uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome in project SeqMonk by s-andrews.
the class SeqMonkParser method parseSamples.
/**
* Parses the list of samples.
*
* @param sections The tab split initial samples line
* @throws SeqMonkException
* @throws IOException Signals that an I/O exception has occurred.
*/
private void parseSamples(String[] sections) throws SeqMonkException, IOException {
if (sections.length != 2) {
throw new SeqMonkException("Samples line didn't contain 2 sections");
}
if (!sections[0].equals("Samples")) {
throw new SeqMonkException("Couldn't find expected samples line");
}
int n = Integer.parseInt(sections[1]);
// We need to keep the Data Sets around to add data to later.
dataSets = new DataSet[n];
for (int i = 0; i < n; i++) {
sections = br.readLine().split("\\t");
// anything else is assumed to be a normal dataset.
if (sections.length == 1) {
dataSets[i] = new DataSet(sections[0], "Not known", DataSet.DUPLICATES_REMOVE_NO);
} else if (sections.length == 2) {
dataSets[i] = new DataSet(sections[0], sections[1], DataSet.DUPLICATES_REMOVE_NO);
} else if (sections.length == 3) {
if (sections[2].equals("HiC")) {
dataSets[i] = new PairedDataSet(sections[0], sections[1], DataSet.DUPLICATES_REMOVE_NO, 0, false);
} else {
dataSets[i] = new DataSet(sections[0], sections[1], DataSet.DUPLICATES_REMOVE_NO);
}
}
application.dataCollection().addDataSet(dataSets[i]);
}
// Immediately after the list of samples comes the lists of reads
String line;
// Iterate through the number of samples
for (int i = 0; i < n; i++) {
Enumeration<ProgressListener> en = listeners.elements();
while (en.hasMoreElements()) {
en.nextElement().progressUpdated("Reading data for " + dataSets[i].name(), i * 10, n * 10);
}
// The first line is
line = br.readLine();
sections = line.split("\t");
if (sections.length != 2) {
throw new SeqMonkException("Read line " + i + " didn't contain 2 sections");
}
int readCount = Integer.parseInt(sections[0]);
// In versions prior to 7 we encoded everything on every line separately
if (thisDataVersion < 7) {
for (int r = 0; r < readCount; r++) {
if ((r % (1 + (readCount / 10))) == 0) {
Enumeration<ProgressListener> en2 = listeners.elements();
while (en2.hasMoreElements()) {
en2.nextElement().progressUpdated("Reading data for " + dataSets[i].name(), i * 10 + (r / (1 + (readCount / 10))), n * 10);
}
}
line = br.readLine();
if (line == null) {
throw new SeqMonkException("Ran out of data whilst parsing reads for sample " + i);
}
sections = line.split("\t");
Chromosome c;
try {
c = application.dataCollection().genome().getChromosome(sections[0]).chromosome();
} catch (Exception sme) {
Enumeration<ProgressListener> e = listeners.elements();
while (e.hasMoreElements()) {
e.nextElement().progressWarningReceived(new SeqMonkException("Read from sample " + i + " could not be mapped to chromosome '" + sections[0] + "'"));
}
continue;
}
int start = Integer.parseInt(sections[1]);
int end = Integer.parseInt(sections[2]);
int strand = Integer.parseInt(sections[3]);
try {
dataSets[i].addData(c, SequenceRead.packPosition(start, end, strand), true);
} catch (SeqMonkException ex) {
Enumeration<ProgressListener> e = listeners.elements();
while (e.hasMoreElements()) {
e.nextElement().progressWarningReceived(ex);
continue;
}
}
}
} else if (dataSets[i] instanceof PairedDataSet) {
// Paired Data sets have a different format, with a packed position for
// the first read, then a chromosome name and packed position for the second
// read.
// From v13 onwards we started entering HiC data pairs in both directions so
// the data would be able to be read back in without sorting it.
boolean hicDataIsPresorted = thisDataVersion >= 13;
// We use this to update the progress bar every time we move to a new chromosome
int seenChromosomeCount = 0;
while (true) {
// The first line should be the chromosome and a number of reads
line = br.readLine();
if (line == null) {
throw new SeqMonkException("Ran out of data whilst parsing reads for sample " + i);
}
// A blank line indicates the end of the sample
if (line.length() == 0)
break;
sections = line.split("\\t");
Chromosome c = application.dataCollection().genome().getChromosome(sections[0]).chromosome();
++seenChromosomeCount;
progressUpdated("Reading data for " + dataSets[i].name(), i * application.dataCollection().genome().getChromosomeCount() + seenChromosomeCount, n * application.dataCollection().genome().getChromosomeCount());
int chrReadCount = Integer.parseInt(sections[1]);
for (int r = 0; r < chrReadCount; r++) {
line = br.readLine();
if (line == null) {
throw new SeqMonkException("Ran out of data whilst parsing reads for sample " + i);
}
/*
* We used to have a split("\t") here, but this turned out to be the bottleneck
* which hugely restricted the speed at which the data could be read. Switching
* for a set of index calls and substring makes this *way* faster.
*/
int tabPosition = line.indexOf('\t');
long packedPosition = quickParseLong(line, 0);
int tabPosition2 = line.indexOf('\t', tabPosition + 1);
Chromosome c2 = application.dataCollection().genome().getChromosome(line.substring(tabPosition, tabPosition2)).chromosome();
long packedPosition2 = quickParseLong(line, tabPosition2 + 1);
try {
dataSets[i].addData(c, packedPosition, hicDataIsPresorted);
dataSets[i].addData(c2, packedPosition2, hicDataIsPresorted);
} catch (SeqMonkException ex) {
Enumeration<ProgressListener> e = listeners.elements();
while (e.hasMoreElements()) {
e.nextElement().progressWarningReceived(ex);
continue;
}
}
}
}
} else {
// In versions after 7 we split the section up into chromosomes
// so we don't put the chromosome on every line, and we put out
// the packed double value rather than the individual start, end
// and strand
// As of version 12 we collapse repeated reads into one line with
// a count after it, so we need to check for this.
// We keep count of reads processed to update the progress listeners
int readsRead = 0;
while (true) {
// The first line should be the chromosome and a number of reads
line = br.readLine();
if (line == null) {
throw new SeqMonkException("Ran out of data whilst parsing reads for sample " + i);
}
// A blank line indicates the end of the sample
if (line.length() == 0)
break;
sections = line.split("\\t");
// We don't try to capture this exception since we can't then process
// any of the reads which follow.
Chromosome c = application.dataCollection().genome().getChromosome(sections[0]).chromosome();
int chrReadCount = Integer.parseInt(sections[1]);
for (int r = 0; r < chrReadCount; r++) {
if ((readsRead % (1 + (readCount / 10))) == 0) {
Enumeration<ProgressListener> en2 = listeners.elements();
while (en2.hasMoreElements()) {
en2.nextElement().progressUpdated("Reading data for " + dataSets[i].name(), i * 10 + (readsRead / (1 + (readCount / 10))), n * 10);
}
}
line = br.readLine();
if (line == null) {
throw new SeqMonkException("Ran out of data whilst parsing reads for sample " + i);
}
// We use some custom parsing code to efficiently extract the packed position and
// count from the line. This avoids having to use the generic parsing code or
// doing a split to determine the tab location.
long packedPosition = 0;
int count = 0;
// Check for a sign.
int sign = 1;
int len = line.length();
if (len == 0) {
System.err.println("Yuk - blank line found in sample data");
}
char ch = line.charAt(0);
if (ch == '-') {
sign = -1;
} else {
packedPosition = ch - '0';
}
// Build the numbers.
int j = 1;
int tabIndex = 0;
while (j < len) {
if (line.charAt(j) == '\t') {
tabIndex = j++;
} else if (tabIndex > 0) {
// char is ASCII so need to subtract char of '0' to get correct int value
count = (count * 10) + (line.charAt(j++) - '0');
} else {
packedPosition = (packedPosition * 10) + (line.charAt(j++) - '0');
}
}
packedPosition = sign * packedPosition;
if (count == 0) {
// There was no count so there's only one of these
count = 1;
} else {
// Update the true read count with the additional reads we get from this line
r += (count - 1);
}
// System.out.println("From line "+line+" got packed="+packedPosition+" count="+count);
dataSets[i].addData(c, packedPosition, count, true);
readsRead += count;
}
}
}
// We've now read all of the data for this sample so we can compact it
Enumeration<ProgressListener> en2 = listeners.elements();
while (en2.hasMoreElements()) {
en2.nextElement().progressUpdated("Caching data for " + dataSets[i].name(), (i + 1) * 10, n * 10);
}
dataSets[i].finalise();
}
}
use of uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome in project SeqMonk by s-andrews.
the class VisibleStoresParser method processNormalDataStore.
private DataSet processNormalDataStore(DataStore store) {
int extendBy = prefs.extendReads();
boolean reverse = prefs.reverseReads();
boolean removeStrand = prefs.removeStrandInfo();
DataSet newData = new DataSet(store.name() + "_reimport", "Reimported from " + store.name(), prefs.removeDuplicates());
// Now process the data
Chromosome[] chrs = dataCollection().genome().getAllChromosomes();
for (int c = 0; c < chrs.length; c++) {
progressUpdated("Processing " + store.name() + " chr " + chrs[c].name(), c, chrs.length);
ReadsWithCounts reads = store.getReadsForChromosome(chrs[c]);
Feature[] features = null;
if (filterByFeature) {
features = collection.genome().annotationCollection().getFeaturesForType(chrs[c], featureType);
Arrays.sort(features);
}
int currentFeaturePostion = 0;
for (int r = 0; r < reads.reads.length; r++) {
for (int ct = 0; ct < reads.counts[r]; ct++) {
long thisRead = reads.reads[r];
if (cancel) {
progressCancelled();
return null;
}
if (downsample && downsampleProbabilty < 1) {
if (Math.random() > downsampleProbabilty) {
continue;
}
}
long read;
int start = SequenceRead.start(thisRead);
int end = SequenceRead.end(thisRead);
int strand = SequenceRead.strand(thisRead);
if (filterByStrand) {
if (strand == Location.FORWARD && !keepForward)
continue;
if (strand == Location.REVERSE && !keepReverse)
continue;
if (strand == Location.UNKNOWN && !keepUnknown)
continue;
}
if (filterByLength) {
int length = SequenceRead.length(thisRead);
if (minLength != null && length < minLength)
continue;
if (maxLength != null && length > maxLength)
continue;
}
if (strand == Location.FORWARD) {
start += forwardOffset;
end += forwardOffset;
}
if (strand == Location.REVERSE) {
start -= reverseOffset;
end -= reverseOffset;
}
if (filterByFeature && features.length == 0 && !excludeFeature)
continue;
if (filterByFeature && features.length > 0) {
// See if we're comparing against the right feature
while (SequenceRead.start(thisRead) > features[currentFeaturePostion].location().end() && currentFeaturePostion < (features.length - 1)) {
currentFeaturePostion++;
}
// Test to see if we overlap
if (SequenceRead.overlaps(thisRead, features[currentFeaturePostion].location().packedPosition())) {
if (excludeFeature)
continue;
} else {
if (!excludeFeature)
continue;
}
}
if (reverse) {
if (strand == Location.FORWARD) {
strand = Location.REVERSE;
} else if (strand == Location.REVERSE) {
strand = Location.FORWARD;
}
}
if (removeStrand) {
strand = Location.UNKNOWN;
}
if (extractCentres) {
int centre = start + ((end - start) / 2);
start = centre - centreExtractContext;
end = centre + centreExtractContext;
}
if (extendBy != 0) {
// We now allow negative extensions to shorten reads
if (strand == Location.FORWARD || strand == Location.UNKNOWN) {
end += extendBy;
if (end < start)
end = start;
} else if (strand == Location.REVERSE) {
start -= extendBy;
if (start > end)
start = end;
}
}
// We don't allow reads before the start of the chromosome
if (start < 1) {
int overrun = (0 - start) + 1;
progressWarningReceived(new SeqMonkException("Reading position " + start + " was " + overrun + "bp before the start of chr" + chrs[c].name() + " (" + chrs[c].length() + ")"));
continue;
}
// We also don't allow readings which are beyond the end of the chromosome
if (end > chrs[c].length()) {
int overrun = end - chrs[c].length();
progressWarningReceived(new SeqMonkException("Reading position " + end + " was " + overrun + "bp beyond the end of chr" + chrs[c].name() + " (" + chrs[c].length() + ")"));
continue;
}
// We can now make the new reading
try {
read = SequenceRead.packPosition(start, end, strand);
if (!prefs.isHiC()) {
// HiC additions are deferred until we know the other end is OK too.
newData.addData(chrs[c], read);
}
} catch (SeqMonkException e) {
progressWarningReceived(e);
continue;
}
}
}
}
return newData;
}
Aggregations