use of uk.ac.babraham.SeqMonk.DataTypes.DataSet in project SeqMonk by s-andrews.
the class GffFileParser method run.
/* (non-Javadoc)
* @see java.lang.Runnable#run()
*/
public void run() {
File[] probeFiles = getFiles();
DataSet[] newData = new DataSet[probeFiles.length];
int extendBy = prefs.extendReads();
try {
for (int f = 0; f < probeFiles.length; f++) {
BufferedReader br;
if (probeFiles[f].getName().toLowerCase().endsWith(".gz")) {
br = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(probeFiles[f]))));
} else {
br = new BufferedReader(new FileReader(probeFiles[f]));
}
String line;
if (prefs.isHiC()) {
newData[f] = new PairedDataSet(probeFiles[f].getName(), probeFiles[f].getCanonicalPath(), prefs.removeDuplicates(), prefs.hiCDistance(), prefs.hiCIgnoreTrans());
} else {
newData[f] = new DataSet(probeFiles[f].getName(), probeFiles[f].getCanonicalPath(), prefs.removeDuplicates());
}
int lineCount = 0;
// Now process the file
while ((line = br.readLine()) != null) {
if (cancel) {
br.close();
progressCancelled();
return;
}
// Ignore blank lines
if (line.trim().length() == 0)
continue;
++lineCount;
if (lineCount % 100000 == 0) {
progressUpdated("Read " + lineCount + " lines from " + probeFiles[f].getName(), f, probeFiles.length);
}
String[] sections = line.split("\t");
// Check to see if we've got enough data to work with
if (sections.length < 5) {
progressWarningReceived(new SeqMonkException("Not enough data from line '" + line + "'"));
// Skip this line...
continue;
}
int strand;
int start;
int end;
try {
start = Integer.parseInt(sections[3]);
end = Integer.parseInt(sections[4]);
// End must always be later than start
if (start > end) {
int temp = start;
start = end;
end = temp;
}
if (sections.length >= 7) {
if (sections[6].equals("+")) {
strand = Location.FORWARD;
} else if (sections[6].equals("-")) {
strand = Location.REVERSE;
} else if (sections[6].equals(".")) {
strand = Location.UNKNOWN;
} else {
progressWarningReceived(new SeqMonkException("Unknown strand character '" + sections[6] + "' marked as unknown strand"));
strand = Location.UNKNOWN;
}
} else {
strand = Location.UNKNOWN;
}
if (extendBy > 0) {
if (strand == Location.FORWARD) {
end += extendBy;
} else if (strand == Location.REVERSE) {
start -= extendBy;
}
}
} catch (NumberFormatException e) {
progressWarningReceived(new SeqMonkException("Location " + sections[3] + "-" + sections[4] + " was not an integer"));
continue;
}
ChromosomeWithOffset c;
try {
c = dataCollection().genome().getChromosome(sections[0]);
} catch (IllegalArgumentException sme) {
progressWarningReceived(sme);
continue;
}
start = c.position(start);
end = c.position(end);
// We also don't allow readings which are beyond the end of the chromosome
if (end > c.chromosome().length()) {
int overrun = end - c.chromosome().length();
progressWarningReceived(new SeqMonkException("Reading position " + end + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")"));
continue;
}
// We can now make the new reading
try {
long read = SequenceRead.packPosition(start, end, strand);
newData[f].addData(c.chromosome(), read);
} catch (SeqMonkException e) {
progressWarningReceived(e);
continue;
}
}
// We're finished with the file.
br.close();
// Cache the data in the new dataset
progressUpdated("Caching data from " + probeFiles[f].getName(), f, probeFiles.length);
newData[f].finalise();
}
processingFinished(newData);
} catch (Exception ex) {
progressExceptionReceived(ex);
}
}
use of uk.ac.babraham.SeqMonk.DataTypes.DataSet in project SeqMonk by s-andrews.
the class SeqMonkParser method parseGroups.
/**
* Parses the list of sample groups.
*
* @param sections The tab split values from the initial groups line
* @throws SeqMonkException
* @throws IOException Signals that an I/O exception has occurred.
*/
private void parseGroups(String[] sections) throws SeqMonkException, IOException {
if (sections.length != 2) {
throw new SeqMonkException("Data Groups line didn't contain 2 sections");
}
if (!sections[0].equals("Data Groups")) {
throw new SeqMonkException("Couldn't find expected data groups line");
}
int n = Integer.parseInt(sections[1]);
dataGroups = new DataGroup[n];
for (int i = 0; i < n; i++) {
String[] group = br.readLine().split("\\t");
DataSet[] groupMembers = new DataSet[group.length - 1];
if (thisDataVersion < 4) {
DataSet[] allSets = application.dataCollection().getAllDataSets();
// In the bad old days we used to refer to datasets by their name
for (int j = 1; j < group.length; j++) {
boolean seen = false;
for (int d = 0; d < allSets.length; d++) {
if (allSets[d].name().equals(group[j])) {
if (seen) {
// System.out.println("Seen this before - abort abort");
throw new SeqMonkException("Name " + group[j] + " is ambiguous in group " + group[0]);
}
// System.out.println("Seen for the first time");
groupMembers[j - 1] = allSets[d];
seen = true;
}
}
}
} else {
for (int j = 1; j < group.length; j++) {
// The more modern and safer way to make up a group is by its index
groupMembers[j - 1] = application.dataCollection().getDataSet(Integer.parseInt(group[j]));
if (groupMembers[j - 1] == null) {
throw new SeqMonkException("Couldn't find dataset at position " + group[j]);
}
}
}
DataGroup g = new DataGroup(group[0], groupMembers);
application.dataCollection().addDataGroup(g);
dataGroups[i] = g;
}
}
use of uk.ac.babraham.SeqMonk.DataTypes.DataSet in project SeqMonk by s-andrews.
the class SeqMonkParser method parseVisibleStores.
/**
* Parses the list of dataStores which should initially be visible
*
* @param sections The tab split initial line from the visible stores section
* @throws SeqMonkException
* @throws IOException Signals that an I/O exception has occurred.
*/
private void parseVisibleStores(String[] sections) throws SeqMonkException, IOException {
if (sections.length != 2) {
throw new SeqMonkException("Visible stores line didn't contain 2 sections");
}
int n = Integer.parseInt(sections[1]);
/*
* Collect the drawn stores in an array. We used to add them as we found
* them but this was inefficient since we had to redo a calculation for
* every one we added. This way we only need to calculate once.
*/
DataStore[] drawnStores = new DataStore[n];
if (thisDataVersion < 4) {
// In the bad old days we used to refer to datasets and stores by name
// which caused problems when names were duplicated. We do our best
// with these cases.
DataSet[] sets = application.dataCollection().getAllDataSets();
DataGroup[] groups = application.dataCollection().getAllDataGroups();
for (int i = 0; i < n; i++) {
String line = br.readLine();
if (line == null) {
throw new SeqMonkException("Ran out of visible store data at line " + i + " (expected " + n + " stores)");
}
String[] storeSections = line.split("\\t");
if (storeSections.length != 2) {
throw new SeqMonkException("Expected 2 sections in visible store line but got " + storeSections.length);
}
if (storeSections[1].equals("set")) {
for (int s = 0; s < sets.length; s++) {
if (sets[s].name().equals(storeSections[0])) {
drawnStores[i] = sets[s];
break;
}
}
} else if (storeSections[1].equals("group")) {
for (int g = 0; g < groups.length; g++) {
if (groups[g].name().equals(storeSections[0])) {
drawnStores[i] = groups[g];
break;
}
}
} else {
throw new SeqMonkException("Didn't recognise data type '" + storeSections[1] + "' when adding visible stores from line '" + line + "'");
}
}
} else {
for (int i = 0; i < n; i++) {
String line = br.readLine();
if (line == null) {
throw new SeqMonkException("Ran out of visible store data at line " + i + " (expected " + n + " stores)");
}
String[] storeSections = line.split("\\t");
if (storeSections.length != 2) {
throw new SeqMonkException("Expected 2 sections in visible store line but got " + storeSections.length);
}
if (storeSections[1].equals("set")) {
drawnStores[i] = application.dataCollection().getDataSet(Integer.parseInt(storeSections[0]));
} else if (storeSections[1].equals("group")) {
drawnStores[i] = application.dataCollection().getDataGroup(Integer.parseInt(storeSections[0]));
} else if (storeSections[1].equals("replicate")) {
drawnStores[i] = application.dataCollection().getReplicateSet(Integer.parseInt(storeSections[0]));
} else {
throw new SeqMonkException("Didn't recognise data type '" + storeSections[1] + "' when adding visible stores from line '" + line + "'");
}
}
}
application.addToDrawnDataStores(drawnStores);
}
use of uk.ac.babraham.SeqMonk.DataTypes.DataSet in project SeqMonk by s-andrews.
the class SeqMonkParser method parseSamples.
/**
* Parses the list of samples.
*
* @param sections The tab split initial samples line
* @throws SeqMonkException
* @throws IOException Signals that an I/O exception has occurred.
*/
private void parseSamples(String[] sections) throws SeqMonkException, IOException {
if (sections.length != 2) {
throw new SeqMonkException("Samples line didn't contain 2 sections");
}
if (!sections[0].equals("Samples")) {
throw new SeqMonkException("Couldn't find expected samples line");
}
int n = Integer.parseInt(sections[1]);
// We need to keep the Data Sets around to add data to later.
dataSets = new DataSet[n];
for (int i = 0; i < n; i++) {
sections = br.readLine().split("\\t");
// anything else is assumed to be a normal dataset.
if (sections.length == 1) {
dataSets[i] = new DataSet(sections[0], "Not known", DataSet.DUPLICATES_REMOVE_NO);
} else if (sections.length == 2) {
dataSets[i] = new DataSet(sections[0], sections[1], DataSet.DUPLICATES_REMOVE_NO);
} else if (sections.length == 3) {
if (sections[2].equals("HiC")) {
dataSets[i] = new PairedDataSet(sections[0], sections[1], DataSet.DUPLICATES_REMOVE_NO, 0, false);
} else {
dataSets[i] = new DataSet(sections[0], sections[1], DataSet.DUPLICATES_REMOVE_NO);
}
}
application.dataCollection().addDataSet(dataSets[i]);
}
// Immediately after the list of samples comes the lists of reads
String line;
// Iterate through the number of samples
for (int i = 0; i < n; i++) {
Enumeration<ProgressListener> en = listeners.elements();
while (en.hasMoreElements()) {
en.nextElement().progressUpdated("Reading data for " + dataSets[i].name(), i * 10, n * 10);
}
// The first line is
line = br.readLine();
sections = line.split("\t");
if (sections.length != 2) {
throw new SeqMonkException("Read line " + i + " didn't contain 2 sections");
}
int readCount = Integer.parseInt(sections[0]);
// In versions prior to 7 we encoded everything on every line separately
if (thisDataVersion < 7) {
for (int r = 0; r < readCount; r++) {
if ((r % (1 + (readCount / 10))) == 0) {
Enumeration<ProgressListener> en2 = listeners.elements();
while (en2.hasMoreElements()) {
en2.nextElement().progressUpdated("Reading data for " + dataSets[i].name(), i * 10 + (r / (1 + (readCount / 10))), n * 10);
}
}
line = br.readLine();
if (line == null) {
throw new SeqMonkException("Ran out of data whilst parsing reads for sample " + i);
}
sections = line.split("\t");
Chromosome c;
try {
c = application.dataCollection().genome().getChromosome(sections[0]).chromosome();
} catch (Exception sme) {
Enumeration<ProgressListener> e = listeners.elements();
while (e.hasMoreElements()) {
e.nextElement().progressWarningReceived(new SeqMonkException("Read from sample " + i + " could not be mapped to chromosome '" + sections[0] + "'"));
}
continue;
}
int start = Integer.parseInt(sections[1]);
int end = Integer.parseInt(sections[2]);
int strand = Integer.parseInt(sections[3]);
try {
dataSets[i].addData(c, SequenceRead.packPosition(start, end, strand), true);
} catch (SeqMonkException ex) {
Enumeration<ProgressListener> e = listeners.elements();
while (e.hasMoreElements()) {
e.nextElement().progressWarningReceived(ex);
continue;
}
}
}
} else if (dataSets[i] instanceof PairedDataSet) {
// Paired Data sets have a different format, with a packed position for
// the first read, then a chromosome name and packed position for the second
// read.
// From v13 onwards we started entering HiC data pairs in both directions so
// the data would be able to be read back in without sorting it.
boolean hicDataIsPresorted = thisDataVersion >= 13;
// We use this to update the progress bar every time we move to a new chromosome
int seenChromosomeCount = 0;
while (true) {
// The first line should be the chromosome and a number of reads
line = br.readLine();
if (line == null) {
throw new SeqMonkException("Ran out of data whilst parsing reads for sample " + i);
}
// A blank line indicates the end of the sample
if (line.length() == 0)
break;
sections = line.split("\\t");
Chromosome c = application.dataCollection().genome().getChromosome(sections[0]).chromosome();
++seenChromosomeCount;
progressUpdated("Reading data for " + dataSets[i].name(), i * application.dataCollection().genome().getChromosomeCount() + seenChromosomeCount, n * application.dataCollection().genome().getChromosomeCount());
int chrReadCount = Integer.parseInt(sections[1]);
for (int r = 0; r < chrReadCount; r++) {
line = br.readLine();
if (line == null) {
throw new SeqMonkException("Ran out of data whilst parsing reads for sample " + i);
}
/*
* We used to have a split("\t") here, but this turned out to be the bottleneck
* which hugely restricted the speed at which the data could be read. Switching
* for a set of index calls and substring makes this *way* faster.
*/
int tabPosition = line.indexOf('\t');
long packedPosition = quickParseLong(line, 0);
int tabPosition2 = line.indexOf('\t', tabPosition + 1);
Chromosome c2 = application.dataCollection().genome().getChromosome(line.substring(tabPosition, tabPosition2)).chromosome();
long packedPosition2 = quickParseLong(line, tabPosition2 + 1);
try {
dataSets[i].addData(c, packedPosition, hicDataIsPresorted);
dataSets[i].addData(c2, packedPosition2, hicDataIsPresorted);
} catch (SeqMonkException ex) {
Enumeration<ProgressListener> e = listeners.elements();
while (e.hasMoreElements()) {
e.nextElement().progressWarningReceived(ex);
continue;
}
}
}
}
} else {
// In versions after 7 we split the section up into chromosomes
// so we don't put the chromosome on every line, and we put out
// the packed double value rather than the individual start, end
// and strand
// As of version 12 we collapse repeated reads into one line with
// a count after it, so we need to check for this.
// We keep count of reads processed to update the progress listeners
int readsRead = 0;
while (true) {
// The first line should be the chromosome and a number of reads
line = br.readLine();
if (line == null) {
throw new SeqMonkException("Ran out of data whilst parsing reads for sample " + i);
}
// A blank line indicates the end of the sample
if (line.length() == 0)
break;
sections = line.split("\\t");
// We don't try to capture this exception since we can't then process
// any of the reads which follow.
Chromosome c = application.dataCollection().genome().getChromosome(sections[0]).chromosome();
int chrReadCount = Integer.parseInt(sections[1]);
for (int r = 0; r < chrReadCount; r++) {
if ((readsRead % (1 + (readCount / 10))) == 0) {
Enumeration<ProgressListener> en2 = listeners.elements();
while (en2.hasMoreElements()) {
en2.nextElement().progressUpdated("Reading data for " + dataSets[i].name(), i * 10 + (readsRead / (1 + (readCount / 10))), n * 10);
}
}
line = br.readLine();
if (line == null) {
throw new SeqMonkException("Ran out of data whilst parsing reads for sample " + i);
}
// We use some custom parsing code to efficiently extract the packed position and
// count from the line. This avoids having to use the generic parsing code or
// doing a split to determine the tab location.
long packedPosition = 0;
int count = 0;
// Check for a sign.
int sign = 1;
int len = line.length();
if (len == 0) {
System.err.println("Yuk - blank line found in sample data");
}
char ch = line.charAt(0);
if (ch == '-') {
sign = -1;
} else {
packedPosition = ch - '0';
}
// Build the numbers.
int j = 1;
int tabIndex = 0;
while (j < len) {
if (line.charAt(j) == '\t') {
tabIndex = j++;
} else if (tabIndex > 0) {
// char is ASCII so need to subtract char of '0' to get correct int value
count = (count * 10) + (line.charAt(j++) - '0');
} else {
packedPosition = (packedPosition * 10) + (line.charAt(j++) - '0');
}
}
packedPosition = sign * packedPosition;
if (count == 0) {
// There was no count so there's only one of these
count = 1;
} else {
// Update the true read count with the additional reads we get from this line
r += (count - 1);
}
// System.out.println("From line "+line+" got packed="+packedPosition+" count="+count);
dataSets[i].addData(c, packedPosition, count, true);
readsRead += count;
}
}
}
// We've now read all of the data for this sample so we can compact it
Enumeration<ProgressListener> en2 = listeners.elements();
while (en2.hasMoreElements()) {
en2.nextElement().progressUpdated("Caching data for " + dataSets[i].name(), (i + 1) * 10, n * 10);
}
dataSets[i].finalise();
}
}
use of uk.ac.babraham.SeqMonk.DataTypes.DataSet in project SeqMonk by s-andrews.
the class VisibleStoresParser method run.
/* (non-Javadoc)
* @see java.lang.Runnable#run()
*/
public void run() {
filterByStrand = prefs.filterStrandCheckbox.isSelected();
if (filterByStrand) {
if (prefs.strandFilterDirectionBox.getSelectedItem().equals("Forward")) {
keepForward = true;
keepReverse = false;
keepUnknown = false;
}
if (prefs.strandFilterDirectionBox.getSelectedItem().equals("Reverse")) {
keepForward = false;
keepReverse = true;
keepUnknown = false;
}
if (prefs.strandFilterDirectionBox.getSelectedItem().equals("Forward or Reverse")) {
keepForward = true;
keepReverse = true;
keepUnknown = false;
}
if (prefs.strandFilterDirectionBox.getSelectedItem().equals("Unknown")) {
keepForward = false;
keepReverse = false;
keepUnknown = true;
}
}
forwardOffset = prefs.forwardOffset();
reverseOffset = prefs.reverseOffset();
filterByFeature = prefs.filterFeatureCheckbox.isSelected();
if (filterByFeature) {
excludeFeature = prefs.filterTypeBox.getSelectedItem().equals("Excluding");
featureType = (String) prefs.filterFeatureBox.getSelectedItem();
}
filterByLength = prefs.filterByLengthBox.isSelected();
if (filterByLength) {
if (prefs.minLengthField.getText().length() > 0) {
minLength = Integer.parseInt(prefs.minLengthField.getText());
}
if (prefs.maxLengthField.getText().length() > 0) {
maxLength = Integer.parseInt(prefs.maxLengthField.getText());
}
}
extractCentres = prefs.extractCentres();
centreExtractContext = prefs.centreExtractContext();
int downsampleTarget = 0;
if (prefs.downsampleBox.isSelected() && prefs.downsampleTargetField.getText().length() > 0) {
downsample = true;
downsampleTarget = Integer.parseInt(prefs.downsampleTargetField.getText());
}
DataSet[] newDataStores = new DataSet[visibleStores.length];
for (int s = 0; s < visibleStores.length; s++) {
if (downsample) {
// Work out the probability for this sample
int realReadCount = visibleStores[s].getTotalReadCount();
downsampleProbabilty = ((double) downsampleTarget) / realReadCount;
}
DataSet newData = null;
try {
if (prefs.isHiC()) {
newData = processHiCDataStore(visibleStores[s]);
} else {
newData = processNormalDataStore(visibleStores[s]);
}
} catch (Exception ex) {
progressExceptionReceived(ex);
}
if (newData == null) {
// They cancelled
return;
}
// Cache the data in the new dataset
progressUpdated("Caching data", 1, 1);
newData.finalise();
newDataStores[s] = newData;
}
processingFinished(newDataStores);
}
Aggregations