use of uk.ac.babraham.SeqMonk.DataTypes.DataSet in project SeqMonk by s-andrews.
the class QuasRFileParser method run.
/* (non-Javadoc)
* @see java.lang.Runnable#run()
*/
public void run() {
try {
File[] quasrFiles = getFiles();
DataSet[] newData = new DataSet[quasrFiles.length];
for (int f = 0; f < quasrFiles.length; f++) {
BufferedReader br;
if (quasrFiles[f].getName().toLowerCase().endsWith(".gz")) {
br = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(quasrFiles[f]))));
} else {
br = new BufferedReader(new FileReader(quasrFiles[f]));
}
String line;
newData[f] = new DataSet(quasrFiles[f].getName(), quasrFiles[f].getCanonicalPath(), prefs.removeDuplicates());
int lineCount = 0;
// Now process the file
while ((line = br.readLine()) != null) {
if (cancel) {
br.close();
progressCancelled();
return;
}
// Ignore blank lines
if (line.trim().length() == 0)
continue;
// In case it has comments
if (line.startsWith("#"))
continue;
++lineCount;
if (lineCount % 100000 == 0) {
progressUpdated("Read " + lineCount + " lines from " + quasrFiles[f].getName(), f, quasrFiles.length);
}
String[] sections = line.split("\t");
// Check to see if we've got enough data to work with
if (sections.length < 5) {
progressWarningReceived(new SeqMonkException("Not enough data from line '" + line + "'"));
// Skip this line...
continue;
}
int start;
int end;
int totalCount;
int methCount;
int unmethCount;
try {
start = Integer.parseInt(sections[1]);
end = Integer.parseInt(sections[2]);
totalCount = Integer.parseInt(sections[3]);
methCount = Integer.parseInt(sections[4]);
unmethCount = totalCount - methCount;
// End must always be later than start
if (start > end) {
progressWarningReceived(new SeqMonkException("End position " + end + " was lower than start position " + start));
int temp = start;
start = end;
end = temp;
}
} catch (NumberFormatException e) {
progressWarningReceived(new SeqMonkException("Location " + sections[0] + "-" + sections[1] + " was not an integer"));
continue;
}
try {
ChromosomeWithOffset c = dataCollection().genome().getChromosome(sections[0]);
// We also don't allow readings which are beyond the end of the chromosome
start = c.position(start);
end = c.position(end);
if (end > c.chromosome().length()) {
int overrun = end - c.chromosome().length();
progressWarningReceived(new SeqMonkException("Reading position " + end + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")"));
continue;
}
// We can now make the new reads
long methRead = SequenceRead.packPosition(start, end, Location.FORWARD);
long unmethRead = SequenceRead.packPosition(start, end, Location.REVERSE);
for (int i = 0; i < methCount; i++) {
newData[f].addData(c.chromosome(), methRead);
}
for (int i = 0; i < unmethCount; i++) {
newData[f].addData(c.chromosome(), unmethRead);
}
} catch (IllegalArgumentException iae) {
progressWarningReceived(iae);
} catch (SeqMonkException sme) {
progressWarningReceived(sme);
continue;
}
}
// We're finished with the file.
br.close();
// Cache the data in the new dataset
progressUpdated("Caching data from " + quasrFiles[f].getName(), f, quasrFiles.length);
newData[f].finalise();
}
processingFinished(newData);
} catch (Exception ex) {
progressExceptionReceived(ex);
return;
}
}
Aggregations