use of uk.ac.babraham.SeqMonk.Utilities.ChromosomeWithOffset in project SeqMonk by s-andrews.
the class BAMFileParser method getSingleEndRead.
/**
* Gets a single end read.
*
* @param sections The tab split sections from the SAM file
* @param flag The binary flag field from the file
* @return The read which was read
* @throws SeqMonkException
*/
private SequenceReadWithChromosome getSingleEndRead(SAMRecord samRecord) throws SeqMonkException {
int strand;
int start;
int end;
start = samRecord.getAlignmentStart();
end = samRecord.getAlignmentEnd();
if (samRecord.getReadNegativeStrandFlag()) {
strand = Location.REVERSE;
} else {
strand = Location.FORWARD;
}
if (extendBy > 0) {
if (strand == Location.FORWARD) {
end += extendBy;
} else if (strand == Location.REVERSE) {
start -= extendBy;
}
}
ChromosomeWithOffset c;
try {
c = collection.genome().getChromosome(samRecord.getReferenceName());
} catch (Exception iae) {
throw new SeqMonkException(iae.getLocalizedMessage());
}
start = c.position(start);
end = c.position(end);
// We also don't allow readings which are beyond the end of the chromosome
if (end > c.chromosome().length()) {
int overrun = end - c.chromosome().length();
throw new SeqMonkException("Reading position " + end + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
}
if (start < 1) {
throw new SeqMonkException("Reading position " + start + " was before the start of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
}
// We can now make the new reading
SequenceReadWithChromosome read = new SequenceReadWithChromosome(c.chromosome(), SequenceRead.packPosition(start, end, strand));
return read;
}
use of uk.ac.babraham.SeqMonk.Utilities.ChromosomeWithOffset in project SeqMonk by s-andrews.
the class BowtieFileParser method getPairedEndRead.
/**
* Gets a paired end read.
*
* @param sections1 The tab split bowtie output sections for the first read
* @param sections2 The tab split bowtie output sections for the second read
* @return The paired end read which was read
* @throws SeqMonkException
*/
private SequenceReadWithChromosome getPairedEndRead(String[] sections1, String[] sections2) throws SeqMonkException {
// We can get the lines with read two first, in which case we'll reverse things
boolean readsAreReversed = false;
if (sections1[0].substring(sections1[0].length() - 1).equals("2")) {
readsAreReversed = true;
}
int strand;
int start;
int end;
try {
/*
* This convention isn't true in newer bowtie files so we can't rely on this any more.
*/
// if (! sections1[0].substring(0, sections1[0].length()-2).equals(sections2[0].substring(0, sections2[0].length()-2))) {
// throw new SeqMonkException("Paired reads '"+sections1[0]+"' and '"+sections2[0]+"' did not match names");
// }
int read1start = Integer.parseInt(sections1[3]) + 1;
int read1end = read1start + (sections1[4].length() - 1);
int read2start = Integer.parseInt(sections2[3]) + 1;
int read2end = read2start + (sections2[4].length() - 1);
if (read1start < read2start) {
start = read1start;
} else {
start = read2start;
}
if (read2end > read1end) {
end = read2end;
} else {
end = read1end;
}
if (sections1[1].equals("+") && sections2[1].equals("-")) {
if (readsAreReversed) {
strand = Location.REVERSE;
} else {
strand = Location.FORWARD;
}
} else if (sections1[1].equals("-") && sections2[1].equals("+")) {
if (readsAreReversed) {
strand = Location.FORWARD;
} else {
strand = Location.REVERSE;
}
} else {
strand = Location.UNKNOWN;
}
} catch (NumberFormatException e) {
throw new SeqMonkException("Location " + sections1[3] + " or " + sections2[3] + " was not an integer");
}
if ((end - start) + 1 > pairedEndDistance) {
throw new SeqMonkException("Distance between ends " + ((end - start) + 1) + " was larger than cutoff (" + pairedEndDistance + ")");
}
if (!sections1[2].equals(sections2[2])) {
throw new SeqMonkException("Paried end read was on a different chromosome");
}
ChromosomeWithOffset c;
try {
c = dataCollection().genome().getChromosome(sections1[2]);
} catch (Exception e) {
throw new SeqMonkException(e.getLocalizedMessage());
}
start = c.position(start);
end = c.position(end);
// We also don't allow readings which are beyond the end of the chromosome
if (end > c.chromosome().length()) {
int overrun = end - c.chromosome().length();
throw new SeqMonkException("Reading position " + end + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
}
if (start < 1) {
throw new SeqMonkException("Reading position " + start + " was before the start of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
}
// We can now make the new reading
SequenceReadWithChromosome read = new SequenceReadWithChromosome(c.chromosome(), SequenceRead.packPosition(start, end, strand));
return read;
}
use of uk.ac.babraham.SeqMonk.Utilities.ChromosomeWithOffset in project SeqMonk by s-andrews.
the class BismarkCovFileParser method run.
/* (non-Javadoc)
* @see java.lang.Runnable#run()
*/
public void run() {
try {
File[] covFiles = getFiles();
DataSet[] newData = new DataSet[covFiles.length];
for (int f = 0; f < covFiles.length; f++) {
BufferedReader br;
if (covFiles[f].getName().toLowerCase().endsWith(".gz")) {
br = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(covFiles[f]))));
} else {
br = new BufferedReader(new FileReader(covFiles[f]));
}
String line;
newData[f] = new DataSet(covFiles[f].getName(), covFiles[f].getCanonicalPath(), prefs.removeDuplicates());
int lineCount = 0;
// Now process the file
while ((line = br.readLine()) != null) {
if (cancel) {
br.close();
progressCancelled();
return;
}
// Ignore blank lines
if (line.trim().length() == 0)
continue;
++lineCount;
if (lineCount % 100000 == 0) {
progressUpdated("Read " + lineCount + " lines from " + covFiles[f].getName(), f, covFiles.length);
}
String[] sections = line.split("\t");
// Check to see if we've got enough data to work with
if (sections.length < 6) {
progressWarningReceived(new SeqMonkException("Not enough data from line '" + line + "'"));
// Skip this line...
continue;
}
int start;
int end;
int methCount;
int unmethCount;
try {
start = Integer.parseInt(sections[1]);
end = Integer.parseInt(sections[2]);
methCount = Integer.parseInt(sections[4]);
unmethCount = Integer.parseInt(sections[5]);
// End must always be later than start
if (start > end) {
progressWarningReceived(new SeqMonkException("End position " + end + " was lower than start position " + start));
int temp = start;
start = end;
end = temp;
}
} catch (NumberFormatException e) {
progressWarningReceived(new SeqMonkException("Location " + sections[0] + "-" + sections[1] + " was not an integer"));
continue;
}
try {
ChromosomeWithOffset c = dataCollection().genome().getChromosome(sections[0]);
// We also don't allow readings which are beyond the end of the chromosome
start = c.position(start);
end = c.position(end);
if (end > c.chromosome().length()) {
int overrun = end - c.chromosome().length();
progressWarningReceived(new SeqMonkException("Reading position " + end + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")"));
continue;
}
// We can now make the new reads
long methRead = SequenceRead.packPosition(start, end, Location.FORWARD);
long unmethRead = SequenceRead.packPosition(start, end, Location.REVERSE);
newData[f].addData(c.chromosome(), methRead, methCount);
newData[f].addData(c.chromosome(), unmethRead, unmethCount);
} catch (IllegalArgumentException iae) {
progressWarningReceived(iae);
} catch (SeqMonkException sme) {
progressWarningReceived(sme);
continue;
}
}
// We're finished with the file.
br.close();
// Cache the data in the new dataset
progressUpdated("Caching data from " + covFiles[f].getName(), f, covFiles.length);
newData[f].finalise();
}
processingFinished(newData);
} catch (Exception ex) {
progressExceptionReceived(ex);
return;
}
}
use of uk.ac.babraham.SeqMonk.Utilities.ChromosomeWithOffset in project SeqMonk by s-andrews.
the class QuasRFileParser method run.
/* (non-Javadoc)
* @see java.lang.Runnable#run()
*/
public void run() {
try {
File[] quasrFiles = getFiles();
DataSet[] newData = new DataSet[quasrFiles.length];
for (int f = 0; f < quasrFiles.length; f++) {
BufferedReader br;
if (quasrFiles[f].getName().toLowerCase().endsWith(".gz")) {
br = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(quasrFiles[f]))));
} else {
br = new BufferedReader(new FileReader(quasrFiles[f]));
}
String line;
newData[f] = new DataSet(quasrFiles[f].getName(), quasrFiles[f].getCanonicalPath(), prefs.removeDuplicates());
int lineCount = 0;
// Now process the file
while ((line = br.readLine()) != null) {
if (cancel) {
br.close();
progressCancelled();
return;
}
// Ignore blank lines
if (line.trim().length() == 0)
continue;
// In case it has comments
if (line.startsWith("#"))
continue;
++lineCount;
if (lineCount % 100000 == 0) {
progressUpdated("Read " + lineCount + " lines from " + quasrFiles[f].getName(), f, quasrFiles.length);
}
String[] sections = line.split("\t");
// Check to see if we've got enough data to work with
if (sections.length < 5) {
progressWarningReceived(new SeqMonkException("Not enough data from line '" + line + "'"));
// Skip this line...
continue;
}
int start;
int end;
int totalCount;
int methCount;
int unmethCount;
try {
start = Integer.parseInt(sections[1]);
end = Integer.parseInt(sections[2]);
totalCount = Integer.parseInt(sections[3]);
methCount = Integer.parseInt(sections[4]);
unmethCount = totalCount - methCount;
// End must always be later than start
if (start > end) {
progressWarningReceived(new SeqMonkException("End position " + end + " was lower than start position " + start));
int temp = start;
start = end;
end = temp;
}
} catch (NumberFormatException e) {
progressWarningReceived(new SeqMonkException("Location " + sections[0] + "-" + sections[1] + " was not an integer"));
continue;
}
try {
ChromosomeWithOffset c = dataCollection().genome().getChromosome(sections[0]);
// We also don't allow readings which are beyond the end of the chromosome
start = c.position(start);
end = c.position(end);
if (end > c.chromosome().length()) {
int overrun = end - c.chromosome().length();
progressWarningReceived(new SeqMonkException("Reading position " + end + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")"));
continue;
}
// We can now make the new reads
long methRead = SequenceRead.packPosition(start, end, Location.FORWARD);
long unmethRead = SequenceRead.packPosition(start, end, Location.REVERSE);
for (int i = 0; i < methCount; i++) {
newData[f].addData(c.chromosome(), methRead);
}
for (int i = 0; i < unmethCount; i++) {
newData[f].addData(c.chromosome(), unmethRead);
}
} catch (IllegalArgumentException iae) {
progressWarningReceived(iae);
} catch (SeqMonkException sme) {
progressWarningReceived(sme);
continue;
}
}
// We're finished with the file.
br.close();
// Cache the data in the new dataset
progressUpdated("Caching data from " + quasrFiles[f].getName(), f, quasrFiles.length);
newData[f].finalise();
}
processingFinished(newData);
} catch (Exception ex) {
progressExceptionReceived(ex);
return;
}
}
Aggregations