use of uk.ac.babraham.SeqMonk.SeqMonkException in project SeqMonk by s-andrews.
the class BAMFileParser method getSplitSingleEndRead.
/**
* Gets a split single end read. The only reason for asking about whether the
* import is single or paired end is that if we're doing a paired end import
* then we reverse the strand for all second read data. For single end import
* we leave the strand alone whichever read it originates from.
*
* @param samRecord The picard record entry for this read
* @param flag pairedEnd Is this a paired end import
* @return The read which was read
* @throws SeqMonkException
*/
private SequenceReadWithChromosome[] getSplitSingleEndRead(SAMRecord samRecord) throws SeqMonkException {
int strand;
int start;
int lastEnd = -1;
start = samRecord.getAlignmentStart();
if (samRecord.getReadNegativeStrandFlag()) {
if (samRecord.getReadPairedFlag() && samRecord.getSecondOfPairFlag()) {
strand = Location.FORWARD;
} else {
strand = Location.REVERSE;
}
} else {
if (samRecord.getReadPairedFlag() && samRecord.getSecondOfPairFlag()) {
strand = Location.REVERSE;
} else {
strand = Location.FORWARD;
}
}
ChromosomeWithOffset c;
try {
c = dataCollection().genome().getChromosome(samRecord.getReferenceName());
} catch (Exception e) {
throw new SeqMonkException(e.getLocalizedMessage());
}
start = c.position(start);
if (start < 1) {
throw new SeqMonkException("Reading position " + start + " was before the start of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
}
if (!samRecord.getCigarString().contains("N")) {
if (!importIntrons) {
int end = c.position(samRecord.getAlignmentEnd());
return new SequenceReadWithChromosome[] { new SequenceReadWithChromosome(c.chromosome(), SequenceRead.packPosition(start, end, strand)) };
}
return new SequenceReadWithChromosome[0];
}
String[] cigarOperations = samRecord.getCigarString().split("\\d+");
String[] cigarNumbers = samRecord.getCigarString().split("[MIDNSHP]");
if (cigarOperations.length != cigarNumbers.length + 1) {
throw new SeqMonkException("Couldn't parse CIGAR string " + samRecord.getCigarString() + " counts were " + cigarOperations.length + " vs " + cigarNumbers.length);
}
Vector<SequenceReadWithChromosome> newReads = new Vector<SequenceReadWithChromosome>();
int currentPosition = start;
for (int pos = 0; pos < cigarNumbers.length; pos++) {
if (cigarOperations[pos + 1].equals("M")) {
currentPosition += Integer.parseInt(cigarNumbers[pos]) - 1;
} else if (cigarOperations[pos + 1].equals("I")) {
currentPosition += Integer.parseInt(cigarNumbers[pos]) - 1;
} else if (cigarOperations[pos + 1].equals("D")) {
currentPosition -= Integer.parseInt(cigarNumbers[pos]) - 1;
} else if (cigarOperations[pos + 1].equals("N")) {
if (importIntrons) {
if (lastEnd > 0) {
if (start > c.chromosome().length()) {
int overrun = (start - 1) - c.chromosome().length();
throw new SeqMonkException("Reading position " + (start - 1) + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
}
newReads.add(new SequenceReadWithChromosome(c.chromosome(), SequenceRead.packPosition(lastEnd + 1, start - 1, strand)));
}
// Update the lastEnd whether we added a read or not since this
// will be the start of the next intron
lastEnd = currentPosition;
} else {
// We also don't allow readings which are beyond the end of the chromosome
if (currentPosition > c.chromosome().length()) {
int overrun = currentPosition - c.chromosome().length();
throw new SeqMonkException("Reading position " + currentPosition + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
}
newReads.add(new SequenceReadWithChromosome(c.chromosome(), SequenceRead.packPosition(start, currentPosition, strand)));
}
currentPosition += Integer.parseInt(cigarNumbers[pos]) + 1;
start = currentPosition;
}
}
if (importIntrons) {
if (lastEnd > 0) {
if (start > c.chromosome().length()) {
int overrun = (start - 1) - c.chromosome().length();
throw new SeqMonkException("Reading position " + (start - 1) + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
}
newReads.add(new SequenceReadWithChromosome(c.chromosome(), SequenceRead.packPosition(lastEnd + 1, start - 1, strand)));
}
} else {
// We also don't allow readings which are beyond the end of the chromosome
if (currentPosition > c.chromosome().length()) {
int overrun = currentPosition - c.chromosome().length();
throw new SeqMonkException("Reading position " + currentPosition + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
}
newReads.add(new SequenceReadWithChromosome(c.chromosome(), SequenceRead.packPosition(start, currentPosition, strand)));
}
return newReads.toArray(new SequenceReadWithChromosome[0]);
}
use of uk.ac.babraham.SeqMonk.SeqMonkException in project SeqMonk by s-andrews.
the class BAMFileParser method getSingleEndRead.
/**
* Gets a single end read.
*
* @param sections The tab split sections from the SAM file
* @param flag The binary flag field from the file
* @return The read which was read
* @throws SeqMonkException
*/
private SequenceReadWithChromosome getSingleEndRead(SAMRecord samRecord) throws SeqMonkException {
int strand;
int start;
int end;
start = samRecord.getAlignmentStart();
end = samRecord.getAlignmentEnd();
if (samRecord.getReadNegativeStrandFlag()) {
strand = Location.REVERSE;
} else {
strand = Location.FORWARD;
}
if (extendBy > 0) {
if (strand == Location.FORWARD) {
end += extendBy;
} else if (strand == Location.REVERSE) {
start -= extendBy;
}
}
ChromosomeWithOffset c;
try {
c = collection.genome().getChromosome(samRecord.getReferenceName());
} catch (Exception iae) {
throw new SeqMonkException(iae.getLocalizedMessage());
}
start = c.position(start);
end = c.position(end);
// We also don't allow readings which are beyond the end of the chromosome
if (end > c.chromosome().length()) {
int overrun = end - c.chromosome().length();
throw new SeqMonkException("Reading position " + end + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
}
if (start < 1) {
throw new SeqMonkException("Reading position " + start + " was before the start of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
}
// We can now make the new reading
SequenceReadWithChromosome read = new SequenceReadWithChromosome(c.chromosome(), SequenceRead.packPosition(start, end, strand));
return read;
}
use of uk.ac.babraham.SeqMonk.SeqMonkException in project SeqMonk by s-andrews.
the class BAMFileParser method run.
/* (non-Javadoc)
* @see java.lang.Runnable#run()
*/
public void run() {
pairedEndImport = prefs.pairedEnd();
pairedEndDistance = prefs.pairDistanceCutoff();
separateSplicedReads = prefs.splitSplicedReads();
importIntrons = prefs.importIntrons();
extendBy = prefs.extendReads();
minMappingQuality = prefs.minMappingQuality();
primaryAlignmentsOnly = prefs.primaryAlignmentsOnly();
File[] samFiles = getFiles();
DataSet[] newData = new DataSet[samFiles.length];
try {
for (int f = 0; f < samFiles.length; f++) {
SAMFileReader.setDefaultValidationStringency(SAMFileReader.ValidationStringency.SILENT);
SAMFileReader inputSam = new SAMFileReader(samFiles[f]);
if (prefs.isHiC()) {
newData[f] = new PairedDataSet(samFiles[f].getName(), samFiles[f].getCanonicalPath(), prefs.removeDuplicates(), prefs.hiCDistance(), prefs.hiCIgnoreTrans());
} else {
newData[f] = new DataSet(samFiles[f].getName(), samFiles[f].getCanonicalPath(), prefs.removeDuplicates());
}
int lineCount = 0;
// Now process the file
// A flag we can set to skip the next record if we're getting
// out of sync during single end HiC import.
boolean skipNext = false;
for (SAMRecord samRecord : inputSam) {
if (skipNext) {
skipNext = false;
continue;
}
if (cancel) {
inputSam.close();
progressCancelled();
return;
}
++lineCount;
if (lineCount % 100000 == 0) {
progressUpdated("Read " + lineCount + " lines from " + samFiles[f].getName(), f, samFiles.length);
}
if (pairedEndImport && !samRecord.getReadPairedFlag()) {
progressWarningReceived(new SeqMonkException("Data was single ended during paired end import"));
continue;
}
if (samRecord.getReadUnmappedFlag()) {
// There was no match
continue;
}
if (primaryAlignmentsOnly && samRecord.getNotPrimaryAlignmentFlag()) {
// alignments
continue;
}
if (pairedEndImport && !separateSplicedReads && samRecord.getMateUnmappedFlag()) {
// No match on the reverse strand. Doesn't matter if we're doing spliced reads.
continue;
}
if (minMappingQuality > 0 && samRecord.getMappingQuality() < minMappingQuality) {
if (prefs.isHiC()) {
if (((PairedDataSet) newData[f]).importSequenceSkipped()) {
// Skip the next line
skipNext = true;
}
}
continue;
}
if (pairedEndImport && !separateSplicedReads && !samRecord.getReadNegativeStrandFlag()) {
// field.
continue;
}
if (pairedEndImport && !separateSplicedReads && !samRecord.getReferenceName().equals(samRecord.getMateReferenceName())) {
// progressWarningReceived(new SeqMonkException("Paired reads mapped to different chromosomes"));
continue;
}
// TODO: Check what this actually stores - might be a real name rather than 0/=
if (pairedEndImport && !separateSplicedReads && !prefs.isHiC() && samRecord.getMateReferenceName() == "0") {
if (samRecord.getMateReferenceName() != "=") {
inputSam.close();
throw new SeqMonkException("Unexpected mate referenece name " + samRecord.getMateReferenceName());
}
// Matches were on different chromosomes
continue;
}
try {
if (pairedEndImport && !separateSplicedReads) {
SequenceReadWithChromosome read = getPairedEndRead(samRecord);
newData[f].addData(read.chromosome, read.read);
} else if (separateSplicedReads) {
SequenceReadWithChromosome[] reads = getSplitSingleEndRead(samRecord);
for (int r = 0; r < reads.length; r++) {
newData[f].addData(reads[r].chromosome, reads[r].read);
}
} else {
SequenceReadWithChromosome read = getSingleEndRead(samRecord);
newData[f].addData(read.chromosome, read.read);
}
} catch (SeqMonkException ex) {
progressWarningReceived(ex);
if (prefs.isHiC()) {
if (((PairedDataSet) newData[f]).importSequenceSkipped()) {
// Skip the next line
skipNext = true;
}
}
}
}
// We're finished with the file.
inputSam.close();
// Cache the data in the new dataset
progressUpdated("Caching data from " + samFiles[f].getName(), f, samFiles.length);
newData[f].finalise();
}
} catch (Exception ex) {
progressExceptionReceived(ex);
return;
}
processingFinished(newData);
}
use of uk.ac.babraham.SeqMonk.SeqMonkException in project SeqMonk by s-andrews.
the class BowtieFileParser method getPairedEndRead.
/**
* Gets a paired end read.
*
* @param sections1 The tab split bowtie output sections for the first read
* @param sections2 The tab split bowtie output sections for the second read
* @return The paired end read which was read
* @throws SeqMonkException
*/
private SequenceReadWithChromosome getPairedEndRead(String[] sections1, String[] sections2) throws SeqMonkException {
// We can get the lines with read two first, in which case we'll reverse things
boolean readsAreReversed = false;
if (sections1[0].substring(sections1[0].length() - 1).equals("2")) {
readsAreReversed = true;
}
int strand;
int start;
int end;
try {
/*
* This convention isn't true in newer bowtie files so we can't rely on this any more.
*/
// if (! sections1[0].substring(0, sections1[0].length()-2).equals(sections2[0].substring(0, sections2[0].length()-2))) {
// throw new SeqMonkException("Paired reads '"+sections1[0]+"' and '"+sections2[0]+"' did not match names");
// }
int read1start = Integer.parseInt(sections1[3]) + 1;
int read1end = read1start + (sections1[4].length() - 1);
int read2start = Integer.parseInt(sections2[3]) + 1;
int read2end = read2start + (sections2[4].length() - 1);
if (read1start < read2start) {
start = read1start;
} else {
start = read2start;
}
if (read2end > read1end) {
end = read2end;
} else {
end = read1end;
}
if (sections1[1].equals("+") && sections2[1].equals("-")) {
if (readsAreReversed) {
strand = Location.REVERSE;
} else {
strand = Location.FORWARD;
}
} else if (sections1[1].equals("-") && sections2[1].equals("+")) {
if (readsAreReversed) {
strand = Location.FORWARD;
} else {
strand = Location.REVERSE;
}
} else {
strand = Location.UNKNOWN;
}
} catch (NumberFormatException e) {
throw new SeqMonkException("Location " + sections1[3] + " or " + sections2[3] + " was not an integer");
}
if ((end - start) + 1 > pairedEndDistance) {
throw new SeqMonkException("Distance between ends " + ((end - start) + 1) + " was larger than cutoff (" + pairedEndDistance + ")");
}
if (!sections1[2].equals(sections2[2])) {
throw new SeqMonkException("Paried end read was on a different chromosome");
}
ChromosomeWithOffset c;
try {
c = dataCollection().genome().getChromosome(sections1[2]);
} catch (Exception e) {
throw new SeqMonkException(e.getLocalizedMessage());
}
start = c.position(start);
end = c.position(end);
// We also don't allow readings which are beyond the end of the chromosome
if (end > c.chromosome().length()) {
int overrun = end - c.chromosome().length();
throw new SeqMonkException("Reading position " + end + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
}
if (start < 1) {
throw new SeqMonkException("Reading position " + start + " was before the start of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
}
// We can now make the new reading
SequenceReadWithChromosome read = new SequenceReadWithChromosome(c.chromosome(), SequenceRead.packPosition(start, end, strand));
return read;
}
use of uk.ac.babraham.SeqMonk.SeqMonkException in project SeqMonk by s-andrews.
the class BowtieFileParser method run.
/* (non-Javadoc)
* @see java.lang.Runnable#run()
*/
public void run() {
pairedEndImport = prefs.pairedEnd();
pairedEndDistance = prefs.pairDistanceCutoff();
if (!pairedEndImport) {
extendBy = prefs.extendReads();
}
String[] sections1;
String[] sections2 = null;
File[] bowtieFiles = getFiles();
DataSet[] newData = new DataSet[bowtieFiles.length];
try {
for (int f = 0; f < bowtieFiles.length; f++) {
BufferedReader br;
if (bowtieFiles[f].getName().toLowerCase().endsWith(".gz")) {
br = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(bowtieFiles[f]))));
} else {
br = new BufferedReader(new FileReader(bowtieFiles[f]));
}
String line;
String line2 = null;
if (prefs.isHiC()) {
newData[f] = new PairedDataSet(bowtieFiles[f].getName(), bowtieFiles[f].getCanonicalPath(), prefs.removeDuplicates(), prefs.hiCDistance(), prefs.hiCIgnoreTrans());
} else {
newData[f] = new DataSet(bowtieFiles[f].getName(), bowtieFiles[f].getCanonicalPath(), prefs.removeDuplicates());
}
int lineCount = 0;
while ((line = br.readLine()) != null) {
if (cancel) {
br.close();
progressCancelled();
return;
}
// Ignore blank lines
if (line.trim().length() == 0)
continue;
++lineCount;
if (pairedEndImport) {
line2 = br.readLine();
if (line2 == null) {
++lineCount;
progressWarningReceived(new SeqMonkException("Ran out of file when looking for paired end"));
continue;
}
}
if (lineCount % 100000 == 0) {
progressUpdated("Read " + lineCount + " lines from " + bowtieFiles[f].getName(), f, bowtieFiles.length);
}
sections1 = line.split("\t");
if (pairedEndImport) {
sections2 = line2.split("\t");
}
// Check to see if we've got enough data to work with
if (sections1.length < 5) {
progressWarningReceived(new SeqMonkException("Not enough data from line '" + line + "'"));
// Skip this line...
continue;
}
if (pairedEndImport && sections2.length < 5) {
progressWarningReceived(new SeqMonkException("Not enough data from line '" + line + "'"));
// Skip this line...
continue;
}
try {
if (pairedEndImport) {
SequenceReadWithChromosome read = getPairedEndRead(sections1, sections2);
newData[f].addData(read.chromosome, read.read);
} else {
SequenceReadWithChromosome read = getSingleEndRead(sections1);
newData[f].addData(read.chromosome, read.read);
}
} catch (SeqMonkException ex) {
progressWarningReceived(ex);
}
}
// We're finished with the file.
br.close();
// Cache the data in the new dataset
progressUpdated("Caching data from " + bowtieFiles[f].getName(), f, bowtieFiles.length);
newData[f].finalise();
}
} catch (Exception ex) {
progressExceptionReceived(ex);
return;
}
processingFinished(newData);
}
Aggregations