use of uk.ac.babraham.SeqMonk.DataTypes.Sequence.SequenceReadWithChromosome in project SeqMonk by s-andrews.
the class BAMFileParser method getPairedEndRead.
/**
* Gets a paired end read. This method assumes that it will only be passed reads which map
* to the reverse strand since these are the ones which contain enough information to
* unambiguously locate both ends of the pair.
*
* @param sections The tab split sections from the SAM file
* @param flag The binary flag field
* @return The read which was read
* @throws SeqMonkException
*/
private SequenceReadWithChromosome getPairedEndRead(SAMRecord samRecord) throws SeqMonkException {
int strand;
int start;
int end;
if (!samRecord.getReadNegativeStrandFlag()) {
throw new SeqMonkException("Read passed to parse pair was not on the negative strand");
}
if (samRecord.getMateNegativeStrandFlag()) {
throw new SeqMonkException("Ignored discordantly stranded read pair");
}
end = samRecord.getAlignmentEnd();
start = samRecord.getMateAlignmentStart();
if (start > end) {
throw new SeqMonkException("Ignored discordantly stranded read pair");
}
if (samRecord.getFirstOfPairFlag()) {
strand = Location.REVERSE;
} else {
strand = Location.FORWARD;
}
if ((end - start) + 1 > pairedEndDistance) {
throw new SeqMonkException("Distance between ends " + ((end - start) + 1) + " was larger than cutoff (" + pairedEndDistance + ")");
}
ChromosomeWithOffset c;
try {
c = dataCollection().genome().getChromosome(samRecord.getReferenceName());
} catch (Exception e) {
throw new SeqMonkException(e.getLocalizedMessage());
}
start = c.position(start);
end = c.position(end);
// We also don't allow readings which are beyond the end of the chromosome
if (end > c.chromosome().length()) {
int overrun = end - c.chromosome().length();
throw new SeqMonkException("Reading position " + end + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
}
if (start < 1) {
throw new SeqMonkException("Reading position " + start + " was before the start of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
}
// We can now make the new reading
SequenceReadWithChromosome read = new SequenceReadWithChromosome(c.chromosome(), SequenceRead.packPosition(start, end, strand));
return read;
}
use of uk.ac.babraham.SeqMonk.DataTypes.Sequence.SequenceReadWithChromosome in project SeqMonk by s-andrews.
the class BowtieFileParser method getSingleEndRead.
/**
* Gets a single single end read.
*
* @param sections The set of tab-split strings from the bowtie output
* @return The read which was read
* @throws SeqMonkException
*/
private SequenceReadWithChromosome getSingleEndRead(String[] sections) throws SeqMonkException {
int strand;
int start;
int end;
try {
start = Integer.parseInt(sections[3]) + 1;
end = start + (sections[4].length() - 1);
if (sections[1].equals("+")) {
strand = Location.FORWARD;
} else if (sections[1].equals("-")) {
strand = Location.REVERSE;
} else {
strand = Location.UNKNOWN;
}
if (extendBy > 0) {
if (strand == Location.FORWARD) {
end += extendBy;
} else if (strand == Location.REVERSE) {
start -= extendBy;
}
}
} catch (NumberFormatException e) {
throw new SeqMonkException("Location " + sections[3] + " was not an integer");
}
ChromosomeWithOffset c;
try {
c = collection.genome().getChromosome(sections[2]);
} catch (Exception iae) {
throw new SeqMonkException(iae.getLocalizedMessage());
}
start = c.position(start);
end = c.position(end);
// We also don't allow readings which are beyond the end of the chromosome
if (end > c.chromosome().length()) {
int overrun = end - c.chromosome().length();
throw new SeqMonkException("Reading position " + end + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
}
if (start < 1) {
throw new SeqMonkException("Reading position " + start + " was before the start of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
}
// We can now make the new reading
SequenceReadWithChromosome read = new SequenceReadWithChromosome(c.chromosome(), SequenceRead.packPosition(start, end, strand));
return read;
}
use of uk.ac.babraham.SeqMonk.DataTypes.Sequence.SequenceReadWithChromosome in project SeqMonk by s-andrews.
the class BAMFileParser method getSplitSingleEndRead.
/**
* Gets a split single end read. The only reason for asking about whether the
* import is single or paired end is that if we're doing a paired end import
* then we reverse the strand for all second read data. For single end import
* we leave the strand alone whichever read it originates from.
*
* @param samRecord The picard record entry for this read
* @param flag pairedEnd Is this a paired end import
* @return The read which was read
* @throws SeqMonkException
*/
private SequenceReadWithChromosome[] getSplitSingleEndRead(SAMRecord samRecord) throws SeqMonkException {
int strand;
int start;
int lastEnd = -1;
start = samRecord.getAlignmentStart();
if (samRecord.getReadNegativeStrandFlag()) {
if (samRecord.getReadPairedFlag() && samRecord.getSecondOfPairFlag()) {
strand = Location.FORWARD;
} else {
strand = Location.REVERSE;
}
} else {
if (samRecord.getReadPairedFlag() && samRecord.getSecondOfPairFlag()) {
strand = Location.REVERSE;
} else {
strand = Location.FORWARD;
}
}
ChromosomeWithOffset c;
try {
c = dataCollection().genome().getChromosome(samRecord.getReferenceName());
} catch (Exception e) {
throw new SeqMonkException(e.getLocalizedMessage());
}
start = c.position(start);
if (start < 1) {
throw new SeqMonkException("Reading position " + start + " was before the start of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
}
if (!samRecord.getCigarString().contains("N")) {
if (!importIntrons) {
int end = c.position(samRecord.getAlignmentEnd());
return new SequenceReadWithChromosome[] { new SequenceReadWithChromosome(c.chromosome(), SequenceRead.packPosition(start, end, strand)) };
}
return new SequenceReadWithChromosome[0];
}
String[] cigarOperations = samRecord.getCigarString().split("\\d+");
String[] cigarNumbers = samRecord.getCigarString().split("[MIDNSHP]");
if (cigarOperations.length != cigarNumbers.length + 1) {
throw new SeqMonkException("Couldn't parse CIGAR string " + samRecord.getCigarString() + " counts were " + cigarOperations.length + " vs " + cigarNumbers.length);
}
Vector<SequenceReadWithChromosome> newReads = new Vector<SequenceReadWithChromosome>();
int currentPosition = start;
for (int pos = 0; pos < cigarNumbers.length; pos++) {
if (cigarOperations[pos + 1].equals("M")) {
currentPosition += Integer.parseInt(cigarNumbers[pos]) - 1;
} else if (cigarOperations[pos + 1].equals("I")) {
currentPosition += Integer.parseInt(cigarNumbers[pos]) - 1;
} else if (cigarOperations[pos + 1].equals("D")) {
currentPosition -= Integer.parseInt(cigarNumbers[pos]) - 1;
} else if (cigarOperations[pos + 1].equals("N")) {
if (importIntrons) {
if (lastEnd > 0) {
if (start > c.chromosome().length()) {
int overrun = (start - 1) - c.chromosome().length();
throw new SeqMonkException("Reading position " + (start - 1) + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
}
newReads.add(new SequenceReadWithChromosome(c.chromosome(), SequenceRead.packPosition(lastEnd + 1, start - 1, strand)));
}
// Update the lastEnd whether we added a read or not since this
// will be the start of the next intron
lastEnd = currentPosition;
} else {
// We also don't allow readings which are beyond the end of the chromosome
if (currentPosition > c.chromosome().length()) {
int overrun = currentPosition - c.chromosome().length();
throw new SeqMonkException("Reading position " + currentPosition + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
}
newReads.add(new SequenceReadWithChromosome(c.chromosome(), SequenceRead.packPosition(start, currentPosition, strand)));
}
currentPosition += Integer.parseInt(cigarNumbers[pos]) + 1;
start = currentPosition;
}
}
if (importIntrons) {
if (lastEnd > 0) {
if (start > c.chromosome().length()) {
int overrun = (start - 1) - c.chromosome().length();
throw new SeqMonkException("Reading position " + (start - 1) + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
}
newReads.add(new SequenceReadWithChromosome(c.chromosome(), SequenceRead.packPosition(lastEnd + 1, start - 1, strand)));
}
} else {
// We also don't allow readings which are beyond the end of the chromosome
if (currentPosition > c.chromosome().length()) {
int overrun = currentPosition - c.chromosome().length();
throw new SeqMonkException("Reading position " + currentPosition + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
}
newReads.add(new SequenceReadWithChromosome(c.chromosome(), SequenceRead.packPosition(start, currentPosition, strand)));
}
return newReads.toArray(new SequenceReadWithChromosome[0]);
}
use of uk.ac.babraham.SeqMonk.DataTypes.Sequence.SequenceReadWithChromosome in project SeqMonk by s-andrews.
the class BAMFileParser method getSingleEndRead.
/**
* Gets a single end read.
*
* @param sections The tab split sections from the SAM file
* @param flag The binary flag field from the file
* @return The read which was read
* @throws SeqMonkException
*/
private SequenceReadWithChromosome getSingleEndRead(SAMRecord samRecord) throws SeqMonkException {
int strand;
int start;
int end;
start = samRecord.getAlignmentStart();
end = samRecord.getAlignmentEnd();
if (samRecord.getReadNegativeStrandFlag()) {
strand = Location.REVERSE;
} else {
strand = Location.FORWARD;
}
if (extendBy > 0) {
if (strand == Location.FORWARD) {
end += extendBy;
} else if (strand == Location.REVERSE) {
start -= extendBy;
}
}
ChromosomeWithOffset c;
try {
c = collection.genome().getChromosome(samRecord.getReferenceName());
} catch (Exception iae) {
throw new SeqMonkException(iae.getLocalizedMessage());
}
start = c.position(start);
end = c.position(end);
// We also don't allow readings which are beyond the end of the chromosome
if (end > c.chromosome().length()) {
int overrun = end - c.chromosome().length();
throw new SeqMonkException("Reading position " + end + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
}
if (start < 1) {
throw new SeqMonkException("Reading position " + start + " was before the start of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
}
// We can now make the new reading
SequenceReadWithChromosome read = new SequenceReadWithChromosome(c.chromosome(), SequenceRead.packPosition(start, end, strand));
return read;
}
use of uk.ac.babraham.SeqMonk.DataTypes.Sequence.SequenceReadWithChromosome in project SeqMonk by s-andrews.
the class BAMFileParser method run.
/* (non-Javadoc)
* @see java.lang.Runnable#run()
*/
public void run() {
pairedEndImport = prefs.pairedEnd();
pairedEndDistance = prefs.pairDistanceCutoff();
separateSplicedReads = prefs.splitSplicedReads();
importIntrons = prefs.importIntrons();
extendBy = prefs.extendReads();
minMappingQuality = prefs.minMappingQuality();
primaryAlignmentsOnly = prefs.primaryAlignmentsOnly();
File[] samFiles = getFiles();
DataSet[] newData = new DataSet[samFiles.length];
try {
for (int f = 0; f < samFiles.length; f++) {
SAMFileReader.setDefaultValidationStringency(SAMFileReader.ValidationStringency.SILENT);
SAMFileReader inputSam = new SAMFileReader(samFiles[f]);
if (prefs.isHiC()) {
newData[f] = new PairedDataSet(samFiles[f].getName(), samFiles[f].getCanonicalPath(), prefs.removeDuplicates(), prefs.hiCDistance(), prefs.hiCIgnoreTrans());
} else {
newData[f] = new DataSet(samFiles[f].getName(), samFiles[f].getCanonicalPath(), prefs.removeDuplicates());
}
int lineCount = 0;
// Now process the file
// A flag we can set to skip the next record if we're getting
// out of sync during single end HiC import.
boolean skipNext = false;
for (SAMRecord samRecord : inputSam) {
if (skipNext) {
skipNext = false;
continue;
}
if (cancel) {
inputSam.close();
progressCancelled();
return;
}
++lineCount;
if (lineCount % 100000 == 0) {
progressUpdated("Read " + lineCount + " lines from " + samFiles[f].getName(), f, samFiles.length);
}
if (pairedEndImport && !samRecord.getReadPairedFlag()) {
progressWarningReceived(new SeqMonkException("Data was single ended during paired end import"));
continue;
}
if (samRecord.getReadUnmappedFlag()) {
// There was no match
continue;
}
if (primaryAlignmentsOnly && samRecord.getNotPrimaryAlignmentFlag()) {
// alignments
continue;
}
if (pairedEndImport && !separateSplicedReads && samRecord.getMateUnmappedFlag()) {
// No match on the reverse strand. Doesn't matter if we're doing spliced reads.
continue;
}
if (minMappingQuality > 0 && samRecord.getMappingQuality() < minMappingQuality) {
if (prefs.isHiC()) {
if (((PairedDataSet) newData[f]).importSequenceSkipped()) {
// Skip the next line
skipNext = true;
}
}
continue;
}
if (pairedEndImport && !separateSplicedReads && !samRecord.getReadNegativeStrandFlag()) {
// field.
continue;
}
if (pairedEndImport && !separateSplicedReads && !samRecord.getReferenceName().equals(samRecord.getMateReferenceName())) {
// progressWarningReceived(new SeqMonkException("Paired reads mapped to different chromosomes"));
continue;
}
// TODO: Check what this actually stores - might be a real name rather than 0/=
if (pairedEndImport && !separateSplicedReads && !prefs.isHiC() && samRecord.getMateReferenceName() == "0") {
if (samRecord.getMateReferenceName() != "=") {
inputSam.close();
throw new SeqMonkException("Unexpected mate referenece name " + samRecord.getMateReferenceName());
}
// Matches were on different chromosomes
continue;
}
try {
if (pairedEndImport && !separateSplicedReads) {
SequenceReadWithChromosome read = getPairedEndRead(samRecord);
newData[f].addData(read.chromosome, read.read);
} else if (separateSplicedReads) {
SequenceReadWithChromosome[] reads = getSplitSingleEndRead(samRecord);
for (int r = 0; r < reads.length; r++) {
newData[f].addData(reads[r].chromosome, reads[r].read);
}
} else {
SequenceReadWithChromosome read = getSingleEndRead(samRecord);
newData[f].addData(read.chromosome, read.read);
}
} catch (SeqMonkException ex) {
progressWarningReceived(ex);
if (prefs.isHiC()) {
if (((PairedDataSet) newData[f]).importSequenceSkipped()) {
// Skip the next line
skipNext = true;
}
}
}
}
// We're finished with the file.
inputSam.close();
// Cache the data in the new dataset
progressUpdated("Caching data from " + samFiles[f].getName(), f, samFiles.length);
newData[f].finalise();
}
} catch (Exception ex) {
progressExceptionReceived(ex);
return;
}
processingFinished(newData);
}
Aggregations