use of uk.ac.babraham.SeqMonk.DataTypes.Sequence.SequenceReadWithChromosome in project SeqMonk by s-andrews.
the class BowtieFileParser method getPairedEndRead.
/**
* Gets a paired end read.
*
* @param sections1 The tab split bowtie output sections for the first read
* @param sections2 The tab split bowtie output sections for the second read
* @return The paired end read which was read
* @throws SeqMonkException
*/
private SequenceReadWithChromosome getPairedEndRead(String[] sections1, String[] sections2) throws SeqMonkException {
// We can get the lines with read two first, in which case we'll reverse things
boolean readsAreReversed = false;
if (sections1[0].substring(sections1[0].length() - 1).equals("2")) {
readsAreReversed = true;
}
int strand;
int start;
int end;
try {
/*
* This convention isn't true in newer bowtie files so we can't rely on this any more.
*/
// if (! sections1[0].substring(0, sections1[0].length()-2).equals(sections2[0].substring(0, sections2[0].length()-2))) {
// throw new SeqMonkException("Paired reads '"+sections1[0]+"' and '"+sections2[0]+"' did not match names");
// }
int read1start = Integer.parseInt(sections1[3]) + 1;
int read1end = read1start + (sections1[4].length() - 1);
int read2start = Integer.parseInt(sections2[3]) + 1;
int read2end = read2start + (sections2[4].length() - 1);
if (read1start < read2start) {
start = read1start;
} else {
start = read2start;
}
if (read2end > read1end) {
end = read2end;
} else {
end = read1end;
}
if (sections1[1].equals("+") && sections2[1].equals("-")) {
if (readsAreReversed) {
strand = Location.REVERSE;
} else {
strand = Location.FORWARD;
}
} else if (sections1[1].equals("-") && sections2[1].equals("+")) {
if (readsAreReversed) {
strand = Location.FORWARD;
} else {
strand = Location.REVERSE;
}
} else {
strand = Location.UNKNOWN;
}
} catch (NumberFormatException e) {
throw new SeqMonkException("Location " + sections1[3] + " or " + sections2[3] + " was not an integer");
}
if ((end - start) + 1 > pairedEndDistance) {
throw new SeqMonkException("Distance between ends " + ((end - start) + 1) + " was larger than cutoff (" + pairedEndDistance + ")");
}
if (!sections1[2].equals(sections2[2])) {
throw new SeqMonkException("Paried end read was on a different chromosome");
}
ChromosomeWithOffset c;
try {
c = dataCollection().genome().getChromosome(sections1[2]);
} catch (Exception e) {
throw new SeqMonkException(e.getLocalizedMessage());
}
start = c.position(start);
end = c.position(end);
// We also don't allow readings which are beyond the end of the chromosome
if (end > c.chromosome().length()) {
int overrun = end - c.chromosome().length();
throw new SeqMonkException("Reading position " + end + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
}
if (start < 1) {
throw new SeqMonkException("Reading position " + start + " was before the start of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
}
// We can now make the new reading
SequenceReadWithChromosome read = new SequenceReadWithChromosome(c.chromosome(), SequenceRead.packPosition(start, end, strand));
return read;
}
use of uk.ac.babraham.SeqMonk.DataTypes.Sequence.SequenceReadWithChromosome in project SeqMonk by s-andrews.
the class BowtieFileParser method run.
/* (non-Javadoc)
* @see java.lang.Runnable#run()
*/
public void run() {
pairedEndImport = prefs.pairedEnd();
pairedEndDistance = prefs.pairDistanceCutoff();
if (!pairedEndImport) {
extendBy = prefs.extendReads();
}
String[] sections1;
String[] sections2 = null;
File[] bowtieFiles = getFiles();
DataSet[] newData = new DataSet[bowtieFiles.length];
try {
for (int f = 0; f < bowtieFiles.length; f++) {
BufferedReader br;
if (bowtieFiles[f].getName().toLowerCase().endsWith(".gz")) {
br = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(bowtieFiles[f]))));
} else {
br = new BufferedReader(new FileReader(bowtieFiles[f]));
}
String line;
String line2 = null;
if (prefs.isHiC()) {
newData[f] = new PairedDataSet(bowtieFiles[f].getName(), bowtieFiles[f].getCanonicalPath(), prefs.removeDuplicates(), prefs.hiCDistance(), prefs.hiCIgnoreTrans());
} else {
newData[f] = new DataSet(bowtieFiles[f].getName(), bowtieFiles[f].getCanonicalPath(), prefs.removeDuplicates());
}
int lineCount = 0;
while ((line = br.readLine()) != null) {
if (cancel) {
br.close();
progressCancelled();
return;
}
// Ignore blank lines
if (line.trim().length() == 0)
continue;
++lineCount;
if (pairedEndImport) {
line2 = br.readLine();
if (line2 == null) {
++lineCount;
progressWarningReceived(new SeqMonkException("Ran out of file when looking for paired end"));
continue;
}
}
if (lineCount % 100000 == 0) {
progressUpdated("Read " + lineCount + " lines from " + bowtieFiles[f].getName(), f, bowtieFiles.length);
}
sections1 = line.split("\t");
if (pairedEndImport) {
sections2 = line2.split("\t");
}
// Check to see if we've got enough data to work with
if (sections1.length < 5) {
progressWarningReceived(new SeqMonkException("Not enough data from line '" + line + "'"));
// Skip this line...
continue;
}
if (pairedEndImport && sections2.length < 5) {
progressWarningReceived(new SeqMonkException("Not enough data from line '" + line + "'"));
// Skip this line...
continue;
}
try {
if (pairedEndImport) {
SequenceReadWithChromosome read = getPairedEndRead(sections1, sections2);
newData[f].addData(read.chromosome, read.read);
} else {
SequenceReadWithChromosome read = getSingleEndRead(sections1);
newData[f].addData(read.chromosome, read.read);
}
} catch (SeqMonkException ex) {
progressWarningReceived(ex);
}
}
// We're finished with the file.
br.close();
// Cache the data in the new dataset
progressUpdated("Caching data from " + bowtieFiles[f].getName(), f, bowtieFiles.length);
newData[f].finalise();
}
} catch (Exception ex) {
progressExceptionReceived(ex);
return;
}
processingFinished(newData);
}
Aggregations