use of uk.ac.babraham.SeqMonk.DataTypes.PairedDataSet in project SeqMonk by s-andrews.
the class GffFileParser method run.
/* (non-Javadoc)
* @see java.lang.Runnable#run()
*/
public void run() {
File[] probeFiles = getFiles();
DataSet[] newData = new DataSet[probeFiles.length];
int extendBy = prefs.extendReads();
try {
for (int f = 0; f < probeFiles.length; f++) {
BufferedReader br;
if (probeFiles[f].getName().toLowerCase().endsWith(".gz")) {
br = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(probeFiles[f]))));
} else {
br = new BufferedReader(new FileReader(probeFiles[f]));
}
String line;
if (prefs.isHiC()) {
newData[f] = new PairedDataSet(probeFiles[f].getName(), probeFiles[f].getCanonicalPath(), prefs.removeDuplicates(), prefs.hiCDistance(), prefs.hiCIgnoreTrans());
} else {
newData[f] = new DataSet(probeFiles[f].getName(), probeFiles[f].getCanonicalPath(), prefs.removeDuplicates());
}
int lineCount = 0;
// Now process the file
while ((line = br.readLine()) != null) {
if (cancel) {
br.close();
progressCancelled();
return;
}
// Ignore blank lines
if (line.trim().length() == 0)
continue;
++lineCount;
if (lineCount % 100000 == 0) {
progressUpdated("Read " + lineCount + " lines from " + probeFiles[f].getName(), f, probeFiles.length);
}
String[] sections = line.split("\t");
// Check to see if we've got enough data to work with
if (sections.length < 5) {
progressWarningReceived(new SeqMonkException("Not enough data from line '" + line + "'"));
// Skip this line...
continue;
}
int strand;
int start;
int end;
try {
start = Integer.parseInt(sections[3]);
end = Integer.parseInt(sections[4]);
// End must always be later than start
if (start > end) {
int temp = start;
start = end;
end = temp;
}
if (sections.length >= 7) {
if (sections[6].equals("+")) {
strand = Location.FORWARD;
} else if (sections[6].equals("-")) {
strand = Location.REVERSE;
} else if (sections[6].equals(".")) {
strand = Location.UNKNOWN;
} else {
progressWarningReceived(new SeqMonkException("Unknown strand character '" + sections[6] + "' marked as unknown strand"));
strand = Location.UNKNOWN;
}
} else {
strand = Location.UNKNOWN;
}
if (extendBy > 0) {
if (strand == Location.FORWARD) {
end += extendBy;
} else if (strand == Location.REVERSE) {
start -= extendBy;
}
}
} catch (NumberFormatException e) {
progressWarningReceived(new SeqMonkException("Location " + sections[3] + "-" + sections[4] + " was not an integer"));
continue;
}
ChromosomeWithOffset c;
try {
c = dataCollection().genome().getChromosome(sections[0]);
} catch (IllegalArgumentException sme) {
progressWarningReceived(sme);
continue;
}
start = c.position(start);
end = c.position(end);
// We also don't allow readings which are beyond the end of the chromosome
if (end > c.chromosome().length()) {
int overrun = end - c.chromosome().length();
progressWarningReceived(new SeqMonkException("Reading position " + end + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")"));
continue;
}
// We can now make the new reading
try {
long read = SequenceRead.packPosition(start, end, strand);
newData[f].addData(c.chromosome(), read);
} catch (SeqMonkException e) {
progressWarningReceived(e);
continue;
}
}
// We're finished with the file.
br.close();
// Cache the data in the new dataset
progressUpdated("Caching data from " + probeFiles[f].getName(), f, probeFiles.length);
newData[f].finalise();
}
processingFinished(newData);
} catch (Exception ex) {
progressExceptionReceived(ex);
}
}
use of uk.ac.babraham.SeqMonk.DataTypes.PairedDataSet in project SeqMonk by s-andrews.
the class SeqMonkParser method parseSamples.
/**
* Parses the list of samples.
*
* @param sections The tab split initial samples line
* @throws SeqMonkException
* @throws IOException Signals that an I/O exception has occurred.
*/
private void parseSamples(String[] sections) throws SeqMonkException, IOException {
if (sections.length != 2) {
throw new SeqMonkException("Samples line didn't contain 2 sections");
}
if (!sections[0].equals("Samples")) {
throw new SeqMonkException("Couldn't find expected samples line");
}
int n = Integer.parseInt(sections[1]);
// We need to keep the Data Sets around to add data to later.
dataSets = new DataSet[n];
for (int i = 0; i < n; i++) {
sections = br.readLine().split("\\t");
// anything else is assumed to be a normal dataset.
if (sections.length == 1) {
dataSets[i] = new DataSet(sections[0], "Not known", DataSet.DUPLICATES_REMOVE_NO);
} else if (sections.length == 2) {
dataSets[i] = new DataSet(sections[0], sections[1], DataSet.DUPLICATES_REMOVE_NO);
} else if (sections.length == 3) {
if (sections[2].equals("HiC")) {
dataSets[i] = new PairedDataSet(sections[0], sections[1], DataSet.DUPLICATES_REMOVE_NO, 0, false);
} else {
dataSets[i] = new DataSet(sections[0], sections[1], DataSet.DUPLICATES_REMOVE_NO);
}
}
application.dataCollection().addDataSet(dataSets[i]);
}
// Immediately after the list of samples comes the lists of reads
String line;
// Iterate through the number of samples
for (int i = 0; i < n; i++) {
Enumeration<ProgressListener> en = listeners.elements();
while (en.hasMoreElements()) {
en.nextElement().progressUpdated("Reading data for " + dataSets[i].name(), i * 10, n * 10);
}
// The first line is
line = br.readLine();
sections = line.split("\t");
if (sections.length != 2) {
throw new SeqMonkException("Read line " + i + " didn't contain 2 sections");
}
int readCount = Integer.parseInt(sections[0]);
// In versions prior to 7 we encoded everything on every line separately
if (thisDataVersion < 7) {
for (int r = 0; r < readCount; r++) {
if ((r % (1 + (readCount / 10))) == 0) {
Enumeration<ProgressListener> en2 = listeners.elements();
while (en2.hasMoreElements()) {
en2.nextElement().progressUpdated("Reading data for " + dataSets[i].name(), i * 10 + (r / (1 + (readCount / 10))), n * 10);
}
}
line = br.readLine();
if (line == null) {
throw new SeqMonkException("Ran out of data whilst parsing reads for sample " + i);
}
sections = line.split("\t");
Chromosome c;
try {
c = application.dataCollection().genome().getChromosome(sections[0]).chromosome();
} catch (Exception sme) {
Enumeration<ProgressListener> e = listeners.elements();
while (e.hasMoreElements()) {
e.nextElement().progressWarningReceived(new SeqMonkException("Read from sample " + i + " could not be mapped to chromosome '" + sections[0] + "'"));
}
continue;
}
int start = Integer.parseInt(sections[1]);
int end = Integer.parseInt(sections[2]);
int strand = Integer.parseInt(sections[3]);
try {
dataSets[i].addData(c, SequenceRead.packPosition(start, end, strand), true);
} catch (SeqMonkException ex) {
Enumeration<ProgressListener> e = listeners.elements();
while (e.hasMoreElements()) {
e.nextElement().progressWarningReceived(ex);
continue;
}
}
}
} else if (dataSets[i] instanceof PairedDataSet) {
// Paired Data sets have a different format, with a packed position for
// the first read, then a chromosome name and packed position for the second
// read.
// From v13 onwards we started entering HiC data pairs in both directions so
// the data would be able to be read back in without sorting it.
boolean hicDataIsPresorted = thisDataVersion >= 13;
// We use this to update the progress bar every time we move to a new chromosome
int seenChromosomeCount = 0;
while (true) {
// The first line should be the chromosome and a number of reads
line = br.readLine();
if (line == null) {
throw new SeqMonkException("Ran out of data whilst parsing reads for sample " + i);
}
// A blank line indicates the end of the sample
if (line.length() == 0)
break;
sections = line.split("\\t");
Chromosome c = application.dataCollection().genome().getChromosome(sections[0]).chromosome();
++seenChromosomeCount;
progressUpdated("Reading data for " + dataSets[i].name(), i * application.dataCollection().genome().getChromosomeCount() + seenChromosomeCount, n * application.dataCollection().genome().getChromosomeCount());
int chrReadCount = Integer.parseInt(sections[1]);
for (int r = 0; r < chrReadCount; r++) {
line = br.readLine();
if (line == null) {
throw new SeqMonkException("Ran out of data whilst parsing reads for sample " + i);
}
/*
* We used to have a split("\t") here, but this turned out to be the bottleneck
* which hugely restricted the speed at which the data could be read. Switching
* for a set of index calls and substring makes this *way* faster.
*/
int tabPosition = line.indexOf('\t');
long packedPosition = quickParseLong(line, 0);
int tabPosition2 = line.indexOf('\t', tabPosition + 1);
Chromosome c2 = application.dataCollection().genome().getChromosome(line.substring(tabPosition, tabPosition2)).chromosome();
long packedPosition2 = quickParseLong(line, tabPosition2 + 1);
try {
dataSets[i].addData(c, packedPosition, hicDataIsPresorted);
dataSets[i].addData(c2, packedPosition2, hicDataIsPresorted);
} catch (SeqMonkException ex) {
Enumeration<ProgressListener> e = listeners.elements();
while (e.hasMoreElements()) {
e.nextElement().progressWarningReceived(ex);
continue;
}
}
}
}
} else {
// In versions after 7 we split the section up into chromosomes
// so we don't put the chromosome on every line, and we put out
// the packed double value rather than the individual start, end
// and strand
// As of version 12 we collapse repeated reads into one line with
// a count after it, so we need to check for this.
// We keep count of reads processed to update the progress listeners
int readsRead = 0;
while (true) {
// The first line should be the chromosome and a number of reads
line = br.readLine();
if (line == null) {
throw new SeqMonkException("Ran out of data whilst parsing reads for sample " + i);
}
// A blank line indicates the end of the sample
if (line.length() == 0)
break;
sections = line.split("\\t");
// We don't try to capture this exception since we can't then process
// any of the reads which follow.
Chromosome c = application.dataCollection().genome().getChromosome(sections[0]).chromosome();
int chrReadCount = Integer.parseInt(sections[1]);
for (int r = 0; r < chrReadCount; r++) {
if ((readsRead % (1 + (readCount / 10))) == 0) {
Enumeration<ProgressListener> en2 = listeners.elements();
while (en2.hasMoreElements()) {
en2.nextElement().progressUpdated("Reading data for " + dataSets[i].name(), i * 10 + (readsRead / (1 + (readCount / 10))), n * 10);
}
}
line = br.readLine();
if (line == null) {
throw new SeqMonkException("Ran out of data whilst parsing reads for sample " + i);
}
// We use some custom parsing code to efficiently extract the packed position and
// count from the line. This avoids having to use the generic parsing code or
// doing a split to determine the tab location.
long packedPosition = 0;
int count = 0;
// Check for a sign.
int sign = 1;
int len = line.length();
if (len == 0) {
System.err.println("Yuk - blank line found in sample data");
}
char ch = line.charAt(0);
if (ch == '-') {
sign = -1;
} else {
packedPosition = ch - '0';
}
// Build the numbers.
int j = 1;
int tabIndex = 0;
while (j < len) {
if (line.charAt(j) == '\t') {
tabIndex = j++;
} else if (tabIndex > 0) {
// char is ASCII so need to subtract char of '0' to get correct int value
count = (count * 10) + (line.charAt(j++) - '0');
} else {
packedPosition = (packedPosition * 10) + (line.charAt(j++) - '0');
}
}
packedPosition = sign * packedPosition;
if (count == 0) {
// There was no count so there's only one of these
count = 1;
} else {
// Update the true read count with the additional reads we get from this line
r += (count - 1);
}
// System.out.println("From line "+line+" got packed="+packedPosition+" count="+count);
dataSets[i].addData(c, packedPosition, count, true);
readsRead += count;
}
}
}
// We've now read all of the data for this sample so we can compact it
Enumeration<ProgressListener> en2 = listeners.elements();
while (en2.hasMoreElements()) {
en2.nextElement().progressUpdated("Caching data for " + dataSets[i].name(), (i + 1) * 10, n * 10);
}
dataSets[i].finalise();
}
}
use of uk.ac.babraham.SeqMonk.DataTypes.PairedDataSet in project SeqMonk by s-andrews.
the class BAMFileParser method run.
/* (non-Javadoc)
* @see java.lang.Runnable#run()
*/
public void run() {
pairedEndImport = prefs.pairedEnd();
pairedEndDistance = prefs.pairDistanceCutoff();
separateSplicedReads = prefs.splitSplicedReads();
importIntrons = prefs.importIntrons();
extendBy = prefs.extendReads();
minMappingQuality = prefs.minMappingQuality();
primaryAlignmentsOnly = prefs.primaryAlignmentsOnly();
File[] samFiles = getFiles();
DataSet[] newData = new DataSet[samFiles.length];
try {
for (int f = 0; f < samFiles.length; f++) {
SAMFileReader.setDefaultValidationStringency(SAMFileReader.ValidationStringency.SILENT);
SAMFileReader inputSam = new SAMFileReader(samFiles[f]);
if (prefs.isHiC()) {
newData[f] = new PairedDataSet(samFiles[f].getName(), samFiles[f].getCanonicalPath(), prefs.removeDuplicates(), prefs.hiCDistance(), prefs.hiCIgnoreTrans());
} else {
newData[f] = new DataSet(samFiles[f].getName(), samFiles[f].getCanonicalPath(), prefs.removeDuplicates());
}
int lineCount = 0;
// Now process the file
// A flag we can set to skip the next record if we're getting
// out of sync during single end HiC import.
boolean skipNext = false;
for (SAMRecord samRecord : inputSam) {
if (skipNext) {
skipNext = false;
continue;
}
if (cancel) {
inputSam.close();
progressCancelled();
return;
}
++lineCount;
if (lineCount % 100000 == 0) {
progressUpdated("Read " + lineCount + " lines from " + samFiles[f].getName(), f, samFiles.length);
}
if (pairedEndImport && !samRecord.getReadPairedFlag()) {
progressWarningReceived(new SeqMonkException("Data was single ended during paired end import"));
continue;
}
if (samRecord.getReadUnmappedFlag()) {
// There was no match
continue;
}
if (primaryAlignmentsOnly && samRecord.getNotPrimaryAlignmentFlag()) {
// alignments
continue;
}
if (pairedEndImport && !separateSplicedReads && samRecord.getMateUnmappedFlag()) {
// No match on the reverse strand. Doesn't matter if we're doing spliced reads.
continue;
}
if (minMappingQuality > 0 && samRecord.getMappingQuality() < minMappingQuality) {
if (prefs.isHiC()) {
if (((PairedDataSet) newData[f]).importSequenceSkipped()) {
// Skip the next line
skipNext = true;
}
}
continue;
}
if (pairedEndImport && !separateSplicedReads && !samRecord.getReadNegativeStrandFlag()) {
// field.
continue;
}
if (pairedEndImport && !separateSplicedReads && !samRecord.getReferenceName().equals(samRecord.getMateReferenceName())) {
// progressWarningReceived(new SeqMonkException("Paired reads mapped to different chromosomes"));
continue;
}
// TODO: Check what this actually stores - might be a real name rather than 0/=
if (pairedEndImport && !separateSplicedReads && !prefs.isHiC() && samRecord.getMateReferenceName() == "0") {
if (samRecord.getMateReferenceName() != "=") {
inputSam.close();
throw new SeqMonkException("Unexpected mate referenece name " + samRecord.getMateReferenceName());
}
// Matches were on different chromosomes
continue;
}
try {
if (pairedEndImport && !separateSplicedReads) {
SequenceReadWithChromosome read = getPairedEndRead(samRecord);
newData[f].addData(read.chromosome, read.read);
} else if (separateSplicedReads) {
SequenceReadWithChromosome[] reads = getSplitSingleEndRead(samRecord);
for (int r = 0; r < reads.length; r++) {
newData[f].addData(reads[r].chromosome, reads[r].read);
}
} else {
SequenceReadWithChromosome read = getSingleEndRead(samRecord);
newData[f].addData(read.chromosome, read.read);
}
} catch (SeqMonkException ex) {
progressWarningReceived(ex);
if (prefs.isHiC()) {
if (((PairedDataSet) newData[f]).importSequenceSkipped()) {
// Skip the next line
skipNext = true;
}
}
}
}
// We're finished with the file.
inputSam.close();
// Cache the data in the new dataset
progressUpdated("Caching data from " + samFiles[f].getName(), f, samFiles.length);
newData[f].finalise();
}
} catch (Exception ex) {
progressExceptionReceived(ex);
return;
}
processingFinished(newData);
}
use of uk.ac.babraham.SeqMonk.DataTypes.PairedDataSet in project SeqMonk by s-andrews.
the class BowtieFileParser method run.
/* (non-Javadoc)
* @see java.lang.Runnable#run()
*/
public void run() {
pairedEndImport = prefs.pairedEnd();
pairedEndDistance = prefs.pairDistanceCutoff();
if (!pairedEndImport) {
extendBy = prefs.extendReads();
}
String[] sections1;
String[] sections2 = null;
File[] bowtieFiles = getFiles();
DataSet[] newData = new DataSet[bowtieFiles.length];
try {
for (int f = 0; f < bowtieFiles.length; f++) {
BufferedReader br;
if (bowtieFiles[f].getName().toLowerCase().endsWith(".gz")) {
br = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(bowtieFiles[f]))));
} else {
br = new BufferedReader(new FileReader(bowtieFiles[f]));
}
String line;
String line2 = null;
if (prefs.isHiC()) {
newData[f] = new PairedDataSet(bowtieFiles[f].getName(), bowtieFiles[f].getCanonicalPath(), prefs.removeDuplicates(), prefs.hiCDistance(), prefs.hiCIgnoreTrans());
} else {
newData[f] = new DataSet(bowtieFiles[f].getName(), bowtieFiles[f].getCanonicalPath(), prefs.removeDuplicates());
}
int lineCount = 0;
while ((line = br.readLine()) != null) {
if (cancel) {
br.close();
progressCancelled();
return;
}
// Ignore blank lines
if (line.trim().length() == 0)
continue;
++lineCount;
if (pairedEndImport) {
line2 = br.readLine();
if (line2 == null) {
++lineCount;
progressWarningReceived(new SeqMonkException("Ran out of file when looking for paired end"));
continue;
}
}
if (lineCount % 100000 == 0) {
progressUpdated("Read " + lineCount + " lines from " + bowtieFiles[f].getName(), f, bowtieFiles.length);
}
sections1 = line.split("\t");
if (pairedEndImport) {
sections2 = line2.split("\t");
}
// Check to see if we've got enough data to work with
if (sections1.length < 5) {
progressWarningReceived(new SeqMonkException("Not enough data from line '" + line + "'"));
// Skip this line...
continue;
}
if (pairedEndImport && sections2.length < 5) {
progressWarningReceived(new SeqMonkException("Not enough data from line '" + line + "'"));
// Skip this line...
continue;
}
try {
if (pairedEndImport) {
SequenceReadWithChromosome read = getPairedEndRead(sections1, sections2);
newData[f].addData(read.chromosome, read.read);
} else {
SequenceReadWithChromosome read = getSingleEndRead(sections1);
newData[f].addData(read.chromosome, read.read);
}
} catch (SeqMonkException ex) {
progressWarningReceived(ex);
}
}
// We're finished with the file.
br.close();
// Cache the data in the new dataset
progressUpdated("Caching data from " + bowtieFiles[f].getName(), f, bowtieFiles.length);
newData[f].finalise();
}
} catch (Exception ex) {
progressExceptionReceived(ex);
return;
}
processingFinished(newData);
}
Aggregations