use of uk.ac.babraham.SeqMonk.DataTypes.DataSet in project SeqMonk by s-andrews.
the class BedPEFileParser method run.
/* (non-Javadoc)
* @see java.lang.Runnable#run()
*/
public void run() {
// System.err.println("Started parsing BED files");
int extendBy = prefs.extendReads();
try {
File[] probeFiles = getFiles();
DataSet[] newData = new DataSet[probeFiles.length];
for (int f = 0; f < probeFiles.length; f++) {
BufferedReader br;
if (probeFiles[f].getName().toLowerCase().endsWith(".gz")) {
br = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(probeFiles[f]))));
} else {
br = new BufferedReader(new FileReader(probeFiles[f]));
}
String line;
if (prefs.isHiC()) {
newData[f] = new PairedDataSet(probeFiles[f].getName(), probeFiles[f].getCanonicalPath(), prefs.removeDuplicates(), prefs.hiCDistance(), prefs.hiCIgnoreTrans());
} else {
newData[f] = new DataSet(probeFiles[f].getName(), probeFiles[f].getCanonicalPath(), prefs.removeDuplicates());
}
int lineCount = 0;
// Now process the file
while ((line = br.readLine()) != null) {
if (cancel) {
br.close();
progressCancelled();
return;
}
// Ignore blank lines
if (line.trim().length() == 0)
continue;
// System.err.println(line);
// Thread.sleep(200);
++lineCount;
if (lineCount % 100000 == 0) {
progressUpdated("Read " + lineCount + " lines from " + probeFiles[f].getName(), f, probeFiles.length);
}
String[] sections = line.split("\t");
// Check to see if we've got enough data to work with
if (sections.length < 6) {
progressWarningReceived(new SeqMonkException("Not enough data from line '" + line + "'"));
// Skip this line...
continue;
}
// move on quickly.
if (sections[0].equals(".") || sections[3].equals("."))
continue;
int strand1;
int start1;
int end1;
int strand2;
int start2;
int end2;
try {
// The start is zero indexed so we need to add 1 to get genomic positions
start1 = Integer.parseInt(sections[1]) + 1;
start2 = Integer.parseInt(sections[4]) + 1;
// The end is zero indexed, but not included in the feature position so
// we need to add one to get genomic coordinates, but subtract one to not
// include the final base.
end1 = Integer.parseInt(sections[2]);
end2 = Integer.parseInt(sections[5]);
// End must always be later than start
if (start1 > end1) {
progressWarningReceived(new SeqMonkException("End position1 " + end1 + " was lower than start position " + start1));
int temp = start1;
start1 = end1;
end1 = temp;
}
if (start2 > end2) {
progressWarningReceived(new SeqMonkException("End position2 " + end2 + " was lower than start position " + start2));
int temp = start2;
start2 = end2;
end2 = temp;
}
if (sections.length >= 10) {
if (sections[8].equals("+")) {
strand1 = Location.FORWARD;
} else if (sections[8].equals("-")) {
strand1 = Location.REVERSE;
} else if (sections[8].equals(".")) {
strand1 = Location.UNKNOWN;
} else {
progressWarningReceived(new SeqMonkException("Unknown strand character '" + sections[8] + "' marked as unknown strand"));
strand1 = Location.UNKNOWN;
}
if (sections[9].equals("+")) {
strand2 = Location.FORWARD;
} else if (sections[9].equals("-")) {
strand2 = Location.REVERSE;
} else if (sections[9].equals(".")) {
strand2 = Location.UNKNOWN;
} else {
progressWarningReceived(new SeqMonkException("Unknown strand character '" + sections[9] + "' marked as unknown strand"));
strand2 = Location.UNKNOWN;
}
// if (extendBy > 0) {
// if (strand==Location.REVERSE) {
// start -=extendBy;
// }
// else if (strand==Location.FORWARD) {
// end+=extendBy;
// }
// }
} else {
strand1 = Location.UNKNOWN;
strand2 = Location.UNKNOWN;
}
} catch (NumberFormatException e) {
progressWarningReceived(new SeqMonkException("Location " + sections[0] + "-" + sections[1] + " was not an integer"));
continue;
}
try {
ChromosomeWithOffset c1 = dataCollection().genome().getChromosome(sections[0]);
// We also don't allow readings which are beyond the end of the chromosome
start1 = c1.position(start1);
end1 = c1.position(end1);
if (end1 > c1.chromosome().length()) {
int overrun = end1 - c1.chromosome().length();
progressWarningReceived(new SeqMonkException("Reading position " + end1 + " was " + overrun + "bp beyond the end of chr" + c1.chromosome().name() + " (" + c1.chromosome().length() + ")"));
continue;
}
ChromosomeWithOffset c2 = dataCollection().genome().getChromosome(sections[3]);
// We also don't allow readings which are beyond the end of the chromosome
start2 = c2.position(start2);
end2 = c2.position(end2);
if (end2 > c2.chromosome().length()) {
int overrun = end2 - c2.chromosome().length();
progressWarningReceived(new SeqMonkException("Reading position " + end2 + " was " + overrun + "bp beyond the end of chr" + c2.chromosome().name() + " (" + c2.chromosome().length() + ")"));
continue;
}
// add them. There's nothing clever to do.
if (prefs.isHiC()) {
long read1 = SequenceRead.packPosition(start1, end1, strand1);
newData[f].addData(c1.chromosome(), read1);
long read2 = SequenceRead.packPosition(start2, end2, strand2);
newData[f].addData(c2.chromosome(), read2);
} else {
// If they're on different chromosomes then we kick them out
if (!c1.chromosome().name().equals(c2.chromosome().name())) {
progressWarningReceived(new SeqMonkException("Paried reads were on different chromosomes - discarding"));
continue;
}
if (strand1 == Location.FORWARD && strand2 != Location.REVERSE) {
progressWarningReceived(new SeqMonkException("Invalid strand orientation - discarding"));
continue;
}
if (strand1 == Location.REVERSE && strand2 != Location.FORWARD) {
progressWarningReceived(new SeqMonkException("Invalid strand orientation - discarding"));
continue;
}
// If they're too far apart we kick them out
int start = 1;
int end = 0;
// We take the strand from read1
int strand = strand1;
if (strand == Location.FORWARD) {
start = start1;
end = end2;
} else if (strand == Location.REVERSE) {
start = start2;
end = end1;
} else if (strand == Location.UNKNOWN) {
start = Math.min(start1, start2);
end = Math.max(end1, end2);
}
if (end <= start) {
progressWarningReceived(new SeqMonkException("Incorrectly oriented reads - discarding"));
continue;
}
if ((end - start) + 1 > prefs.pairDistanceCutoff()) {
progressWarningReceived(new SeqMonkException("Distance between reads too great (" + (((end - start) + 1) - prefs.pairDistanceCutoff()) + ")"));
continue;
}
long read = SequenceRead.packPosition(start, end, strand);
newData[f].addData(c1.chromosome(), read);
}
} catch (IllegalArgumentException iae) {
progressWarningReceived(iae);
} catch (SeqMonkException sme) {
progressWarningReceived(sme);
continue;
}
}
// We're finished with the file.
br.close();
// Cache the data in the new dataset
progressUpdated("Caching data from " + probeFiles[f].getName(), f, probeFiles.length);
newData[f].finalise();
}
processingFinished(newData);
} catch (Exception ex) {
progressExceptionReceived(ex);
return;
}
}
use of uk.ac.babraham.SeqMonk.DataTypes.DataSet in project SeqMonk by s-andrews.
the class SeqMonkReimportParser method run.
/* (non-Javadoc)
* @see java.lang.Runnable#run()
*/
public void run() {
File[] files = getFiles();
Vector<DataSet> allDataSets = new Vector<DataSet>();
for (int f = 0; f < files.length; f++) {
progressUpdated("Scanning File " + files[f].getName(), f, files.length);
FileInputStream fis = null;
try {
fis = new FileInputStream(files[f]);
br = new BufferedReader(new InputStreamReader(new GZIPInputStream(fis)));
} catch (IOException ioe) {
try {
if (fis != null) {
fis.close();
}
br = new BufferedReader(new FileReader(files[f]));
} catch (IOException ex) {
progressExceptionReceived(ex);
return;
}
}
try {
String line;
String[] sections;
while ((line = br.readLine()) != null) {
sections = line.split("\\t");
// Now we look where to send this...
if (sections[0].equals("SeqMonk Data Version")) {
parseDataVersion(sections);
} else if (sections[0].equals("Samples")) {
DataSet[] dataSets = parseSamples(sections);
for (int i = 0; i < dataSets.length; i++) {
allDataSets.add(dataSets[i]);
}
// Once we've parsed the samples we don't care about anything else.
break;
} else if (sections[0].equals("Genome")) {
try {
parseGenome(sections);
} catch (SeqMonkException sme) {
progressWarningReceived(sme);
break;
}
}
}
// We're finished with the file
br.close();
} catch (Exception ex) {
progressExceptionReceived(ex);
try {
br.close();
} catch (IOException e1) {
throw new IllegalStateException(e1);
}
return;
}
}
processingFinished(allDataSets.toArray(new DataSet[0]));
}
use of uk.ac.babraham.SeqMonk.DataTypes.DataSet in project SeqMonk by s-andrews.
the class SeqMonkReimportParser method parseSamples.
/**
* Parses the list of samples.
*
* @param sections The tab split initial samples line
* @throws SeqMonkException
* @throws IOException Signals that an I/O exception has occurred.
*/
private DataSet[] parseSamples(String[] sections) throws SeqMonkException, IOException {
if (sections.length != 2) {
throw new SeqMonkException("Samples line didn't contain 2 sections");
}
if (!sections[0].equals("Samples")) {
throw new SeqMonkException("Couldn't find expected samples line");
}
int n = Integer.parseInt(sections[1]);
// We need to keep the Data Sets around to add data to later.
// We're going to start all of these datasets off, but only collect data for some
// of them.
DataSet[] dataSets = new DataSet[n];
for (int i = 0; i < n; i++) {
sections = br.readLine().split("\\t");
// anything else is assumed to be a normal dataset.
if (sections.length == 1) {
dataSets[i] = new DataSet(sections[0], "Not known", DataSet.DUPLICATES_REMOVE_NO);
} else if (sections.length == 2) {
dataSets[i] = new DataSet(sections[0], sections[1], DataSet.DUPLICATES_REMOVE_NO);
} else if (sections.length == 3) {
if (sections[2].equals("HiC")) {
dataSets[i] = new PairedDataSet(sections[0], sections[1], DataSet.DUPLICATES_REMOVE_NO, 0, false);
} else {
dataSets[i] = new DataSet(sections[0], sections[1], DataSet.DUPLICATES_REMOVE_NO);
}
}
}
// At this point we need to know which of the datasets we're actually going
// to import into the current project
DataSetSelector selector = new DataSetSelector(dataSets);
boolean[] dataSetsToParse = new boolean[dataSets.length];
int lastDataSetToParse = 0;
for (int i = 0; i < dataSetsToParse.length; i++) {
dataSetsToParse[i] = false;
}
int[] selectedIndices = selector.getSelectedIndices();
for (int i = 0; i < selectedIndices.length; i++) {
dataSetsToParse[selectedIndices[i]] = true;
if (selectedIndices[i] > lastDataSetToParse)
lastDataSetToParse = selectedIndices[i];
}
// Now we can go through the rest of the sets and add data where appropriate
// Immediately after the list of samples comes the lists of reads
String line;
// Iterate through the number of samples
for (int i = 0; i < n; i++) {
if (i > lastDataSetToParse) {
// No sense skipping through a load of data we're not going to import.
break;
}
if (dataSetsToParse[i]) {
progressUpdated("Reading data for " + dataSets[i].name(), i * 10, n * 10);
} else {
progressUpdated("Skipping " + dataSets[i].name(), i * 10, n * 10);
}
// The first line is
line = br.readLine();
sections = line.split("\t");
if (sections.length != 2) {
throw new SeqMonkException("Read line " + i + " didn't contain 2 sections");
}
int readCount = Integer.parseInt(sections[0]);
// In versions prior to 7 we encoded everything on every line separately
if (thisDataVersion < 7) {
for (int r = 0; r < readCount; r++) {
if ((r % (1 + (readCount / 10))) == 0) {
if (dataSetsToParse[i]) {
progressUpdated("Reading data for " + dataSets[i].name(), i * 10 + (r / (1 + (readCount / 10))), n * 10);
} else {
progressUpdated("Skipping " + dataSets[i].name(), i * 10 + (r / (1 + (readCount / 10))), n * 10);
}
}
line = br.readLine();
if (line == null) {
throw new SeqMonkException("Ran out of data whilst parsing reads for sample " + i);
}
if (dataSetsToParse[i]) {
sections = line.split("\t");
Chromosome c;
try {
c = genome.getChromosome(sections[0]).chromosome();
} catch (IllegalArgumentException sme) {
progressWarningReceived(new SeqMonkException("Read from sample " + i + " could not be mapped to chromosome '" + sections[0] + "'"));
continue;
}
int start = Integer.parseInt(sections[1]);
int end = Integer.parseInt(sections[2]);
int strand = Integer.parseInt(sections[3]);
try {
dataSets[i].addData(c, SequenceRead.packPosition(start, end, strand));
} catch (SeqMonkException ex) {
progressWarningReceived(ex);
continue;
}
}
}
} else if (dataSets[i] instanceof PairedDataSet) {
// Paired Data sets have a different format, with a packed position for
// the first read, then a chromosome name and packed position for the second
// read.
// From v13 onwards we started entering HiC data pairs in both directions so
// the data would be able to be read back in without sorting it.
boolean hicDataIsPresorted = thisDataVersion >= 13;
while (true) {
// The first line should be the chromosome and a number of reads
line = br.readLine();
if (line == null) {
throw new SeqMonkException("Ran out of data whilst parsing reads for sample " + i);
}
if (dataSetsToParse[i]) {
// A blank line indicates the end of the sample
if (line.length() == 0)
break;
sections = line.split("\\t");
Chromosome c = genome.getChromosome(sections[0]).chromosome();
// progressUpdated("Reading data for "+dataSets[i].name(),i*application.dataCollection().genome().getChromosomeCount()+seenChromosomeCount,n*application.dataCollection().genome().getChromosomeCount());
int chrReadCount = Integer.parseInt(sections[1]);
for (int r = 0; r < chrReadCount; r++) {
line = br.readLine();
if (line == null) {
throw new SeqMonkException("Ran out of data whilst parsing reads for sample " + i);
}
/*
* We used to have a split("\t") here, but this turned out to be the bottleneck
* which hugely restricted the speed at which the data could be read. Switching
* for a set of index calls and substring makes this *way* faster.
*/
long packedPosition = Long.parseLong(line.substring(0, line.indexOf('\t')));
Chromosome c2 = genome.getChromosome(line.substring(line.indexOf('\t'), line.lastIndexOf(('\t')))).chromosome();
long packedPosition2 = Long.parseLong(line.substring(line.lastIndexOf('\t') + 1, line.length()));
try {
dataSets[i].addData(c, packedPosition, hicDataIsPresorted);
dataSets[i].addData(c2, packedPosition2, hicDataIsPresorted);
} catch (SeqMonkException ex) {
progressWarningReceived(ex);
continue;
}
}
}
}
} else {
// In versions after 7 we split the section up into chromosomes
// so we don't put the chromosome on every line, and we put out
// the packed double value rather than the individual start, end
// and strand
// As of version 12 we collapse repeated reads into one line with
// a count after it, so we need to check for this.
// We keep count of reads processed to update the progress listeners
int readsRead = 0;
while (true) {
// The first line should be the chromosome and a number of reads
line = br.readLine();
if (line == null) {
throw new SeqMonkException("Ran out of data whilst parsing reads for sample " + i);
}
// A blank line indicates the end of the sample
if (line.length() == 0)
break;
if (dataSetsToParse[i]) {
sections = line.split("\\t");
// We don't try to capture this exception since we can't then process
// any of the reads which follow.
Chromosome c = genome.getChromosome(sections[0]).chromosome();
int chrReadCount = Integer.parseInt(sections[1]);
int tabIndexPosition = 0;
for (int r = 0; r < chrReadCount; r++) {
readsRead++;
if ((readsRead % (1 + (readCount / 10))) == 0) {
if (dataSetsToParse[i]) {
progressUpdated("Reading data for " + dataSets[i].name(), i * 10 + (readsRead / (1 + (readCount / 10))), n * 10);
} else {
progressUpdated("Skipping " + dataSets[i].name(), i * 10 + (readsRead / (1 + (readCount / 10))), n * 10);
}
}
line = br.readLine();
if (line == null) {
throw new SeqMonkException("Ran out of data whilst parsing reads for sample " + i);
}
long packedPosition;
try {
if (line.indexOf("\t") != -1) {
// We have an index with a count
sections = line.split("\t");
tabIndexPosition = line.indexOf("\t");
packedPosition = Long.parseLong(line.substring(0, tabIndexPosition));
int count = Integer.parseInt(line.substring(tabIndexPosition + 1));
// TODO: Does this break the progress bar sometimes?
r += (count - 1);
readsRead += (count - 1);
for (int x = 1; x <= count; x++) {
dataSets[i].addData(c, packedPosition);
}
} else {
packedPosition = Long.parseLong(line);
dataSets[i].addData(c, packedPosition);
}
} catch (SeqMonkException ex) {
progressWarningReceived(ex);
}
}
}
}
}
if (dataSetsToParse[i]) {
dataSets[i].finalise();
}
}
// Finally we need to make up an array of just the datasets we actually read
Vector<DataSet> keepers = new Vector<DataSet>();
for (int i = 0; i < dataSets.length; i++) {
if (dataSetsToParse[i]) {
keepers.add(dataSets[i]);
}
}
return keepers.toArray(new DataSet[0]);
}
use of uk.ac.babraham.SeqMonk.DataTypes.DataSet in project SeqMonk by s-andrews.
the class FourCEnrichmentQuantitation method quantitate.
/* (non-Javadoc)
* @see uk.ac.babraham.SeqMonk.Quantitation.Quantitation#quantitate(uk.ac.babraham.SeqMonk.DataTypes.DataStore[])
*/
public void quantitate(DataStore[] data) {
// Find the 4C looking data stores amongst the set we've been given
Vector<DataSet> fourCStores = new Vector<DataSet>();
for (int d = 0; d < data.length; d++) {
if (data[d] instanceof HiCDataStore && ((HiCDataStore) data[d]).isValidHiC()) {
progressWarningReceived(new SeqMonkException("Skipping " + data[d] + " which was HiC data"));
continue;
} else if (!(data[d] instanceof DataSet)) {
// We can only analyse direct 4C data sets
progressWarningReceived(new SeqMonkException("Skipping " + data[d] + " which wasn't a DataSet"));
continue;
} else if (!((DataSet) data[d]).fileName().startsWith("HiC other end")) {
progressWarningReceived(new SeqMonkException("Skipping " + data[d] + " which didn't look like a 4C data set"));
continue;
}
fourCStores.add((DataSet) data[d]);
}
this.data = fourCStores.toArray(new DataSet[0]);
Thread t = new Thread(this);
cancel = false;
t.start();
}
use of uk.ac.babraham.SeqMonk.DataTypes.DataSet in project SeqMonk by s-andrews.
the class LogTransformQuantitation method run.
/* (non-Javadoc)
* @see java.lang.Runnable#run()
*/
public void run() {
if (!isReady()) {
progressExceptionReceived(new SeqMonkException("Options weren't set correctly"));
}
Chromosome[] chromosomes = application.dataCollection().genome().getAllChromosomes();
Vector<DataStore> quantitatedStores = new Vector<DataStore>();
DataSet[] sets = application.dataCollection().getAllDataSets();
for (int s = 0; s < sets.length; s++) {
if (sets[s].isQuantitated()) {
quantitatedStores.add(sets[s]);
}
}
DataGroup[] groups = application.dataCollection().getAllDataGroups();
for (int g = 0; g < groups.length; g++) {
if (groups[g].isQuantitated()) {
quantitatedStores.add(groups[g]);
}
}
DataStore[] data = quantitatedStores.toArray(new DataStore[0]);
for (int c = 0; c < chromosomes.length; c++) {
// See if we need to quit
if (cancel) {
progressCancelled();
return;
}
progressUpdated(c, chromosomes.length);
Probe[] allProbes = application.dataCollection().probeSet().getProbesForChromosome(chromosomes[c]);
try {
for (int p = 0; p < allProbes.length; p++) {
// See if we need to quit
if (cancel) {
progressCancelled();
return;
}
for (int d = 0; d < data.length; d++) {
data[d].setValueForProbe(allProbes[p], (float) (Math.log(Math.max(data[d].getValueForProbe(allProbes[p]), threshold)) / Math.log(logBase)));
}
}
} catch (SeqMonkException e) {
progressExceptionReceived(e);
}
}
quantitatonComplete();
}
Aggregations