use of uk.ac.babraham.SeqMonk.SeqMonkException in project SeqMonk by s-andrews.
the class BowtieFileParser method getSingleEndRead.
/**
* Gets a single single end read.
*
* @param sections The set of tab-split strings from the bowtie output
* @return The read which was read
* @throws SeqMonkException
*/
private SequenceReadWithChromosome getSingleEndRead(String[] sections) throws SeqMonkException {
int strand;
int start;
int end;
try {
start = Integer.parseInt(sections[3]) + 1;
end = start + (sections[4].length() - 1);
if (sections[1].equals("+")) {
strand = Location.FORWARD;
} else if (sections[1].equals("-")) {
strand = Location.REVERSE;
} else {
strand = Location.UNKNOWN;
}
if (extendBy > 0) {
if (strand == Location.FORWARD) {
end += extendBy;
} else if (strand == Location.REVERSE) {
start -= extendBy;
}
}
} catch (NumberFormatException e) {
throw new SeqMonkException("Location " + sections[3] + " was not an integer");
}
ChromosomeWithOffset c;
try {
c = collection.genome().getChromosome(sections[2]);
} catch (Exception iae) {
throw new SeqMonkException(iae.getLocalizedMessage());
}
start = c.position(start);
end = c.position(end);
// We also don't allow readings which are beyond the end of the chromosome
if (end > c.chromosome().length()) {
int overrun = end - c.chromosome().length();
throw new SeqMonkException("Reading position " + end + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
}
if (start < 1) {
throw new SeqMonkException("Reading position " + start + " was before the start of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
}
// We can now make the new reading
SequenceReadWithChromosome read = new SequenceReadWithChromosome(c.chromosome(), SequenceRead.packPosition(start, end, strand));
return read;
}
use of uk.ac.babraham.SeqMonk.SeqMonkException in project SeqMonk by s-andrews.
the class SeqMonkReimportParser method run.
/* (non-Javadoc)
* @see java.lang.Runnable#run()
*/
public void run() {
File[] files = getFiles();
Vector<DataSet> allDataSets = new Vector<DataSet>();
for (int f = 0; f < files.length; f++) {
progressUpdated("Scanning File " + files[f].getName(), f, files.length);
FileInputStream fis = null;
try {
fis = new FileInputStream(files[f]);
br = new BufferedReader(new InputStreamReader(new GZIPInputStream(fis)));
} catch (IOException ioe) {
try {
if (fis != null) {
fis.close();
}
br = new BufferedReader(new FileReader(files[f]));
} catch (IOException ex) {
progressExceptionReceived(ex);
return;
}
}
try {
String line;
String[] sections;
while ((line = br.readLine()) != null) {
sections = line.split("\\t");
// Now we look where to send this...
if (sections[0].equals("SeqMonk Data Version")) {
parseDataVersion(sections);
} else if (sections[0].equals("Samples")) {
DataSet[] dataSets = parseSamples(sections);
for (int i = 0; i < dataSets.length; i++) {
allDataSets.add(dataSets[i]);
}
// Once we've parsed the samples we don't care about anything else.
break;
} else if (sections[0].equals("Genome")) {
try {
parseGenome(sections);
} catch (SeqMonkException sme) {
progressWarningReceived(sme);
break;
}
}
}
// We're finished with the file
br.close();
} catch (Exception ex) {
progressExceptionReceived(ex);
try {
br.close();
} catch (IOException e1) {
throw new IllegalStateException(e1);
}
return;
}
}
processingFinished(allDataSets.toArray(new DataSet[0]));
}
use of uk.ac.babraham.SeqMonk.SeqMonkException in project SeqMonk by s-andrews.
the class SeqMonkReimportParser method parseSamples.
/**
* Parses the list of samples.
*
* @param sections The tab split initial samples line
* @throws SeqMonkException
* @throws IOException Signals that an I/O exception has occurred.
*/
private DataSet[] parseSamples(String[] sections) throws SeqMonkException, IOException {
if (sections.length != 2) {
throw new SeqMonkException("Samples line didn't contain 2 sections");
}
if (!sections[0].equals("Samples")) {
throw new SeqMonkException("Couldn't find expected samples line");
}
int n = Integer.parseInt(sections[1]);
// We need to keep the Data Sets around to add data to later.
// We're going to start all of these datasets off, but only collect data for some
// of them.
DataSet[] dataSets = new DataSet[n];
for (int i = 0; i < n; i++) {
sections = br.readLine().split("\\t");
// anything else is assumed to be a normal dataset.
if (sections.length == 1) {
dataSets[i] = new DataSet(sections[0], "Not known", DataSet.DUPLICATES_REMOVE_NO);
} else if (sections.length == 2) {
dataSets[i] = new DataSet(sections[0], sections[1], DataSet.DUPLICATES_REMOVE_NO);
} else if (sections.length == 3) {
if (sections[2].equals("HiC")) {
dataSets[i] = new PairedDataSet(sections[0], sections[1], DataSet.DUPLICATES_REMOVE_NO, 0, false);
} else {
dataSets[i] = new DataSet(sections[0], sections[1], DataSet.DUPLICATES_REMOVE_NO);
}
}
}
// At this point we need to know which of the datasets we're actually going
// to import into the current project
DataSetSelector selector = new DataSetSelector(dataSets);
boolean[] dataSetsToParse = new boolean[dataSets.length];
int lastDataSetToParse = 0;
for (int i = 0; i < dataSetsToParse.length; i++) {
dataSetsToParse[i] = false;
}
int[] selectedIndices = selector.getSelectedIndices();
for (int i = 0; i < selectedIndices.length; i++) {
dataSetsToParse[selectedIndices[i]] = true;
if (selectedIndices[i] > lastDataSetToParse)
lastDataSetToParse = selectedIndices[i];
}
// Now we can go through the rest of the sets and add data where appropriate
// Immediately after the list of samples comes the lists of reads
String line;
// Iterate through the number of samples
for (int i = 0; i < n; i++) {
if (i > lastDataSetToParse) {
// No sense skipping through a load of data we're not going to import.
break;
}
if (dataSetsToParse[i]) {
progressUpdated("Reading data for " + dataSets[i].name(), i * 10, n * 10);
} else {
progressUpdated("Skipping " + dataSets[i].name(), i * 10, n * 10);
}
// The first line is
line = br.readLine();
sections = line.split("\t");
if (sections.length != 2) {
throw new SeqMonkException("Read line " + i + " didn't contain 2 sections");
}
int readCount = Integer.parseInt(sections[0]);
// In versions prior to 7 we encoded everything on every line separately
if (thisDataVersion < 7) {
for (int r = 0; r < readCount; r++) {
if ((r % (1 + (readCount / 10))) == 0) {
if (dataSetsToParse[i]) {
progressUpdated("Reading data for " + dataSets[i].name(), i * 10 + (r / (1 + (readCount / 10))), n * 10);
} else {
progressUpdated("Skipping " + dataSets[i].name(), i * 10 + (r / (1 + (readCount / 10))), n * 10);
}
}
line = br.readLine();
if (line == null) {
throw new SeqMonkException("Ran out of data whilst parsing reads for sample " + i);
}
if (dataSetsToParse[i]) {
sections = line.split("\t");
Chromosome c;
try {
c = genome.getChromosome(sections[0]).chromosome();
} catch (IllegalArgumentException sme) {
progressWarningReceived(new SeqMonkException("Read from sample " + i + " could not be mapped to chromosome '" + sections[0] + "'"));
continue;
}
int start = Integer.parseInt(sections[1]);
int end = Integer.parseInt(sections[2]);
int strand = Integer.parseInt(sections[3]);
try {
dataSets[i].addData(c, SequenceRead.packPosition(start, end, strand));
} catch (SeqMonkException ex) {
progressWarningReceived(ex);
continue;
}
}
}
} else if (dataSets[i] instanceof PairedDataSet) {
// Paired Data sets have a different format, with a packed position for
// the first read, then a chromosome name and packed position for the second
// read.
// From v13 onwards we started entering HiC data pairs in both directions so
// the data would be able to be read back in without sorting it.
boolean hicDataIsPresorted = thisDataVersion >= 13;
while (true) {
// The first line should be the chromosome and a number of reads
line = br.readLine();
if (line == null) {
throw new SeqMonkException("Ran out of data whilst parsing reads for sample " + i);
}
if (dataSetsToParse[i]) {
// A blank line indicates the end of the sample
if (line.length() == 0)
break;
sections = line.split("\\t");
Chromosome c = genome.getChromosome(sections[0]).chromosome();
// progressUpdated("Reading data for "+dataSets[i].name(),i*application.dataCollection().genome().getChromosomeCount()+seenChromosomeCount,n*application.dataCollection().genome().getChromosomeCount());
int chrReadCount = Integer.parseInt(sections[1]);
for (int r = 0; r < chrReadCount; r++) {
line = br.readLine();
if (line == null) {
throw new SeqMonkException("Ran out of data whilst parsing reads for sample " + i);
}
/*
* We used to have a split("\t") here, but this turned out to be the bottleneck
* which hugely restricted the speed at which the data could be read. Switching
* for a set of index calls and substring makes this *way* faster.
*/
long packedPosition = Long.parseLong(line.substring(0, line.indexOf('\t')));
Chromosome c2 = genome.getChromosome(line.substring(line.indexOf('\t'), line.lastIndexOf(('\t')))).chromosome();
long packedPosition2 = Long.parseLong(line.substring(line.lastIndexOf('\t') + 1, line.length()));
try {
dataSets[i].addData(c, packedPosition, hicDataIsPresorted);
dataSets[i].addData(c2, packedPosition2, hicDataIsPresorted);
} catch (SeqMonkException ex) {
progressWarningReceived(ex);
continue;
}
}
}
}
} else {
// In versions after 7 we split the section up into chromosomes
// so we don't put the chromosome on every line, and we put out
// the packed double value rather than the individual start, end
// and strand
// As of version 12 we collapse repeated reads into one line with
// a count after it, so we need to check for this.
// We keep count of reads processed to update the progress listeners
int readsRead = 0;
while (true) {
// The first line should be the chromosome and a number of reads
line = br.readLine();
if (line == null) {
throw new SeqMonkException("Ran out of data whilst parsing reads for sample " + i);
}
// A blank line indicates the end of the sample
if (line.length() == 0)
break;
if (dataSetsToParse[i]) {
sections = line.split("\\t");
// We don't try to capture this exception since we can't then process
// any of the reads which follow.
Chromosome c = genome.getChromosome(sections[0]).chromosome();
int chrReadCount = Integer.parseInt(sections[1]);
int tabIndexPosition = 0;
for (int r = 0; r < chrReadCount; r++) {
readsRead++;
if ((readsRead % (1 + (readCount / 10))) == 0) {
if (dataSetsToParse[i]) {
progressUpdated("Reading data for " + dataSets[i].name(), i * 10 + (readsRead / (1 + (readCount / 10))), n * 10);
} else {
progressUpdated("Skipping " + dataSets[i].name(), i * 10 + (readsRead / (1 + (readCount / 10))), n * 10);
}
}
line = br.readLine();
if (line == null) {
throw new SeqMonkException("Ran out of data whilst parsing reads for sample " + i);
}
long packedPosition;
try {
if (line.indexOf("\t") != -1) {
// We have an index with a count
sections = line.split("\t");
tabIndexPosition = line.indexOf("\t");
packedPosition = Long.parseLong(line.substring(0, tabIndexPosition));
int count = Integer.parseInt(line.substring(tabIndexPosition + 1));
// TODO: Does this break the progress bar sometimes?
r += (count - 1);
readsRead += (count - 1);
for (int x = 1; x <= count; x++) {
dataSets[i].addData(c, packedPosition);
}
} else {
packedPosition = Long.parseLong(line);
dataSets[i].addData(c, packedPosition);
}
} catch (SeqMonkException ex) {
progressWarningReceived(ex);
}
}
}
}
}
if (dataSetsToParse[i]) {
dataSets[i].finalise();
}
}
// Finally we need to make up an array of just the datasets we actually read
Vector<DataSet> keepers = new Vector<DataSet>();
for (int i = 0; i < dataSets.length; i++) {
if (dataSetsToParse[i]) {
keepers.add(dataSets[i]);
}
}
return keepers.toArray(new DataSet[0]);
}
use of uk.ac.babraham.SeqMonk.SeqMonkException in project SeqMonk by s-andrews.
the class FourCEnrichmentQuantitation method quantitate.
/* (non-Javadoc)
* @see uk.ac.babraham.SeqMonk.Quantitation.Quantitation#quantitate(uk.ac.babraham.SeqMonk.DataTypes.DataStore[])
*/
public void quantitate(DataStore[] data) {
// Find the 4C looking data stores amongst the set we've been given
Vector<DataSet> fourCStores = new Vector<DataSet>();
for (int d = 0; d < data.length; d++) {
if (data[d] instanceof HiCDataStore && ((HiCDataStore) data[d]).isValidHiC()) {
progressWarningReceived(new SeqMonkException("Skipping " + data[d] + " which was HiC data"));
continue;
} else if (!(data[d] instanceof DataSet)) {
// We can only analyse direct 4C data sets
progressWarningReceived(new SeqMonkException("Skipping " + data[d] + " which wasn't a DataSet"));
continue;
} else if (!((DataSet) data[d]).fileName().startsWith("HiC other end")) {
progressWarningReceived(new SeqMonkException("Skipping " + data[d] + " which didn't look like a 4C data set"));
continue;
}
fourCStores.add((DataSet) data[d]);
}
this.data = fourCStores.toArray(new DataSet[0]);
Thread t = new Thread(this);
cancel = false;
t.start();
}
use of uk.ac.babraham.SeqMonk.SeqMonkException in project SeqMonk by s-andrews.
the class HiCPCADomainQuantitation method run.
/* (non-Javadoc)
* @see java.lang.Runnable#run()
*/
public void run() {
// We're going to go through the probes one chromosome at a time so we
// can reduce the complexity we have to deal with
Chromosome[] chromosomes = application.dataCollection().genome().getAllChromosomes();
for (int c = 0; c < chromosomes.length; c++) {
if (cancel) {
progressCancelled();
return;
}
currentChromosome = chromosomes[c];
Probe[] probes = application.dataCollection().probeSet().getProbesForChromosome(chromosomes[c]);
if (probes.length < 5) {
progressWarningReceived(new SeqMonkException("Too few probes on chromosome " + currentChromosome.name() + " - assigning zero to everything"));
// It's not worth trying to find domains
for (int d = 0; d < data.length; d++) {
for (int p = 0; p < probes.length; p++) {
((DataStore) data[d]).setValueForProbe(probes[p], 0f);
}
}
continue;
}
ProbeList thisChrProbes = new ProbeList(application.dataCollection().probeSet(), chromosomes[c].name(), "", null);
for (int p = 0; p < probes.length; p++) {
thisChrProbes.addProbe(probes[p], 0f);
}
for (int d = 0; d < data.length; d++) {
if (cancel) {
progressCancelled();
return;
}
currentStore = data[d];
current = (d * chromosomes.length) + c;
total = chromosomes.length * data.length;
progressUpdated("Processing chromosome " + chromosomes[c].name() + " for " + data[d].name(), current, total);
HeatmapMatrix matrix = new HeatmapMatrix(data[d], new ProbeList[] { thisChrProbes }, application.dataCollection().genome(), optionsPanel.minDistance(), optionsPanel.maxDistance(), optionsPanel.minStrength(), optionsPanel.maxSignificance(), optionsPanel.minAbsolute(), optionsPanel.correctLinkage());
matrix.addProgressListener(this);
wait = true;
matrix.startCalculating();
while (wait) {
try {
Thread.sleep(100);
} catch (InterruptedException e) {
}
}
if (cancel) {
progressCancelled();
return;
}
if (matrix.filteredInteractions().length < 10) {
progressWarningReceived(new SeqMonkException("Too few interactions on chromosome " + currentChromosome.name() + " for " + data[d].name() + " - assigning zero to everything"));
// not going to get a sensible answer anyway.
for (int p = 0; p < probes.length; p++) {
((DataStore) data[d]).setValueForProbe(probes[p], 0f);
}
continue;
}
InteractionClusterMatrix clusterMatrix = new InteractionClusterMatrix(matrix.filteredInteractions(), probes.length);
clusterMatrix.addListener(this);
wait = true;
clusterMatrix.startCorrelating();
while (wait) {
try {
Thread.sleep(100);
} catch (InterruptedException e) {
}
}
float[][] correlationMatrix = clusterMatrix.correlationMatix();
// Annoyingly the PCA needs a double [][]
double[][] correlationMatrixDouble = new double[correlationMatrix.length][];
for (int i = 0; i < correlationMatrix.length; i++) {
double[] db = new double[correlationMatrix[i].length];
for (int j = 0; j < db.length; j++) {
db[j] = correlationMatrix[i][j];
}
correlationMatrixDouble[i] = db;
}
// Now we can calculate the PCA values from the correlation matrix
PCA pca = new PCA(correlationMatrixDouble);
pca.addProgressListener(this);
wait = true;
pca.startCalculating();
while (wait) {
try {
Thread.sleep(100);
} catch (InterruptedException e) {
}
}
double[] extractedEigenValues = pca.extractedEigenValues();
// for these probes
for (int p = 0; p < probes.length; p++) {
((DataStore) data[d]).setValueForProbe(probes[p], (float) extractedEigenValues[p]);
}
}
thisChrProbes.delete();
}
quantitatonComplete();
}
Aggregations