Search in sources :

Example 16 with SeqMonkException

use of uk.ac.babraham.SeqMonk.SeqMonkException in project SeqMonk by s-andrews.

the class BowtieFileParser method getSingleEndRead.

/**
 * Gets a single single end read.
 *
 * @param sections The set of tab-split strings from the bowtie output
 * @return The read which was read
 * @throws SeqMonkException
 */
private SequenceReadWithChromosome getSingleEndRead(String[] sections) throws SeqMonkException {
    int strand;
    int start;
    int end;
    try {
        start = Integer.parseInt(sections[3]) + 1;
        end = start + (sections[4].length() - 1);
        if (sections[1].equals("+")) {
            strand = Location.FORWARD;
        } else if (sections[1].equals("-")) {
            strand = Location.REVERSE;
        } else {
            strand = Location.UNKNOWN;
        }
        if (extendBy > 0) {
            if (strand == Location.FORWARD) {
                end += extendBy;
            } else if (strand == Location.REVERSE) {
                start -= extendBy;
            }
        }
    } catch (NumberFormatException e) {
        throw new SeqMonkException("Location " + sections[3] + " was not an integer");
    }
    ChromosomeWithOffset c;
    try {
        c = collection.genome().getChromosome(sections[2]);
    } catch (Exception iae) {
        throw new SeqMonkException(iae.getLocalizedMessage());
    }
    start = c.position(start);
    end = c.position(end);
    // We also don't allow readings which are beyond the end of the chromosome
    if (end > c.chromosome().length()) {
        int overrun = end - c.chromosome().length();
        throw new SeqMonkException("Reading position " + end + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
    }
    if (start < 1) {
        throw new SeqMonkException("Reading position " + start + " was before the start of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
    }
    // We can now make the new reading
    SequenceReadWithChromosome read = new SequenceReadWithChromosome(c.chromosome(), SequenceRead.packPosition(start, end, strand));
    return read;
}
Also used : ChromosomeWithOffset(uk.ac.babraham.SeqMonk.Utilities.ChromosomeWithOffset) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) SequenceReadWithChromosome(uk.ac.babraham.SeqMonk.DataTypes.Sequence.SequenceReadWithChromosome)

Example 17 with SeqMonkException

use of uk.ac.babraham.SeqMonk.SeqMonkException in project SeqMonk by s-andrews.

the class SeqMonkReimportParser method run.

/* (non-Javadoc)
	 * @see java.lang.Runnable#run()
	 */
public void run() {
    File[] files = getFiles();
    Vector<DataSet> allDataSets = new Vector<DataSet>();
    for (int f = 0; f < files.length; f++) {
        progressUpdated("Scanning File " + files[f].getName(), f, files.length);
        FileInputStream fis = null;
        try {
            fis = new FileInputStream(files[f]);
            br = new BufferedReader(new InputStreamReader(new GZIPInputStream(fis)));
        } catch (IOException ioe) {
            try {
                if (fis != null) {
                    fis.close();
                }
                br = new BufferedReader(new FileReader(files[f]));
            } catch (IOException ex) {
                progressExceptionReceived(ex);
                return;
            }
        }
        try {
            String line;
            String[] sections;
            while ((line = br.readLine()) != null) {
                sections = line.split("\\t");
                // Now we look where to send this...
                if (sections[0].equals("SeqMonk Data Version")) {
                    parseDataVersion(sections);
                } else if (sections[0].equals("Samples")) {
                    DataSet[] dataSets = parseSamples(sections);
                    for (int i = 0; i < dataSets.length; i++) {
                        allDataSets.add(dataSets[i]);
                    }
                    // Once we've parsed the samples we don't care about anything else.
                    break;
                } else if (sections[0].equals("Genome")) {
                    try {
                        parseGenome(sections);
                    } catch (SeqMonkException sme) {
                        progressWarningReceived(sme);
                        break;
                    }
                }
            }
            // We're finished with the file
            br.close();
        } catch (Exception ex) {
            progressExceptionReceived(ex);
            try {
                br.close();
            } catch (IOException e1) {
                throw new IllegalStateException(e1);
            }
            return;
        }
    }
    processingFinished(allDataSets.toArray(new DataSet[0]));
}
Also used : InputStreamReader(java.io.InputStreamReader) DataSet(uk.ac.babraham.SeqMonk.DataTypes.DataSet) PairedDataSet(uk.ac.babraham.SeqMonk.DataTypes.PairedDataSet) IOException(java.io.IOException) FileInputStream(java.io.FileInputStream) IOException(java.io.IOException) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) GZIPInputStream(java.util.zip.GZIPInputStream) BufferedReader(java.io.BufferedReader) FileReader(java.io.FileReader) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) File(java.io.File) Vector(java.util.Vector)

Example 18 with SeqMonkException

use of uk.ac.babraham.SeqMonk.SeqMonkException in project SeqMonk by s-andrews.

the class SeqMonkReimportParser method parseSamples.

/**
 * Parses the list of samples.
 *
 * @param sections The tab split initial samples line
 * @throws SeqMonkException
 * @throws IOException Signals that an I/O exception has occurred.
 */
private DataSet[] parseSamples(String[] sections) throws SeqMonkException, IOException {
    if (sections.length != 2) {
        throw new SeqMonkException("Samples line didn't contain 2 sections");
    }
    if (!sections[0].equals("Samples")) {
        throw new SeqMonkException("Couldn't find expected samples line");
    }
    int n = Integer.parseInt(sections[1]);
    // We need to keep the Data Sets around to add data to later.
    // We're going to start all of these datasets off, but only collect data for some
    // of them.
    DataSet[] dataSets = new DataSet[n];
    for (int i = 0; i < n; i++) {
        sections = br.readLine().split("\\t");
        // anything else is assumed to be a normal dataset.
        if (sections.length == 1) {
            dataSets[i] = new DataSet(sections[0], "Not known", DataSet.DUPLICATES_REMOVE_NO);
        } else if (sections.length == 2) {
            dataSets[i] = new DataSet(sections[0], sections[1], DataSet.DUPLICATES_REMOVE_NO);
        } else if (sections.length == 3) {
            if (sections[2].equals("HiC")) {
                dataSets[i] = new PairedDataSet(sections[0], sections[1], DataSet.DUPLICATES_REMOVE_NO, 0, false);
            } else {
                dataSets[i] = new DataSet(sections[0], sections[1], DataSet.DUPLICATES_REMOVE_NO);
            }
        }
    }
    // At this point we need to know which of the datasets we're actually going
    // to import into the current project
    DataSetSelector selector = new DataSetSelector(dataSets);
    boolean[] dataSetsToParse = new boolean[dataSets.length];
    int lastDataSetToParse = 0;
    for (int i = 0; i < dataSetsToParse.length; i++) {
        dataSetsToParse[i] = false;
    }
    int[] selectedIndices = selector.getSelectedIndices();
    for (int i = 0; i < selectedIndices.length; i++) {
        dataSetsToParse[selectedIndices[i]] = true;
        if (selectedIndices[i] > lastDataSetToParse)
            lastDataSetToParse = selectedIndices[i];
    }
    // Now we can go through the rest of the sets and add data where appropriate
    // Immediately after the list of samples comes the lists of reads
    String line;
    // Iterate through the number of samples
    for (int i = 0; i < n; i++) {
        if (i > lastDataSetToParse) {
            // No sense skipping through a load of data we're not going to import.
            break;
        }
        if (dataSetsToParse[i]) {
            progressUpdated("Reading data for " + dataSets[i].name(), i * 10, n * 10);
        } else {
            progressUpdated("Skipping " + dataSets[i].name(), i * 10, n * 10);
        }
        // The first line is
        line = br.readLine();
        sections = line.split("\t");
        if (sections.length != 2) {
            throw new SeqMonkException("Read line " + i + " didn't contain 2 sections");
        }
        int readCount = Integer.parseInt(sections[0]);
        // In versions prior to 7 we encoded everything on every line separately
        if (thisDataVersion < 7) {
            for (int r = 0; r < readCount; r++) {
                if ((r % (1 + (readCount / 10))) == 0) {
                    if (dataSetsToParse[i]) {
                        progressUpdated("Reading data for " + dataSets[i].name(), i * 10 + (r / (1 + (readCount / 10))), n * 10);
                    } else {
                        progressUpdated("Skipping " + dataSets[i].name(), i * 10 + (r / (1 + (readCount / 10))), n * 10);
                    }
                }
                line = br.readLine();
                if (line == null) {
                    throw new SeqMonkException("Ran out of data whilst parsing reads for sample " + i);
                }
                if (dataSetsToParse[i]) {
                    sections = line.split("\t");
                    Chromosome c;
                    try {
                        c = genome.getChromosome(sections[0]).chromosome();
                    } catch (IllegalArgumentException sme) {
                        progressWarningReceived(new SeqMonkException("Read from sample " + i + " could not be mapped to chromosome '" + sections[0] + "'"));
                        continue;
                    }
                    int start = Integer.parseInt(sections[1]);
                    int end = Integer.parseInt(sections[2]);
                    int strand = Integer.parseInt(sections[3]);
                    try {
                        dataSets[i].addData(c, SequenceRead.packPosition(start, end, strand));
                    } catch (SeqMonkException ex) {
                        progressWarningReceived(ex);
                        continue;
                    }
                }
            }
        } else if (dataSets[i] instanceof PairedDataSet) {
            // Paired Data sets have a different format, with a packed position for
            // the first read, then a chromosome name and packed position for the second
            // read.
            // From v13 onwards we started entering HiC data pairs in both directions so
            // the data would be able to be read back in without sorting it.
            boolean hicDataIsPresorted = thisDataVersion >= 13;
            while (true) {
                // The first line should be the chromosome and a number of reads
                line = br.readLine();
                if (line == null) {
                    throw new SeqMonkException("Ran out of data whilst parsing reads for sample " + i);
                }
                if (dataSetsToParse[i]) {
                    // A blank line indicates the end of the sample
                    if (line.length() == 0)
                        break;
                    sections = line.split("\\t");
                    Chromosome c = genome.getChromosome(sections[0]).chromosome();
                    // progressUpdated("Reading data for "+dataSets[i].name(),i*application.dataCollection().genome().getChromosomeCount()+seenChromosomeCount,n*application.dataCollection().genome().getChromosomeCount());
                    int chrReadCount = Integer.parseInt(sections[1]);
                    for (int r = 0; r < chrReadCount; r++) {
                        line = br.readLine();
                        if (line == null) {
                            throw new SeqMonkException("Ran out of data whilst parsing reads for sample " + i);
                        }
                        /*
							 * We used to have a split("\t") here, but this turned out to be the bottleneck
							 * which hugely restricted the speed at which the data could be read.  Switching
							 * for a set of index calls and substring makes this *way* faster.
							 */
                        long packedPosition = Long.parseLong(line.substring(0, line.indexOf('\t')));
                        Chromosome c2 = genome.getChromosome(line.substring(line.indexOf('\t'), line.lastIndexOf(('\t')))).chromosome();
                        long packedPosition2 = Long.parseLong(line.substring(line.lastIndexOf('\t') + 1, line.length()));
                        try {
                            dataSets[i].addData(c, packedPosition, hicDataIsPresorted);
                            dataSets[i].addData(c2, packedPosition2, hicDataIsPresorted);
                        } catch (SeqMonkException ex) {
                            progressWarningReceived(ex);
                            continue;
                        }
                    }
                }
            }
        } else {
            // In versions after 7 we split the section up into chromosomes
            // so we don't put the chromosome on every line, and we put out
            // the packed double value rather than the individual start, end
            // and strand
            // As of version 12 we collapse repeated reads into one line with
            // a count after it, so we need to check for this.
            // We keep count of reads processed to update the progress listeners
            int readsRead = 0;
            while (true) {
                // The first line should be the chromosome and a number of reads
                line = br.readLine();
                if (line == null) {
                    throw new SeqMonkException("Ran out of data whilst parsing reads for sample " + i);
                }
                // A blank line indicates the end of the sample
                if (line.length() == 0)
                    break;
                if (dataSetsToParse[i]) {
                    sections = line.split("\\t");
                    // We don't try to capture this exception since we can't then process
                    // any of the reads which follow.
                    Chromosome c = genome.getChromosome(sections[0]).chromosome();
                    int chrReadCount = Integer.parseInt(sections[1]);
                    int tabIndexPosition = 0;
                    for (int r = 0; r < chrReadCount; r++) {
                        readsRead++;
                        if ((readsRead % (1 + (readCount / 10))) == 0) {
                            if (dataSetsToParse[i]) {
                                progressUpdated("Reading data for " + dataSets[i].name(), i * 10 + (readsRead / (1 + (readCount / 10))), n * 10);
                            } else {
                                progressUpdated("Skipping " + dataSets[i].name(), i * 10 + (readsRead / (1 + (readCount / 10))), n * 10);
                            }
                        }
                        line = br.readLine();
                        if (line == null) {
                            throw new SeqMonkException("Ran out of data whilst parsing reads for sample " + i);
                        }
                        long packedPosition;
                        try {
                            if (line.indexOf("\t") != -1) {
                                // We have an index with a count
                                sections = line.split("\t");
                                tabIndexPosition = line.indexOf("\t");
                                packedPosition = Long.parseLong(line.substring(0, tabIndexPosition));
                                int count = Integer.parseInt(line.substring(tabIndexPosition + 1));
                                // TODO: Does this break the progress bar sometimes?
                                r += (count - 1);
                                readsRead += (count - 1);
                                for (int x = 1; x <= count; x++) {
                                    dataSets[i].addData(c, packedPosition);
                                }
                            } else {
                                packedPosition = Long.parseLong(line);
                                dataSets[i].addData(c, packedPosition);
                            }
                        } catch (SeqMonkException ex) {
                            progressWarningReceived(ex);
                        }
                    }
                }
            }
        }
        if (dataSetsToParse[i]) {
            dataSets[i].finalise();
        }
    }
    // Finally we need to make up an array of just the datasets we actually read
    Vector<DataSet> keepers = new Vector<DataSet>();
    for (int i = 0; i < dataSets.length; i++) {
        if (dataSetsToParse[i]) {
            keepers.add(dataSets[i]);
        }
    }
    return keepers.toArray(new DataSet[0]);
}
Also used : DataSet(uk.ac.babraham.SeqMonk.DataTypes.DataSet) PairedDataSet(uk.ac.babraham.SeqMonk.DataTypes.PairedDataSet) PairedDataSet(uk.ac.babraham.SeqMonk.DataTypes.PairedDataSet) Chromosome(uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) Vector(java.util.Vector)

Example 19 with SeqMonkException

use of uk.ac.babraham.SeqMonk.SeqMonkException in project SeqMonk by s-andrews.

the class FourCEnrichmentQuantitation method quantitate.

/* (non-Javadoc)
	 * @see uk.ac.babraham.SeqMonk.Quantitation.Quantitation#quantitate(uk.ac.babraham.SeqMonk.DataTypes.DataStore[])
	 */
public void quantitate(DataStore[] data) {
    // Find the 4C looking data stores amongst the set we've been given
    Vector<DataSet> fourCStores = new Vector<DataSet>();
    for (int d = 0; d < data.length; d++) {
        if (data[d] instanceof HiCDataStore && ((HiCDataStore) data[d]).isValidHiC()) {
            progressWarningReceived(new SeqMonkException("Skipping " + data[d] + " which was HiC data"));
            continue;
        } else if (!(data[d] instanceof DataSet)) {
            // We can only analyse direct 4C data sets
            progressWarningReceived(new SeqMonkException("Skipping " + data[d] + " which wasn't a DataSet"));
            continue;
        } else if (!((DataSet) data[d]).fileName().startsWith("HiC other end")) {
            progressWarningReceived(new SeqMonkException("Skipping " + data[d] + " which didn't look like a 4C data set"));
            continue;
        }
        fourCStores.add((DataSet) data[d]);
    }
    this.data = fourCStores.toArray(new DataSet[0]);
    Thread t = new Thread(this);
    cancel = false;
    t.start();
}
Also used : DataSet(uk.ac.babraham.SeqMonk.DataTypes.DataSet) HiCDataStore(uk.ac.babraham.SeqMonk.DataTypes.HiCDataStore) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) Vector(java.util.Vector)

Example 20 with SeqMonkException

use of uk.ac.babraham.SeqMonk.SeqMonkException in project SeqMonk by s-andrews.

the class HiCPCADomainQuantitation method run.

/* (non-Javadoc)
	 * @see java.lang.Runnable#run()
	 */
public void run() {
    // We're going to go through the probes one chromosome at a time so we
    // can reduce the complexity we have to deal with
    Chromosome[] chromosomes = application.dataCollection().genome().getAllChromosomes();
    for (int c = 0; c < chromosomes.length; c++) {
        if (cancel) {
            progressCancelled();
            return;
        }
        currentChromosome = chromosomes[c];
        Probe[] probes = application.dataCollection().probeSet().getProbesForChromosome(chromosomes[c]);
        if (probes.length < 5) {
            progressWarningReceived(new SeqMonkException("Too few probes on chromosome " + currentChromosome.name() + " - assigning zero to everything"));
            // It's not worth trying to find domains
            for (int d = 0; d < data.length; d++) {
                for (int p = 0; p < probes.length; p++) {
                    ((DataStore) data[d]).setValueForProbe(probes[p], 0f);
                }
            }
            continue;
        }
        ProbeList thisChrProbes = new ProbeList(application.dataCollection().probeSet(), chromosomes[c].name(), "", null);
        for (int p = 0; p < probes.length; p++) {
            thisChrProbes.addProbe(probes[p], 0f);
        }
        for (int d = 0; d < data.length; d++) {
            if (cancel) {
                progressCancelled();
                return;
            }
            currentStore = data[d];
            current = (d * chromosomes.length) + c;
            total = chromosomes.length * data.length;
            progressUpdated("Processing chromosome " + chromosomes[c].name() + " for " + data[d].name(), current, total);
            HeatmapMatrix matrix = new HeatmapMatrix(data[d], new ProbeList[] { thisChrProbes }, application.dataCollection().genome(), optionsPanel.minDistance(), optionsPanel.maxDistance(), optionsPanel.minStrength(), optionsPanel.maxSignificance(), optionsPanel.minAbsolute(), optionsPanel.correctLinkage());
            matrix.addProgressListener(this);
            wait = true;
            matrix.startCalculating();
            while (wait) {
                try {
                    Thread.sleep(100);
                } catch (InterruptedException e) {
                }
            }
            if (cancel) {
                progressCancelled();
                return;
            }
            if (matrix.filteredInteractions().length < 10) {
                progressWarningReceived(new SeqMonkException("Too few interactions on chromosome " + currentChromosome.name() + " for " + data[d].name() + " - assigning zero to everything"));
                // not going to get a sensible answer anyway.
                for (int p = 0; p < probes.length; p++) {
                    ((DataStore) data[d]).setValueForProbe(probes[p], 0f);
                }
                continue;
            }
            InteractionClusterMatrix clusterMatrix = new InteractionClusterMatrix(matrix.filteredInteractions(), probes.length);
            clusterMatrix.addListener(this);
            wait = true;
            clusterMatrix.startCorrelating();
            while (wait) {
                try {
                    Thread.sleep(100);
                } catch (InterruptedException e) {
                }
            }
            float[][] correlationMatrix = clusterMatrix.correlationMatix();
            // Annoyingly the PCA needs a double [][]
            double[][] correlationMatrixDouble = new double[correlationMatrix.length][];
            for (int i = 0; i < correlationMatrix.length; i++) {
                double[] db = new double[correlationMatrix[i].length];
                for (int j = 0; j < db.length; j++) {
                    db[j] = correlationMatrix[i][j];
                }
                correlationMatrixDouble[i] = db;
            }
            // Now we can calculate the PCA values from the correlation matrix
            PCA pca = new PCA(correlationMatrixDouble);
            pca.addProgressListener(this);
            wait = true;
            pca.startCalculating();
            while (wait) {
                try {
                    Thread.sleep(100);
                } catch (InterruptedException e) {
                }
            }
            double[] extractedEigenValues = pca.extractedEigenValues();
            // for these probes
            for (int p = 0; p < probes.length; p++) {
                ((DataStore) data[d]).setValueForProbe(probes[p], (float) extractedEigenValues[p]);
            }
        }
        thisChrProbes.delete();
    }
    quantitatonComplete();
}
Also used : ProbeList(uk.ac.babraham.SeqMonk.DataTypes.Probes.ProbeList) InteractionClusterMatrix(uk.ac.babraham.SeqMonk.DataTypes.Interaction.InteractionClusterMatrix) Chromosome(uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome) Probe(uk.ac.babraham.SeqMonk.DataTypes.Probes.Probe) HeatmapMatrix(uk.ac.babraham.SeqMonk.DataTypes.Interaction.HeatmapMatrix) PCA(uk.ac.babraham.SeqMonk.Analysis.Statistics.PCA) DataStore(uk.ac.babraham.SeqMonk.DataTypes.DataStore) HiCDataStore(uk.ac.babraham.SeqMonk.DataTypes.HiCDataStore) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException)

Aggregations

SeqMonkException (uk.ac.babraham.SeqMonk.SeqMonkException)91 Probe (uk.ac.babraham.SeqMonk.DataTypes.Probes.Probe)49 ProbeList (uk.ac.babraham.SeqMonk.DataTypes.Probes.ProbeList)30 Chromosome (uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome)22 Vector (java.util.Vector)21 DataSet (uk.ac.babraham.SeqMonk.DataTypes.DataSet)20 File (java.io.File)19 DataStore (uk.ac.babraham.SeqMonk.DataTypes.DataStore)17 BufferedReader (java.io.BufferedReader)16 FileReader (java.io.FileReader)16 ChromosomeWithOffset (uk.ac.babraham.SeqMonk.Utilities.ChromosomeWithOffset)14 PairedDataSet (uk.ac.babraham.SeqMonk.DataTypes.PairedDataSet)13 FileInputStream (java.io.FileInputStream)11 IOException (java.io.IOException)11 InputStreamReader (java.io.InputStreamReader)11 GZIPInputStream (java.util.zip.GZIPInputStream)11 HiCDataStore (uk.ac.babraham.SeqMonk.DataTypes.HiCDataStore)8 ProgressListener (uk.ac.babraham.SeqMonk.DataTypes.ProgressListener)8 FileNotFoundException (java.io.FileNotFoundException)7 SequenceReadWithChromosome (uk.ac.babraham.SeqMonk.DataTypes.Sequence.SequenceReadWithChromosome)7