Search in sources :

Example 36 with DataSet

use of uk.ac.babraham.SeqMonk.DataTypes.DataSet in project SeqMonk by s-andrews.

the class QuasRFileParser method run.

/* (non-Javadoc)
	 * @see java.lang.Runnable#run()
	 */
public void run() {
    try {
        File[] quasrFiles = getFiles();
        DataSet[] newData = new DataSet[quasrFiles.length];
        for (int f = 0; f < quasrFiles.length; f++) {
            BufferedReader br;
            if (quasrFiles[f].getName().toLowerCase().endsWith(".gz")) {
                br = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(quasrFiles[f]))));
            } else {
                br = new BufferedReader(new FileReader(quasrFiles[f]));
            }
            String line;
            newData[f] = new DataSet(quasrFiles[f].getName(), quasrFiles[f].getCanonicalPath(), prefs.removeDuplicates());
            int lineCount = 0;
            // Now process the file
            while ((line = br.readLine()) != null) {
                if (cancel) {
                    br.close();
                    progressCancelled();
                    return;
                }
                // Ignore blank lines
                if (line.trim().length() == 0)
                    continue;
                // In case it has comments
                if (line.startsWith("#"))
                    continue;
                ++lineCount;
                if (lineCount % 100000 == 0) {
                    progressUpdated("Read " + lineCount + " lines from " + quasrFiles[f].getName(), f, quasrFiles.length);
                }
                String[] sections = line.split("\t");
                // Check to see if we've got enough data to work with
                if (sections.length < 5) {
                    progressWarningReceived(new SeqMonkException("Not enough data from line '" + line + "'"));
                    // Skip this line...
                    continue;
                }
                int start;
                int end;
                int totalCount;
                int methCount;
                int unmethCount;
                try {
                    start = Integer.parseInt(sections[1]);
                    end = Integer.parseInt(sections[2]);
                    totalCount = Integer.parseInt(sections[3]);
                    methCount = Integer.parseInt(sections[4]);
                    unmethCount = totalCount - methCount;
                    // End must always be later than start
                    if (start > end) {
                        progressWarningReceived(new SeqMonkException("End position " + end + " was lower than start position " + start));
                        int temp = start;
                        start = end;
                        end = temp;
                    }
                } catch (NumberFormatException e) {
                    progressWarningReceived(new SeqMonkException("Location " + sections[0] + "-" + sections[1] + " was not an integer"));
                    continue;
                }
                try {
                    ChromosomeWithOffset c = dataCollection().genome().getChromosome(sections[0]);
                    // We also don't allow readings which are beyond the end of the chromosome
                    start = c.position(start);
                    end = c.position(end);
                    if (end > c.chromosome().length()) {
                        int overrun = end - c.chromosome().length();
                        progressWarningReceived(new SeqMonkException("Reading position " + end + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")"));
                        continue;
                    }
                    // We can now make the new reads
                    long methRead = SequenceRead.packPosition(start, end, Location.FORWARD);
                    long unmethRead = SequenceRead.packPosition(start, end, Location.REVERSE);
                    for (int i = 0; i < methCount; i++) {
                        newData[f].addData(c.chromosome(), methRead);
                    }
                    for (int i = 0; i < unmethCount; i++) {
                        newData[f].addData(c.chromosome(), unmethRead);
                    }
                } catch (IllegalArgumentException iae) {
                    progressWarningReceived(iae);
                } catch (SeqMonkException sme) {
                    progressWarningReceived(sme);
                    continue;
                }
            }
            // We're finished with the file.
            br.close();
            // Cache the data in the new dataset
            progressUpdated("Caching data from " + quasrFiles[f].getName(), f, quasrFiles.length);
            newData[f].finalise();
        }
        processingFinished(newData);
    } catch (Exception ex) {
        progressExceptionReceived(ex);
        return;
    }
}
Also used : ChromosomeWithOffset(uk.ac.babraham.SeqMonk.Utilities.ChromosomeWithOffset) InputStreamReader(java.io.InputStreamReader) DataSet(uk.ac.babraham.SeqMonk.DataTypes.DataSet) FileInputStream(java.io.FileInputStream) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) GZIPInputStream(java.util.zip.GZIPInputStream) BufferedReader(java.io.BufferedReader) FileReader(java.io.FileReader) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) File(java.io.File)

Aggregations

DataSet (uk.ac.babraham.SeqMonk.DataTypes.DataSet)36 SeqMonkException (uk.ac.babraham.SeqMonk.SeqMonkException)22 DataGroup (uk.ac.babraham.SeqMonk.DataTypes.DataGroup)16 PairedDataSet (uk.ac.babraham.SeqMonk.DataTypes.PairedDataSet)14 File (java.io.File)12 Vector (java.util.Vector)11 DataStore (uk.ac.babraham.SeqMonk.DataTypes.DataStore)11 BufferedReader (java.io.BufferedReader)10 FileReader (java.io.FileReader)10 FileInputStream (java.io.FileInputStream)9 InputStreamReader (java.io.InputStreamReader)9 GZIPInputStream (java.util.zip.GZIPInputStream)9 Chromosome (uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome)8 ChromosomeWithOffset (uk.ac.babraham.SeqMonk.Utilities.ChromosomeWithOffset)7 Probe (uk.ac.babraham.SeqMonk.DataTypes.Probes.Probe)6 JLabel (javax.swing.JLabel)5 ReplicateSet (uk.ac.babraham.SeqMonk.DataTypes.ReplicateSet)5 IOException (java.io.IOException)4 GridBagConstraints (java.awt.GridBagConstraints)3 GridBagLayout (java.awt.GridBagLayout)3