Search in sources :

Example 1 with ChromosomeWithOffset

use of uk.ac.babraham.SeqMonk.Utilities.ChromosomeWithOffset in project SeqMonk by s-andrews.

the class GFF3AnnotationParser method parseAnnotation.

public AnnotationSet[] parseAnnotation(File file, Genome genome, String prefix) throws Exception {
    System.err.println("Parsing " + file);
    if (prefix == null) {
        featurePrefix = JOptionPane.showInputDialog(SeqMonkApplication.getInstance(), "Feature prefix", "GFFv3/GTP Options", JOptionPane.QUESTION_MESSAGE);
    } else {
        featurePrefix = prefix;
    }
    if (featurePrefix == null)
        featurePrefix = "";
    Vector<AnnotationSet> annotationSets = new Vector<AnnotationSet>();
    AnnotationSet currentAnnotation = new AnnotationSet(genome, file.getName());
    annotationSets.add(currentAnnotation);
    Hashtable<String, FeatureGroup> groupedFeatures = new Hashtable<String, FeatureGroup>();
    BufferedReader br;
    if (file.getName().toLowerCase().endsWith(".gz")) {
        br = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(file))));
    } else {
        br = new BufferedReader(new FileReader(file));
    }
    String line;
    int count = 0;
    while ((line = br.readLine()) != null) {
        if (cancel) {
            progressCancelled();
            br.close();
            return null;
        }
        if (count % 1000 == 0) {
            progressUpdated("Read " + count + " lines from " + file.getName(), 0, 1);
        }
        if (count > 1000000 && count % 1000000 == 0) {
            progressUpdated("Caching...", 0, 1);
            currentAnnotation.finalise();
            currentAnnotation = new AnnotationSet(genome, file.getName() + "[" + annotationSets.size() + "]");
            annotationSets.add(currentAnnotation);
        }
        ++count;
        // Ignore blank lines
        if (line.trim().length() == 0)
            continue;
        // Skip comments
        if (line.startsWith("#"))
            continue;
        String[] sections = line.split("\t");
        // Check to see if we've got enough data to work with
        if (sections.length < 7) {
            progressWarningReceived(new SeqMonkException("Not enough data from line '" + line + "'"));
            continue;
        }
        int strand;
        int start;
        int end;
        try {
            start = Integer.parseInt(sections[3]);
            end = Integer.parseInt(sections[4]);
            // End must always be later than start
            if (end < start) {
                int temp = start;
                start = end;
                end = temp;
            }
            if (sections.length >= 7) {
                if (sections[6].equals("+")) {
                    strand = Location.FORWARD;
                } else if (sections[6].equals("-")) {
                    strand = Location.REVERSE;
                } else {
                    strand = Location.UNKNOWN;
                }
            } else {
                strand = Location.UNKNOWN;
            }
        } catch (NumberFormatException e) {
            progressWarningReceived(new SeqMonkException("Location " + sections[3] + "-" + sections[4] + " was not an integer"));
            continue;
        }
        ChromosomeWithOffset c;
        try {
            c = genome.getChromosome(sections[0]);
        } catch (IllegalArgumentException e) {
            progressWarningReceived(new SeqMonkException("Couldn't find a chromosome called " + sections[0]));
            continue;
        }
        start = c.position(start);
        end = c.position(end);
        // We also don't allow readings which are beyond the end of the chromosome
        if (end > c.chromosome().length()) {
            int overrun = end - c.chromosome().length();
            progressWarningReceived(new SeqMonkException("Reading position " + end + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")"));
            continue;
        }
        if (sections.length > 8 && sections[8].trim().length() > 0) {
            // Should check for escaped colons
            String[] attributes = sections[8].split(" *; *");
            // Make up a data structure of the attributes we have
            Hashtable<String, Vector<String>> keyValuePairs = new Hashtable<String, Vector<String>>();
            for (int a = 0; a < attributes.length; a++) {
                // Should check for escaped equals
                String[] keyValue = attributes[a].split("=", 2);
                // See if we didn't get split
                if (keyValue.length == 1) {
                    // This could be a GTF file which uses quoted values in space delimited fields
                    keyValue = attributes[a].split(" \"");
                    if (keyValue.length == 2) {
                        // We need to remove the quote from the end of the value
                        keyValue[1] = keyValue[1].substring(0, keyValue[1].length() - 1);
                    // System.out.println("Key='"+keyValue[0]+"' value='"+keyValue[1]+"'");
                    }
                }
                if (keyValue.length == 2) {
                    if (keyValuePairs.containsKey(keyValue[0])) {
                        keyValuePairs.get(keyValue[0]).add(keyValue[1]);
                    } else {
                        Vector<String> newVector = new Vector<String>();
                        newVector.add(keyValue[1]);
                        keyValuePairs.put(keyValue[0], newVector);
                    }
                } else {
                    progressWarningReceived(new SeqMonkException("No key value delimiter in " + attributes[a]));
                }
            }
            if (keyValuePairs.containsKey("Parent") && !sections[2].equals("mRNA")) {
                // We change exons to mRNA so we don't end up with spliced exon objects
                if (sections[2].equals("exon"))
                    sections[2] = "mRNA";
                String[] parents = keyValuePairs.get("Parent").elementAt(0).split(",");
                for (int p = 0; p < parents.length; p++) {
                    if (!groupedFeatures.containsKey(sections[2] + "_" + parents[p])) {
                        // Make a new feature to which we can add this
                        Feature feature = new Feature(featurePrefix + sections[2], c.chromosome().name());
                        groupedFeatures.put(sections[2] + "_" + parents[p], new FeatureGroup(feature, strand, feature.location()));
                        Enumeration<String> en = keyValuePairs.keys();
                        while (en.hasMoreElements()) {
                            String key = en.nextElement();
                            String[] values = keyValuePairs.get(key).toArray(new String[0]);
                            for (int v = 0; v < values.length; v++) {
                                feature.addAttribute(key, values[v]);
                            }
                        }
                    }
                    groupedFeatures.get(sections[2] + "_" + parents[p]).addSublocation(new Location(start, end, strand));
                }
            } else // parent feature
            if (keyValuePairs.containsKey("transcript_id")) {
                if (sections[2].equals("exon"))
                    sections[2] = "mRNA";
                if (!groupedFeatures.containsKey(sections[2] + "_" + keyValuePairs.get("transcript_id").elementAt(0))) {
                    Feature feature = new Feature(featurePrefix + sections[2], c.chromosome().name());
                    Enumeration<String> en = keyValuePairs.keys();
                    while (en.hasMoreElements()) {
                        String key = en.nextElement();
                        String[] values = keyValuePairs.get(key).toArray(new String[0]);
                        for (int v = 0; v < values.length; v++) {
                            feature.addAttribute(key, values[v]);
                        }
                    }
                    groupedFeatures.put(sections[2] + "_" + keyValuePairs.get("transcript_id").elementAt(0), new FeatureGroup(feature, strand, feature.location()));
                }
                groupedFeatures.get(sections[2] + "_" + keyValuePairs.get("transcript_id").elementAt(0)).addSublocation(new Location(start, end, strand));
            } else {
                // If we get here we're making a feature with attributes
                Feature feature = new Feature(featurePrefix + sections[2], c.chromosome().name());
                feature.setLocation(new Location(start, end, strand));
                Enumeration<String> en = keyValuePairs.keys();
                while (en.hasMoreElements()) {
                    String key = en.nextElement();
                    String[] values = keyValuePairs.get(key).toArray(new String[0]);
                    for (int v = 0; v < values.length; v++) {
                        feature.addAttribute(key, values[v]);
                    }
                }
                if (keyValuePairs.containsKey("ID")) {
                    // This is a feature which may end up having subfeatures
                    groupedFeatures.put(sections[2] + "_" + keyValuePairs.get("ID").elementAt(0), new FeatureGroup(feature, strand, feature.location()));
                // System.out.println("Making new entry for "+keyValuePairs.get("ID").elementAt(0));
                } else {
                    // We can just add this to the annotation collection
                    currentAnnotation.addFeature(feature);
                }
            }
        } else {
            // No group parameter to worry about
            Feature feature = new Feature(featurePrefix + sections[2], c.chromosome().name());
            feature.setLocation(new Location(start, end, strand));
            currentAnnotation.addFeature(feature);
        }
    }
    br.close();
    // Now go through the grouped features adding them to the annotation set
    Iterator<FeatureGroup> i = groupedFeatures.values().iterator();
    while (i.hasNext()) {
        Feature f = i.next().feature();
        currentAnnotation.addFeature(f);
    }
    return annotationSets.toArray(new AnnotationSet[0]);
}
Also used : ChromosomeWithOffset(uk.ac.babraham.SeqMonk.Utilities.ChromosomeWithOffset) AnnotationSet(uk.ac.babraham.SeqMonk.DataTypes.Genome.AnnotationSet) Feature(uk.ac.babraham.SeqMonk.DataTypes.Genome.Feature) GZIPInputStream(java.util.zip.GZIPInputStream) FileReader(java.io.FileReader) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) Vector(java.util.Vector) Enumeration(java.util.Enumeration) InputStreamReader(java.io.InputStreamReader) Hashtable(java.util.Hashtable) FileInputStream(java.io.FileInputStream) BufferedReader(java.io.BufferedReader) Location(uk.ac.babraham.SeqMonk.DataTypes.Genome.Location) SplitLocation(uk.ac.babraham.SeqMonk.DataTypes.Genome.SplitLocation)

Example 2 with ChromosomeWithOffset

use of uk.ac.babraham.SeqMonk.Utilities.ChromosomeWithOffset in project SeqMonk by s-andrews.

the class BedFileParser method run.

/* (non-Javadoc)
	 * @see java.lang.Runnable#run()
	 */
public void run() {
    // System.err.println("Started parsing BED files");
    int extendBy = prefs.extendReads();
    try {
        File[] probeFiles = getFiles();
        DataSet[] newData = new DataSet[probeFiles.length];
        for (int f = 0; f < probeFiles.length; f++) {
            BufferedReader br;
            if (probeFiles[f].getName().toLowerCase().endsWith(".gz")) {
                br = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(probeFiles[f]))));
            } else {
                br = new BufferedReader(new FileReader(probeFiles[f]));
            }
            String line;
            if (prefs.isHiC()) {
                newData[f] = new PairedDataSet(probeFiles[f].getName(), probeFiles[f].getCanonicalPath(), prefs.removeDuplicates(), prefs.hiCDistance(), prefs.hiCIgnoreTrans());
            } else {
                newData[f] = new DataSet(probeFiles[f].getName(), probeFiles[f].getCanonicalPath(), prefs.removeDuplicates());
            }
            int lineCount = 0;
            // Now process the file
            while ((line = br.readLine()) != null) {
                if (cancel) {
                    br.close();
                    progressCancelled();
                    return;
                }
                // Ignore blank lines
                if (line.trim().length() == 0)
                    continue;
                ++lineCount;
                if (lineCount % 100000 == 0) {
                    progressUpdated("Read " + lineCount + " lines from " + probeFiles[f].getName(), f, probeFiles.length);
                }
                String[] sections = line.split("\t");
                // Check to see if we've got enough data to work with
                if (sections.length < 3) {
                    progressWarningReceived(new SeqMonkException("Not enough data from line '" + line + "'"));
                    // Skip this line...
                    continue;
                }
                int strand;
                int start;
                int end;
                try {
                    // The start is zero indexed so we need to add 1 to get genomic positions
                    start = Integer.parseInt(sections[1]) + 1;
                    // The end is zero indexed, but not included in the feature position so
                    // we need to add one to get genomic coordinates, but subtract one to not
                    // include the final base.
                    end = Integer.parseInt(sections[2]);
                    // End must always be later than start
                    if (start > end) {
                        progressWarningReceived(new SeqMonkException("End position " + end + " was lower than start position " + start));
                        int temp = start;
                        start = end;
                        end = temp;
                    }
                    if (sections.length >= 6) {
                        if (sections[5].equals("+")) {
                            strand = Location.FORWARD;
                        } else if (sections[5].equals("-")) {
                            strand = Location.REVERSE;
                        } else {
                            progressWarningReceived(new SeqMonkException("Unknown strand character '" + sections[5] + "' marked as unknown strand"));
                            strand = Location.UNKNOWN;
                        }
                        if (extendBy > 0) {
                            if (strand == Location.REVERSE) {
                                start -= extendBy;
                            } else if (strand == Location.FORWARD) {
                                end += extendBy;
                            }
                        }
                    } else {
                        strand = Location.UNKNOWN;
                    }
                } catch (NumberFormatException e) {
                    progressWarningReceived(new SeqMonkException("Location " + sections[0] + "-" + sections[1] + " was not an integer"));
                    continue;
                }
                try {
                    ChromosomeWithOffset c = dataCollection().genome().getChromosome(sections[0]);
                    // We also don't allow readings which are beyond the end of the chromosome
                    start = c.position(start);
                    end = c.position(end);
                    if (end > c.chromosome().length()) {
                        int overrun = end - c.chromosome().length();
                        progressWarningReceived(new SeqMonkException("Reading position " + end + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")"));
                        continue;
                    }
                    // We can now make the new reading
                    long read = SequenceRead.packPosition(start, end, strand);
                    newData[f].addData(c.chromosome(), read);
                } catch (IllegalArgumentException iae) {
                    progressWarningReceived(iae);
                } catch (SeqMonkException sme) {
                    progressWarningReceived(sme);
                    continue;
                }
            }
            // We're finished with the file.
            br.close();
            // Cache the data in the new dataset
            progressUpdated("Caching data from " + probeFiles[f].getName(), f, probeFiles.length);
            newData[f].finalise();
        }
        processingFinished(newData);
    } catch (Exception ex) {
        progressExceptionReceived(ex);
        return;
    }
}
Also used : ChromosomeWithOffset(uk.ac.babraham.SeqMonk.Utilities.ChromosomeWithOffset) InputStreamReader(java.io.InputStreamReader) DataSet(uk.ac.babraham.SeqMonk.DataTypes.DataSet) PairedDataSet(uk.ac.babraham.SeqMonk.DataTypes.PairedDataSet) PairedDataSet(uk.ac.babraham.SeqMonk.DataTypes.PairedDataSet) FileInputStream(java.io.FileInputStream) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) GZIPInputStream(java.util.zip.GZIPInputStream) BufferedReader(java.io.BufferedReader) FileReader(java.io.FileReader) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) File(java.io.File)

Example 3 with ChromosomeWithOffset

use of uk.ac.babraham.SeqMonk.Utilities.ChromosomeWithOffset in project SeqMonk by s-andrews.

the class GenericSeqReadParser method run.

/* (non-Javadoc)
	 * @see java.lang.Runnable#run()
	 */
public void run() {
    try {
        // This call just makes sure that the options panel exists if
        // it's never been called for before.
        getOptionsPanel();
        int removeDuplicates = optionsPanel.removeDuplicates();
        int extendBy = optionsPanel.extendBy();
        File[] probeFiles = getFiles();
        DataSet[] newData = new DataSet[probeFiles.length];
        for (int f = 0; f < probeFiles.length; f++) {
            BufferedReader br;
            if (probeFiles[f].getName().toLowerCase().endsWith(".gz")) {
                br = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(probeFiles[f]))));
            } else {
                br = new BufferedReader(new FileReader(probeFiles[f]));
            }
            String line;
            // First skip the header lines
            for (int i = 0; i < startRowValue; i++) {
                line = br.readLine();
                if (line == null) {
                    br.close();
                    throw new Exception("Ran out of file before skipping all of the header lines");
                }
            }
            int maxIndexValue = 0;
            if (chrColValue > maxIndexValue)
                maxIndexValue = chrColValue;
            if (startColValue > maxIndexValue)
                maxIndexValue = startColValue;
            if (endColValue > maxIndexValue)
                maxIndexValue = endColValue;
            if (strandColValue > maxIndexValue)
                maxIndexValue = strandColValue;
            if (countColValue > maxIndexValue)
                maxIndexValue = countColValue;
            if (optionsPanel.isHiC()) {
                int distance = 0;
                if (optionsPanel.hiCDistance.getText().length() > 0) {
                    distance = Integer.parseInt(optionsPanel.hiCDistance.getText());
                }
                // TODO: Add an option to remove trans hits when importing from the generic parser.
                newData[f] = new PairedDataSet(probeFiles[f].getName(), probeFiles[f].getCanonicalPath(), removeDuplicates, distance, false);
            } else {
                newData[f] = new DataSet(probeFiles[f].getName(), probeFiles[f].getCanonicalPath(), removeDuplicates);
            }
            int lineCount = 0;
            // Now process the rest of the file
            while ((line = br.readLine()) != null) {
                if (cancel) {
                    br.close();
                    progressCancelled();
                    return;
                }
                ++lineCount;
                if (lineCount % 100000 == 0) {
                    progressUpdated("Read " + lineCount + " lines from " + probeFiles[f].getName(), f, probeFiles.length);
                }
                String[] sections = line.split(delimitersValue);
                // Check to see if we've got enough data to work with
                if (maxIndexValue >= sections.length) {
                    progressWarningReceived(new SeqMonkException("Not enough data (" + sections.length + ") to get a probe name on line '" + line + "'"));
                    // Skip this line...
                    continue;
                }
                int strand;
                int start;
                int end;
                int count = 1;
                try {
                    start = Integer.parseInt(sections[startColValue].replaceAll(" ", ""));
                    end = Integer.parseInt(sections[endColValue].replaceAll(" ", ""));
                    // End must always be later than start
                    if (end < start) {
                        int temp = start;
                        start = end;
                        end = temp;
                    }
                    if (countColValue != -1 && sections[countColValue].length() > 0) {
                        try {
                            count = Integer.parseInt(sections[countColValue].replaceAll(" ", ""));
                        } catch (NumberFormatException e) {
                            progressWarningReceived(new SeqMonkException("Count value " + sections[countColValue] + " was not an integer"));
                            continue;
                        }
                    }
                    if (useStrand) {
                        sections[strandColValue] = sections[strandColValue].replaceAll(" ", "");
                        if (sections[strandColValue].equals("+") || sections[strandColValue].equals("1") || sections[strandColValue].equals("FF") || sections[strandColValue].equals("F")) {
                            strand = Location.FORWARD;
                        } else if (sections[strandColValue].equals("-") || sections[strandColValue].equals("-1") || sections[strandColValue].equals("RF") || sections[strandColValue].equals("R")) {
                            strand = Location.REVERSE;
                        } else {
                            progressWarningReceived(new SeqMonkException("Unknown strand character '" + sections[strandColValue] + "' marked as unknown strand"));
                            strand = Location.UNKNOWN;
                        }
                    } else {
                        strand = Location.UNKNOWN;
                    }
                    if (extendBy > 0) {
                        if (strand == Location.REVERSE) {
                            start -= extendBy;
                        } else {
                            end += extendBy;
                        }
                    }
                } catch (NumberFormatException e) {
                    progressWarningReceived(new SeqMonkException("Location '" + sections[startColValue] + "'-'" + sections[endColValue] + "' was not an integer"));
                    continue;
                }
                ChromosomeWithOffset c;
                try {
                    c = dataCollection().genome().getChromosome(sections[chrColValue]);
                } catch (IllegalArgumentException sme) {
                    progressWarningReceived(sme);
                    continue;
                }
                start = c.position(start);
                end = c.position(end);
                // We also don't allow readings which are beyond the end of the chromosome
                if (end > c.chromosome().length()) {
                    int overrun = end - c.chromosome().length();
                    progressWarningReceived(new SeqMonkException("Reading position " + end + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")"));
                    continue;
                }
                if (start < 1) {
                    progressWarningReceived(new SeqMonkException("Reading start position " + start + " was less than 1"));
                    continue;
                }
                // We can now make the new reading
                try {
                    long read = SequenceRead.packPosition(start, end, strand);
                    for (int i = 0; i < count; i++) {
                        newData[f].addData(c.chromosome(), read);
                    }
                } catch (SeqMonkException e) {
                    progressWarningReceived(e);
                    continue;
                }
            // System.out.println("Added probe "+newProbe.name()+" on "+newProbe.chromosome()+" at pos "+newProbe.position());
            }
            // We're finished with the file.
            br.close();
            // Cache the data in the new dataset
            progressUpdated("Caching data from " + probeFiles[f].getName(), f, probeFiles.length);
            newData[f].finalise();
        }
        processingFinished(newData);
    } catch (Exception ex) {
        progressExceptionReceived(ex);
        return;
    }
}
Also used : ChromosomeWithOffset(uk.ac.babraham.SeqMonk.Utilities.ChromosomeWithOffset) InputStreamReader(java.io.InputStreamReader) DataSet(uk.ac.babraham.SeqMonk.DataTypes.DataSet) PairedDataSet(uk.ac.babraham.SeqMonk.DataTypes.PairedDataSet) PairedDataSet(uk.ac.babraham.SeqMonk.DataTypes.PairedDataSet) FileInputStream(java.io.FileInputStream) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) GZIPInputStream(java.util.zip.GZIPInputStream) BufferedReader(java.io.BufferedReader) FileReader(java.io.FileReader) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) File(java.io.File)

Example 4 with ChromosomeWithOffset

use of uk.ac.babraham.SeqMonk.Utilities.ChromosomeWithOffset in project SeqMonk by s-andrews.

the class MethylKitFileParser method run.

/* (non-Javadoc)
	 * @see java.lang.Runnable#run()
	 */
public void run() {
    try {
        File[] methylKitFiles = getFiles();
        DataSet[] newData = new DataSet[methylKitFiles.length];
        for (int f = 0; f < methylKitFiles.length; f++) {
            BufferedReader br;
            if (methylKitFiles[f].getName().toLowerCase().endsWith(".gz")) {
                br = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(methylKitFiles[f]))));
            } else {
                br = new BufferedReader(new FileReader(methylKitFiles[f]));
            }
            String line;
            newData[f] = new DataSet(methylKitFiles[f].getName(), methylKitFiles[f].getCanonicalPath(), prefs.removeDuplicates());
            int lineCount = 0;
            // Now process the file
            while ((line = br.readLine()) != null) {
                if (cancel) {
                    br.close();
                    progressCancelled();
                    return;
                }
                // Ignore blank lines
                if (line.trim().length() == 0)
                    continue;
                // In case it has comments
                if (line.startsWith("#"))
                    continue;
                // This is the start of the header
                if (line.startsWith("chrBase"))
                    continue;
                ++lineCount;
                if (lineCount % 100000 == 0) {
                    progressUpdated("Read " + lineCount + " lines from " + methylKitFiles[f].getName(), f, methylKitFiles.length);
                }
                String[] sections = line.split("\t");
                // Check to see if we've got enough data to work with
                if (sections.length < 6) {
                    progressWarningReceived(new SeqMonkException("Not enough data from line '" + line + "'"));
                    // Skip this line...
                    continue;
                }
                int position;
                int totalCount;
                int methCount;
                int unmethCount;
                try {
                    position = Integer.parseInt(sections[2]);
                    totalCount = Integer.parseInt(sections[4]);
                    methCount = Math.round((Float.parseFloat(sections[5]) / 100) * totalCount);
                    unmethCount = Math.round((Float.parseFloat(sections[6]) / 100) * totalCount);
                } catch (NumberFormatException e) {
                    progressWarningReceived(new SeqMonkException("Failed to parse position and counts from " + line));
                    continue;
                }
                try {
                    ChromosomeWithOffset c = dataCollection().genome().getChromosome(sections[1]);
                    // We also don't allow readings which are beyond the end of the chromosome
                    if (position > c.chromosome().length()) {
                        int overrun = position - c.chromosome().length();
                        progressWarningReceived(new SeqMonkException("Reading position " + position + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")"));
                        continue;
                    }
                    // We can now make the new reads
                    long methRead = SequenceRead.packPosition(position, position, Location.FORWARD);
                    long unmethRead = SequenceRead.packPosition(position, position, Location.REVERSE);
                    for (int i = 0; i < methCount; i++) {
                        newData[f].addData(c.chromosome(), methRead);
                    }
                    for (int i = 0; i < unmethCount; i++) {
                        newData[f].addData(c.chromosome(), unmethRead);
                    }
                } catch (IllegalArgumentException iae) {
                    progressWarningReceived(iae);
                } catch (SeqMonkException sme) {
                    progressWarningReceived(sme);
                    continue;
                }
            }
            // We're finished with the file.
            br.close();
            // Cache the data in the new dataset
            progressUpdated("Caching data from " + methylKitFiles[f].getName(), f, methylKitFiles.length);
            newData[f].finalise();
        }
        processingFinished(newData);
    } catch (Exception ex) {
        progressExceptionReceived(ex);
        return;
    }
}
Also used : ChromosomeWithOffset(uk.ac.babraham.SeqMonk.Utilities.ChromosomeWithOffset) InputStreamReader(java.io.InputStreamReader) DataSet(uk.ac.babraham.SeqMonk.DataTypes.DataSet) FileInputStream(java.io.FileInputStream) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) GZIPInputStream(java.util.zip.GZIPInputStream) BufferedReader(java.io.BufferedReader) FileReader(java.io.FileReader) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) File(java.io.File)

Example 5 with ChromosomeWithOffset

use of uk.ac.babraham.SeqMonk.Utilities.ChromosomeWithOffset in project SeqMonk by s-andrews.

the class BAMFileParser method getPairedEndRead.

/**
 * Gets a paired end read.  This method assumes that it will only be passed reads which map
 * to the reverse strand since these are the ones which contain enough information to
 * unambiguously locate both ends of the pair.
 *
 * @param sections The tab split sections from the SAM file
 * @param flag The binary flag field
 * @return The read which was read
 * @throws SeqMonkException
 */
private SequenceReadWithChromosome getPairedEndRead(SAMRecord samRecord) throws SeqMonkException {
    int strand;
    int start;
    int end;
    if (!samRecord.getReadNegativeStrandFlag()) {
        throw new SeqMonkException("Read passed to parse pair was not on the negative strand");
    }
    if (samRecord.getMateNegativeStrandFlag()) {
        throw new SeqMonkException("Ignored discordantly stranded read pair");
    }
    end = samRecord.getAlignmentEnd();
    start = samRecord.getMateAlignmentStart();
    if (start > end) {
        throw new SeqMonkException("Ignored discordantly stranded read pair");
    }
    if (samRecord.getFirstOfPairFlag()) {
        strand = Location.REVERSE;
    } else {
        strand = Location.FORWARD;
    }
    if ((end - start) + 1 > pairedEndDistance) {
        throw new SeqMonkException("Distance between ends " + ((end - start) + 1) + " was larger than cutoff (" + pairedEndDistance + ")");
    }
    ChromosomeWithOffset c;
    try {
        c = dataCollection().genome().getChromosome(samRecord.getReferenceName());
    } catch (Exception e) {
        throw new SeqMonkException(e.getLocalizedMessage());
    }
    start = c.position(start);
    end = c.position(end);
    // We also don't allow readings which are beyond the end of the chromosome
    if (end > c.chromosome().length()) {
        int overrun = end - c.chromosome().length();
        throw new SeqMonkException("Reading position " + end + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
    }
    if (start < 1) {
        throw new SeqMonkException("Reading position " + start + " was before the start of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
    }
    // We can now make the new reading
    SequenceReadWithChromosome read = new SequenceReadWithChromosome(c.chromosome(), SequenceRead.packPosition(start, end, strand));
    return read;
}
Also used : ChromosomeWithOffset(uk.ac.babraham.SeqMonk.Utilities.ChromosomeWithOffset) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) SequenceReadWithChromosome(uk.ac.babraham.SeqMonk.DataTypes.Sequence.SequenceReadWithChromosome)

Aggregations

SeqMonkException (uk.ac.babraham.SeqMonk.SeqMonkException)14 ChromosomeWithOffset (uk.ac.babraham.SeqMonk.Utilities.ChromosomeWithOffset)14 BufferedReader (java.io.BufferedReader)9 FileInputStream (java.io.FileInputStream)9 FileReader (java.io.FileReader)9 InputStreamReader (java.io.InputStreamReader)9 GZIPInputStream (java.util.zip.GZIPInputStream)9 File (java.io.File)7 DataSet (uk.ac.babraham.SeqMonk.DataTypes.DataSet)7 SequenceReadWithChromosome (uk.ac.babraham.SeqMonk.DataTypes.Sequence.SequenceReadWithChromosome)5 PairedDataSet (uk.ac.babraham.SeqMonk.DataTypes.PairedDataSet)4 Vector (java.util.Vector)3 AnnotationSet (uk.ac.babraham.SeqMonk.DataTypes.Genome.AnnotationSet)2 Feature (uk.ac.babraham.SeqMonk.DataTypes.Genome.Feature)2 Location (uk.ac.babraham.SeqMonk.DataTypes.Genome.Location)2 Enumeration (java.util.Enumeration)1 HashSet (java.util.HashSet)1 Hashtable (java.util.Hashtable)1 JDialog (javax.swing.JDialog)1 SplitLocation (uk.ac.babraham.SeqMonk.DataTypes.Genome.SplitLocation)1