Search in sources :

Example 6 with ChromosomeWithOffset

use of uk.ac.babraham.SeqMonk.Utilities.ChromosomeWithOffset in project SeqMonk by s-andrews.

the class BedPEFileParser method run.

/* (non-Javadoc)
	 * @see java.lang.Runnable#run()
	 */
public void run() {
    // System.err.println("Started parsing BED files");
    int extendBy = prefs.extendReads();
    try {
        File[] probeFiles = getFiles();
        DataSet[] newData = new DataSet[probeFiles.length];
        for (int f = 0; f < probeFiles.length; f++) {
            BufferedReader br;
            if (probeFiles[f].getName().toLowerCase().endsWith(".gz")) {
                br = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(probeFiles[f]))));
            } else {
                br = new BufferedReader(new FileReader(probeFiles[f]));
            }
            String line;
            if (prefs.isHiC()) {
                newData[f] = new PairedDataSet(probeFiles[f].getName(), probeFiles[f].getCanonicalPath(), prefs.removeDuplicates(), prefs.hiCDistance(), prefs.hiCIgnoreTrans());
            } else {
                newData[f] = new DataSet(probeFiles[f].getName(), probeFiles[f].getCanonicalPath(), prefs.removeDuplicates());
            }
            int lineCount = 0;
            // Now process the file
            while ((line = br.readLine()) != null) {
                if (cancel) {
                    br.close();
                    progressCancelled();
                    return;
                }
                // Ignore blank lines
                if (line.trim().length() == 0)
                    continue;
                // System.err.println(line);
                // Thread.sleep(200);
                ++lineCount;
                if (lineCount % 100000 == 0) {
                    progressUpdated("Read " + lineCount + " lines from " + probeFiles[f].getName(), f, probeFiles.length);
                }
                String[] sections = line.split("\t");
                // Check to see if we've got enough data to work with
                if (sections.length < 6) {
                    progressWarningReceived(new SeqMonkException("Not enough data from line '" + line + "'"));
                    // Skip this line...
                    continue;
                }
                // move on quickly.
                if (sections[0].equals(".") || sections[3].equals("."))
                    continue;
                int strand1;
                int start1;
                int end1;
                int strand2;
                int start2;
                int end2;
                try {
                    // The start is zero indexed so we need to add 1 to get genomic positions
                    start1 = Integer.parseInt(sections[1]) + 1;
                    start2 = Integer.parseInt(sections[4]) + 1;
                    // The end is zero indexed, but not included in the feature position so
                    // we need to add one to get genomic coordinates, but subtract one to not
                    // include the final base.
                    end1 = Integer.parseInt(sections[2]);
                    end2 = Integer.parseInt(sections[5]);
                    // End must always be later than start
                    if (start1 > end1) {
                        progressWarningReceived(new SeqMonkException("End position1 " + end1 + " was lower than start position " + start1));
                        int temp = start1;
                        start1 = end1;
                        end1 = temp;
                    }
                    if (start2 > end2) {
                        progressWarningReceived(new SeqMonkException("End position2 " + end2 + " was lower than start position " + start2));
                        int temp = start2;
                        start2 = end2;
                        end2 = temp;
                    }
                    if (sections.length >= 10) {
                        if (sections[8].equals("+")) {
                            strand1 = Location.FORWARD;
                        } else if (sections[8].equals("-")) {
                            strand1 = Location.REVERSE;
                        } else if (sections[8].equals(".")) {
                            strand1 = Location.UNKNOWN;
                        } else {
                            progressWarningReceived(new SeqMonkException("Unknown strand character '" + sections[8] + "' marked as unknown strand"));
                            strand1 = Location.UNKNOWN;
                        }
                        if (sections[9].equals("+")) {
                            strand2 = Location.FORWARD;
                        } else if (sections[9].equals("-")) {
                            strand2 = Location.REVERSE;
                        } else if (sections[9].equals(".")) {
                            strand2 = Location.UNKNOWN;
                        } else {
                            progressWarningReceived(new SeqMonkException("Unknown strand character '" + sections[9] + "' marked as unknown strand"));
                            strand2 = Location.UNKNOWN;
                        }
                    // if (extendBy > 0) {
                    // if (strand==Location.REVERSE) {
                    // start -=extendBy;
                    // }
                    // else if (strand==Location.FORWARD) {
                    // end+=extendBy;
                    // }
                    // }
                    } else {
                        strand1 = Location.UNKNOWN;
                        strand2 = Location.UNKNOWN;
                    }
                } catch (NumberFormatException e) {
                    progressWarningReceived(new SeqMonkException("Location " + sections[0] + "-" + sections[1] + " was not an integer"));
                    continue;
                }
                try {
                    ChromosomeWithOffset c1 = dataCollection().genome().getChromosome(sections[0]);
                    // We also don't allow readings which are beyond the end of the chromosome
                    start1 = c1.position(start1);
                    end1 = c1.position(end1);
                    if (end1 > c1.chromosome().length()) {
                        int overrun = end1 - c1.chromosome().length();
                        progressWarningReceived(new SeqMonkException("Reading position " + end1 + " was " + overrun + "bp beyond the end of chr" + c1.chromosome().name() + " (" + c1.chromosome().length() + ")"));
                        continue;
                    }
                    ChromosomeWithOffset c2 = dataCollection().genome().getChromosome(sections[3]);
                    // We also don't allow readings which are beyond the end of the chromosome
                    start2 = c2.position(start2);
                    end2 = c2.position(end2);
                    if (end2 > c2.chromosome().length()) {
                        int overrun = end2 - c2.chromosome().length();
                        progressWarningReceived(new SeqMonkException("Reading position " + end2 + " was " + overrun + "bp beyond the end of chr" + c2.chromosome().name() + " (" + c2.chromosome().length() + ")"));
                        continue;
                    }
                    // add them.  There's nothing clever to do.
                    if (prefs.isHiC()) {
                        long read1 = SequenceRead.packPosition(start1, end1, strand1);
                        newData[f].addData(c1.chromosome(), read1);
                        long read2 = SequenceRead.packPosition(start2, end2, strand2);
                        newData[f].addData(c2.chromosome(), read2);
                    } else {
                        // If they're on different chromosomes then we kick them out
                        if (!c1.chromosome().name().equals(c2.chromosome().name())) {
                            progressWarningReceived(new SeqMonkException("Paried reads were on different chromosomes - discarding"));
                            continue;
                        }
                        if (strand1 == Location.FORWARD && strand2 != Location.REVERSE) {
                            progressWarningReceived(new SeqMonkException("Invalid strand orientation - discarding"));
                            continue;
                        }
                        if (strand1 == Location.REVERSE && strand2 != Location.FORWARD) {
                            progressWarningReceived(new SeqMonkException("Invalid strand orientation - discarding"));
                            continue;
                        }
                        // If they're too far apart we kick them out
                        int start = 1;
                        int end = 0;
                        // We take the strand from read1
                        int strand = strand1;
                        if (strand == Location.FORWARD) {
                            start = start1;
                            end = end2;
                        } else if (strand == Location.REVERSE) {
                            start = start2;
                            end = end1;
                        } else if (strand == Location.UNKNOWN) {
                            start = Math.min(start1, start2);
                            end = Math.max(end1, end2);
                        }
                        if (end <= start) {
                            progressWarningReceived(new SeqMonkException("Incorrectly oriented reads - discarding"));
                            continue;
                        }
                        if ((end - start) + 1 > prefs.pairDistanceCutoff()) {
                            progressWarningReceived(new SeqMonkException("Distance between reads too great (" + (((end - start) + 1) - prefs.pairDistanceCutoff()) + ")"));
                            continue;
                        }
                        long read = SequenceRead.packPosition(start, end, strand);
                        newData[f].addData(c1.chromosome(), read);
                    }
                } catch (IllegalArgumentException iae) {
                    progressWarningReceived(iae);
                } catch (SeqMonkException sme) {
                    progressWarningReceived(sme);
                    continue;
                }
            }
            // We're finished with the file.
            br.close();
            // Cache the data in the new dataset
            progressUpdated("Caching data from " + probeFiles[f].getName(), f, probeFiles.length);
            newData[f].finalise();
        }
        processingFinished(newData);
    } catch (Exception ex) {
        progressExceptionReceived(ex);
        return;
    }
}
Also used : ChromosomeWithOffset(uk.ac.babraham.SeqMonk.Utilities.ChromosomeWithOffset) InputStreamReader(java.io.InputStreamReader) DataSet(uk.ac.babraham.SeqMonk.DataTypes.DataSet) PairedDataSet(uk.ac.babraham.SeqMonk.DataTypes.PairedDataSet) PairedDataSet(uk.ac.babraham.SeqMonk.DataTypes.PairedDataSet) FileInputStream(java.io.FileInputStream) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) GZIPInputStream(java.util.zip.GZIPInputStream) BufferedReader(java.io.BufferedReader) FileReader(java.io.FileReader) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) File(java.io.File)

Example 7 with ChromosomeWithOffset

use of uk.ac.babraham.SeqMonk.Utilities.ChromosomeWithOffset in project SeqMonk by s-andrews.

the class BowtieFileParser method getSingleEndRead.

/**
 * Gets a single single end read.
 *
 * @param sections The set of tab-split strings from the bowtie output
 * @return The read which was read
 * @throws SeqMonkException
 */
private SequenceReadWithChromosome getSingleEndRead(String[] sections) throws SeqMonkException {
    int strand;
    int start;
    int end;
    try {
        start = Integer.parseInt(sections[3]) + 1;
        end = start + (sections[4].length() - 1);
        if (sections[1].equals("+")) {
            strand = Location.FORWARD;
        } else if (sections[1].equals("-")) {
            strand = Location.REVERSE;
        } else {
            strand = Location.UNKNOWN;
        }
        if (extendBy > 0) {
            if (strand == Location.FORWARD) {
                end += extendBy;
            } else if (strand == Location.REVERSE) {
                start -= extendBy;
            }
        }
    } catch (NumberFormatException e) {
        throw new SeqMonkException("Location " + sections[3] + " was not an integer");
    }
    ChromosomeWithOffset c;
    try {
        c = collection.genome().getChromosome(sections[2]);
    } catch (Exception iae) {
        throw new SeqMonkException(iae.getLocalizedMessage());
    }
    start = c.position(start);
    end = c.position(end);
    // We also don't allow readings which are beyond the end of the chromosome
    if (end > c.chromosome().length()) {
        int overrun = end - c.chromosome().length();
        throw new SeqMonkException("Reading position " + end + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
    }
    if (start < 1) {
        throw new SeqMonkException("Reading position " + start + " was before the start of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
    }
    // We can now make the new reading
    SequenceReadWithChromosome read = new SequenceReadWithChromosome(c.chromosome(), SequenceRead.packPosition(start, end, strand));
    return read;
}
Also used : ChromosomeWithOffset(uk.ac.babraham.SeqMonk.Utilities.ChromosomeWithOffset) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) SequenceReadWithChromosome(uk.ac.babraham.SeqMonk.DataTypes.Sequence.SequenceReadWithChromosome)

Example 8 with ChromosomeWithOffset

use of uk.ac.babraham.SeqMonk.Utilities.ChromosomeWithOffset in project SeqMonk by s-andrews.

the class GffFileParser method run.

/* (non-Javadoc)
	 * @see java.lang.Runnable#run()
	 */
public void run() {
    File[] probeFiles = getFiles();
    DataSet[] newData = new DataSet[probeFiles.length];
    int extendBy = prefs.extendReads();
    try {
        for (int f = 0; f < probeFiles.length; f++) {
            BufferedReader br;
            if (probeFiles[f].getName().toLowerCase().endsWith(".gz")) {
                br = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(probeFiles[f]))));
            } else {
                br = new BufferedReader(new FileReader(probeFiles[f]));
            }
            String line;
            if (prefs.isHiC()) {
                newData[f] = new PairedDataSet(probeFiles[f].getName(), probeFiles[f].getCanonicalPath(), prefs.removeDuplicates(), prefs.hiCDistance(), prefs.hiCIgnoreTrans());
            } else {
                newData[f] = new DataSet(probeFiles[f].getName(), probeFiles[f].getCanonicalPath(), prefs.removeDuplicates());
            }
            int lineCount = 0;
            // Now process the file
            while ((line = br.readLine()) != null) {
                if (cancel) {
                    br.close();
                    progressCancelled();
                    return;
                }
                // Ignore blank lines
                if (line.trim().length() == 0)
                    continue;
                ++lineCount;
                if (lineCount % 100000 == 0) {
                    progressUpdated("Read " + lineCount + " lines from " + probeFiles[f].getName(), f, probeFiles.length);
                }
                String[] sections = line.split("\t");
                // Check to see if we've got enough data to work with
                if (sections.length < 5) {
                    progressWarningReceived(new SeqMonkException("Not enough data from line '" + line + "'"));
                    // Skip this line...
                    continue;
                }
                int strand;
                int start;
                int end;
                try {
                    start = Integer.parseInt(sections[3]);
                    end = Integer.parseInt(sections[4]);
                    // End must always be later than start
                    if (start > end) {
                        int temp = start;
                        start = end;
                        end = temp;
                    }
                    if (sections.length >= 7) {
                        if (sections[6].equals("+")) {
                            strand = Location.FORWARD;
                        } else if (sections[6].equals("-")) {
                            strand = Location.REVERSE;
                        } else if (sections[6].equals(".")) {
                            strand = Location.UNKNOWN;
                        } else {
                            progressWarningReceived(new SeqMonkException("Unknown strand character '" + sections[6] + "' marked as unknown strand"));
                            strand = Location.UNKNOWN;
                        }
                    } else {
                        strand = Location.UNKNOWN;
                    }
                    if (extendBy > 0) {
                        if (strand == Location.FORWARD) {
                            end += extendBy;
                        } else if (strand == Location.REVERSE) {
                            start -= extendBy;
                        }
                    }
                } catch (NumberFormatException e) {
                    progressWarningReceived(new SeqMonkException("Location " + sections[3] + "-" + sections[4] + " was not an integer"));
                    continue;
                }
                ChromosomeWithOffset c;
                try {
                    c = dataCollection().genome().getChromosome(sections[0]);
                } catch (IllegalArgumentException sme) {
                    progressWarningReceived(sme);
                    continue;
                }
                start = c.position(start);
                end = c.position(end);
                // We also don't allow readings which are beyond the end of the chromosome
                if (end > c.chromosome().length()) {
                    int overrun = end - c.chromosome().length();
                    progressWarningReceived(new SeqMonkException("Reading position " + end + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")"));
                    continue;
                }
                // We can now make the new reading
                try {
                    long read = SequenceRead.packPosition(start, end, strand);
                    newData[f].addData(c.chromosome(), read);
                } catch (SeqMonkException e) {
                    progressWarningReceived(e);
                    continue;
                }
            }
            // We're finished with the file.
            br.close();
            // Cache the data in the new dataset
            progressUpdated("Caching data from " + probeFiles[f].getName(), f, probeFiles.length);
            newData[f].finalise();
        }
        processingFinished(newData);
    } catch (Exception ex) {
        progressExceptionReceived(ex);
    }
}
Also used : ChromosomeWithOffset(uk.ac.babraham.SeqMonk.Utilities.ChromosomeWithOffset) InputStreamReader(java.io.InputStreamReader) DataSet(uk.ac.babraham.SeqMonk.DataTypes.DataSet) PairedDataSet(uk.ac.babraham.SeqMonk.DataTypes.PairedDataSet) PairedDataSet(uk.ac.babraham.SeqMonk.DataTypes.PairedDataSet) FileInputStream(java.io.FileInputStream) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) GZIPInputStream(java.util.zip.GZIPInputStream) BufferedReader(java.io.BufferedReader) FileReader(java.io.FileReader) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) File(java.io.File)

Example 9 with ChromosomeWithOffset

use of uk.ac.babraham.SeqMonk.Utilities.ChromosomeWithOffset in project SeqMonk by s-andrews.

the class GenericAnnotationParser method parseAnnotation.

/* (non-Javadoc)
	 * @see uk.ac.babraham.SeqMonk.AnnotationParsers.AnnotationParser#parseAnnotation(java.io.File, uk.ac.babraham.SeqMonk.DataTypes.Genome.Genome)
	 */
public AnnotationSet[] parseAnnotation(File file, Genome genome) throws Exception {
    this.file = file;
    if (options == null) {
        options = new JDialog(SeqMonkApplication.getInstance());
        options.setModal(true);
        options.setContentPane(new GenericAnnotationParserOptions(options));
        options.setSize(700, 400);
        options.setLocationRelativeTo(null);
    }
    // We have to set cancel to true as a default so we don't try to
    // proceed with processing if the user closes the options using
    // the X on the window.
    options.setTitle("Format for " + file.getName() + "...");
    cancel = true;
    options.setVisible(true);
    if (cancel) {
        progressCancelled();
        return null;
    }
    // We keep track of how many types have been added
    // to catch if someone sets the wrong field and makes
    // loads of different feature types.
    HashSet<String> addedTypes = new HashSet<String>();
    BufferedReader br;
    if (file.getName().toLowerCase().endsWith(".gz")) {
        br = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(file))));
    } else {
        br = new BufferedReader(new FileReader(file));
    }
    String line;
    // First skip the header lines
    for (int i = 0; i < startRowValue; i++) {
        line = br.readLine();
        if (line == null) {
            throw new Exception("Ran out of file before skipping all of the header lines");
        }
    }
    int maxIndexValue = 0;
    if (chrColValue > maxIndexValue)
        maxIndexValue = chrColValue;
    if (startColValue > maxIndexValue)
        maxIndexValue = startColValue;
    if (endColValue > maxIndexValue)
        maxIndexValue = endColValue;
    if (strandColValue > maxIndexValue)
        maxIndexValue = strandColValue;
    if (typeColValue > maxIndexValue)
        maxIndexValue = typeColValue;
    if (nameColValue > maxIndexValue)
        maxIndexValue = nameColValue;
    if (descriptionColValue > maxIndexValue)
        maxIndexValue = descriptionColValue;
    Vector<AnnotationSet> annotationSets = new Vector<AnnotationSet>();
    AnnotationSet currentAnnotation = new AnnotationSet(genome, file.getName());
    annotationSets.add(currentAnnotation);
    int lineCount = 0;
    // Now process the rest of the file
    while ((line = br.readLine()) != null) {
        ++lineCount;
        if (cancel) {
            progressCancelled();
            return null;
        }
        if (lineCount % 1000 == 0) {
            progressUpdated("Read " + lineCount + " lines from " + file.getName(), 0, 1);
        }
        if (lineCount > 1000000 && lineCount % 1000000 == 0) {
            progressUpdated("Caching...", 0, 1);
            currentAnnotation.finalise();
            currentAnnotation = new AnnotationSet(genome, file.getName() + "[" + annotationSets.size() + "]");
            annotationSets.add(currentAnnotation);
        }
        String[] sections = line.split(delimitersValue, -1);
        // Check to see if we've got enough data to work with
        if (maxIndexValue >= sections.length) {
            progressWarningReceived(new SeqMonkException("Not enough data (" + sections.length + ") to get a probe name on line '" + line + "'"));
            // Skip this line...
            continue;
        }
        int strand;
        int start;
        int end;
        try {
            start = Integer.parseInt(sections[startColValue]);
            end = Integer.parseInt(sections[endColValue]);
            // End must always be later than start
            if (end < start) {
                int temp = start;
                start = end;
                end = temp;
            }
            if (useStrand) {
                if (sections[strandColValue].equals("+") || sections[strandColValue].equals("1") || sections[strandColValue].equals("FF") || sections[strandColValue].equals("F")) {
                    strand = Location.FORWARD;
                } else if (sections[strandColValue].equals("-") || sections[strandColValue].equals("-1") || sections[strandColValue].equals("RF") || sections[strandColValue].equals("R")) {
                    strand = Location.REVERSE;
                } else {
                    progressWarningReceived(new SeqMonkException("Unknown strand character '" + sections[strandColValue] + "' marked as unknown strand"));
                    strand = Location.UNKNOWN;
                }
            } else {
                strand = Location.UNKNOWN;
            }
        } catch (NumberFormatException e) {
            progressWarningReceived(new SeqMonkException("Location " + sections[startColValue] + "-" + sections[endColValue] + " was not an integer"));
            continue;
        }
        ChromosomeWithOffset c;
        try {
            c = genome.getChromosome(sections[chrColValue]);
        } catch (IllegalArgumentException sme) {
            progressWarningReceived(sme);
            continue;
        }
        end = c.position(end);
        start = c.position(start);
        // We also don't allow readings which are beyond the end of the chromosome
        if (end > c.chromosome().length()) {
            int overrun = end - c.chromosome().length();
            progressWarningReceived(new SeqMonkException("Reading position " + end + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")"));
            continue;
        }
        if (start < 1) {
            progressWarningReceived(new SeqMonkException("Reading start position " + start + " was less than 1"));
            continue;
        }
        // Now we can add the type, name and description
        String type;
        // If there's a column containing the type then use that
        if (typeColValue >= 0) {
            type = sections[typeColValue];
        } else // If not then use the manually specified type if they set it
        if (featureType != null && featureType.length() > 0) {
            type = featureType;
        } else // If all else fails use the file name as the type
        {
            type = file.getName();
        }
        if (!addedTypes.contains(type)) {
            addedTypes.add(type);
            if (addedTypes.size() > 100) {
                throw new SeqMonkException("More than 100 different types of feature added - you don't want to do this!");
            }
        }
        String name = null;
        if (nameColValue >= 0) {
            name = sections[nameColValue];
        }
        String description = null;
        if (descriptionColValue >= 0) {
            description = sections[descriptionColValue];
        }
        // We can now make the new annotation
        Feature feature = new Feature(type, c.chromosome().name());
        if (name != null) {
            feature.addAttribute("name", name);
        }
        if (description != null)
            feature.addAttribute("description", description);
        feature.setLocation(new Location(start, end, strand));
        currentAnnotation.addFeature(feature);
    // System.out.println("Added probe "+newProbe.name()+" on "+newProbe.chromosome()+" at pos "+newProbe.position());
    }
    // We're finished with the file.
    br.close();
    options.dispose();
    options = null;
    return annotationSets.toArray(new AnnotationSet[0]);
}
Also used : ChromosomeWithOffset(uk.ac.babraham.SeqMonk.Utilities.ChromosomeWithOffset) InputStreamReader(java.io.InputStreamReader) AnnotationSet(uk.ac.babraham.SeqMonk.DataTypes.Genome.AnnotationSet) Feature(uk.ac.babraham.SeqMonk.DataTypes.Genome.Feature) FileInputStream(java.io.FileInputStream) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) GZIPInputStream(java.util.zip.GZIPInputStream) BufferedReader(java.io.BufferedReader) FileReader(java.io.FileReader) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) Vector(java.util.Vector) JDialog(javax.swing.JDialog) HashSet(java.util.HashSet) Location(uk.ac.babraham.SeqMonk.DataTypes.Genome.Location)

Example 10 with ChromosomeWithOffset

use of uk.ac.babraham.SeqMonk.Utilities.ChromosomeWithOffset in project SeqMonk by s-andrews.

the class BAMFileParser method getSplitSingleEndRead.

/**
 * Gets a split single end read.  The only reason for asking about whether the
 * import is single or paired end is that if we're doing a paired end import
 * then we reverse the strand for all second read data.  For single end import
 * we leave the strand alone whichever read it originates from.
 *
 * @param samRecord The picard record entry for this read
 * @param flag pairedEnd Is this a paired end import
 * @return The read which was read
 * @throws SeqMonkException
 */
private SequenceReadWithChromosome[] getSplitSingleEndRead(SAMRecord samRecord) throws SeqMonkException {
    int strand;
    int start;
    int lastEnd = -1;
    start = samRecord.getAlignmentStart();
    if (samRecord.getReadNegativeStrandFlag()) {
        if (samRecord.getReadPairedFlag() && samRecord.getSecondOfPairFlag()) {
            strand = Location.FORWARD;
        } else {
            strand = Location.REVERSE;
        }
    } else {
        if (samRecord.getReadPairedFlag() && samRecord.getSecondOfPairFlag()) {
            strand = Location.REVERSE;
        } else {
            strand = Location.FORWARD;
        }
    }
    ChromosomeWithOffset c;
    try {
        c = dataCollection().genome().getChromosome(samRecord.getReferenceName());
    } catch (Exception e) {
        throw new SeqMonkException(e.getLocalizedMessage());
    }
    start = c.position(start);
    if (start < 1) {
        throw new SeqMonkException("Reading position " + start + " was before the start of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
    }
    if (!samRecord.getCigarString().contains("N")) {
        if (!importIntrons) {
            int end = c.position(samRecord.getAlignmentEnd());
            return new SequenceReadWithChromosome[] { new SequenceReadWithChromosome(c.chromosome(), SequenceRead.packPosition(start, end, strand)) };
        }
        return new SequenceReadWithChromosome[0];
    }
    String[] cigarOperations = samRecord.getCigarString().split("\\d+");
    String[] cigarNumbers = samRecord.getCigarString().split("[MIDNSHP]");
    if (cigarOperations.length != cigarNumbers.length + 1) {
        throw new SeqMonkException("Couldn't parse CIGAR string " + samRecord.getCigarString() + " counts were " + cigarOperations.length + " vs " + cigarNumbers.length);
    }
    Vector<SequenceReadWithChromosome> newReads = new Vector<SequenceReadWithChromosome>();
    int currentPosition = start;
    for (int pos = 0; pos < cigarNumbers.length; pos++) {
        if (cigarOperations[pos + 1].equals("M")) {
            currentPosition += Integer.parseInt(cigarNumbers[pos]) - 1;
        } else if (cigarOperations[pos + 1].equals("I")) {
            currentPosition += Integer.parseInt(cigarNumbers[pos]) - 1;
        } else if (cigarOperations[pos + 1].equals("D")) {
            currentPosition -= Integer.parseInt(cigarNumbers[pos]) - 1;
        } else if (cigarOperations[pos + 1].equals("N")) {
            if (importIntrons) {
                if (lastEnd > 0) {
                    if (start > c.chromosome().length()) {
                        int overrun = (start - 1) - c.chromosome().length();
                        throw new SeqMonkException("Reading position " + (start - 1) + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
                    }
                    newReads.add(new SequenceReadWithChromosome(c.chromosome(), SequenceRead.packPosition(lastEnd + 1, start - 1, strand)));
                }
                // Update the lastEnd whether we added a read or not since this
                // will be the start of the next intron
                lastEnd = currentPosition;
            } else {
                // We also don't allow readings which are beyond the end of the chromosome
                if (currentPosition > c.chromosome().length()) {
                    int overrun = currentPosition - c.chromosome().length();
                    throw new SeqMonkException("Reading position " + currentPosition + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
                }
                newReads.add(new SequenceReadWithChromosome(c.chromosome(), SequenceRead.packPosition(start, currentPosition, strand)));
            }
            currentPosition += Integer.parseInt(cigarNumbers[pos]) + 1;
            start = currentPosition;
        }
    }
    if (importIntrons) {
        if (lastEnd > 0) {
            if (start > c.chromosome().length()) {
                int overrun = (start - 1) - c.chromosome().length();
                throw new SeqMonkException("Reading position " + (start - 1) + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
            }
            newReads.add(new SequenceReadWithChromosome(c.chromosome(), SequenceRead.packPosition(lastEnd + 1, start - 1, strand)));
        }
    } else {
        // We also don't allow readings which are beyond the end of the chromosome
        if (currentPosition > c.chromosome().length()) {
            int overrun = currentPosition - c.chromosome().length();
            throw new SeqMonkException("Reading position " + currentPosition + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
        }
        newReads.add(new SequenceReadWithChromosome(c.chromosome(), SequenceRead.packPosition(start, currentPosition, strand)));
    }
    return newReads.toArray(new SequenceReadWithChromosome[0]);
}
Also used : ChromosomeWithOffset(uk.ac.babraham.SeqMonk.Utilities.ChromosomeWithOffset) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) Vector(java.util.Vector) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) SequenceReadWithChromosome(uk.ac.babraham.SeqMonk.DataTypes.Sequence.SequenceReadWithChromosome)

Aggregations

SeqMonkException (uk.ac.babraham.SeqMonk.SeqMonkException)14 ChromosomeWithOffset (uk.ac.babraham.SeqMonk.Utilities.ChromosomeWithOffset)14 BufferedReader (java.io.BufferedReader)9 FileInputStream (java.io.FileInputStream)9 FileReader (java.io.FileReader)9 InputStreamReader (java.io.InputStreamReader)9 GZIPInputStream (java.util.zip.GZIPInputStream)9 File (java.io.File)7 DataSet (uk.ac.babraham.SeqMonk.DataTypes.DataSet)7 SequenceReadWithChromosome (uk.ac.babraham.SeqMonk.DataTypes.Sequence.SequenceReadWithChromosome)5 PairedDataSet (uk.ac.babraham.SeqMonk.DataTypes.PairedDataSet)4 Vector (java.util.Vector)3 AnnotationSet (uk.ac.babraham.SeqMonk.DataTypes.Genome.AnnotationSet)2 Feature (uk.ac.babraham.SeqMonk.DataTypes.Genome.Feature)2 Location (uk.ac.babraham.SeqMonk.DataTypes.Genome.Location)2 Enumeration (java.util.Enumeration)1 HashSet (java.util.HashSet)1 Hashtable (java.util.Hashtable)1 JDialog (javax.swing.JDialog)1 SplitLocation (uk.ac.babraham.SeqMonk.DataTypes.Genome.SplitLocation)1