Search in sources :

Example 1 with SequenceReadWithChromosome

use of uk.ac.babraham.SeqMonk.DataTypes.Sequence.SequenceReadWithChromosome in project SeqMonk by s-andrews.

the class BAMFileParser method getPairedEndRead.

/**
 * Gets a paired end read.  This method assumes that it will only be passed reads which map
 * to the reverse strand since these are the ones which contain enough information to
 * unambiguously locate both ends of the pair.
 *
 * @param sections The tab split sections from the SAM file
 * @param flag The binary flag field
 * @return The read which was read
 * @throws SeqMonkException
 */
private SequenceReadWithChromosome getPairedEndRead(SAMRecord samRecord) throws SeqMonkException {
    int strand;
    int start;
    int end;
    if (!samRecord.getReadNegativeStrandFlag()) {
        throw new SeqMonkException("Read passed to parse pair was not on the negative strand");
    }
    if (samRecord.getMateNegativeStrandFlag()) {
        throw new SeqMonkException("Ignored discordantly stranded read pair");
    }
    end = samRecord.getAlignmentEnd();
    start = samRecord.getMateAlignmentStart();
    if (start > end) {
        throw new SeqMonkException("Ignored discordantly stranded read pair");
    }
    if (samRecord.getFirstOfPairFlag()) {
        strand = Location.REVERSE;
    } else {
        strand = Location.FORWARD;
    }
    if ((end - start) + 1 > pairedEndDistance) {
        throw new SeqMonkException("Distance between ends " + ((end - start) + 1) + " was larger than cutoff (" + pairedEndDistance + ")");
    }
    ChromosomeWithOffset c;
    try {
        c = dataCollection().genome().getChromosome(samRecord.getReferenceName());
    } catch (Exception e) {
        throw new SeqMonkException(e.getLocalizedMessage());
    }
    start = c.position(start);
    end = c.position(end);
    // We also don't allow readings which are beyond the end of the chromosome
    if (end > c.chromosome().length()) {
        int overrun = end - c.chromosome().length();
        throw new SeqMonkException("Reading position " + end + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
    }
    if (start < 1) {
        throw new SeqMonkException("Reading position " + start + " was before the start of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
    }
    // We can now make the new reading
    SequenceReadWithChromosome read = new SequenceReadWithChromosome(c.chromosome(), SequenceRead.packPosition(start, end, strand));
    return read;
}
Also used : ChromosomeWithOffset(uk.ac.babraham.SeqMonk.Utilities.ChromosomeWithOffset) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) SequenceReadWithChromosome(uk.ac.babraham.SeqMonk.DataTypes.Sequence.SequenceReadWithChromosome)

Example 2 with SequenceReadWithChromosome

use of uk.ac.babraham.SeqMonk.DataTypes.Sequence.SequenceReadWithChromosome in project SeqMonk by s-andrews.

the class BowtieFileParser method getSingleEndRead.

/**
 * Gets a single single end read.
 *
 * @param sections The set of tab-split strings from the bowtie output
 * @return The read which was read
 * @throws SeqMonkException
 */
private SequenceReadWithChromosome getSingleEndRead(String[] sections) throws SeqMonkException {
    int strand;
    int start;
    int end;
    try {
        start = Integer.parseInt(sections[3]) + 1;
        end = start + (sections[4].length() - 1);
        if (sections[1].equals("+")) {
            strand = Location.FORWARD;
        } else if (sections[1].equals("-")) {
            strand = Location.REVERSE;
        } else {
            strand = Location.UNKNOWN;
        }
        if (extendBy > 0) {
            if (strand == Location.FORWARD) {
                end += extendBy;
            } else if (strand == Location.REVERSE) {
                start -= extendBy;
            }
        }
    } catch (NumberFormatException e) {
        throw new SeqMonkException("Location " + sections[3] + " was not an integer");
    }
    ChromosomeWithOffset c;
    try {
        c = collection.genome().getChromosome(sections[2]);
    } catch (Exception iae) {
        throw new SeqMonkException(iae.getLocalizedMessage());
    }
    start = c.position(start);
    end = c.position(end);
    // We also don't allow readings which are beyond the end of the chromosome
    if (end > c.chromosome().length()) {
        int overrun = end - c.chromosome().length();
        throw new SeqMonkException("Reading position " + end + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
    }
    if (start < 1) {
        throw new SeqMonkException("Reading position " + start + " was before the start of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
    }
    // We can now make the new reading
    SequenceReadWithChromosome read = new SequenceReadWithChromosome(c.chromosome(), SequenceRead.packPosition(start, end, strand));
    return read;
}
Also used : ChromosomeWithOffset(uk.ac.babraham.SeqMonk.Utilities.ChromosomeWithOffset) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) SequenceReadWithChromosome(uk.ac.babraham.SeqMonk.DataTypes.Sequence.SequenceReadWithChromosome)

Example 3 with SequenceReadWithChromosome

use of uk.ac.babraham.SeqMonk.DataTypes.Sequence.SequenceReadWithChromosome in project SeqMonk by s-andrews.

the class BAMFileParser method getSplitSingleEndRead.

/**
 * Gets a split single end read.  The only reason for asking about whether the
 * import is single or paired end is that if we're doing a paired end import
 * then we reverse the strand for all second read data.  For single end import
 * we leave the strand alone whichever read it originates from.
 *
 * @param samRecord The picard record entry for this read
 * @param flag pairedEnd Is this a paired end import
 * @return The read which was read
 * @throws SeqMonkException
 */
private SequenceReadWithChromosome[] getSplitSingleEndRead(SAMRecord samRecord) throws SeqMonkException {
    int strand;
    int start;
    int lastEnd = -1;
    start = samRecord.getAlignmentStart();
    if (samRecord.getReadNegativeStrandFlag()) {
        if (samRecord.getReadPairedFlag() && samRecord.getSecondOfPairFlag()) {
            strand = Location.FORWARD;
        } else {
            strand = Location.REVERSE;
        }
    } else {
        if (samRecord.getReadPairedFlag() && samRecord.getSecondOfPairFlag()) {
            strand = Location.REVERSE;
        } else {
            strand = Location.FORWARD;
        }
    }
    ChromosomeWithOffset c;
    try {
        c = dataCollection().genome().getChromosome(samRecord.getReferenceName());
    } catch (Exception e) {
        throw new SeqMonkException(e.getLocalizedMessage());
    }
    start = c.position(start);
    if (start < 1) {
        throw new SeqMonkException("Reading position " + start + " was before the start of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
    }
    if (!samRecord.getCigarString().contains("N")) {
        if (!importIntrons) {
            int end = c.position(samRecord.getAlignmentEnd());
            return new SequenceReadWithChromosome[] { new SequenceReadWithChromosome(c.chromosome(), SequenceRead.packPosition(start, end, strand)) };
        }
        return new SequenceReadWithChromosome[0];
    }
    String[] cigarOperations = samRecord.getCigarString().split("\\d+");
    String[] cigarNumbers = samRecord.getCigarString().split("[MIDNSHP]");
    if (cigarOperations.length != cigarNumbers.length + 1) {
        throw new SeqMonkException("Couldn't parse CIGAR string " + samRecord.getCigarString() + " counts were " + cigarOperations.length + " vs " + cigarNumbers.length);
    }
    Vector<SequenceReadWithChromosome> newReads = new Vector<SequenceReadWithChromosome>();
    int currentPosition = start;
    for (int pos = 0; pos < cigarNumbers.length; pos++) {
        if (cigarOperations[pos + 1].equals("M")) {
            currentPosition += Integer.parseInt(cigarNumbers[pos]) - 1;
        } else if (cigarOperations[pos + 1].equals("I")) {
            currentPosition += Integer.parseInt(cigarNumbers[pos]) - 1;
        } else if (cigarOperations[pos + 1].equals("D")) {
            currentPosition -= Integer.parseInt(cigarNumbers[pos]) - 1;
        } else if (cigarOperations[pos + 1].equals("N")) {
            if (importIntrons) {
                if (lastEnd > 0) {
                    if (start > c.chromosome().length()) {
                        int overrun = (start - 1) - c.chromosome().length();
                        throw new SeqMonkException("Reading position " + (start - 1) + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
                    }
                    newReads.add(new SequenceReadWithChromosome(c.chromosome(), SequenceRead.packPosition(lastEnd + 1, start - 1, strand)));
                }
                // Update the lastEnd whether we added a read or not since this
                // will be the start of the next intron
                lastEnd = currentPosition;
            } else {
                // We also don't allow readings which are beyond the end of the chromosome
                if (currentPosition > c.chromosome().length()) {
                    int overrun = currentPosition - c.chromosome().length();
                    throw new SeqMonkException("Reading position " + currentPosition + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
                }
                newReads.add(new SequenceReadWithChromosome(c.chromosome(), SequenceRead.packPosition(start, currentPosition, strand)));
            }
            currentPosition += Integer.parseInt(cigarNumbers[pos]) + 1;
            start = currentPosition;
        }
    }
    if (importIntrons) {
        if (lastEnd > 0) {
            if (start > c.chromosome().length()) {
                int overrun = (start - 1) - c.chromosome().length();
                throw new SeqMonkException("Reading position " + (start - 1) + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
            }
            newReads.add(new SequenceReadWithChromosome(c.chromosome(), SequenceRead.packPosition(lastEnd + 1, start - 1, strand)));
        }
    } else {
        // We also don't allow readings which are beyond the end of the chromosome
        if (currentPosition > c.chromosome().length()) {
            int overrun = currentPosition - c.chromosome().length();
            throw new SeqMonkException("Reading position " + currentPosition + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
        }
        newReads.add(new SequenceReadWithChromosome(c.chromosome(), SequenceRead.packPosition(start, currentPosition, strand)));
    }
    return newReads.toArray(new SequenceReadWithChromosome[0]);
}
Also used : ChromosomeWithOffset(uk.ac.babraham.SeqMonk.Utilities.ChromosomeWithOffset) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) Vector(java.util.Vector) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) SequenceReadWithChromosome(uk.ac.babraham.SeqMonk.DataTypes.Sequence.SequenceReadWithChromosome)

Example 4 with SequenceReadWithChromosome

use of uk.ac.babraham.SeqMonk.DataTypes.Sequence.SequenceReadWithChromosome in project SeqMonk by s-andrews.

the class BAMFileParser method getSingleEndRead.

/**
 * Gets a single end read.
 *
 * @param sections The tab split sections from the SAM file
 * @param flag The binary flag field from the file
 * @return The read which was read
 * @throws SeqMonkException
 */
private SequenceReadWithChromosome getSingleEndRead(SAMRecord samRecord) throws SeqMonkException {
    int strand;
    int start;
    int end;
    start = samRecord.getAlignmentStart();
    end = samRecord.getAlignmentEnd();
    if (samRecord.getReadNegativeStrandFlag()) {
        strand = Location.REVERSE;
    } else {
        strand = Location.FORWARD;
    }
    if (extendBy > 0) {
        if (strand == Location.FORWARD) {
            end += extendBy;
        } else if (strand == Location.REVERSE) {
            start -= extendBy;
        }
    }
    ChromosomeWithOffset c;
    try {
        c = collection.genome().getChromosome(samRecord.getReferenceName());
    } catch (Exception iae) {
        throw new SeqMonkException(iae.getLocalizedMessage());
    }
    start = c.position(start);
    end = c.position(end);
    // We also don't allow readings which are beyond the end of the chromosome
    if (end > c.chromosome().length()) {
        int overrun = end - c.chromosome().length();
        throw new SeqMonkException("Reading position " + end + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
    }
    if (start < 1) {
        throw new SeqMonkException("Reading position " + start + " was before the start of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
    }
    // We can now make the new reading
    SequenceReadWithChromosome read = new SequenceReadWithChromosome(c.chromosome(), SequenceRead.packPosition(start, end, strand));
    return read;
}
Also used : ChromosomeWithOffset(uk.ac.babraham.SeqMonk.Utilities.ChromosomeWithOffset) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) SequenceReadWithChromosome(uk.ac.babraham.SeqMonk.DataTypes.Sequence.SequenceReadWithChromosome)

Example 5 with SequenceReadWithChromosome

use of uk.ac.babraham.SeqMonk.DataTypes.Sequence.SequenceReadWithChromosome in project SeqMonk by s-andrews.

the class BAMFileParser method run.

/* (non-Javadoc)
	 * @see java.lang.Runnable#run()
	 */
public void run() {
    pairedEndImport = prefs.pairedEnd();
    pairedEndDistance = prefs.pairDistanceCutoff();
    separateSplicedReads = prefs.splitSplicedReads();
    importIntrons = prefs.importIntrons();
    extendBy = prefs.extendReads();
    minMappingQuality = prefs.minMappingQuality();
    primaryAlignmentsOnly = prefs.primaryAlignmentsOnly();
    File[] samFiles = getFiles();
    DataSet[] newData = new DataSet[samFiles.length];
    try {
        for (int f = 0; f < samFiles.length; f++) {
            SAMFileReader.setDefaultValidationStringency(SAMFileReader.ValidationStringency.SILENT);
            SAMFileReader inputSam = new SAMFileReader(samFiles[f]);
            if (prefs.isHiC()) {
                newData[f] = new PairedDataSet(samFiles[f].getName(), samFiles[f].getCanonicalPath(), prefs.removeDuplicates(), prefs.hiCDistance(), prefs.hiCIgnoreTrans());
            } else {
                newData[f] = new DataSet(samFiles[f].getName(), samFiles[f].getCanonicalPath(), prefs.removeDuplicates());
            }
            int lineCount = 0;
            // Now process the file
            // A flag we can set to skip the next record if we're getting
            // out of sync during single end HiC import.
            boolean skipNext = false;
            for (SAMRecord samRecord : inputSam) {
                if (skipNext) {
                    skipNext = false;
                    continue;
                }
                if (cancel) {
                    inputSam.close();
                    progressCancelled();
                    return;
                }
                ++lineCount;
                if (lineCount % 100000 == 0) {
                    progressUpdated("Read " + lineCount + " lines from " + samFiles[f].getName(), f, samFiles.length);
                }
                if (pairedEndImport && !samRecord.getReadPairedFlag()) {
                    progressWarningReceived(new SeqMonkException("Data was single ended during paired end import"));
                    continue;
                }
                if (samRecord.getReadUnmappedFlag()) {
                    // There was no match
                    continue;
                }
                if (primaryAlignmentsOnly && samRecord.getNotPrimaryAlignmentFlag()) {
                    // alignments
                    continue;
                }
                if (pairedEndImport && !separateSplicedReads && samRecord.getMateUnmappedFlag()) {
                    // No match on the reverse strand.  Doesn't matter if we're doing spliced reads.
                    continue;
                }
                if (minMappingQuality > 0 && samRecord.getMappingQuality() < minMappingQuality) {
                    if (prefs.isHiC()) {
                        if (((PairedDataSet) newData[f]).importSequenceSkipped()) {
                            // Skip the next line
                            skipNext = true;
                        }
                    }
                    continue;
                }
                if (pairedEndImport && !separateSplicedReads && !samRecord.getReadNegativeStrandFlag()) {
                    // field.
                    continue;
                }
                if (pairedEndImport && !separateSplicedReads && !samRecord.getReferenceName().equals(samRecord.getMateReferenceName())) {
                    // progressWarningReceived(new SeqMonkException("Paired reads mapped to different chromosomes"));
                    continue;
                }
                // TODO: Check what this actually stores - might be a real name rather than 0/=
                if (pairedEndImport && !separateSplicedReads && !prefs.isHiC() && samRecord.getMateReferenceName() == "0") {
                    if (samRecord.getMateReferenceName() != "=") {
                        inputSam.close();
                        throw new SeqMonkException("Unexpected mate referenece name " + samRecord.getMateReferenceName());
                    }
                    // Matches were on different chromosomes
                    continue;
                }
                try {
                    if (pairedEndImport && !separateSplicedReads) {
                        SequenceReadWithChromosome read = getPairedEndRead(samRecord);
                        newData[f].addData(read.chromosome, read.read);
                    } else if (separateSplicedReads) {
                        SequenceReadWithChromosome[] reads = getSplitSingleEndRead(samRecord);
                        for (int r = 0; r < reads.length; r++) {
                            newData[f].addData(reads[r].chromosome, reads[r].read);
                        }
                    } else {
                        SequenceReadWithChromosome read = getSingleEndRead(samRecord);
                        newData[f].addData(read.chromosome, read.read);
                    }
                } catch (SeqMonkException ex) {
                    progressWarningReceived(ex);
                    if (prefs.isHiC()) {
                        if (((PairedDataSet) newData[f]).importSequenceSkipped()) {
                            // Skip the next line
                            skipNext = true;
                        }
                    }
                }
            }
            // We're finished with the file.
            inputSam.close();
            // Cache the data in the new dataset
            progressUpdated("Caching data from " + samFiles[f].getName(), f, samFiles.length);
            newData[f].finalise();
        }
    } catch (Exception ex) {
        progressExceptionReceived(ex);
        return;
    }
    processingFinished(newData);
}
Also used : DataSet(uk.ac.babraham.SeqMonk.DataTypes.DataSet) PairedDataSet(uk.ac.babraham.SeqMonk.DataTypes.PairedDataSet) SAMRecord(net.sf.samtools.SAMRecord) PairedDataSet(uk.ac.babraham.SeqMonk.DataTypes.PairedDataSet) SAMFileReader(net.sf.samtools.SAMFileReader) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) File(java.io.File) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) SequenceReadWithChromosome(uk.ac.babraham.SeqMonk.DataTypes.Sequence.SequenceReadWithChromosome)

Aggregations

SequenceReadWithChromosome (uk.ac.babraham.SeqMonk.DataTypes.Sequence.SequenceReadWithChromosome)7 SeqMonkException (uk.ac.babraham.SeqMonk.SeqMonkException)7 ChromosomeWithOffset (uk.ac.babraham.SeqMonk.Utilities.ChromosomeWithOffset)5 File (java.io.File)2 DataSet (uk.ac.babraham.SeqMonk.DataTypes.DataSet)2 PairedDataSet (uk.ac.babraham.SeqMonk.DataTypes.PairedDataSet)2 BufferedReader (java.io.BufferedReader)1 FileInputStream (java.io.FileInputStream)1 FileReader (java.io.FileReader)1 InputStreamReader (java.io.InputStreamReader)1 Vector (java.util.Vector)1 GZIPInputStream (java.util.zip.GZIPInputStream)1 SAMFileReader (net.sf.samtools.SAMFileReader)1 SAMRecord (net.sf.samtools.SAMRecord)1