Search in sources :

Example 81 with SeqMonkException

use of uk.ac.babraham.SeqMonk.SeqMonkException in project SeqMonk by s-andrews.

the class BAMFileParser method getSplitSingleEndRead.

/**
 * Gets a split single end read.  The only reason for asking about whether the
 * import is single or paired end is that if we're doing a paired end import
 * then we reverse the strand for all second read data.  For single end import
 * we leave the strand alone whichever read it originates from.
 *
 * @param samRecord The picard record entry for this read
 * @param flag pairedEnd Is this a paired end import
 * @return The read which was read
 * @throws SeqMonkException
 */
private SequenceReadWithChromosome[] getSplitSingleEndRead(SAMRecord samRecord) throws SeqMonkException {
    int strand;
    int start;
    int lastEnd = -1;
    start = samRecord.getAlignmentStart();
    if (samRecord.getReadNegativeStrandFlag()) {
        if (samRecord.getReadPairedFlag() && samRecord.getSecondOfPairFlag()) {
            strand = Location.FORWARD;
        } else {
            strand = Location.REVERSE;
        }
    } else {
        if (samRecord.getReadPairedFlag() && samRecord.getSecondOfPairFlag()) {
            strand = Location.REVERSE;
        } else {
            strand = Location.FORWARD;
        }
    }
    ChromosomeWithOffset c;
    try {
        c = dataCollection().genome().getChromosome(samRecord.getReferenceName());
    } catch (Exception e) {
        throw new SeqMonkException(e.getLocalizedMessage());
    }
    start = c.position(start);
    if (start < 1) {
        throw new SeqMonkException("Reading position " + start + " was before the start of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
    }
    if (!samRecord.getCigarString().contains("N")) {
        if (!importIntrons) {
            int end = c.position(samRecord.getAlignmentEnd());
            return new SequenceReadWithChromosome[] { new SequenceReadWithChromosome(c.chromosome(), SequenceRead.packPosition(start, end, strand)) };
        }
        return new SequenceReadWithChromosome[0];
    }
    String[] cigarOperations = samRecord.getCigarString().split("\\d+");
    String[] cigarNumbers = samRecord.getCigarString().split("[MIDNSHP]");
    if (cigarOperations.length != cigarNumbers.length + 1) {
        throw new SeqMonkException("Couldn't parse CIGAR string " + samRecord.getCigarString() + " counts were " + cigarOperations.length + " vs " + cigarNumbers.length);
    }
    Vector<SequenceReadWithChromosome> newReads = new Vector<SequenceReadWithChromosome>();
    int currentPosition = start;
    for (int pos = 0; pos < cigarNumbers.length; pos++) {
        if (cigarOperations[pos + 1].equals("M")) {
            currentPosition += Integer.parseInt(cigarNumbers[pos]) - 1;
        } else if (cigarOperations[pos + 1].equals("I")) {
            currentPosition += Integer.parseInt(cigarNumbers[pos]) - 1;
        } else if (cigarOperations[pos + 1].equals("D")) {
            currentPosition -= Integer.parseInt(cigarNumbers[pos]) - 1;
        } else if (cigarOperations[pos + 1].equals("N")) {
            if (importIntrons) {
                if (lastEnd > 0) {
                    if (start > c.chromosome().length()) {
                        int overrun = (start - 1) - c.chromosome().length();
                        throw new SeqMonkException("Reading position " + (start - 1) + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
                    }
                    newReads.add(new SequenceReadWithChromosome(c.chromosome(), SequenceRead.packPosition(lastEnd + 1, start - 1, strand)));
                }
                // Update the lastEnd whether we added a read or not since this
                // will be the start of the next intron
                lastEnd = currentPosition;
            } else {
                // We also don't allow readings which are beyond the end of the chromosome
                if (currentPosition > c.chromosome().length()) {
                    int overrun = currentPosition - c.chromosome().length();
                    throw new SeqMonkException("Reading position " + currentPosition + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
                }
                newReads.add(new SequenceReadWithChromosome(c.chromosome(), SequenceRead.packPosition(start, currentPosition, strand)));
            }
            currentPosition += Integer.parseInt(cigarNumbers[pos]) + 1;
            start = currentPosition;
        }
    }
    if (importIntrons) {
        if (lastEnd > 0) {
            if (start > c.chromosome().length()) {
                int overrun = (start - 1) - c.chromosome().length();
                throw new SeqMonkException("Reading position " + (start - 1) + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
            }
            newReads.add(new SequenceReadWithChromosome(c.chromosome(), SequenceRead.packPosition(lastEnd + 1, start - 1, strand)));
        }
    } else {
        // We also don't allow readings which are beyond the end of the chromosome
        if (currentPosition > c.chromosome().length()) {
            int overrun = currentPosition - c.chromosome().length();
            throw new SeqMonkException("Reading position " + currentPosition + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
        }
        newReads.add(new SequenceReadWithChromosome(c.chromosome(), SequenceRead.packPosition(start, currentPosition, strand)));
    }
    return newReads.toArray(new SequenceReadWithChromosome[0]);
}
Also used : ChromosomeWithOffset(uk.ac.babraham.SeqMonk.Utilities.ChromosomeWithOffset) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) Vector(java.util.Vector) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) SequenceReadWithChromosome(uk.ac.babraham.SeqMonk.DataTypes.Sequence.SequenceReadWithChromosome)

Example 82 with SeqMonkException

use of uk.ac.babraham.SeqMonk.SeqMonkException in project SeqMonk by s-andrews.

the class BAMFileParser method getSingleEndRead.

/**
 * Gets a single end read.
 *
 * @param sections The tab split sections from the SAM file
 * @param flag The binary flag field from the file
 * @return The read which was read
 * @throws SeqMonkException
 */
private SequenceReadWithChromosome getSingleEndRead(SAMRecord samRecord) throws SeqMonkException {
    int strand;
    int start;
    int end;
    start = samRecord.getAlignmentStart();
    end = samRecord.getAlignmentEnd();
    if (samRecord.getReadNegativeStrandFlag()) {
        strand = Location.REVERSE;
    } else {
        strand = Location.FORWARD;
    }
    if (extendBy > 0) {
        if (strand == Location.FORWARD) {
            end += extendBy;
        } else if (strand == Location.REVERSE) {
            start -= extendBy;
        }
    }
    ChromosomeWithOffset c;
    try {
        c = collection.genome().getChromosome(samRecord.getReferenceName());
    } catch (Exception iae) {
        throw new SeqMonkException(iae.getLocalizedMessage());
    }
    start = c.position(start);
    end = c.position(end);
    // We also don't allow readings which are beyond the end of the chromosome
    if (end > c.chromosome().length()) {
        int overrun = end - c.chromosome().length();
        throw new SeqMonkException("Reading position " + end + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
    }
    if (start < 1) {
        throw new SeqMonkException("Reading position " + start + " was before the start of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
    }
    // We can now make the new reading
    SequenceReadWithChromosome read = new SequenceReadWithChromosome(c.chromosome(), SequenceRead.packPosition(start, end, strand));
    return read;
}
Also used : ChromosomeWithOffset(uk.ac.babraham.SeqMonk.Utilities.ChromosomeWithOffset) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) SequenceReadWithChromosome(uk.ac.babraham.SeqMonk.DataTypes.Sequence.SequenceReadWithChromosome)

Example 83 with SeqMonkException

use of uk.ac.babraham.SeqMonk.SeqMonkException in project SeqMonk by s-andrews.

the class BAMFileParser method run.

/* (non-Javadoc)
	 * @see java.lang.Runnable#run()
	 */
public void run() {
    pairedEndImport = prefs.pairedEnd();
    pairedEndDistance = prefs.pairDistanceCutoff();
    separateSplicedReads = prefs.splitSplicedReads();
    importIntrons = prefs.importIntrons();
    extendBy = prefs.extendReads();
    minMappingQuality = prefs.minMappingQuality();
    primaryAlignmentsOnly = prefs.primaryAlignmentsOnly();
    File[] samFiles = getFiles();
    DataSet[] newData = new DataSet[samFiles.length];
    try {
        for (int f = 0; f < samFiles.length; f++) {
            SAMFileReader.setDefaultValidationStringency(SAMFileReader.ValidationStringency.SILENT);
            SAMFileReader inputSam = new SAMFileReader(samFiles[f]);
            if (prefs.isHiC()) {
                newData[f] = new PairedDataSet(samFiles[f].getName(), samFiles[f].getCanonicalPath(), prefs.removeDuplicates(), prefs.hiCDistance(), prefs.hiCIgnoreTrans());
            } else {
                newData[f] = new DataSet(samFiles[f].getName(), samFiles[f].getCanonicalPath(), prefs.removeDuplicates());
            }
            int lineCount = 0;
            // Now process the file
            // A flag we can set to skip the next record if we're getting
            // out of sync during single end HiC import.
            boolean skipNext = false;
            for (SAMRecord samRecord : inputSam) {
                if (skipNext) {
                    skipNext = false;
                    continue;
                }
                if (cancel) {
                    inputSam.close();
                    progressCancelled();
                    return;
                }
                ++lineCount;
                if (lineCount % 100000 == 0) {
                    progressUpdated("Read " + lineCount + " lines from " + samFiles[f].getName(), f, samFiles.length);
                }
                if (pairedEndImport && !samRecord.getReadPairedFlag()) {
                    progressWarningReceived(new SeqMonkException("Data was single ended during paired end import"));
                    continue;
                }
                if (samRecord.getReadUnmappedFlag()) {
                    // There was no match
                    continue;
                }
                if (primaryAlignmentsOnly && samRecord.getNotPrimaryAlignmentFlag()) {
                    // alignments
                    continue;
                }
                if (pairedEndImport && !separateSplicedReads && samRecord.getMateUnmappedFlag()) {
                    // No match on the reverse strand.  Doesn't matter if we're doing spliced reads.
                    continue;
                }
                if (minMappingQuality > 0 && samRecord.getMappingQuality() < minMappingQuality) {
                    if (prefs.isHiC()) {
                        if (((PairedDataSet) newData[f]).importSequenceSkipped()) {
                            // Skip the next line
                            skipNext = true;
                        }
                    }
                    continue;
                }
                if (pairedEndImport && !separateSplicedReads && !samRecord.getReadNegativeStrandFlag()) {
                    // field.
                    continue;
                }
                if (pairedEndImport && !separateSplicedReads && !samRecord.getReferenceName().equals(samRecord.getMateReferenceName())) {
                    // progressWarningReceived(new SeqMonkException("Paired reads mapped to different chromosomes"));
                    continue;
                }
                // TODO: Check what this actually stores - might be a real name rather than 0/=
                if (pairedEndImport && !separateSplicedReads && !prefs.isHiC() && samRecord.getMateReferenceName() == "0") {
                    if (samRecord.getMateReferenceName() != "=") {
                        inputSam.close();
                        throw new SeqMonkException("Unexpected mate referenece name " + samRecord.getMateReferenceName());
                    }
                    // Matches were on different chromosomes
                    continue;
                }
                try {
                    if (pairedEndImport && !separateSplicedReads) {
                        SequenceReadWithChromosome read = getPairedEndRead(samRecord);
                        newData[f].addData(read.chromosome, read.read);
                    } else if (separateSplicedReads) {
                        SequenceReadWithChromosome[] reads = getSplitSingleEndRead(samRecord);
                        for (int r = 0; r < reads.length; r++) {
                            newData[f].addData(reads[r].chromosome, reads[r].read);
                        }
                    } else {
                        SequenceReadWithChromosome read = getSingleEndRead(samRecord);
                        newData[f].addData(read.chromosome, read.read);
                    }
                } catch (SeqMonkException ex) {
                    progressWarningReceived(ex);
                    if (prefs.isHiC()) {
                        if (((PairedDataSet) newData[f]).importSequenceSkipped()) {
                            // Skip the next line
                            skipNext = true;
                        }
                    }
                }
            }
            // We're finished with the file.
            inputSam.close();
            // Cache the data in the new dataset
            progressUpdated("Caching data from " + samFiles[f].getName(), f, samFiles.length);
            newData[f].finalise();
        }
    } catch (Exception ex) {
        progressExceptionReceived(ex);
        return;
    }
    processingFinished(newData);
}
Also used : DataSet(uk.ac.babraham.SeqMonk.DataTypes.DataSet) PairedDataSet(uk.ac.babraham.SeqMonk.DataTypes.PairedDataSet) SAMRecord(net.sf.samtools.SAMRecord) PairedDataSet(uk.ac.babraham.SeqMonk.DataTypes.PairedDataSet) SAMFileReader(net.sf.samtools.SAMFileReader) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) File(java.io.File) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) SequenceReadWithChromosome(uk.ac.babraham.SeqMonk.DataTypes.Sequence.SequenceReadWithChromosome)

Example 84 with SeqMonkException

use of uk.ac.babraham.SeqMonk.SeqMonkException in project SeqMonk by s-andrews.

the class BowtieFileParser method getPairedEndRead.

/**
 * Gets a paired end read.
 *
 * @param sections1 The tab split bowtie output sections for the first read
 * @param sections2 The tab split bowtie output sections for the second read
 * @return The paired end read which was read
 * @throws SeqMonkException
 */
private SequenceReadWithChromosome getPairedEndRead(String[] sections1, String[] sections2) throws SeqMonkException {
    // We can get the lines with read two first, in which case we'll reverse things
    boolean readsAreReversed = false;
    if (sections1[0].substring(sections1[0].length() - 1).equals("2")) {
        readsAreReversed = true;
    }
    int strand;
    int start;
    int end;
    try {
        /*
			 * This convention isn't true in newer bowtie files so we can't rely on this any more.
			 */
        // if (! sections1[0].substring(0, sections1[0].length()-2).equals(sections2[0].substring(0, sections2[0].length()-2))) {
        // throw new SeqMonkException("Paired reads '"+sections1[0]+"' and '"+sections2[0]+"' did not match names");
        // }
        int read1start = Integer.parseInt(sections1[3]) + 1;
        int read1end = read1start + (sections1[4].length() - 1);
        int read2start = Integer.parseInt(sections2[3]) + 1;
        int read2end = read2start + (sections2[4].length() - 1);
        if (read1start < read2start) {
            start = read1start;
        } else {
            start = read2start;
        }
        if (read2end > read1end) {
            end = read2end;
        } else {
            end = read1end;
        }
        if (sections1[1].equals("+") && sections2[1].equals("-")) {
            if (readsAreReversed) {
                strand = Location.REVERSE;
            } else {
                strand = Location.FORWARD;
            }
        } else if (sections1[1].equals("-") && sections2[1].equals("+")) {
            if (readsAreReversed) {
                strand = Location.FORWARD;
            } else {
                strand = Location.REVERSE;
            }
        } else {
            strand = Location.UNKNOWN;
        }
    } catch (NumberFormatException e) {
        throw new SeqMonkException("Location " + sections1[3] + " or " + sections2[3] + " was not an integer");
    }
    if ((end - start) + 1 > pairedEndDistance) {
        throw new SeqMonkException("Distance between ends " + ((end - start) + 1) + " was larger than cutoff (" + pairedEndDistance + ")");
    }
    if (!sections1[2].equals(sections2[2])) {
        throw new SeqMonkException("Paried end read was on a different chromosome");
    }
    ChromosomeWithOffset c;
    try {
        c = dataCollection().genome().getChromosome(sections1[2]);
    } catch (Exception e) {
        throw new SeqMonkException(e.getLocalizedMessage());
    }
    start = c.position(start);
    end = c.position(end);
    // We also don't allow readings which are beyond the end of the chromosome
    if (end > c.chromosome().length()) {
        int overrun = end - c.chromosome().length();
        throw new SeqMonkException("Reading position " + end + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
    }
    if (start < 1) {
        throw new SeqMonkException("Reading position " + start + " was before the start of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
    }
    // We can now make the new reading
    SequenceReadWithChromosome read = new SequenceReadWithChromosome(c.chromosome(), SequenceRead.packPosition(start, end, strand));
    return read;
}
Also used : ChromosomeWithOffset(uk.ac.babraham.SeqMonk.Utilities.ChromosomeWithOffset) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) SequenceReadWithChromosome(uk.ac.babraham.SeqMonk.DataTypes.Sequence.SequenceReadWithChromosome)

Example 85 with SeqMonkException

use of uk.ac.babraham.SeqMonk.SeqMonkException in project SeqMonk by s-andrews.

the class BowtieFileParser method run.

/* (non-Javadoc)
	 * @see java.lang.Runnable#run()
	 */
public void run() {
    pairedEndImport = prefs.pairedEnd();
    pairedEndDistance = prefs.pairDistanceCutoff();
    if (!pairedEndImport) {
        extendBy = prefs.extendReads();
    }
    String[] sections1;
    String[] sections2 = null;
    File[] bowtieFiles = getFiles();
    DataSet[] newData = new DataSet[bowtieFiles.length];
    try {
        for (int f = 0; f < bowtieFiles.length; f++) {
            BufferedReader br;
            if (bowtieFiles[f].getName().toLowerCase().endsWith(".gz")) {
                br = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(bowtieFiles[f]))));
            } else {
                br = new BufferedReader(new FileReader(bowtieFiles[f]));
            }
            String line;
            String line2 = null;
            if (prefs.isHiC()) {
                newData[f] = new PairedDataSet(bowtieFiles[f].getName(), bowtieFiles[f].getCanonicalPath(), prefs.removeDuplicates(), prefs.hiCDistance(), prefs.hiCIgnoreTrans());
            } else {
                newData[f] = new DataSet(bowtieFiles[f].getName(), bowtieFiles[f].getCanonicalPath(), prefs.removeDuplicates());
            }
            int lineCount = 0;
            while ((line = br.readLine()) != null) {
                if (cancel) {
                    br.close();
                    progressCancelled();
                    return;
                }
                // Ignore blank lines
                if (line.trim().length() == 0)
                    continue;
                ++lineCount;
                if (pairedEndImport) {
                    line2 = br.readLine();
                    if (line2 == null) {
                        ++lineCount;
                        progressWarningReceived(new SeqMonkException("Ran out of file when looking for paired end"));
                        continue;
                    }
                }
                if (lineCount % 100000 == 0) {
                    progressUpdated("Read " + lineCount + " lines from " + bowtieFiles[f].getName(), f, bowtieFiles.length);
                }
                sections1 = line.split("\t");
                if (pairedEndImport) {
                    sections2 = line2.split("\t");
                }
                // Check to see if we've got enough data to work with
                if (sections1.length < 5) {
                    progressWarningReceived(new SeqMonkException("Not enough data from line '" + line + "'"));
                    // Skip this line...
                    continue;
                }
                if (pairedEndImport && sections2.length < 5) {
                    progressWarningReceived(new SeqMonkException("Not enough data from line '" + line + "'"));
                    // Skip this line...
                    continue;
                }
                try {
                    if (pairedEndImport) {
                        SequenceReadWithChromosome read = getPairedEndRead(sections1, sections2);
                        newData[f].addData(read.chromosome, read.read);
                    } else {
                        SequenceReadWithChromosome read = getSingleEndRead(sections1);
                        newData[f].addData(read.chromosome, read.read);
                    }
                } catch (SeqMonkException ex) {
                    progressWarningReceived(ex);
                }
            }
            // We're finished with the file.
            br.close();
            // Cache the data in the new dataset
            progressUpdated("Caching data from " + bowtieFiles[f].getName(), f, bowtieFiles.length);
            newData[f].finalise();
        }
    } catch (Exception ex) {
        progressExceptionReceived(ex);
        return;
    }
    processingFinished(newData);
}
Also used : InputStreamReader(java.io.InputStreamReader) DataSet(uk.ac.babraham.SeqMonk.DataTypes.DataSet) PairedDataSet(uk.ac.babraham.SeqMonk.DataTypes.PairedDataSet) PairedDataSet(uk.ac.babraham.SeqMonk.DataTypes.PairedDataSet) FileInputStream(java.io.FileInputStream) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) SequenceReadWithChromosome(uk.ac.babraham.SeqMonk.DataTypes.Sequence.SequenceReadWithChromosome) GZIPInputStream(java.util.zip.GZIPInputStream) BufferedReader(java.io.BufferedReader) FileReader(java.io.FileReader) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) File(java.io.File)

Aggregations

SeqMonkException (uk.ac.babraham.SeqMonk.SeqMonkException)91 Probe (uk.ac.babraham.SeqMonk.DataTypes.Probes.Probe)49 ProbeList (uk.ac.babraham.SeqMonk.DataTypes.Probes.ProbeList)30 Chromosome (uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome)22 Vector (java.util.Vector)21 DataSet (uk.ac.babraham.SeqMonk.DataTypes.DataSet)20 File (java.io.File)19 DataStore (uk.ac.babraham.SeqMonk.DataTypes.DataStore)17 BufferedReader (java.io.BufferedReader)16 FileReader (java.io.FileReader)16 ChromosomeWithOffset (uk.ac.babraham.SeqMonk.Utilities.ChromosomeWithOffset)14 PairedDataSet (uk.ac.babraham.SeqMonk.DataTypes.PairedDataSet)13 FileInputStream (java.io.FileInputStream)11 IOException (java.io.IOException)11 InputStreamReader (java.io.InputStreamReader)11 GZIPInputStream (java.util.zip.GZIPInputStream)11 HiCDataStore (uk.ac.babraham.SeqMonk.DataTypes.HiCDataStore)8 ProgressListener (uk.ac.babraham.SeqMonk.DataTypes.ProgressListener)8 FileNotFoundException (java.io.FileNotFoundException)7 SequenceReadWithChromosome (uk.ac.babraham.SeqMonk.DataTypes.Sequence.SequenceReadWithChromosome)7