Search in sources :

Example 6 with SequenceReadWithChromosome

use of uk.ac.babraham.SeqMonk.DataTypes.Sequence.SequenceReadWithChromosome in project SeqMonk by s-andrews.

the class BowtieFileParser method getPairedEndRead.

/**
 * Gets a paired end read.
 *
 * @param sections1 The tab split bowtie output sections for the first read
 * @param sections2 The tab split bowtie output sections for the second read
 * @return The paired end read which was read
 * @throws SeqMonkException
 */
private SequenceReadWithChromosome getPairedEndRead(String[] sections1, String[] sections2) throws SeqMonkException {
    // We can get the lines with read two first, in which case we'll reverse things
    boolean readsAreReversed = false;
    if (sections1[0].substring(sections1[0].length() - 1).equals("2")) {
        readsAreReversed = true;
    }
    int strand;
    int start;
    int end;
    try {
        /*
			 * This convention isn't true in newer bowtie files so we can't rely on this any more.
			 */
        // if (! sections1[0].substring(0, sections1[0].length()-2).equals(sections2[0].substring(0, sections2[0].length()-2))) {
        // throw new SeqMonkException("Paired reads '"+sections1[0]+"' and '"+sections2[0]+"' did not match names");
        // }
        int read1start = Integer.parseInt(sections1[3]) + 1;
        int read1end = read1start + (sections1[4].length() - 1);
        int read2start = Integer.parseInt(sections2[3]) + 1;
        int read2end = read2start + (sections2[4].length() - 1);
        if (read1start < read2start) {
            start = read1start;
        } else {
            start = read2start;
        }
        if (read2end > read1end) {
            end = read2end;
        } else {
            end = read1end;
        }
        if (sections1[1].equals("+") && sections2[1].equals("-")) {
            if (readsAreReversed) {
                strand = Location.REVERSE;
            } else {
                strand = Location.FORWARD;
            }
        } else if (sections1[1].equals("-") && sections2[1].equals("+")) {
            if (readsAreReversed) {
                strand = Location.FORWARD;
            } else {
                strand = Location.REVERSE;
            }
        } else {
            strand = Location.UNKNOWN;
        }
    } catch (NumberFormatException e) {
        throw new SeqMonkException("Location " + sections1[3] + " or " + sections2[3] + " was not an integer");
    }
    if ((end - start) + 1 > pairedEndDistance) {
        throw new SeqMonkException("Distance between ends " + ((end - start) + 1) + " was larger than cutoff (" + pairedEndDistance + ")");
    }
    if (!sections1[2].equals(sections2[2])) {
        throw new SeqMonkException("Paried end read was on a different chromosome");
    }
    ChromosomeWithOffset c;
    try {
        c = dataCollection().genome().getChromosome(sections1[2]);
    } catch (Exception e) {
        throw new SeqMonkException(e.getLocalizedMessage());
    }
    start = c.position(start);
    end = c.position(end);
    // We also don't allow readings which are beyond the end of the chromosome
    if (end > c.chromosome().length()) {
        int overrun = end - c.chromosome().length();
        throw new SeqMonkException("Reading position " + end + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
    }
    if (start < 1) {
        throw new SeqMonkException("Reading position " + start + " was before the start of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
    }
    // We can now make the new reading
    SequenceReadWithChromosome read = new SequenceReadWithChromosome(c.chromosome(), SequenceRead.packPosition(start, end, strand));
    return read;
}
Also used : ChromosomeWithOffset(uk.ac.babraham.SeqMonk.Utilities.ChromosomeWithOffset) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) SequenceReadWithChromosome(uk.ac.babraham.SeqMonk.DataTypes.Sequence.SequenceReadWithChromosome)

Example 7 with SequenceReadWithChromosome

use of uk.ac.babraham.SeqMonk.DataTypes.Sequence.SequenceReadWithChromosome in project SeqMonk by s-andrews.

the class BowtieFileParser method run.

/* (non-Javadoc)
	 * @see java.lang.Runnable#run()
	 */
public void run() {
    pairedEndImport = prefs.pairedEnd();
    pairedEndDistance = prefs.pairDistanceCutoff();
    if (!pairedEndImport) {
        extendBy = prefs.extendReads();
    }
    String[] sections1;
    String[] sections2 = null;
    File[] bowtieFiles = getFiles();
    DataSet[] newData = new DataSet[bowtieFiles.length];
    try {
        for (int f = 0; f < bowtieFiles.length; f++) {
            BufferedReader br;
            if (bowtieFiles[f].getName().toLowerCase().endsWith(".gz")) {
                br = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(bowtieFiles[f]))));
            } else {
                br = new BufferedReader(new FileReader(bowtieFiles[f]));
            }
            String line;
            String line2 = null;
            if (prefs.isHiC()) {
                newData[f] = new PairedDataSet(bowtieFiles[f].getName(), bowtieFiles[f].getCanonicalPath(), prefs.removeDuplicates(), prefs.hiCDistance(), prefs.hiCIgnoreTrans());
            } else {
                newData[f] = new DataSet(bowtieFiles[f].getName(), bowtieFiles[f].getCanonicalPath(), prefs.removeDuplicates());
            }
            int lineCount = 0;
            while ((line = br.readLine()) != null) {
                if (cancel) {
                    br.close();
                    progressCancelled();
                    return;
                }
                // Ignore blank lines
                if (line.trim().length() == 0)
                    continue;
                ++lineCount;
                if (pairedEndImport) {
                    line2 = br.readLine();
                    if (line2 == null) {
                        ++lineCount;
                        progressWarningReceived(new SeqMonkException("Ran out of file when looking for paired end"));
                        continue;
                    }
                }
                if (lineCount % 100000 == 0) {
                    progressUpdated("Read " + lineCount + " lines from " + bowtieFiles[f].getName(), f, bowtieFiles.length);
                }
                sections1 = line.split("\t");
                if (pairedEndImport) {
                    sections2 = line2.split("\t");
                }
                // Check to see if we've got enough data to work with
                if (sections1.length < 5) {
                    progressWarningReceived(new SeqMonkException("Not enough data from line '" + line + "'"));
                    // Skip this line...
                    continue;
                }
                if (pairedEndImport && sections2.length < 5) {
                    progressWarningReceived(new SeqMonkException("Not enough data from line '" + line + "'"));
                    // Skip this line...
                    continue;
                }
                try {
                    if (pairedEndImport) {
                        SequenceReadWithChromosome read = getPairedEndRead(sections1, sections2);
                        newData[f].addData(read.chromosome, read.read);
                    } else {
                        SequenceReadWithChromosome read = getSingleEndRead(sections1);
                        newData[f].addData(read.chromosome, read.read);
                    }
                } catch (SeqMonkException ex) {
                    progressWarningReceived(ex);
                }
            }
            // We're finished with the file.
            br.close();
            // Cache the data in the new dataset
            progressUpdated("Caching data from " + bowtieFiles[f].getName(), f, bowtieFiles.length);
            newData[f].finalise();
        }
    } catch (Exception ex) {
        progressExceptionReceived(ex);
        return;
    }
    processingFinished(newData);
}
Also used : InputStreamReader(java.io.InputStreamReader) DataSet(uk.ac.babraham.SeqMonk.DataTypes.DataSet) PairedDataSet(uk.ac.babraham.SeqMonk.DataTypes.PairedDataSet) PairedDataSet(uk.ac.babraham.SeqMonk.DataTypes.PairedDataSet) FileInputStream(java.io.FileInputStream) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) SequenceReadWithChromosome(uk.ac.babraham.SeqMonk.DataTypes.Sequence.SequenceReadWithChromosome) GZIPInputStream(java.util.zip.GZIPInputStream) BufferedReader(java.io.BufferedReader) FileReader(java.io.FileReader) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) File(java.io.File)

Aggregations

SequenceReadWithChromosome (uk.ac.babraham.SeqMonk.DataTypes.Sequence.SequenceReadWithChromosome)7 SeqMonkException (uk.ac.babraham.SeqMonk.SeqMonkException)7 ChromosomeWithOffset (uk.ac.babraham.SeqMonk.Utilities.ChromosomeWithOffset)5 File (java.io.File)2 DataSet (uk.ac.babraham.SeqMonk.DataTypes.DataSet)2 PairedDataSet (uk.ac.babraham.SeqMonk.DataTypes.PairedDataSet)2 BufferedReader (java.io.BufferedReader)1 FileInputStream (java.io.FileInputStream)1 FileReader (java.io.FileReader)1 InputStreamReader (java.io.InputStreamReader)1 Vector (java.util.Vector)1 GZIPInputStream (java.util.zip.GZIPInputStream)1 SAMFileReader (net.sf.samtools.SAMFileReader)1 SAMRecord (net.sf.samtools.SAMRecord)1