Search in sources :

Example 1 with PairedDataSet

use of uk.ac.babraham.SeqMonk.DataTypes.PairedDataSet in project SeqMonk by s-andrews.

the class BedFileParser method run.

/* (non-Javadoc)
	 * @see java.lang.Runnable#run()
	 */
public void run() {
    // System.err.println("Started parsing BED files");
    int extendBy = prefs.extendReads();
    try {
        File[] probeFiles = getFiles();
        DataSet[] newData = new DataSet[probeFiles.length];
        for (int f = 0; f < probeFiles.length; f++) {
            BufferedReader br;
            if (probeFiles[f].getName().toLowerCase().endsWith(".gz")) {
                br = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(probeFiles[f]))));
            } else {
                br = new BufferedReader(new FileReader(probeFiles[f]));
            }
            String line;
            if (prefs.isHiC()) {
                newData[f] = new PairedDataSet(probeFiles[f].getName(), probeFiles[f].getCanonicalPath(), prefs.removeDuplicates(), prefs.hiCDistance(), prefs.hiCIgnoreTrans());
            } else {
                newData[f] = new DataSet(probeFiles[f].getName(), probeFiles[f].getCanonicalPath(), prefs.removeDuplicates());
            }
            int lineCount = 0;
            // Now process the file
            while ((line = br.readLine()) != null) {
                if (cancel) {
                    br.close();
                    progressCancelled();
                    return;
                }
                // Ignore blank lines
                if (line.trim().length() == 0)
                    continue;
                ++lineCount;
                if (lineCount % 100000 == 0) {
                    progressUpdated("Read " + lineCount + " lines from " + probeFiles[f].getName(), f, probeFiles.length);
                }
                String[] sections = line.split("\t");
                // Check to see if we've got enough data to work with
                if (sections.length < 3) {
                    progressWarningReceived(new SeqMonkException("Not enough data from line '" + line + "'"));
                    // Skip this line...
                    continue;
                }
                int strand;
                int start;
                int end;
                try {
                    // The start is zero indexed so we need to add 1 to get genomic positions
                    start = Integer.parseInt(sections[1]) + 1;
                    // The end is zero indexed, but not included in the feature position so
                    // we need to add one to get genomic coordinates, but subtract one to not
                    // include the final base.
                    end = Integer.parseInt(sections[2]);
                    // End must always be later than start
                    if (start > end) {
                        progressWarningReceived(new SeqMonkException("End position " + end + " was lower than start position " + start));
                        int temp = start;
                        start = end;
                        end = temp;
                    }
                    if (sections.length >= 6) {
                        if (sections[5].equals("+")) {
                            strand = Location.FORWARD;
                        } else if (sections[5].equals("-")) {
                            strand = Location.REVERSE;
                        } else {
                            progressWarningReceived(new SeqMonkException("Unknown strand character '" + sections[5] + "' marked as unknown strand"));
                            strand = Location.UNKNOWN;
                        }
                        if (extendBy > 0) {
                            if (strand == Location.REVERSE) {
                                start -= extendBy;
                            } else if (strand == Location.FORWARD) {
                                end += extendBy;
                            }
                        }
                    } else {
                        strand = Location.UNKNOWN;
                    }
                } catch (NumberFormatException e) {
                    progressWarningReceived(new SeqMonkException("Location " + sections[0] + "-" + sections[1] + " was not an integer"));
                    continue;
                }
                try {
                    ChromosomeWithOffset c = dataCollection().genome().getChromosome(sections[0]);
                    // We also don't allow readings which are beyond the end of the chromosome
                    start = c.position(start);
                    end = c.position(end);
                    if (end > c.chromosome().length()) {
                        int overrun = end - c.chromosome().length();
                        progressWarningReceived(new SeqMonkException("Reading position " + end + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")"));
                        continue;
                    }
                    // We can now make the new reading
                    long read = SequenceRead.packPosition(start, end, strand);
                    newData[f].addData(c.chromosome(), read);
                } catch (IllegalArgumentException iae) {
                    progressWarningReceived(iae);
                } catch (SeqMonkException sme) {
                    progressWarningReceived(sme);
                    continue;
                }
            }
            // We're finished with the file.
            br.close();
            // Cache the data in the new dataset
            progressUpdated("Caching data from " + probeFiles[f].getName(), f, probeFiles.length);
            newData[f].finalise();
        }
        processingFinished(newData);
    } catch (Exception ex) {
        progressExceptionReceived(ex);
        return;
    }
}
Also used : ChromosomeWithOffset(uk.ac.babraham.SeqMonk.Utilities.ChromosomeWithOffset) InputStreamReader(java.io.InputStreamReader) DataSet(uk.ac.babraham.SeqMonk.DataTypes.DataSet) PairedDataSet(uk.ac.babraham.SeqMonk.DataTypes.PairedDataSet) PairedDataSet(uk.ac.babraham.SeqMonk.DataTypes.PairedDataSet) FileInputStream(java.io.FileInputStream) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) GZIPInputStream(java.util.zip.GZIPInputStream) BufferedReader(java.io.BufferedReader) FileReader(java.io.FileReader) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) File(java.io.File)

Example 2 with PairedDataSet

use of uk.ac.babraham.SeqMonk.DataTypes.PairedDataSet in project SeqMonk by s-andrews.

the class GenericSeqReadParser method run.

/* (non-Javadoc)
	 * @see java.lang.Runnable#run()
	 */
public void run() {
    try {
        // This call just makes sure that the options panel exists if
        // it's never been called for before.
        getOptionsPanel();
        int removeDuplicates = optionsPanel.removeDuplicates();
        int extendBy = optionsPanel.extendBy();
        File[] probeFiles = getFiles();
        DataSet[] newData = new DataSet[probeFiles.length];
        for (int f = 0; f < probeFiles.length; f++) {
            BufferedReader br;
            if (probeFiles[f].getName().toLowerCase().endsWith(".gz")) {
                br = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(probeFiles[f]))));
            } else {
                br = new BufferedReader(new FileReader(probeFiles[f]));
            }
            String line;
            // First skip the header lines
            for (int i = 0; i < startRowValue; i++) {
                line = br.readLine();
                if (line == null) {
                    br.close();
                    throw new Exception("Ran out of file before skipping all of the header lines");
                }
            }
            int maxIndexValue = 0;
            if (chrColValue > maxIndexValue)
                maxIndexValue = chrColValue;
            if (startColValue > maxIndexValue)
                maxIndexValue = startColValue;
            if (endColValue > maxIndexValue)
                maxIndexValue = endColValue;
            if (strandColValue > maxIndexValue)
                maxIndexValue = strandColValue;
            if (countColValue > maxIndexValue)
                maxIndexValue = countColValue;
            if (optionsPanel.isHiC()) {
                int distance = 0;
                if (optionsPanel.hiCDistance.getText().length() > 0) {
                    distance = Integer.parseInt(optionsPanel.hiCDistance.getText());
                }
                // TODO: Add an option to remove trans hits when importing from the generic parser.
                newData[f] = new PairedDataSet(probeFiles[f].getName(), probeFiles[f].getCanonicalPath(), removeDuplicates, distance, false);
            } else {
                newData[f] = new DataSet(probeFiles[f].getName(), probeFiles[f].getCanonicalPath(), removeDuplicates);
            }
            int lineCount = 0;
            // Now process the rest of the file
            while ((line = br.readLine()) != null) {
                if (cancel) {
                    br.close();
                    progressCancelled();
                    return;
                }
                ++lineCount;
                if (lineCount % 100000 == 0) {
                    progressUpdated("Read " + lineCount + " lines from " + probeFiles[f].getName(), f, probeFiles.length);
                }
                String[] sections = line.split(delimitersValue);
                // Check to see if we've got enough data to work with
                if (maxIndexValue >= sections.length) {
                    progressWarningReceived(new SeqMonkException("Not enough data (" + sections.length + ") to get a probe name on line '" + line + "'"));
                    // Skip this line...
                    continue;
                }
                int strand;
                int start;
                int end;
                int count = 1;
                try {
                    start = Integer.parseInt(sections[startColValue].replaceAll(" ", ""));
                    end = Integer.parseInt(sections[endColValue].replaceAll(" ", ""));
                    // End must always be later than start
                    if (end < start) {
                        int temp = start;
                        start = end;
                        end = temp;
                    }
                    if (countColValue != -1 && sections[countColValue].length() > 0) {
                        try {
                            count = Integer.parseInt(sections[countColValue].replaceAll(" ", ""));
                        } catch (NumberFormatException e) {
                            progressWarningReceived(new SeqMonkException("Count value " + sections[countColValue] + " was not an integer"));
                            continue;
                        }
                    }
                    if (useStrand) {
                        sections[strandColValue] = sections[strandColValue].replaceAll(" ", "");
                        if (sections[strandColValue].equals("+") || sections[strandColValue].equals("1") || sections[strandColValue].equals("FF") || sections[strandColValue].equals("F")) {
                            strand = Location.FORWARD;
                        } else if (sections[strandColValue].equals("-") || sections[strandColValue].equals("-1") || sections[strandColValue].equals("RF") || sections[strandColValue].equals("R")) {
                            strand = Location.REVERSE;
                        } else {
                            progressWarningReceived(new SeqMonkException("Unknown strand character '" + sections[strandColValue] + "' marked as unknown strand"));
                            strand = Location.UNKNOWN;
                        }
                    } else {
                        strand = Location.UNKNOWN;
                    }
                    if (extendBy > 0) {
                        if (strand == Location.REVERSE) {
                            start -= extendBy;
                        } else {
                            end += extendBy;
                        }
                    }
                } catch (NumberFormatException e) {
                    progressWarningReceived(new SeqMonkException("Location '" + sections[startColValue] + "'-'" + sections[endColValue] + "' was not an integer"));
                    continue;
                }
                ChromosomeWithOffset c;
                try {
                    c = dataCollection().genome().getChromosome(sections[chrColValue]);
                } catch (IllegalArgumentException sme) {
                    progressWarningReceived(sme);
                    continue;
                }
                start = c.position(start);
                end = c.position(end);
                // We also don't allow readings which are beyond the end of the chromosome
                if (end > c.chromosome().length()) {
                    int overrun = end - c.chromosome().length();
                    progressWarningReceived(new SeqMonkException("Reading position " + end + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")"));
                    continue;
                }
                if (start < 1) {
                    progressWarningReceived(new SeqMonkException("Reading start position " + start + " was less than 1"));
                    continue;
                }
                // We can now make the new reading
                try {
                    long read = SequenceRead.packPosition(start, end, strand);
                    for (int i = 0; i < count; i++) {
                        newData[f].addData(c.chromosome(), read);
                    }
                } catch (SeqMonkException e) {
                    progressWarningReceived(e);
                    continue;
                }
            // System.out.println("Added probe "+newProbe.name()+" on "+newProbe.chromosome()+" at pos "+newProbe.position());
            }
            // We're finished with the file.
            br.close();
            // Cache the data in the new dataset
            progressUpdated("Caching data from " + probeFiles[f].getName(), f, probeFiles.length);
            newData[f].finalise();
        }
        processingFinished(newData);
    } catch (Exception ex) {
        progressExceptionReceived(ex);
        return;
    }
}
Also used : ChromosomeWithOffset(uk.ac.babraham.SeqMonk.Utilities.ChromosomeWithOffset) InputStreamReader(java.io.InputStreamReader) DataSet(uk.ac.babraham.SeqMonk.DataTypes.DataSet) PairedDataSet(uk.ac.babraham.SeqMonk.DataTypes.PairedDataSet) PairedDataSet(uk.ac.babraham.SeqMonk.DataTypes.PairedDataSet) FileInputStream(java.io.FileInputStream) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) GZIPInputStream(java.util.zip.GZIPInputStream) BufferedReader(java.io.BufferedReader) FileReader(java.io.FileReader) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) File(java.io.File)

Example 3 with PairedDataSet

use of uk.ac.babraham.SeqMonk.DataTypes.PairedDataSet in project SeqMonk by s-andrews.

the class VisibleStoresParser method processHiCDataStore.

private DataSet processHiCDataStore(DataStore store) throws SeqMonkException {
    int extendBy = prefs.extendReads();
    boolean reverse = prefs.reverseReads();
    boolean removeStrand = prefs.removeStrandInfo();
    PairedDataSet newData = new PairedDataSet(store.name() + "_reimport", "Reimported from " + store.name(), prefs.removeDuplicates(), prefs.hiCDistance(), prefs.hiCIgnoreTrans());
    // Now process the data
    Chromosome[] chrs = dataCollection().genome().getAllChromosomes();
    for (int c = 0; c < chrs.length; c++) {
        progressUpdated("Processing " + store.name() + " chr " + chrs[c].name(), c, chrs.length);
        // We make the call to get exportable reads so we don't duplicate reads
        // when we export things
        HiCHitCollection hitCollection = ((HiCDataStore) store).getExportableReadsForChromosome(chrs[c]);
        String[] localChromosomes = hitCollection.getChromosomeNamesWithHits();
        for (int c2 = 0; c2 < localChromosomes.length; c2++) {
            Chromosome localChromosome = SeqMonkApplication.getInstance().dataCollection().genome().getChromosome(localChromosomes[c2]).chromosome();
            long[] sourceReads = hitCollection.getSourcePositionsForChromosome(localChromosomes[c2]);
            long[] hitReads = hitCollection.getHitPositionsForChromosome(localChromosomes[c2]);
            for (int r = 0; r < sourceReads.length; r++) {
                if (cancel) {
                    progressCancelled();
                    return null;
                }
                if (downsample && downsampleProbabilty < 1) {
                    if (Math.random() > downsampleProbabilty) {
                        continue;
                    }
                }
                if ((!(reverse || removeStrand)) && extendBy == 0 && (!filterByFeature)) {
                    // Just add them as they are
                    newData.addData(chrs[c], sourceReads[r]);
                    newData.addData(localChromosome, hitReads[r]);
                }
                Feature[] features = null;
                if (filterByFeature) {
                    features = collection.genome().annotationCollection().getFeaturesForType(chrs[c], featureType);
                    Arrays.sort(features);
                }
                int currentFeaturePostion = 0;
                if (filterByFeature) {
                    // See if we're comparing against the right feature
                    while (SequenceRead.start(sourceReads[r]) > features[currentFeaturePostion].location().end() && currentFeaturePostion < (features.length - 1)) {
                        currentFeaturePostion++;
                    }
                    // Test to see if we overlap
                    if (SequenceRead.overlaps(sourceReads[r], features[currentFeaturePostion].location().packedPosition())) {
                        if (excludeFeature)
                            continue;
                    } else {
                        if (!excludeFeature)
                            continue;
                    }
                }
                int sourceStart = SequenceRead.start(sourceReads[r]);
                int sourceEend = SequenceRead.end(sourceReads[r]);
                int sourceStrand = SequenceRead.strand(sourceReads[r]);
                int hitStart = SequenceRead.start(sourceReads[r]);
                int hitEend = SequenceRead.end(hitReads[r]);
                int hitStrand = SequenceRead.strand(hitReads[r]);
                if (reverse) {
                    if (sourceStrand == Location.FORWARD) {
                        sourceStrand = Location.REVERSE;
                    } else if (sourceStrand == Location.REVERSE) {
                        sourceStrand = Location.FORWARD;
                    }
                    if (hitStrand == Location.FORWARD) {
                        hitStrand = Location.REVERSE;
                    } else if (hitStrand == Location.REVERSE) {
                        hitStrand = Location.FORWARD;
                    }
                }
                if (removeStrand) {
                    sourceStrand = Location.UNKNOWN;
                    hitStrand = Location.UNKNOWN;
                }
                if (extendBy > 0) {
                    if (sourceStrand == Location.FORWARD) {
                        sourceEend += extendBy;
                    } else if (sourceStrand == Location.REVERSE) {
                        sourceStart -= extendBy;
                    }
                    if (hitStrand == Location.FORWARD) {
                        hitEend += extendBy;
                    } else if (hitStrand == Location.REVERSE) {
                        hitStart -= extendBy;
                    }
                }
                // We also don't allow readings which are beyond the end of the chromosome
                if (sourceEend > chrs[c].length()) {
                    int overrun = sourceEend - chrs[c].length();
                    progressWarningReceived(new SeqMonkException("Reading position " + sourceEend + " was " + overrun + "bp beyond the end of chr" + chrs[c].name() + " (" + chrs[c].length() + ")"));
                    continue;
                }
                if (hitEend > localChromosome.length()) {
                    int overrun = hitEend - SeqMonkApplication.getInstance().dataCollection().genome().getChromosome(localChromosomes[c2]).chromosome().length();
                    progressWarningReceived(new SeqMonkException("Reading position " + hitEend + " was " + overrun + "bp beyond the end of chr" + localChromosome.name() + " (" + chrs[c].length() + ")"));
                    continue;
                }
                // We can now make the new readings
                long sourceRead = SequenceRead.packPosition(sourceStart, sourceEend, sourceStrand);
                long hitRead = SequenceRead.packPosition(hitStart, hitEend, hitStrand);
                if (!prefs.isHiC()) {
                    // HiC additions are deferred until we know the other end is OK too.
                    newData.addData(chrs[c], sourceRead);
                    newData.addData(localChromosome, hitRead);
                }
            }
        }
    }
    return newData;
}
Also used : HiCHitCollection(uk.ac.babraham.SeqMonk.DataTypes.Sequence.HiCHitCollection) PairedDataSet(uk.ac.babraham.SeqMonk.DataTypes.PairedDataSet) Chromosome(uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome) HiCDataStore(uk.ac.babraham.SeqMonk.DataTypes.HiCDataStore) Feature(uk.ac.babraham.SeqMonk.DataTypes.Genome.Feature) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException)

Example 4 with PairedDataSet

use of uk.ac.babraham.SeqMonk.DataTypes.PairedDataSet in project SeqMonk by s-andrews.

the class BedPEFileParser method run.

/* (non-Javadoc)
	 * @see java.lang.Runnable#run()
	 */
public void run() {
    // System.err.println("Started parsing BED files");
    int extendBy = prefs.extendReads();
    try {
        File[] probeFiles = getFiles();
        DataSet[] newData = new DataSet[probeFiles.length];
        for (int f = 0; f < probeFiles.length; f++) {
            BufferedReader br;
            if (probeFiles[f].getName().toLowerCase().endsWith(".gz")) {
                br = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(probeFiles[f]))));
            } else {
                br = new BufferedReader(new FileReader(probeFiles[f]));
            }
            String line;
            if (prefs.isHiC()) {
                newData[f] = new PairedDataSet(probeFiles[f].getName(), probeFiles[f].getCanonicalPath(), prefs.removeDuplicates(), prefs.hiCDistance(), prefs.hiCIgnoreTrans());
            } else {
                newData[f] = new DataSet(probeFiles[f].getName(), probeFiles[f].getCanonicalPath(), prefs.removeDuplicates());
            }
            int lineCount = 0;
            // Now process the file
            while ((line = br.readLine()) != null) {
                if (cancel) {
                    br.close();
                    progressCancelled();
                    return;
                }
                // Ignore blank lines
                if (line.trim().length() == 0)
                    continue;
                // System.err.println(line);
                // Thread.sleep(200);
                ++lineCount;
                if (lineCount % 100000 == 0) {
                    progressUpdated("Read " + lineCount + " lines from " + probeFiles[f].getName(), f, probeFiles.length);
                }
                String[] sections = line.split("\t");
                // Check to see if we've got enough data to work with
                if (sections.length < 6) {
                    progressWarningReceived(new SeqMonkException("Not enough data from line '" + line + "'"));
                    // Skip this line...
                    continue;
                }
                // move on quickly.
                if (sections[0].equals(".") || sections[3].equals("."))
                    continue;
                int strand1;
                int start1;
                int end1;
                int strand2;
                int start2;
                int end2;
                try {
                    // The start is zero indexed so we need to add 1 to get genomic positions
                    start1 = Integer.parseInt(sections[1]) + 1;
                    start2 = Integer.parseInt(sections[4]) + 1;
                    // The end is zero indexed, but not included in the feature position so
                    // we need to add one to get genomic coordinates, but subtract one to not
                    // include the final base.
                    end1 = Integer.parseInt(sections[2]);
                    end2 = Integer.parseInt(sections[5]);
                    // End must always be later than start
                    if (start1 > end1) {
                        progressWarningReceived(new SeqMonkException("End position1 " + end1 + " was lower than start position " + start1));
                        int temp = start1;
                        start1 = end1;
                        end1 = temp;
                    }
                    if (start2 > end2) {
                        progressWarningReceived(new SeqMonkException("End position2 " + end2 + " was lower than start position " + start2));
                        int temp = start2;
                        start2 = end2;
                        end2 = temp;
                    }
                    if (sections.length >= 10) {
                        if (sections[8].equals("+")) {
                            strand1 = Location.FORWARD;
                        } else if (sections[8].equals("-")) {
                            strand1 = Location.REVERSE;
                        } else if (sections[8].equals(".")) {
                            strand1 = Location.UNKNOWN;
                        } else {
                            progressWarningReceived(new SeqMonkException("Unknown strand character '" + sections[8] + "' marked as unknown strand"));
                            strand1 = Location.UNKNOWN;
                        }
                        if (sections[9].equals("+")) {
                            strand2 = Location.FORWARD;
                        } else if (sections[9].equals("-")) {
                            strand2 = Location.REVERSE;
                        } else if (sections[9].equals(".")) {
                            strand2 = Location.UNKNOWN;
                        } else {
                            progressWarningReceived(new SeqMonkException("Unknown strand character '" + sections[9] + "' marked as unknown strand"));
                            strand2 = Location.UNKNOWN;
                        }
                    // if (extendBy > 0) {
                    // if (strand==Location.REVERSE) {
                    // start -=extendBy;
                    // }
                    // else if (strand==Location.FORWARD) {
                    // end+=extendBy;
                    // }
                    // }
                    } else {
                        strand1 = Location.UNKNOWN;
                        strand2 = Location.UNKNOWN;
                    }
                } catch (NumberFormatException e) {
                    progressWarningReceived(new SeqMonkException("Location " + sections[0] + "-" + sections[1] + " was not an integer"));
                    continue;
                }
                try {
                    ChromosomeWithOffset c1 = dataCollection().genome().getChromosome(sections[0]);
                    // We also don't allow readings which are beyond the end of the chromosome
                    start1 = c1.position(start1);
                    end1 = c1.position(end1);
                    if (end1 > c1.chromosome().length()) {
                        int overrun = end1 - c1.chromosome().length();
                        progressWarningReceived(new SeqMonkException("Reading position " + end1 + " was " + overrun + "bp beyond the end of chr" + c1.chromosome().name() + " (" + c1.chromosome().length() + ")"));
                        continue;
                    }
                    ChromosomeWithOffset c2 = dataCollection().genome().getChromosome(sections[3]);
                    // We also don't allow readings which are beyond the end of the chromosome
                    start2 = c2.position(start2);
                    end2 = c2.position(end2);
                    if (end2 > c2.chromosome().length()) {
                        int overrun = end2 - c2.chromosome().length();
                        progressWarningReceived(new SeqMonkException("Reading position " + end2 + " was " + overrun + "bp beyond the end of chr" + c2.chromosome().name() + " (" + c2.chromosome().length() + ")"));
                        continue;
                    }
                    // add them.  There's nothing clever to do.
                    if (prefs.isHiC()) {
                        long read1 = SequenceRead.packPosition(start1, end1, strand1);
                        newData[f].addData(c1.chromosome(), read1);
                        long read2 = SequenceRead.packPosition(start2, end2, strand2);
                        newData[f].addData(c2.chromosome(), read2);
                    } else {
                        // If they're on different chromosomes then we kick them out
                        if (!c1.chromosome().name().equals(c2.chromosome().name())) {
                            progressWarningReceived(new SeqMonkException("Paried reads were on different chromosomes - discarding"));
                            continue;
                        }
                        if (strand1 == Location.FORWARD && strand2 != Location.REVERSE) {
                            progressWarningReceived(new SeqMonkException("Invalid strand orientation - discarding"));
                            continue;
                        }
                        if (strand1 == Location.REVERSE && strand2 != Location.FORWARD) {
                            progressWarningReceived(new SeqMonkException("Invalid strand orientation - discarding"));
                            continue;
                        }
                        // If they're too far apart we kick them out
                        int start = 1;
                        int end = 0;
                        // We take the strand from read1
                        int strand = strand1;
                        if (strand == Location.FORWARD) {
                            start = start1;
                            end = end2;
                        } else if (strand == Location.REVERSE) {
                            start = start2;
                            end = end1;
                        } else if (strand == Location.UNKNOWN) {
                            start = Math.min(start1, start2);
                            end = Math.max(end1, end2);
                        }
                        if (end <= start) {
                            progressWarningReceived(new SeqMonkException("Incorrectly oriented reads - discarding"));
                            continue;
                        }
                        if ((end - start) + 1 > prefs.pairDistanceCutoff()) {
                            progressWarningReceived(new SeqMonkException("Distance between reads too great (" + (((end - start) + 1) - prefs.pairDistanceCutoff()) + ")"));
                            continue;
                        }
                        long read = SequenceRead.packPosition(start, end, strand);
                        newData[f].addData(c1.chromosome(), read);
                    }
                } catch (IllegalArgumentException iae) {
                    progressWarningReceived(iae);
                } catch (SeqMonkException sme) {
                    progressWarningReceived(sme);
                    continue;
                }
            }
            // We're finished with the file.
            br.close();
            // Cache the data in the new dataset
            progressUpdated("Caching data from " + probeFiles[f].getName(), f, probeFiles.length);
            newData[f].finalise();
        }
        processingFinished(newData);
    } catch (Exception ex) {
        progressExceptionReceived(ex);
        return;
    }
}
Also used : ChromosomeWithOffset(uk.ac.babraham.SeqMonk.Utilities.ChromosomeWithOffset) InputStreamReader(java.io.InputStreamReader) DataSet(uk.ac.babraham.SeqMonk.DataTypes.DataSet) PairedDataSet(uk.ac.babraham.SeqMonk.DataTypes.PairedDataSet) PairedDataSet(uk.ac.babraham.SeqMonk.DataTypes.PairedDataSet) FileInputStream(java.io.FileInputStream) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) GZIPInputStream(java.util.zip.GZIPInputStream) BufferedReader(java.io.BufferedReader) FileReader(java.io.FileReader) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) File(java.io.File)

Example 5 with PairedDataSet

use of uk.ac.babraham.SeqMonk.DataTypes.PairedDataSet in project SeqMonk by s-andrews.

the class SeqMonkReimportParser method parseSamples.

/**
 * Parses the list of samples.
 *
 * @param sections The tab split initial samples line
 * @throws SeqMonkException
 * @throws IOException Signals that an I/O exception has occurred.
 */
private DataSet[] parseSamples(String[] sections) throws SeqMonkException, IOException {
    if (sections.length != 2) {
        throw new SeqMonkException("Samples line didn't contain 2 sections");
    }
    if (!sections[0].equals("Samples")) {
        throw new SeqMonkException("Couldn't find expected samples line");
    }
    int n = Integer.parseInt(sections[1]);
    // We need to keep the Data Sets around to add data to later.
    // We're going to start all of these datasets off, but only collect data for some
    // of them.
    DataSet[] dataSets = new DataSet[n];
    for (int i = 0; i < n; i++) {
        sections = br.readLine().split("\\t");
        // anything else is assumed to be a normal dataset.
        if (sections.length == 1) {
            dataSets[i] = new DataSet(sections[0], "Not known", DataSet.DUPLICATES_REMOVE_NO);
        } else if (sections.length == 2) {
            dataSets[i] = new DataSet(sections[0], sections[1], DataSet.DUPLICATES_REMOVE_NO);
        } else if (sections.length == 3) {
            if (sections[2].equals("HiC")) {
                dataSets[i] = new PairedDataSet(sections[0], sections[1], DataSet.DUPLICATES_REMOVE_NO, 0, false);
            } else {
                dataSets[i] = new DataSet(sections[0], sections[1], DataSet.DUPLICATES_REMOVE_NO);
            }
        }
    }
    // At this point we need to know which of the datasets we're actually going
    // to import into the current project
    DataSetSelector selector = new DataSetSelector(dataSets);
    boolean[] dataSetsToParse = new boolean[dataSets.length];
    int lastDataSetToParse = 0;
    for (int i = 0; i < dataSetsToParse.length; i++) {
        dataSetsToParse[i] = false;
    }
    int[] selectedIndices = selector.getSelectedIndices();
    for (int i = 0; i < selectedIndices.length; i++) {
        dataSetsToParse[selectedIndices[i]] = true;
        if (selectedIndices[i] > lastDataSetToParse)
            lastDataSetToParse = selectedIndices[i];
    }
    // Now we can go through the rest of the sets and add data where appropriate
    // Immediately after the list of samples comes the lists of reads
    String line;
    // Iterate through the number of samples
    for (int i = 0; i < n; i++) {
        if (i > lastDataSetToParse) {
            // No sense skipping through a load of data we're not going to import.
            break;
        }
        if (dataSetsToParse[i]) {
            progressUpdated("Reading data for " + dataSets[i].name(), i * 10, n * 10);
        } else {
            progressUpdated("Skipping " + dataSets[i].name(), i * 10, n * 10);
        }
        // The first line is
        line = br.readLine();
        sections = line.split("\t");
        if (sections.length != 2) {
            throw new SeqMonkException("Read line " + i + " didn't contain 2 sections");
        }
        int readCount = Integer.parseInt(sections[0]);
        // In versions prior to 7 we encoded everything on every line separately
        if (thisDataVersion < 7) {
            for (int r = 0; r < readCount; r++) {
                if ((r % (1 + (readCount / 10))) == 0) {
                    if (dataSetsToParse[i]) {
                        progressUpdated("Reading data for " + dataSets[i].name(), i * 10 + (r / (1 + (readCount / 10))), n * 10);
                    } else {
                        progressUpdated("Skipping " + dataSets[i].name(), i * 10 + (r / (1 + (readCount / 10))), n * 10);
                    }
                }
                line = br.readLine();
                if (line == null) {
                    throw new SeqMonkException("Ran out of data whilst parsing reads for sample " + i);
                }
                if (dataSetsToParse[i]) {
                    sections = line.split("\t");
                    Chromosome c;
                    try {
                        c = genome.getChromosome(sections[0]).chromosome();
                    } catch (IllegalArgumentException sme) {
                        progressWarningReceived(new SeqMonkException("Read from sample " + i + " could not be mapped to chromosome '" + sections[0] + "'"));
                        continue;
                    }
                    int start = Integer.parseInt(sections[1]);
                    int end = Integer.parseInt(sections[2]);
                    int strand = Integer.parseInt(sections[3]);
                    try {
                        dataSets[i].addData(c, SequenceRead.packPosition(start, end, strand));
                    } catch (SeqMonkException ex) {
                        progressWarningReceived(ex);
                        continue;
                    }
                }
            }
        } else if (dataSets[i] instanceof PairedDataSet) {
            // Paired Data sets have a different format, with a packed position for
            // the first read, then a chromosome name and packed position for the second
            // read.
            // From v13 onwards we started entering HiC data pairs in both directions so
            // the data would be able to be read back in without sorting it.
            boolean hicDataIsPresorted = thisDataVersion >= 13;
            while (true) {
                // The first line should be the chromosome and a number of reads
                line = br.readLine();
                if (line == null) {
                    throw new SeqMonkException("Ran out of data whilst parsing reads for sample " + i);
                }
                if (dataSetsToParse[i]) {
                    // A blank line indicates the end of the sample
                    if (line.length() == 0)
                        break;
                    sections = line.split("\\t");
                    Chromosome c = genome.getChromosome(sections[0]).chromosome();
                    // progressUpdated("Reading data for "+dataSets[i].name(),i*application.dataCollection().genome().getChromosomeCount()+seenChromosomeCount,n*application.dataCollection().genome().getChromosomeCount());
                    int chrReadCount = Integer.parseInt(sections[1]);
                    for (int r = 0; r < chrReadCount; r++) {
                        line = br.readLine();
                        if (line == null) {
                            throw new SeqMonkException("Ran out of data whilst parsing reads for sample " + i);
                        }
                        /*
							 * We used to have a split("\t") here, but this turned out to be the bottleneck
							 * which hugely restricted the speed at which the data could be read.  Switching
							 * for a set of index calls and substring makes this *way* faster.
							 */
                        long packedPosition = Long.parseLong(line.substring(0, line.indexOf('\t')));
                        Chromosome c2 = genome.getChromosome(line.substring(line.indexOf('\t'), line.lastIndexOf(('\t')))).chromosome();
                        long packedPosition2 = Long.parseLong(line.substring(line.lastIndexOf('\t') + 1, line.length()));
                        try {
                            dataSets[i].addData(c, packedPosition, hicDataIsPresorted);
                            dataSets[i].addData(c2, packedPosition2, hicDataIsPresorted);
                        } catch (SeqMonkException ex) {
                            progressWarningReceived(ex);
                            continue;
                        }
                    }
                }
            }
        } else {
            // In versions after 7 we split the section up into chromosomes
            // so we don't put the chromosome on every line, and we put out
            // the packed double value rather than the individual start, end
            // and strand
            // As of version 12 we collapse repeated reads into one line with
            // a count after it, so we need to check for this.
            // We keep count of reads processed to update the progress listeners
            int readsRead = 0;
            while (true) {
                // The first line should be the chromosome and a number of reads
                line = br.readLine();
                if (line == null) {
                    throw new SeqMonkException("Ran out of data whilst parsing reads for sample " + i);
                }
                // A blank line indicates the end of the sample
                if (line.length() == 0)
                    break;
                if (dataSetsToParse[i]) {
                    sections = line.split("\\t");
                    // We don't try to capture this exception since we can't then process
                    // any of the reads which follow.
                    Chromosome c = genome.getChromosome(sections[0]).chromosome();
                    int chrReadCount = Integer.parseInt(sections[1]);
                    int tabIndexPosition = 0;
                    for (int r = 0; r < chrReadCount; r++) {
                        readsRead++;
                        if ((readsRead % (1 + (readCount / 10))) == 0) {
                            if (dataSetsToParse[i]) {
                                progressUpdated("Reading data for " + dataSets[i].name(), i * 10 + (readsRead / (1 + (readCount / 10))), n * 10);
                            } else {
                                progressUpdated("Skipping " + dataSets[i].name(), i * 10 + (readsRead / (1 + (readCount / 10))), n * 10);
                            }
                        }
                        line = br.readLine();
                        if (line == null) {
                            throw new SeqMonkException("Ran out of data whilst parsing reads for sample " + i);
                        }
                        long packedPosition;
                        try {
                            if (line.indexOf("\t") != -1) {
                                // We have an index with a count
                                sections = line.split("\t");
                                tabIndexPosition = line.indexOf("\t");
                                packedPosition = Long.parseLong(line.substring(0, tabIndexPosition));
                                int count = Integer.parseInt(line.substring(tabIndexPosition + 1));
                                // TODO: Does this break the progress bar sometimes?
                                r += (count - 1);
                                readsRead += (count - 1);
                                for (int x = 1; x <= count; x++) {
                                    dataSets[i].addData(c, packedPosition);
                                }
                            } else {
                                packedPosition = Long.parseLong(line);
                                dataSets[i].addData(c, packedPosition);
                            }
                        } catch (SeqMonkException ex) {
                            progressWarningReceived(ex);
                        }
                    }
                }
            }
        }
        if (dataSetsToParse[i]) {
            dataSets[i].finalise();
        }
    }
    // Finally we need to make up an array of just the datasets we actually read
    Vector<DataSet> keepers = new Vector<DataSet>();
    for (int i = 0; i < dataSets.length; i++) {
        if (dataSetsToParse[i]) {
            keepers.add(dataSets[i]);
        }
    }
    return keepers.toArray(new DataSet[0]);
}
Also used : DataSet(uk.ac.babraham.SeqMonk.DataTypes.DataSet) PairedDataSet(uk.ac.babraham.SeqMonk.DataTypes.PairedDataSet) PairedDataSet(uk.ac.babraham.SeqMonk.DataTypes.PairedDataSet) Chromosome(uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) Vector(java.util.Vector)

Aggregations

PairedDataSet (uk.ac.babraham.SeqMonk.DataTypes.PairedDataSet)9 SeqMonkException (uk.ac.babraham.SeqMonk.SeqMonkException)9 DataSet (uk.ac.babraham.SeqMonk.DataTypes.DataSet)8 File (java.io.File)6 BufferedReader (java.io.BufferedReader)5 FileInputStream (java.io.FileInputStream)5 FileReader (java.io.FileReader)5 InputStreamReader (java.io.InputStreamReader)5 GZIPInputStream (java.util.zip.GZIPInputStream)5 ChromosomeWithOffset (uk.ac.babraham.SeqMonk.Utilities.ChromosomeWithOffset)4 Chromosome (uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome)3 SequenceReadWithChromosome (uk.ac.babraham.SeqMonk.DataTypes.Sequence.SequenceReadWithChromosome)2 FileNotFoundException (java.io.FileNotFoundException)1 IOException (java.io.IOException)1 UnknownHostException (java.net.UnknownHostException)1 Enumeration (java.util.Enumeration)1 Vector (java.util.Vector)1 SAMFileReader (net.sf.samtools.SAMFileReader)1 SAMRecord (net.sf.samtools.SAMRecord)1 Feature (uk.ac.babraham.SeqMonk.DataTypes.Genome.Feature)1