Examples with DataSet - uk.ac.babraham.SeqMonk.DataTypes.DataSet

Example 31 with DataSet

use of uk.ac.babraham.SeqMonk.DataTypes.DataSet in project SeqMonk by s-andrews.

the class BAMFileParser method run.

/* (non-Javadoc)
	 * @see java.lang.Runnable#run()
	 */
public void run() {
    pairedEndImport = prefs.pairedEnd();
    pairedEndDistance = prefs.pairDistanceCutoff();
    separateSplicedReads = prefs.splitSplicedReads();
    importIntrons = prefs.importIntrons();
    extendBy = prefs.extendReads();
    minMappingQuality = prefs.minMappingQuality();
    primaryAlignmentsOnly = prefs.primaryAlignmentsOnly();
    File[] samFiles = getFiles();
    DataSet[] newData = new DataSet[samFiles.length];
    try {
        for (int f = 0; f < samFiles.length; f++) {
            SAMFileReader.setDefaultValidationStringency(SAMFileReader.ValidationStringency.SILENT);
            SAMFileReader inputSam = new SAMFileReader(samFiles[f]);
            if (prefs.isHiC()) {
                newData[f] = new PairedDataSet(samFiles[f].getName(), samFiles[f].getCanonicalPath(), prefs.removeDuplicates(), prefs.hiCDistance(), prefs.hiCIgnoreTrans());
            } else {
                newData[f] = new DataSet(samFiles[f].getName(), samFiles[f].getCanonicalPath(), prefs.removeDuplicates());
            }
            int lineCount = 0;
            // Now process the file
            // A flag we can set to skip the next record if we're getting
            // out of sync during single end HiC import.
            boolean skipNext = false;
            for (SAMRecord samRecord : inputSam) {
                if (skipNext) {
                    skipNext = false;
                    continue;
                }
                if (cancel) {
                    inputSam.close();
                    progressCancelled();
                    return;
                }
                ++lineCount;
                if (lineCount % 100000 == 0) {
                    progressUpdated("Read " + lineCount + " lines from " + samFiles[f].getName(), f, samFiles.length);
                }
                if (pairedEndImport && !samRecord.getReadPairedFlag()) {
                    progressWarningReceived(new SeqMonkException("Data was single ended during paired end import"));
                    continue;
                }
                if (samRecord.getReadUnmappedFlag()) {
                    // There was no match
                    continue;
                }
                if (primaryAlignmentsOnly && samRecord.getNotPrimaryAlignmentFlag()) {
                    // alignments
                    continue;
                }
                if (pairedEndImport && !separateSplicedReads && samRecord.getMateUnmappedFlag()) {
                    // No match on the reverse strand.  Doesn't matter if we're doing spliced reads.
                    continue;
                }
                if (minMappingQuality > 0 && samRecord.getMappingQuality() < minMappingQuality) {
                    if (prefs.isHiC()) {
                        if (((PairedDataSet) newData[f]).importSequenceSkipped()) {
                            // Skip the next line
                            skipNext = true;
                        }
                    }
                    continue;
                }
                if (pairedEndImport && !separateSplicedReads && !samRecord.getReadNegativeStrandFlag()) {
                    // field.
                    continue;
                }
                if (pairedEndImport && !separateSplicedReads && !samRecord.getReferenceName().equals(samRecord.getMateReferenceName())) {
                    // progressWarningReceived(new SeqMonkException("Paired reads mapped to different chromosomes"));
                    continue;
                }
                // TODO: Check what this actually stores - might be a real name rather than 0/=
                if (pairedEndImport && !separateSplicedReads && !prefs.isHiC() && samRecord.getMateReferenceName() == "0") {
                    if (samRecord.getMateReferenceName() != "=") {
                        inputSam.close();
                        throw new SeqMonkException("Unexpected mate referenece name " + samRecord.getMateReferenceName());
                    }
                    // Matches were on different chromosomes
                    continue;
                }
                try {
                    if (pairedEndImport && !separateSplicedReads) {
                        SequenceReadWithChromosome read = getPairedEndRead(samRecord);
                        newData[f].addData(read.chromosome, read.read);
                    } else if (separateSplicedReads) {
                        SequenceReadWithChromosome[] reads = getSplitSingleEndRead(samRecord);
                        for (int r = 0; r < reads.length; r++) {
                            newData[f].addData(reads[r].chromosome, reads[r].read);
                        }
                    } else {
                        SequenceReadWithChromosome read = getSingleEndRead(samRecord);
                        newData[f].addData(read.chromosome, read.read);
                    }
                } catch (SeqMonkException ex) {
                    progressWarningReceived(ex);
                    if (prefs.isHiC()) {
                        if (((PairedDataSet) newData[f]).importSequenceSkipped()) {
                            // Skip the next line
                            skipNext = true;
                        }
                    }
                }
            }
            // We're finished with the file.
            inputSam.close();
            // Cache the data in the new dataset
            progressUpdated("Caching data from " + samFiles[f].getName(), f, samFiles.length);
            newData[f].finalise();
        }
    } catch (Exception ex) {
        progressExceptionReceived(ex);
        return;
    }
    processingFinished(newData);
}

Also used : DataSet(uk.ac.babraham.SeqMonk.DataTypes.DataSet) PairedDataSet(uk.ac.babraham.SeqMonk.DataTypes.PairedDataSet) SAMRecord(net.sf.samtools.SAMRecord) PairedDataSet(uk.ac.babraham.SeqMonk.DataTypes.PairedDataSet) SAMFileReader(net.sf.samtools.SAMFileReader) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) File(java.io.File) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) SequenceReadWithChromosome(uk.ac.babraham.SeqMonk.DataTypes.Sequence.SequenceReadWithChromosome)

Example 32 with DataSet

use of uk.ac.babraham.SeqMonk.DataTypes.DataSet in project SeqMonk by s-andrews.

the class BowtieFileParser method run.

/* (non-Javadoc)
	 * @see java.lang.Runnable#run()
	 */
public void run() {
    pairedEndImport = prefs.pairedEnd();
    pairedEndDistance = prefs.pairDistanceCutoff();
    if (!pairedEndImport) {
        extendBy = prefs.extendReads();
    }
    String[] sections1;
    String[] sections2 = null;
    File[] bowtieFiles = getFiles();
    DataSet[] newData = new DataSet[bowtieFiles.length];
    try {
        for (int f = 0; f < bowtieFiles.length; f++) {
            BufferedReader br;
            if (bowtieFiles[f].getName().toLowerCase().endsWith(".gz")) {
                br = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(bowtieFiles[f]))));
            } else {
                br = new BufferedReader(new FileReader(bowtieFiles[f]));
            }
            String line;
            String line2 = null;
            if (prefs.isHiC()) {
                newData[f] = new PairedDataSet(bowtieFiles[f].getName(), bowtieFiles[f].getCanonicalPath(), prefs.removeDuplicates(), prefs.hiCDistance(), prefs.hiCIgnoreTrans());
            } else {
                newData[f] = new DataSet(bowtieFiles[f].getName(), bowtieFiles[f].getCanonicalPath(), prefs.removeDuplicates());
            }
            int lineCount = 0;
            while ((line = br.readLine()) != null) {
                if (cancel) {
                    br.close();
                    progressCancelled();
                    return;
                }
                // Ignore blank lines
                if (line.trim().length() == 0)
                    continue;
                ++lineCount;
                if (pairedEndImport) {
                    line2 = br.readLine();
                    if (line2 == null) {
                        ++lineCount;
                        progressWarningReceived(new SeqMonkException("Ran out of file when looking for paired end"));
                        continue;
                    }
                }
                if (lineCount % 100000 == 0) {
                    progressUpdated("Read " + lineCount + " lines from " + bowtieFiles[f].getName(), f, bowtieFiles.length);
                }
                sections1 = line.split("\t");
                if (pairedEndImport) {
                    sections2 = line2.split("\t");
                }
                // Check to see if we've got enough data to work with
                if (sections1.length < 5) {
                    progressWarningReceived(new SeqMonkException("Not enough data from line '" + line + "'"));
                    // Skip this line...
                    continue;
                }
                if (pairedEndImport && sections2.length < 5) {
                    progressWarningReceived(new SeqMonkException("Not enough data from line '" + line + "'"));
                    // Skip this line...
                    continue;
                }
                try {
                    if (pairedEndImport) {
                        SequenceReadWithChromosome read = getPairedEndRead(sections1, sections2);
                        newData[f].addData(read.chromosome, read.read);
                    } else {
                        SequenceReadWithChromosome read = getSingleEndRead(sections1);
                        newData[f].addData(read.chromosome, read.read);
                    }
                } catch (SeqMonkException ex) {
                    progressWarningReceived(ex);
                }
            }
            // We're finished with the file.
            br.close();
            // Cache the data in the new dataset
            progressUpdated("Caching data from " + bowtieFiles[f].getName(), f, bowtieFiles.length);
            newData[f].finalise();
        }
    } catch (Exception ex) {
        progressExceptionReceived(ex);
        return;
    }
    processingFinished(newData);
}

Also used : InputStreamReader(java.io.InputStreamReader) DataSet(uk.ac.babraham.SeqMonk.DataTypes.DataSet) PairedDataSet(uk.ac.babraham.SeqMonk.DataTypes.PairedDataSet) PairedDataSet(uk.ac.babraham.SeqMonk.DataTypes.PairedDataSet) FileInputStream(java.io.FileInputStream) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) SequenceReadWithChromosome(uk.ac.babraham.SeqMonk.DataTypes.Sequence.SequenceReadWithChromosome) GZIPInputStream(java.util.zip.GZIPInputStream) BufferedReader(java.io.BufferedReader) FileReader(java.io.FileReader) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) File(java.io.File)

Example 33 with DataSet

use of uk.ac.babraham.SeqMonk.DataTypes.DataSet in project SeqMonk by s-andrews.

the class ActiveProbeListParser method processNormalDataStore.

private DataSet processNormalDataStore(ProbeList activeList) {
    int extendBy = prefs.extendReads();
    boolean reverse = prefs.reverseReads();
    boolean modifyStrand = false;
    int forcedStrand = 0;
    if (!prefs.strandOptionBox.getSelectedItem().equals("From probes")) {
        modifyStrand = true;
        if (prefs.strandOptionBox.getSelectedItem().equals("Forward")) {
            forcedStrand = Location.FORWARD;
        } else if (prefs.strandOptionBox.getSelectedItem().equals("Reverse")) {
            forcedStrand = Location.REVERSE;
        } else if (prefs.strandOptionBox.getSelectedItem().equals("Unknown")) {
            forcedStrand = Location.UNKNOWN;
        } else {
            throw new IllegalArgumentException("Unknown forced strand option " + prefs.strandOptionBox.getSelectedItem());
        }
    }
    DataSet newData = new DataSet(activeList.name(), "Reimported from " + activeList.name(), prefs.removeDuplicates());
    // Now process the data
    Chromosome[] chrs = dataCollection().genome().getAllChromosomes();
    for (int c = 0; c < chrs.length; c++) {
        progressUpdated("Processing " + activeList.name() + " chr " + chrs[c].name(), c, chrs.length);
        Probe[] probes = activeList.getProbesForChromosome(chrs[c]);
        for (int r = 0; r < probes.length; r++) {
            if (cancel) {
                progressCancelled();
                return null;
            }
            long read;
            int start = probes[r].start();
            int end = probes[r].end();
            int strand = probes[r].strand();
            if (reverse) {
                if (strand == Location.FORWARD) {
                    strand = Location.REVERSE;
                } else if (strand == Location.REVERSE) {
                    strand = Location.FORWARD;
                }
            }
            if (extendBy != 0) {
                // We now allow negative extensions to shorten reads
                if (strand == Location.FORWARD || strand == Location.UNKNOWN) {
                    end += extendBy;
                    if (end < start)
                        end = start;
                } else if (strand == Location.REVERSE) {
                    start -= extendBy;
                    if (start > end)
                        start = end;
                }
            }
            // We don't allow reads before the start of the chromosome
            if (start < 1) {
                int overrun = (0 - start) + 1;
                progressWarningReceived(new SeqMonkException("Reading position " + start + " was " + overrun + "bp before the start of chr" + chrs[c].name() + " (" + chrs[c].length() + ")"));
                continue;
            }
            // We also don't allow readings which are beyond the end of the chromosome
            if (end > chrs[c].length()) {
                int overrun = end - chrs[c].length();
                progressWarningReceived(new SeqMonkException("Reading position " + end + " was " + overrun + "bp beyond the end of chr" + chrs[c].name() + " (" + chrs[c].length() + ")"));
                continue;
            }
            // Force the strand to what they specified if they want this.
            if (modifyStrand) {
                strand = forcedStrand;
            }
            // We can now make the new reading
            try {
                read = SequenceRead.packPosition(start, end, strand);
                newData.addData(chrs[c], read);
            } catch (SeqMonkException e) {
                progressWarningReceived(e);
                continue;
            }
        }
    }
    return newData;
}

Also used : DataSet(uk.ac.babraham.SeqMonk.DataTypes.DataSet) Chromosome(uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) Probe(uk.ac.babraham.SeqMonk.DataTypes.Probes.Probe)

Example 34 with DataSet

use of uk.ac.babraham.SeqMonk.DataTypes.DataSet in project SeqMonk by s-andrews.

the class BismarkCovFileParser method run.

/* (non-Javadoc)
	 * @see java.lang.Runnable#run()
	 */
public void run() {
    try {
        File[] covFiles = getFiles();
        DataSet[] newData = new DataSet[covFiles.length];
        for (int f = 0; f < covFiles.length; f++) {
            BufferedReader br;
            if (covFiles[f].getName().toLowerCase().endsWith(".gz")) {
                br = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(covFiles[f]))));
            } else {
                br = new BufferedReader(new FileReader(covFiles[f]));
            }
            String line;
            newData[f] = new DataSet(covFiles[f].getName(), covFiles[f].getCanonicalPath(), prefs.removeDuplicates());
            int lineCount = 0;
            // Now process the file
            while ((line = br.readLine()) != null) {
                if (cancel) {
                    br.close();
                    progressCancelled();
                    return;
                }
                // Ignore blank lines
                if (line.trim().length() == 0)
                    continue;
                ++lineCount;
                if (lineCount % 100000 == 0) {
                    progressUpdated("Read " + lineCount + " lines from " + covFiles[f].getName(), f, covFiles.length);
                }
                String[] sections = line.split("\t");
                // Check to see if we've got enough data to work with
                if (sections.length < 6) {
                    progressWarningReceived(new SeqMonkException("Not enough data from line '" + line + "'"));
                    // Skip this line...
                    continue;
                }
                int start;
                int end;
                int methCount;
                int unmethCount;
                try {
                    start = Integer.parseInt(sections[1]);
                    end = Integer.parseInt(sections[2]);
                    methCount = Integer.parseInt(sections[4]);
                    unmethCount = Integer.parseInt(sections[5]);
                    // End must always be later than start
                    if (start > end) {
                        progressWarningReceived(new SeqMonkException("End position " + end + " was lower than start position " + start));
                        int temp = start;
                        start = end;
                        end = temp;
                    }
                } catch (NumberFormatException e) {
                    progressWarningReceived(new SeqMonkException("Location " + sections[0] + "-" + sections[1] + " was not an integer"));
                    continue;
                }
                try {
                    ChromosomeWithOffset c = dataCollection().genome().getChromosome(sections[0]);
                    // We also don't allow readings which are beyond the end of the chromosome
                    start = c.position(start);
                    end = c.position(end);
                    if (end > c.chromosome().length()) {
                        int overrun = end - c.chromosome().length();
                        progressWarningReceived(new SeqMonkException("Reading position " + end + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")"));
                        continue;
                    }
                    // We can now make the new reads
                    long methRead = SequenceRead.packPosition(start, end, Location.FORWARD);
                    long unmethRead = SequenceRead.packPosition(start, end, Location.REVERSE);
                    newData[f].addData(c.chromosome(), methRead, methCount);
                    newData[f].addData(c.chromosome(), unmethRead, unmethCount);
                } catch (IllegalArgumentException iae) {
                    progressWarningReceived(iae);
                } catch (SeqMonkException sme) {
                    progressWarningReceived(sme);
                    continue;
                }
            }
            // We're finished with the file.
            br.close();
            // Cache the data in the new dataset
            progressUpdated("Caching data from " + covFiles[f].getName(), f, covFiles.length);
            newData[f].finalise();
        }
        processingFinished(newData);
    } catch (Exception ex) {
        progressExceptionReceived(ex);
        return;
    }
}

Also used : ChromosomeWithOffset(uk.ac.babraham.SeqMonk.Utilities.ChromosomeWithOffset) InputStreamReader(java.io.InputStreamReader) DataSet(uk.ac.babraham.SeqMonk.DataTypes.DataSet) FileInputStream(java.io.FileInputStream) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) GZIPInputStream(java.util.zip.GZIPInputStream) BufferedReader(java.io.BufferedReader) FileReader(java.io.FileReader) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) File(java.io.File)

Example 35 with DataSet

use of uk.ac.babraham.SeqMonk.DataTypes.DataSet in project SeqMonk by s-andrews.

the class HiCOtherEndExtractor method run.

/* (non-Javadoc)
	 * @see java.lang.Runnable#run()
	 */
public void run() {
    // We need to get the baits and other options from the preferences
    Probe[] baits = prefs.getBaits();
    boolean mergeIntoOne = prefs.mergeIntoOne();
    boolean excludeSelf = prefs.excludeSelf();
    DataSet[] newData;
    if (mergeIntoOne) {
        newData = new DataSet[visibleHiCDataSets.length];
    } else {
        newData = new DataSet[visibleHiCDataSets.length * baits.length];
    }
    try {
        for (int d = 0; d < visibleHiCDataSets.length; d++) {
            if (baits.length == 1) {
                System.err.println("Only one bait");
                newData[d] = new DataSet(prefs.prefix.getText() + "_" + visibleHiCDataSets[d].name(), "HiC other end of " + visibleHiCDataSets[d].name() + " for region " + baits[0].chromosome().name() + ":" + baits[0].start() + "-" + baits[0].end(), DataSet.DUPLICATES_REMOVE_NO);
            } else if (mergeIntoOne) {
                System.err.println("Multiple baits, but merging");
                newData[d] = new DataSet(prefs.prefix.getText() + "_" + visibleHiCDataSets[d].name(), "HiC other end of " + visibleHiCDataSets[d].name() + " for " + baits.length + " regions", DataSet.DUPLICATES_REMOVE_NO);
            }
            for (int b = 0; b < baits.length; b++) {
                if (cancel) {
                    progressCancelled();
                    return;
                }
                progressUpdated("Getting other ends from " + visibleHiCDataSets[d].name() + " and " + baits[b].toString(), (d * baits.length) + b, visibleHiCDataSets.length * baits.length);
                if (!(baits.length == 1 || mergeIntoOne)) {
                    newData[(d * baits.length) + b] = new DataSet(baits[b].toString() + "_" + visibleHiCDataSets[d].name(), "HiC other end of " + visibleHiCDataSets[d].name() + " for region " + baits[b].chromosome().name() + ":" + baits[b].start() + "-" + baits[b].end(), DataSet.DUPLICATES_REMOVE_NO);
                }
                HiCHitCollection hiCCollection = visibleHiCDataSets[d].getHiCReadsForProbe(baits[b]);
                String[] chromosomes = hiCCollection.getChromosomeNamesWithHits();
                for (int c = 0; c < chromosomes.length; c++) {
                    Chromosome chromosome = collection.genome().getChromosome(chromosomes[c]).chromosome();
                    long[] reads = hiCCollection.getHitPositionsForChromosome(chromosomes[c]);
                    for (int r = 0; r < reads.length; r++) {
                        if (cancel) {
                            progressCancelled();
                            return;
                        }
                        if (excludeSelf && baits[b].chromosome().name().equals(chromosomes[c]) && SequenceRead.overlaps(reads[r], baits[b].packedPosition())) {
                            continue;
                        }
                        if (mergeIntoOne) {
                            newData[d].addData(chromosome, reads[r]);
                        } else {
                            newData[(d * baits.length) + b].addData(chromosome, reads[r]);
                        }
                    }
                }
                if (!mergeIntoOne) {
                    // Cache the data in the new dataset
                    progressUpdated("Caching data", (d * baits.length) + b, visibleHiCDataSets.length * baits.length);
                    newData[(d * baits.length) + b].finalise();
                }
            }
            if (mergeIntoOne) {
                // Cache the data in the new dataset
                progressUpdated("Caching data", d, visibleHiCDataSets.length);
                newData[d].finalise();
            }
        }
        processingFinished(newData);
    } catch (Exception ex) {
        progressExceptionReceived(ex);
    }
}

Also used : HiCHitCollection(uk.ac.babraham.SeqMonk.DataTypes.Sequence.HiCHitCollection) DataSet(uk.ac.babraham.SeqMonk.DataTypes.DataSet) Chromosome(uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome) Probe(uk.ac.babraham.SeqMonk.DataTypes.Probes.Probe)

Aggregations

DataSet (uk.ac.babraham.SeqMonk.DataTypes.DataSet)36 SeqMonkException (uk.ac.babraham.SeqMonk.SeqMonkException)22 DataGroup (uk.ac.babraham.SeqMonk.DataTypes.DataGroup)16 PairedDataSet (uk.ac.babraham.SeqMonk.DataTypes.PairedDataSet)14 File (java.io.File)12 Vector (java.util.Vector)11 DataStore (uk.ac.babraham.SeqMonk.DataTypes.DataStore)11 BufferedReader (java.io.BufferedReader)10 FileReader (java.io.FileReader)10 FileInputStream (java.io.FileInputStream)9 InputStreamReader (java.io.InputStreamReader)9 GZIPInputStream (java.util.zip.GZIPInputStream)9 Chromosome (uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome)8 ChromosomeWithOffset (uk.ac.babraham.SeqMonk.Utilities.ChromosomeWithOffset)7 Probe (uk.ac.babraham.SeqMonk.DataTypes.Probes.Probe)6 JLabel (javax.swing.JLabel)5 ReplicateSet (uk.ac.babraham.SeqMonk.DataTypes.ReplicateSet)5 IOException (java.io.IOException)4 GridBagConstraints (java.awt.GridBagConstraints)3 GridBagLayout (java.awt.GridBagLayout)3