Search in sources :

Example 56 with Chromosome

use of uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome in project SeqMonk by s-andrews.

the class FeatureFilter method generateProbeList.

protected void generateProbeList() {
    // We'll start by getting the complete set of probes from the position
    // filter.  We'll split these by chromosome at a later date but we
    // have to get them as a set to start with.
    Probe[] probesToMatch = options.featurePositions.getProbes();
    // This is the set of passing probes we're going to build up.
    ProbeList passedProbes = new ProbeList(startingList, "", "", null);
    // We need to know how far beyond the feature we might need to look
    int annotationLimit = options.closenessLimit();
    // Since we're going to be making the annotations on the
    // basis of position we should go through all probes one
    // chromosome at a time.
    Chromosome[] chrs = collection.genome().getAllChromosomes();
    for (int c = 0; c < chrs.length; c++) {
        // For the not-overlapping option it's easiest for us to keep a list
        // of probes to reject (those that do overlap) and then make the negated
        // list at the end.
        HashSet<Probe> failedProbes = new HashSet<Probe>();
        progressUpdated("Processing features on Chr " + chrs[c].name(), c, chrs.length);
        Probe[] probes = startingList.getProbesForChromosome(chrs[c]);
        Vector<Probe> featuresForThisChromosome = new Vector<Probe>();
        for (int f = 0; f < probesToMatch.length; f++) {
            if (probesToMatch[f].chromosome().equals(chrs[c])) {
                featuresForThisChromosome.add(probesToMatch[f]);
            }
        }
        Probe[] features = featuresForThisChromosome.toArray(new Probe[0]);
        Arrays.sort(probes);
        Arrays.sort(features);
        int lastFoundIndex = 0;
        // We can now step through the probes looking for the best feature match
        for (int p = 0; p < probes.length; p++) {
            boolean foundFirst = false;
            for (int f = lastFoundIndex; f < features.length; f++) {
                if (cancel) {
                    cancel = false;
                    progressCancelled();
                    return;
                }
                if (!foundFirst) {
                    if (features[f].end() + annotationLimit >= probes[p].start()) {
                        lastFoundIndex = f;
                        foundFirst = true;
                    }
                }
                // See if we're skipping this feature for this probe based on its strand
                if (strand != ANY_STRAND) {
                    switch(strand) {
                        case FORWARD_ONLY:
                            {
                                if (features[f].strand() != Location.FORWARD)
                                    continue;
                                break;
                            }
                        case REVERSE_ONLY:
                            {
                                if (features[f].strand() != Location.REVERSE)
                                    continue;
                                break;
                            }
                        case SAME_STRAND:
                            {
                                if (features[f].strand() != probes[p].strand())
                                    continue;
                                break;
                            }
                        case OPPOSING_STRAND:
                            {
                                if (!((features[f].strand() == Location.FORWARD && probes[p].strand() == Location.REVERSE) || (features[f].strand() == Location.REVERSE && probes[p].strand() == Location.FORWARD)))
                                    continue;
                                break;
                            }
                    }
                }
                if (relationship == EXACTLY_MATCHING) {
                    if (probes[p].start() == features[f].start() && probes[p].end() == features[f].end()) {
                        passedProbes.addProbe(probes[p], null);
                        break;
                    }
                } else if (relationship == OVERLAPPING || relationship == NOT_OVERLAPPING) {
                    if (probes[p].start() < features[f].end() && probes[p].end() > features[f].start()) {
                        if (relationship == OVERLAPPING) {
                            passedProbes.addProbe(probes[p], null);
                        } else {
                            // This is going to be a rejected probe for not-overlapping
                            failedProbes.add(probes[p]);
                        }
                        break;
                    }
                } else if (relationship == CONTAINED_WITHIN) {
                    if (probes[p].start() >= features[f].start() && probes[p].end() <= features[f].end()) {
                        passedProbes.addProbe(probes[p], null);
                        break;
                    }
                } else if (relationship == SURROUNDING) {
                    if (probes[p].start() <= features[f].start() && probes[p].end() >= features[f].end()) {
                        passedProbes.addProbe(probes[p], null);
                        break;
                    }
                } else if (relationship == CLOSE_TO) {
                    if (probes[p].start() < features[f].end() + annotationLimit && probes[p].end() > features[f].start() - annotationLimit) {
                        passedProbes.addProbe(probes[p], null);
                        break;
                    }
                }
            }
        }
        // data to get the probes which weren't rejected
        if (relationship == NOT_OVERLAPPING) {
            for (int p = 0; p < probes.length; p++) {
                if (!failedProbes.contains(probes[p])) {
                    passedProbes.addProbe(probes[p], null);
                }
            }
        }
    }
    filterFinished(passedProbes);
}
Also used : ProbeList(uk.ac.babraham.SeqMonk.DataTypes.Probes.ProbeList) Chromosome(uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome) Probe(uk.ac.babraham.SeqMonk.DataTypes.Probes.Probe) Vector(java.util.Vector) HashSet(java.util.HashSet)

Example 57 with Chromosome

use of uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome in project SeqMonk by s-andrews.

the class ContigProbeGenerator method run.

/* (non-Javadoc)
	 * @see java.lang.Runnable#run()
	 */
public void run() {
    Chromosome[] chromosomes = collection.genome().getAllChromosomes();
    Vector<Probe> newProbes = new Vector<Probe>();
    for (int c = 0; c < chromosomes.length; c++) {
        // Time for an update
        updateGenerationProgress("Processed " + c + " chromosomes", c, chromosomes.length);
        // We'll merge together the reads for all of the selected DataStores and
        // compute a single set of probes which covers all of them.
        ReadsWithCounts[] v = new ReadsWithCounts[selectedStores.length];
        for (int s = 0; s < selectedStores.length; s++) {
            v[s] = selectedStores[s].getReadsForChromosome(chromosomes[c]);
        }
        ReadsWithCounts rawReads = new ReadsWithCounts(v);
        v = null;
        // We now want to convert this list into a non-redundant set of
        // read positions with counts.  If we don't do this then we get
        // appalling performance where we have many reads mapped at the
        // same position
        // Our default is to do all strands at once
        int[] strandsToTry = new int[] { 100 };
        if (separateStrands.isSelected()) {
            strandsToTry = new int[] { Location.FORWARD, Location.REVERSE, Location.UNKNOWN };
        }
        for (int strand = 0; strand < strandsToTry.length; strand++) {
            ReadsWithCounts reads = getNonRedundantReads(rawReads, strandsToTry[strand]);
            if (reads.totalCount() == 0) {
                // System.err.println("Skipping strand "+strandsToTry[strand]+" on chr "+chromosomes[c]);
                continue;
            }
            int strandForNewProbes = Location.UNKNOWN;
            if (strandsToTry.length > 1) {
                strandForNewProbes = strandsToTry[strand];
            }
            int start = -1;
            // We now start a process where we work out at what point we cross the
            // threshold of having more than depthCutoff reads overlapping at any
            // point
            LinkedList<SequenceReadWithCount> currentSet = new LinkedList<SequenceReadWithCount>();
            int currentSetSize = 0;
            for (int r = 0; r < reads.reads.length; r++) {
                // See if we need to quit
                if (cancel) {
                    generationCancelled();
                    return;
                }
                while (currentSetSize > 0 && SequenceRead.end(currentSet.getFirst().read) < SequenceRead.start(reads.reads[r])) {
                    SequenceReadWithCount lastRead = currentSet.removeFirst();
                    currentSetSize -= lastRead.count;
                    if (start > 0 && currentSetSize < depthCutoff) {
                        // We just got to the end of a probe
                        Probe p = new Probe(chromosomes[c], start, SequenceRead.end(lastRead.read), strandForNewProbes);
                        // Check to see if we have a previous probe against which we can check
                        Probe lastProbe = null;
                        if (!newProbes.isEmpty())
                            lastProbe = newProbes.lastElement();
                        // Can we merge?
                        if (lastProbe != null && p.chromosome() == lastProbe.chromosome() && p.strand() == lastProbe.strand() && p.start() - lastProbe.end() <= distance) {
                            // Remove the last probe from the stored set
                            newProbes.remove(newProbes.size() - 1);
                            // Expand this probe to cover the last one and add it to the stored set
                            newProbes.add(new Probe(p.chromosome(), lastProbe.start(), p.end(), strandForNewProbes));
                        } else if (lastProbe != null) {
                            // We might still remove this if it's too small
                            if (lastProbe.length() < minSize) {
                                newProbes.remove(newProbes.size() - 1);
                            }
                            // We still need to add the new probe
                            newProbes.add(p);
                        } else {
                            newProbes.add(p);
                        }
                        start = -1;
                    }
                }
                // If there's nothing there already then just add it
                if (currentSetSize == 0) {
                    currentSet.add(new SequenceReadWithCount(reads.reads[r], reads.counts[r]));
                    currentSetSize += reads.counts[r];
                } else // If there are reads in the current set then we need to add this read
                // so that the current set is ordered by the end positions of the
                // reads, with the earliest end first.  We therefore start from the back
                // and work our way to the front, as soon as we see an entry whose end
                // is lower than ours we add ourselves after that
                {
                    // Now we add this read at a position based on its end position
                    ListIterator<SequenceReadWithCount> it = currentSet.listIterator(currentSet.size());
                    while (true) {
                        // If we reach the front of the set then we add ourselves to the front
                        if (!it.hasPrevious()) {
                            currentSet.addFirst(new SequenceReadWithCount(reads.reads[r], reads.counts[r]));
                            currentSetSize += reads.counts[r];
                            break;
                        } else {
                            SequenceReadWithCount previousRead = it.previous();
                            if (SequenceRead.end(previousRead.read) < SequenceRead.end(reads.reads[r])) {
                                // We want to add ourselves after this element so backtrack
                                // by one position (which must exist because we just went
                                // past it
                                it.next();
                                it.add(new SequenceReadWithCount(reads.reads[r], reads.counts[r]));
                                currentSetSize += reads.counts[r];
                                break;
                            }
                        }
                    }
                }
                // See if we crossed the threshold for starting a new probe
                if (start < 0 && currentSetSize >= depthCutoff) {
                    start = SequenceRead.start(reads.reads[r]);
                }
            }
            // out of the so far unprocessed reads on this chromosome
            if (start > 0) {
                Probe p = new Probe(chromosomes[c], start, SequenceRead.end(currentSet.getFirst().read), strandForNewProbes);
                // Check to see if we can merge with the last probe made
                Probe lastProbe = null;
                if (!newProbes.isEmpty())
                    lastProbe = newProbes.lastElement();
                // Can we merge?
                if (lastProbe != null && p.chromosome() == lastProbe.chromosome() && p.start() - lastProbe.end() <= distance) {
                    newProbes.remove(newProbes.size() - 1);
                    newProbes.add(new Probe(p.chromosome(), lastProbe.start(), p.end(), strandForNewProbes));
                } else if (lastProbe != null) {
                    // We might still remove this if it's too small
                    if (lastProbe.length() < minSize) {
                        newProbes.remove(newProbes.size() - 1);
                    }
                    // final chance
                    if (p.length() > minSize) {
                        newProbes.add(p);
                    }
                } else {
                    // Add the remaining probe if it's big enough.
                    if (p.length() > minSize) {
                        newProbes.add(p);
                    }
                }
            }
        }
    }
    Probe[] finalList = newProbes.toArray(new Probe[0]);
    newProbes.clear();
    ProbeSet finalSet = new ProbeSet(getDescription(), finalList);
    generationComplete(finalSet);
}
Also used : Chromosome(uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome) ReadsWithCounts(uk.ac.babraham.SeqMonk.DataTypes.Sequence.ReadsWithCounts) Probe(uk.ac.babraham.SeqMonk.DataTypes.Probes.Probe) LinkedList(java.util.LinkedList) ProbeSet(uk.ac.babraham.SeqMonk.DataTypes.Probes.ProbeSet) Vector(java.util.Vector) IntVector(uk.ac.babraham.SeqMonk.Utilities.IntVector) LongVector(uk.ac.babraham.SeqMonk.Utilities.LongVector)

Example 58 with Chromosome

use of uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome in project SeqMonk by s-andrews.

the class DeduplicationProbeGenerator method run.

/* (non-Javadoc)
	 * @see java.lang.Runnable#run()
	 */
public void run() {
    Chromosome[] chromosomes = collection.genome().getAllChromosomes();
    boolean separateStrands = separateStrandsBox.isSelected();
    int maxDistance = 0;
    if (maxDistanceField.getText().length() > 0) {
        maxDistance = Integer.parseInt(maxDistanceField.getText());
    }
    Vector<Probe> newProbes = new Vector<Probe>();
    for (int c = 0; c < chromosomes.length; c++) {
        // Time for an update
        updateGenerationProgress("Processed " + c + " chromosomes", c, chromosomes.length);
        Probe[] startingProbes = initialList.getProbesForChromosome(chromosomes[c]);
        // For directional merging we use 3 probes
        Probe currentForward = null;
        Probe currentReverse = null;
        Probe currentUnknown = null;
        // For non-directional merging we use only one
        Probe currentProbe = null;
        // Now we can make the actual probes
        for (int i = 0; i < startingProbes.length; i++) {
            if (cancel) {
                generationCancelled();
                return;
            }
            if (separateStrands) {
                switch(startingProbes[i].strand()) {
                    case (Probe.FORWARD):
                        if (currentForward == null) {
                            currentForward = new Probe(chromosomes[c], startingProbes[i].packedPosition());
                        } else if (startingProbes[i].start() <= currentForward.end() + maxDistance) {
                            if (startingProbes[i].end() > currentForward.end()) {
                                // Extend the current probe
                                currentForward = new Probe(chromosomes[c], currentForward.start(), startingProbes[i].end(), currentForward.strand(), currentForward.name());
                            }
                        } else {
                            newProbes.add(currentForward);
                            currentForward = new Probe(chromosomes[c], startingProbes[i].packedPosition());
                        }
                        break;
                    case (Probe.REVERSE):
                        if (currentReverse == null) {
                            currentReverse = new Probe(chromosomes[c], startingProbes[i].packedPosition());
                        } else if (startingProbes[i].start() <= currentReverse.end() + maxDistance) {
                            if (startingProbes[i].end() > currentReverse.end()) {
                                // Extend the current probe
                                currentReverse = new Probe(chromosomes[c], currentReverse.start(), startingProbes[i].end(), currentReverse.strand(), currentReverse.name());
                            }
                        } else {
                            newProbes.add(currentReverse);
                            currentReverse = new Probe(chromosomes[c], startingProbes[i].packedPosition());
                        }
                        break;
                    case (Probe.UNKNOWN):
                        if (currentUnknown == null) {
                            currentUnknown = new Probe(chromosomes[c], startingProbes[i].packedPosition());
                        } else if (startingProbes[i].start() <= currentUnknown.end() + maxDistance) {
                            if (startingProbes[i].end() > currentUnknown.end()) {
                                // Extend the current probe
                                currentUnknown = new Probe(chromosomes[c], currentUnknown.start(), startingProbes[i].end(), currentUnknown.strand(), currentUnknown.name());
                            }
                        } else {
                            newProbes.add(currentUnknown);
                            currentUnknown = new Probe(chromosomes[c], startingProbes[i].packedPosition());
                        }
                        break;
                }
            } else {
                if (currentProbe == null) {
                    currentProbe = new Probe(chromosomes[c], startingProbes[i].packedPosition());
                } else if (startingProbes[i].start() <= currentProbe.end() + maxDistance) {
                    if (startingProbes[i].end() > currentProbe.end() || startingProbes[i].strand() != currentProbe.strand()) {
                        // Update the current probe
                        int usedStrand = currentProbe.strand();
                        if (startingProbes[i].strand() != currentProbe.strand()) {
                            usedStrand = Probe.UNKNOWN;
                        }
                        currentProbe = new Probe(chromosomes[c], currentProbe.start(), Math.max(startingProbes[i].end(), currentProbe.end()), usedStrand, currentProbe.name());
                    }
                } else {
                    newProbes.add(currentProbe);
                    currentProbe = new Probe(chromosomes[c], startingProbes[i].packedPosition());
                }
            }
        }
        // At the end of each chromosome we add any probes we have remaining
        if (currentProbe != null)
            newProbes.add(currentProbe);
        if (currentForward != null)
            newProbes.add(currentForward);
        if (currentReverse != null)
            newProbes.add(currentReverse);
        if (currentUnknown != null)
            newProbes.add(currentUnknown);
    }
    Probe[] finalList = newProbes.toArray(new Probe[0]);
    ProbeSet finalSet = new ProbeSet(getDescription(), finalList);
    generationComplete(finalSet);
}
Also used : ProbeSet(uk.ac.babraham.SeqMonk.DataTypes.Probes.ProbeSet) Chromosome(uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome) Probe(uk.ac.babraham.SeqMonk.DataTypes.Probes.Probe) Vector(java.util.Vector)

Example 59 with Chromosome

use of uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome in project SeqMonk by s-andrews.

the class MacsPeakCaller method run.

public void run() {
    // for (int i=0;i<selectedChIPStores.length;i++) {
    // System.err.println("Selcted ChIP="+selectedChIPStores[i]);
    // }
    // for (int i=0;i<selectedInputStores.length;i++) {
    // System.err.println("Selcted Input="+selectedInputStores[i]);
    // }
    // First find the tag offsets between the watson and crick strands
    // Work out the total average coverage for all of the combined ChIP samples
    long totalChIPCoverage = 0;
    for (int i = 0; i < selectedChIPStores.length; i++) {
        totalChIPCoverage += selectedChIPStores[i].getTotalReadLength();
    }
    if (cancel) {
        generationCancelled();
        return;
    }
    double averageChIPCoveragePerBase = totalChIPCoverage / (double) collection.genome().getTotalGenomeLength();
    double lowerCoverage = averageChIPCoveragePerBase * minFoldEnrichment;
    double upperCoverage = averageChIPCoveragePerBase * maxFoldEnrichment;
    System.err.println("Coverage range for high confidence peaks is " + lowerCoverage + " - " + upperCoverage);
    // Now we go through the data to find locations for our high confidence peaks so we can
    // randomly select 1000 of these to use to find the offset between the two strands
    Chromosome[] chromosomes = collection.genome().getAllChromosomes();
    Vector<Probe> potentialHighConfidencePeaks = new Vector<Probe>();
    for (int c = 0; c < chromosomes.length; c++) {
        if (cancel) {
            generationCancelled();
            return;
        }
        // Time for an update
        updateGenerationProgress("Finding high confidence peaks on chromosome " + chromosomes[c].name(), c, chromosomes.length);
        Probe lastValidProbe = null;
        for (int startPosition = 1; startPosition < chromosomes[c].length() - fragmentSize; startPosition += fragmentSize / 2) {
            // See if we need to quit
            if (cancel) {
                generationCancelled();
                return;
            }
            long totalLength = 0;
            Probe probe = new Probe(chromosomes[c], startPosition, startPosition + fragmentSize);
            for (int s = 0; s < selectedChIPStores.length; s++) {
                long[] reads = selectedChIPStores[s].getReadsForProbe(probe);
                for (int j = 0; j < reads.length; j++) {
                    totalLength += SequenceRead.length(reads[j]);
                }
            }
            if (totalLength >= (lowerCoverage * probe.length()) && totalLength <= upperCoverage * probe.length()) {
                if (lastValidProbe != null && SequenceRead.overlaps(lastValidProbe.packedPosition(), probe.packedPosition())) {
                    lastValidProbe = new Probe(chromosomes[c], lastValidProbe.start(), probe.end());
                } else if (lastValidProbe != null) {
                    // Check that the overall density over the region falls within our limits
                    totalLength = 0;
                    for (int s = 0; s < selectedChIPStores.length; s++) {
                        long[] reads = selectedChIPStores[s].getReadsForProbe(lastValidProbe);
                        for (int j = 0; j < reads.length; j++) {
                            totalLength += SequenceRead.length(reads[j]);
                        }
                    }
                    if (totalLength >= (lowerCoverage * lastValidProbe.length()) && totalLength <= upperCoverage * lastValidProbe.length()) {
                        potentialHighConfidencePeaks.add(lastValidProbe);
                    }
                    lastValidProbe = probe;
                } else {
                    lastValidProbe = probe;
                }
            }
        }
        if (lastValidProbe != null) {
            long totalLength = 0;
            for (int s = 0; s < selectedChIPStores.length; s++) {
                long[] reads = selectedChIPStores[s].getReadsForProbe(lastValidProbe);
                for (int j = 0; j < reads.length; j++) {
                    totalLength += SequenceRead.length(reads[j]);
                }
            }
            if (totalLength >= (lowerCoverage * lastValidProbe.length()) && totalLength <= upperCoverage * lastValidProbe.length()) {
                potentialHighConfidencePeaks.add(lastValidProbe);
            }
        }
    }
    if (potentialHighConfidencePeaks.size() == 0) {
        JOptionPane.showMessageDialog(SeqMonkApplication.getInstance(), "No high confidence peaks found", "Quitting generator", JOptionPane.INFORMATION_MESSAGE);
        generationCancelled();
        return;
    }
    // System.err.println("Found "+potentialHighConfidencePeaks.size()+" high confidence peaks");
    // Now we select 1000 random probes from this set
    Probe[] highConfidencePeaks = potentialHighConfidencePeaks.toArray(new Probe[0]);
    Collections.shuffle(Arrays.asList(highConfidencePeaks));
    Probe[] randomHighConfidenceProbes = new Probe[Math.min(highConfidencePeaks.length, 1000)];
    for (int i = 0; i < randomHighConfidenceProbes.length; i++) {
        randomHighConfidenceProbes[i] = highConfidencePeaks[i];
    }
    // Now find the average distance between forward / reverse reads in the candidate peaks
    int[] distances = new int[highConfidencePeaks.length];
    // Sort the candidates so we don't do stupid stuff with the cache
    Arrays.sort(randomHighConfidenceProbes);
    for (int p = 0; p < randomHighConfidenceProbes.length; p++) {
        // See if we need to quit
        if (cancel) {
            generationCancelled();
            return;
        }
        distances[p] = getInterStrandDistance(randomHighConfidenceProbes[p], selectedChIPStores);
    }
    int medianInterStrandDistance = (int) SimpleStats.median(distances);
    if (medianInterStrandDistance < 0)
        medianInterStrandDistance = 0;
    // System.err.println("Median inter strand difference = "+medianInterStrandDistance);
    // Now we find the depth cutoff for overrepresented single tags using a binomial distribution
    int totalReadCount = 0;
    for (int i = 0; i < selectedChIPStores.length; i++) {
        totalReadCount += selectedChIPStores[i].getTotalReadCount();
    }
    BinomialDistribution bin = new BinomialDistribution(totalReadCount, 1d / collection.genome().getTotalGenomeLength());
    // We want to know what depth has a chance of less than 1^-5
    int redundantThreshold = bin.inverseCumulativeProbability(1 - 0.00001d);
    if (redundantThreshold < 1)
        redundantThreshold = 1;
    // System.err.println("Redundancy threshold is "+redundantThreshold);
    // Now we construct a poisson distribution to work out the threshold to use for
    // constructing a full candidate peak set.
    updateGenerationProgress("Counting non-redundant reads", 0, 1);
    // To do this we need to get the full non-redundant length from the whole set
    int totalNonRedCount = getNonRedundantReadCount(selectedChIPStores, redundantThreshold);
    // System.err.println("Total non-redundant sequences is "+totalNonRedCount);
    // We need to know the median read length for the data
    int readLength = 0;
    for (int i = 0; i < selectedChIPStores.length; i++) {
        readLength += selectedChIPStores[i].getTotalReadLength() / selectedChIPStores[i].getTotalReadCount();
    }
    readLength /= selectedChIPStores.length;
    double expectedCountsPerWindow = getExpectedCountPerWindow(totalNonRedCount, collection.genome().getTotalGenomeLength(), fragmentSize, readLength);
    PoissonDistribution poisson = new PoissonDistribution(expectedCountsPerWindow);
    int readCountCutoff = poisson.inverseCumulativeProbability(1 - pValue);
    // System.err.println("Threshold for enrichment in a window is "+readCountCutoff+" reads using a p-value of "+pValue+" and a mean of "+(totalNonRedCount/(collection.genome().getTotalGenomeLength()/(double)fragmentSize)));
    // Now we go back through the whole dataset to do a search for all possible candidate probes
    // We re-use the peak vector we came up with before.
    potentialHighConfidencePeaks.clear();
    for (int c = 0; c < chromosomes.length; c++) {
        // Time for an update
        updateGenerationProgress("Finding candidate peaks on chromosome " + chromosomes[c].name(), c, chromosomes.length);
        Probe lastValidProbe = null;
        for (int startPosition = 1; startPosition < chromosomes[c].length() - fragmentSize; startPosition += fragmentSize / 2) {
            // See if we need to quit
            if (cancel) {
                generationCancelled();
                return;
            }
            // We expand the region we're looking at by the inter-strand distance as we're going to
            // be adjusting the read positions
            Probe probe = new Probe(chromosomes[c], startPosition, (startPosition + fragmentSize - 1));
            long[] mergedProbeReads = getReadsFromDataStoreCollection(probe, selectedChIPStores, medianInterStrandDistance);
            mergedProbeReads = deduplicateReads(mergedProbeReads, redundantThreshold);
            SequenceRead.sort(mergedProbeReads);
            int thisProbeOverlapCount = 0;
            for (int i = 0; i < mergedProbeReads.length; i++) {
                if (SequenceRead.overlaps(mergedProbeReads[i], probe.packedPosition())) {
                    ++thisProbeOverlapCount;
                }
            }
            if (thisProbeOverlapCount > readCountCutoff) {
                if (lastValidProbe != null && SequenceRead.overlaps(lastValidProbe.packedPosition(), probe.packedPosition())) {
                    lastValidProbe = new Probe(chromosomes[c], lastValidProbe.start(), probe.end());
                } else if (lastValidProbe != null) {
                    potentialHighConfidencePeaks.add(lastValidProbe);
                    lastValidProbe = probe;
                } else {
                    lastValidProbe = probe;
                }
            }
        }
        if (lastValidProbe != null) {
            potentialHighConfidencePeaks.add(lastValidProbe);
        }
    }
    // Finally we re-filter the peaks we have using local poisson distributions with densities taken
    // from either the input samples (if there are any), or the local region.  The densities are
    // estimated over 1,5 and 10kb around the peak and genome wide and the max of these is taken.
    // If there is no input then the 1kb region is not used.
    Probe[] allCandidateProbes = potentialHighConfidencePeaks.toArray(new Probe[0]);
    // Work out which stores we're using to validate against.
    DataStore[] validationStores;
    boolean useInput = false;
    double inputCorrection = 1;
    int validationNonRedCount;
    if (selectedInputStores != null && selectedInputStores.length > 0) {
        // See if we need to quit
        if (cancel) {
            generationCancelled();
            return;
        }
        validationStores = selectedInputStores;
        useInput = true;
        // We also need to work out the total number of nonredundant seqences
        // in the input so we can work out a scaling factor so that the densities
        // for input and chip are comparable.
        validationNonRedCount = getNonRedundantReadCount(validationStores, redundantThreshold);
        inputCorrection = totalNonRedCount / (double) validationNonRedCount;
        System.err.println("From chip=" + totalNonRedCount + " input=" + validationNonRedCount + " correction is " + inputCorrection);
    } else {
        validationStores = selectedChIPStores;
        validationNonRedCount = totalNonRedCount;
    }
    Vector<Probe> finalValidatedProbes = new Vector<Probe>();
    for (int p = 0; p < allCandidateProbes.length; p++) {
        // See if we need to quit
        if (cancel) {
            generationCancelled();
            return;
        }
        if (p % 100 == 0) {
            updateGenerationProgress("Validated " + p + " out of " + allCandidateProbes.length + " raw peaks", p, allCandidateProbes.length);
        }
        // System.err.println("Validating "+allCandidateProbes[p].chromosome()+":"+allCandidateProbes[p].start()+"-"+allCandidateProbes[p].end());
        // We now need to find the maximum read density per 2*bandwidth against which
        // we're going to validate this peak
        // We're going to get all reads within 10kb of the peak, and then we can subselect from there
        int midPoint = allCandidateProbes[p].middle();
        Probe region10kb = new Probe(allCandidateProbes[p].chromosome(), Math.max(midPoint - 5000, 1), Math.min(midPoint + 4999, allCandidateProbes[p].chromosome().length()), allCandidateProbes[p].strand());
        Probe region5kb = new Probe(allCandidateProbes[p].chromosome(), Math.max(midPoint - 2500, 1), Math.min(midPoint + 2499, allCandidateProbes[p].chromosome().length()), allCandidateProbes[p].strand());
        Probe region1kb = new Probe(allCandidateProbes[p].chromosome(), Math.max(midPoint - 500, 1), Math.min(midPoint + 499, allCandidateProbes[p].chromosome().length()), allCandidateProbes[p].strand());
        // Get the probes for the largest region
        long[] thisRegionReads = getReadsFromDataStoreCollection(region10kb, validationStores, 0);
        // Deduplicate so it's a fair comparison
        // Should we recalculate the redundant threshold based on the input coverage?
        thisRegionReads = deduplicateReads(thisRegionReads, redundantThreshold);
        int region10kbcount = thisRegionReads.length;
        int region5kbcount = 0;
        int region1kbcount = 0;
        // Go through the reads seeing if they fit into the 5 or 1kb regions
        for (int r = 0; r < thisRegionReads.length; r++) {
            if (SequenceRead.overlaps(region5kb.packedPosition(), thisRegionReads[r]))
                ++region5kbcount;
            if (SequenceRead.overlaps(region1kb.packedPosition(), thisRegionReads[r]))
                ++region1kbcount;
        }
        // System.err.println("Input counts 10kb="+region10kbcount+" 5kb="+region5kbcount+" 1kb="+region1kbcount);
        // Convert to densities per window and ajdust for global coverage
        double globalDensity = getExpectedCountPerWindow(validationNonRedCount, collection.genome().getTotalGenomeLength(), allCandidateProbes[p].length(), readLength) * inputCorrection;
        double density10kb = getExpectedCountPerWindow(region10kbcount, region10kb.length(), allCandidateProbes[p].length(), readLength) * inputCorrection;
        double density5kb = getExpectedCountPerWindow(region5kbcount, region5kb.length(), allCandidateProbes[p].length(), readLength) * inputCorrection;
        double density1kb = getExpectedCountPerWindow(region1kbcount, region1kb.length(), allCandidateProbes[p].length(), readLength) * inputCorrection;
        // Find the highest density to use for the validation
        double highestDensity = globalDensity;
        if (density10kb > highestDensity)
            highestDensity = density10kb;
        if (density5kb > highestDensity)
            highestDensity = density5kb;
        if (useInput && density1kb > highestDensity)
            highestDensity = density1kb;
        // System.err.println("Global="+globalDensity+" 10kb="+density10kb+" 5kb="+density5kb+" 1kb="+density1kb+" using="+highestDensity);
        // Construct a poisson distribution with this density
        PoissonDistribution localPoisson = new PoissonDistribution(highestDensity);
        // System.err.println("Cutoff from global="+(new PoissonDistribution(globalDensity)).inverseCumulativeProbability(1-pValue)+" 10kb="+(new PoissonDistribution(density10kb)).inverseCumulativeProbability(1-pValue)+" 5kb="+(new PoissonDistribution(density5kb)).inverseCumulativeProbability(1-pValue)+" 1kb="+(new PoissonDistribution(density1kb)).inverseCumulativeProbability(1-pValue));
        // Now check to see if the actual count from this peak is enough to still pass
        long[] mergedProbeReads = getReadsFromDataStoreCollection(allCandidateProbes[p], selectedChIPStores, medianInterStrandDistance);
        mergedProbeReads = deduplicateReads(mergedProbeReads, redundantThreshold);
        SequenceRead.sort(mergedProbeReads);
        int thisProbeOverlapCount = 0;
        for (int i = 0; i < mergedProbeReads.length; i++) {
            if (SequenceRead.overlaps(mergedProbeReads[i], allCandidateProbes[p].packedPosition())) {
                ++thisProbeOverlapCount;
            }
        }
        if (thisProbeOverlapCount > localPoisson.inverseCumulativeProbability(1 - pValue)) {
            finalValidatedProbes.add(allCandidateProbes[p]);
        // System.err.println("Adding probe to final set");
        }
    }
    // System.err.println("From "+allCandidateProbes.length+" candidates "+finalValidatedProbes.size()+" peaks were validated");
    ProbeSet finalSet = new ProbeSet(getDescription(), finalValidatedProbes.toArray(new Probe[0]));
    generationComplete(finalSet);
}
Also used : PoissonDistribution(org.apache.commons.math3.distribution.PoissonDistribution) Chromosome(uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome) Probe(uk.ac.babraham.SeqMonk.DataTypes.Probes.Probe) ProbeSet(uk.ac.babraham.SeqMonk.DataTypes.Probes.ProbeSet) DataStore(uk.ac.babraham.SeqMonk.DataTypes.DataStore) BinomialDistribution(org.apache.commons.math3.distribution.BinomialDistribution) Vector(java.util.Vector) LongVector(uk.ac.babraham.SeqMonk.Utilities.LongVector)

Example 60 with Chromosome

use of uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome in project SeqMonk by s-andrews.

the class RNASeqPipeline method startPipeline.

protected void startPipeline() {
    // We first need to generate probes over all of the features listed in
    // the feature types.  The probes should cover the whole area of the
    // feature regardless of where it splices.
    Vector<Probe> probes = new Vector<Probe>();
    boolean mergeTranscripts = optionsPanel.mergeTranscripts();
    boolean pairedEnd = optionsPanel.pairedEnd();
    boolean logTransform = optionsPanel.logTransform();
    boolean applyTranscriptLengthCorrection = optionsPanel.applyTranscriptLengthCorrection();
    boolean rawCounts = optionsPanel.rawCounts();
    boolean noValueForZeroCounts = optionsPanel.noValueForZeroCounts();
    boolean correctDNAContamination = optionsPanel.correctForDNAContamination();
    boolean correctDuplication = optionsPanel.correctForDNADuplication();
    if (rawCounts) {
        logTransform = false;
        applyTranscriptLengthCorrection = false;
        noValueForZeroCounts = false;
    }
    Chromosome[] chrs = collection().genome().getAllChromosomes();
    for (int c = 0; c < chrs.length; c++) {
        // System.err.println("Processing chr "+chrs[c].name());
        if (cancel) {
            progressCancelled();
            return;
        }
        progressUpdated("Making features for chr" + chrs[c].name(), c, chrs.length * 2);
        Feature[] features = collection().genome().annotationCollection().getFeaturesForType(chrs[c], optionsPanel.getSelectedFeatureType());
        Arrays.sort(features);
        FeatureGroup[] mergedTranscripts = mergeTranscripts(features, mergeTranscripts);
        for (int f = 0; f < mergedTranscripts.length; f++) {
            if (cancel) {
                progressCancelled();
                return;
            }
            probes.add(new Probe(chrs[c], mergedTranscripts[f].start(), mergedTranscripts[f].end(), mergedTranscripts[f].strand(), mergedTranscripts[f].name()));
        }
    }
    Probe[] allProbes = probes.toArray(new Probe[0]);
    Arrays.sort(allProbes);
    if (collection().probeSet() == null) {
        collection().setProbeSet(new ProbeSet("Transcript features over " + optionsPanel.getSelectedFeatureType(), allProbes));
    } else {
        Probe[] existingProbes = collection().probeSet().getAllProbes();
        Arrays.sort(existingProbes);
        if (allProbes.length != existingProbes.length) {
            collection().setProbeSet(new ProbeSet("Transcript features over " + optionsPanel.getSelectedFeatureType(), allProbes));
        } else {
            // Check the positions against the new ones
            boolean areTheyTheSame = true;
            for (int p = 0; p < allProbes.length; p++) {
                if (allProbes[p].packedPosition() != existingProbes[p].packedPosition()) {
                    areTheyTheSame = false;
                    break;
                }
            }
            if (areTheyTheSame) {
                allProbes = existingProbes;
            } else {
                collection().setProbeSet(new ProbeSet("Transcript features over " + optionsPanel.getSelectedFeatureType(), allProbes));
            }
        }
    }
    // If we're correcting for DNA contamination we need to work out the average density of
    // reads in intergenic regions
    float[] dnaDensityPerKb = new float[data.length];
    int[] correctedTotalCounts = new int[data.length];
    if (correctDNAContamination) {
        // We need to make interstitial probes to the set we already have, ignoring those at the end of chromosomes
        Vector<Probe> intergenicProbes = new Vector<Probe>();
        Chromosome lastChr = allProbes[0].chromosome();
        for (int p = 1; p < allProbes.length; p++) {
            if (allProbes[p].chromosome() != lastChr) {
                lastChr = allProbes[p].chromosome();
                continue;
            }
            // See if there's a gap back to the last probe
            if (allProbes[p].start() > allProbes[p - 1].end()) {
                if (allProbes[p].start() - allProbes[p - 1].end() < 1000) {
                    // Don't bother with really short probes
                    continue;
                }
                intergenicProbes.add(new Probe(lastChr, allProbes[p - 1].end() + 1, allProbes[p].start() - 1));
            }
        }
        Probe[] allIntergenicProbes = intergenicProbes.toArray(new Probe[0]);
        for (int d = 0; d < data.length; d++) {
            progressUpdated("Quantitating DNA contamination", 1, 2);
            float[] densities = new float[allIntergenicProbes.length];
            for (int p = 0; p < allIntergenicProbes.length; p++) {
                densities[p] = data[d].getReadsForProbe(allIntergenicProbes[p]).length / (allIntergenicProbes[p].length() / 1000f);
            }
            dnaDensityPerKb[d] = SimpleStats.median(densities);
        }
        // Work out adjusted total counts having subtracted the DNA contamination
        for (int d = 0; d < data.length; d++) {
            int predictedContamination = (int) (dnaDensityPerKb[d] * (SeqMonkApplication.getInstance().dataCollection().genome().getTotalGenomeLength() / 1000));
            int correctedTotalReadCount = data[d].getTotalReadCount() - predictedContamination;
            correctedTotalCounts[d] = correctedTotalReadCount;
        }
        // Halve the density if they're doing a directional quantitation
        if (optionsPanel.isDirectional()) {
            for (int i = 0; i < dnaDensityPerKb.length; i++) {
                dnaDensityPerKb[i] /= 2;
            }
        }
        // Halve the density if the libraries are paired end
        if (pairedEnd) {
            for (int i = 0; i < dnaDensityPerKb.length; i++) {
                dnaDensityPerKb[i] /= 2;
            }
        }
    }
    // If we're correcting for duplication we need to work out the modal count depth in
    // intergenic regions
    int[] modalDuplicationLevels = new int[data.length];
    if (correctDuplication) {
        for (int d = 0; d < data.length; d++) {
            progressUpdated("Quantitating DNA duplication", 1, 2);
            // We're not going to look at depths which are > 200.  If it's that duplicated
            // then there's no point continuing anyway.
            int[] depthCount = new int[200];
            for (int p = 0; p < allProbes.length; p++) {
                long[] reads = data[d].getReadsForProbe(allProbes[p]);
                int currentCount = 0;
                for (int r = 1; r < reads.length; r++) {
                    if (reads[r] == reads[r - 1]) {
                        ++currentCount;
                    } else {
                        if (currentCount > 0 && currentCount < 200) {
                            ++depthCount[currentCount];
                        }
                        currentCount = 1;
                    }
                }
            }
            // Find the modal depth in intergenic regions. This is the best estimate
            // of duplication
            // Since unique reads turn up all over the place even in duplicated
            // data we say that if unique reads are higher than the sum of 2-10 there
            // is no duplication
            int twoTenSum = 0;
            for (int i = 2; i <= 10; i++) {
                twoTenSum += depthCount[i];
            }
            if (depthCount[1] > twoTenSum) {
                modalDuplicationLevels[d] = 1;
            } else {
                int highestDepth = 0;
                int bestDupGuess = 1;
                for (int i = 2; i < depthCount.length; i++) {
                    // System.err.println("For depth "+i+" count was "+depthCount[i]);
                    if (depthCount[i] > highestDepth) {
                        bestDupGuess = i;
                        highestDepth = depthCount[i];
                    }
                }
                modalDuplicationLevels[d] = bestDupGuess;
            }
        }
    }
    // Having made probes we now need to quantitate them.  We'll fetch the
    // probes overlapping each sub-feature and then aggregate these together
    // to get the final quantitation.
    QuantitationStrandType readFilter = optionsPanel.readFilter();
    int currentIndex = 0;
    for (int c = 0; c < chrs.length; c++) {
        if (cancel) {
            progressCancelled();
            return;
        }
        progressUpdated("Quantitating features on chr" + chrs[c].name(), chrs.length + c, chrs.length * 2);
        Feature[] features = collection().genome().annotationCollection().getFeaturesForType(chrs[c], optionsPanel.getSelectedFeatureType());
        Arrays.sort(features);
        FeatureGroup[] mergedTranscripts = mergeTranscripts(features, mergeTranscripts);
        int[] readLengths = new int[data.length];
        for (int d = 0; d < data.length; d++) {
            readLengths[d] = data[d].getMaxReadLength();
            // actual length.
            if (pairedEnd) {
                readLengths[d] *= 2;
            }
        }
        for (int f = 0; f < mergedTranscripts.length; f++) {
            Location[] subLocations = mergedTranscripts[f].getSubLocations();
            int totalLength = 0;
            // Find the total length of all of the exons
            for (int s = 0; s < subLocations.length; s++) {
                totalLength += subLocations[s].length();
            }
            for (int d = 0; d < data.length; d++) {
                if (cancel) {
                    progressCancelled();
                    return;
                }
                long totalCount = 0;
                for (int s = 0; s < subLocations.length; s++) {
                    long[] reads = data[d].getReadsForProbe(new Probe(chrs[c], subLocations[s].start(), subLocations[s].end()));
                    for (int r = 0; r < reads.length; r++) {
                        if (!readFilter.useRead(subLocations[s], reads[r])) {
                            continue;
                        }
                        int overlap = (Math.min(subLocations[s].end(), SequenceRead.end(reads[r])) - Math.max(subLocations[s].start(), SequenceRead.start(reads[r]))) + 1;
                        totalCount += overlap;
                    }
                }
                // Now we correct the count by the total length of reads in the data and by
                // the length of the split parts of the probe, and assign this to the probe.
                // As we're correcting for read length then we work out the whole number of
                // reads which this count could comprise, rounding down to a whole number.
                totalCount /= readLengths[d];
                // We can now subtract the DNA contamination prediction.
                if (correctDNAContamination) {
                    int predictedContamination = (int) ((totalLength / 1000f) * dnaDensityPerKb[d]);
                    totalCount -= predictedContamination;
                    // Makes no sense to have negative counts
                    if (totalCount < 0)
                        totalCount = 0;
                }
                // ..and we can divide by the duplication level if we know it.
                if (correctDuplication) {
                    totalCount /= modalDuplicationLevels[d];
                }
                // System.err.println("Total read count for "+mergedTranscripts[f].name+" is "+totalCount);
                float value = totalCount;
                if (value == 0 && noValueForZeroCounts) {
                    value = Float.NaN;
                }
                // If we're log transforming then we need to set zero values to 0.9
                if (logTransform && value == 0 && !noValueForZeroCounts) {
                    value = 0.9f;
                }
                // been asked to.
                if (applyTranscriptLengthCorrection) {
                    value /= (totalLength / 1000f);
                }
                // We also correct by the total read count
                if (!rawCounts) {
                    // System.err.println("True total is "+data[d].getTotalReadCount()+" corrected total is "+correctedTotalCounts[d]);
                    // If these libraries are paired end then the total number of
                    // reads is also effectively halved.
                    float totalReadCount;
                    // calculated this already, but otherwise we'll take the total count (total length/read length)
                    if (correctDNAContamination) {
                        totalReadCount = correctedTotalCounts[d];
                    } else {
                        totalReadCount = data[d].getTotalReadLength() / readLengths[d];
                    }
                    // If we're correcting for duplication we divide by the duplication level.
                    if (correctDuplication) {
                        totalReadCount /= modalDuplicationLevels[d];
                    }
                    // Finally we work out millions of reads (single end) or fragments (paired end)
                    if (pairedEnd) {
                        totalReadCount /= 2000000f;
                    } else {
                        totalReadCount /= 1000000f;
                    }
                    // Lastly we divide the value by the total millions of reads to get the globally corrected count.
                    value /= totalReadCount;
                }
                // Finally we do the log transform if we've been asked to
                if (logTransform) {
                    value = (float) Math.log(value) / log2;
                }
                data[d].setValueForProbe(allProbes[currentIndex], value);
            }
            currentIndex++;
        }
    }
    collection().probeSet().setCurrentQuantitation(getQuantitationDescription(mergeTranscripts, applyTranscriptLengthCorrection, correctDNAContamination, logTransform, rawCounts));
    // If we estimated any parameters let's report them.
    if (correctDNAContamination || correctDuplication) {
        float[] dna = null;
        if (correctDNAContamination) {
            dna = dnaDensityPerKb;
        }
        int[] dup = null;
        if (correctDuplication) {
            dup = modalDuplicationLevels;
        }
        RNASeqParametersModel model = new RNASeqParametersModel(data, dna, dup);
        ReportTableDialog report = new ReportTableDialog(SeqMonkApplication.getInstance(), new Report(null, null) {

            @Override
            public void run() {
            }

            @Override
            public String name() {
                return "RNA-Seq parameter";
            }

            @Override
            public boolean isReady() {
                return true;
            }

            @Override
            public JPanel getOptionsPanel() {
                return null;
            }

            @Override
            public void generateReport() {
            }
        }, model);
    }
    quantitatonComplete();
}
Also used : JPanel(javax.swing.JPanel) Probe(uk.ac.babraham.SeqMonk.DataTypes.Probes.Probe) Feature(uk.ac.babraham.SeqMonk.DataTypes.Genome.Feature) ProbeSet(uk.ac.babraham.SeqMonk.DataTypes.Probes.ProbeSet) ReportTableDialog(uk.ac.babraham.SeqMonk.Displays.Report.ReportTableDialog) QuantitationStrandType(uk.ac.babraham.SeqMonk.DataTypes.Sequence.QuantitationStrandType) Vector(java.util.Vector) Report(uk.ac.babraham.SeqMonk.Reports.Report) Chromosome(uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome) Location(uk.ac.babraham.SeqMonk.DataTypes.Genome.Location)

Aggregations

Chromosome (uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome)78 Probe (uk.ac.babraham.SeqMonk.DataTypes.Probes.Probe)47 Vector (java.util.Vector)36 Feature (uk.ac.babraham.SeqMonk.DataTypes.Genome.Feature)23 SeqMonkException (uk.ac.babraham.SeqMonk.SeqMonkException)23 ProbeSet (uk.ac.babraham.SeqMonk.DataTypes.Probes.ProbeSet)22 ProbeList (uk.ac.babraham.SeqMonk.DataTypes.Probes.ProbeList)12 DataStore (uk.ac.babraham.SeqMonk.DataTypes.DataStore)11 DataSet (uk.ac.babraham.SeqMonk.DataTypes.DataSet)8 ReadsWithCounts (uk.ac.babraham.SeqMonk.DataTypes.Sequence.ReadsWithCounts)8 Location (uk.ac.babraham.SeqMonk.DataTypes.Genome.Location)7 SplitLocation (uk.ac.babraham.SeqMonk.DataTypes.Genome.SplitLocation)7 ProgressListener (uk.ac.babraham.SeqMonk.DataTypes.ProgressListener)7 HiCHitCollection (uk.ac.babraham.SeqMonk.DataTypes.Sequence.HiCHitCollection)7 IOException (java.io.IOException)6 File (java.io.File)5 Hashtable (java.util.Hashtable)5 HiCDataStore (uk.ac.babraham.SeqMonk.DataTypes.HiCDataStore)5 QuantitationStrandType (uk.ac.babraham.SeqMonk.DataTypes.Sequence.QuantitationStrandType)5 PairedDataSet (uk.ac.babraham.SeqMonk.DataTypes.PairedDataSet)4