use of uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome in project SeqMonk by s-andrews.
the class FeatureFilter method generateProbeList.
protected void generateProbeList() {
// We'll start by getting the complete set of probes from the position
// filter. We'll split these by chromosome at a later date but we
// have to get them as a set to start with.
Probe[] probesToMatch = options.featurePositions.getProbes();
// This is the set of passing probes we're going to build up.
ProbeList passedProbes = new ProbeList(startingList, "", "", null);
// We need to know how far beyond the feature we might need to look
int annotationLimit = options.closenessLimit();
// Since we're going to be making the annotations on the
// basis of position we should go through all probes one
// chromosome at a time.
Chromosome[] chrs = collection.genome().getAllChromosomes();
for (int c = 0; c < chrs.length; c++) {
// For the not-overlapping option it's easiest for us to keep a list
// of probes to reject (those that do overlap) and then make the negated
// list at the end.
HashSet<Probe> failedProbes = new HashSet<Probe>();
progressUpdated("Processing features on Chr " + chrs[c].name(), c, chrs.length);
Probe[] probes = startingList.getProbesForChromosome(chrs[c]);
Vector<Probe> featuresForThisChromosome = new Vector<Probe>();
for (int f = 0; f < probesToMatch.length; f++) {
if (probesToMatch[f].chromosome().equals(chrs[c])) {
featuresForThisChromosome.add(probesToMatch[f]);
}
}
Probe[] features = featuresForThisChromosome.toArray(new Probe[0]);
Arrays.sort(probes);
Arrays.sort(features);
int lastFoundIndex = 0;
// We can now step through the probes looking for the best feature match
for (int p = 0; p < probes.length; p++) {
boolean foundFirst = false;
for (int f = lastFoundIndex; f < features.length; f++) {
if (cancel) {
cancel = false;
progressCancelled();
return;
}
if (!foundFirst) {
if (features[f].end() + annotationLimit >= probes[p].start()) {
lastFoundIndex = f;
foundFirst = true;
}
}
// See if we're skipping this feature for this probe based on its strand
if (strand != ANY_STRAND) {
switch(strand) {
case FORWARD_ONLY:
{
if (features[f].strand() != Location.FORWARD)
continue;
break;
}
case REVERSE_ONLY:
{
if (features[f].strand() != Location.REVERSE)
continue;
break;
}
case SAME_STRAND:
{
if (features[f].strand() != probes[p].strand())
continue;
break;
}
case OPPOSING_STRAND:
{
if (!((features[f].strand() == Location.FORWARD && probes[p].strand() == Location.REVERSE) || (features[f].strand() == Location.REVERSE && probes[p].strand() == Location.FORWARD)))
continue;
break;
}
}
}
if (relationship == EXACTLY_MATCHING) {
if (probes[p].start() == features[f].start() && probes[p].end() == features[f].end()) {
passedProbes.addProbe(probes[p], null);
break;
}
} else if (relationship == OVERLAPPING || relationship == NOT_OVERLAPPING) {
if (probes[p].start() < features[f].end() && probes[p].end() > features[f].start()) {
if (relationship == OVERLAPPING) {
passedProbes.addProbe(probes[p], null);
} else {
// This is going to be a rejected probe for not-overlapping
failedProbes.add(probes[p]);
}
break;
}
} else if (relationship == CONTAINED_WITHIN) {
if (probes[p].start() >= features[f].start() && probes[p].end() <= features[f].end()) {
passedProbes.addProbe(probes[p], null);
break;
}
} else if (relationship == SURROUNDING) {
if (probes[p].start() <= features[f].start() && probes[p].end() >= features[f].end()) {
passedProbes.addProbe(probes[p], null);
break;
}
} else if (relationship == CLOSE_TO) {
if (probes[p].start() < features[f].end() + annotationLimit && probes[p].end() > features[f].start() - annotationLimit) {
passedProbes.addProbe(probes[p], null);
break;
}
}
}
}
// data to get the probes which weren't rejected
if (relationship == NOT_OVERLAPPING) {
for (int p = 0; p < probes.length; p++) {
if (!failedProbes.contains(probes[p])) {
passedProbes.addProbe(probes[p], null);
}
}
}
}
filterFinished(passedProbes);
}
use of uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome in project SeqMonk by s-andrews.
the class ContigProbeGenerator method run.
/* (non-Javadoc)
* @see java.lang.Runnable#run()
*/
public void run() {
Chromosome[] chromosomes = collection.genome().getAllChromosomes();
Vector<Probe> newProbes = new Vector<Probe>();
for (int c = 0; c < chromosomes.length; c++) {
// Time for an update
updateGenerationProgress("Processed " + c + " chromosomes", c, chromosomes.length);
// We'll merge together the reads for all of the selected DataStores and
// compute a single set of probes which covers all of them.
ReadsWithCounts[] v = new ReadsWithCounts[selectedStores.length];
for (int s = 0; s < selectedStores.length; s++) {
v[s] = selectedStores[s].getReadsForChromosome(chromosomes[c]);
}
ReadsWithCounts rawReads = new ReadsWithCounts(v);
v = null;
// We now want to convert this list into a non-redundant set of
// read positions with counts. If we don't do this then we get
// appalling performance where we have many reads mapped at the
// same position
// Our default is to do all strands at once
int[] strandsToTry = new int[] { 100 };
if (separateStrands.isSelected()) {
strandsToTry = new int[] { Location.FORWARD, Location.REVERSE, Location.UNKNOWN };
}
for (int strand = 0; strand < strandsToTry.length; strand++) {
ReadsWithCounts reads = getNonRedundantReads(rawReads, strandsToTry[strand]);
if (reads.totalCount() == 0) {
// System.err.println("Skipping strand "+strandsToTry[strand]+" on chr "+chromosomes[c]);
continue;
}
int strandForNewProbes = Location.UNKNOWN;
if (strandsToTry.length > 1) {
strandForNewProbes = strandsToTry[strand];
}
int start = -1;
// We now start a process where we work out at what point we cross the
// threshold of having more than depthCutoff reads overlapping at any
// point
LinkedList<SequenceReadWithCount> currentSet = new LinkedList<SequenceReadWithCount>();
int currentSetSize = 0;
for (int r = 0; r < reads.reads.length; r++) {
// See if we need to quit
if (cancel) {
generationCancelled();
return;
}
while (currentSetSize > 0 && SequenceRead.end(currentSet.getFirst().read) < SequenceRead.start(reads.reads[r])) {
SequenceReadWithCount lastRead = currentSet.removeFirst();
currentSetSize -= lastRead.count;
if (start > 0 && currentSetSize < depthCutoff) {
// We just got to the end of a probe
Probe p = new Probe(chromosomes[c], start, SequenceRead.end(lastRead.read), strandForNewProbes);
// Check to see if we have a previous probe against which we can check
Probe lastProbe = null;
if (!newProbes.isEmpty())
lastProbe = newProbes.lastElement();
// Can we merge?
if (lastProbe != null && p.chromosome() == lastProbe.chromosome() && p.strand() == lastProbe.strand() && p.start() - lastProbe.end() <= distance) {
// Remove the last probe from the stored set
newProbes.remove(newProbes.size() - 1);
// Expand this probe to cover the last one and add it to the stored set
newProbes.add(new Probe(p.chromosome(), lastProbe.start(), p.end(), strandForNewProbes));
} else if (lastProbe != null) {
// We might still remove this if it's too small
if (lastProbe.length() < minSize) {
newProbes.remove(newProbes.size() - 1);
}
// We still need to add the new probe
newProbes.add(p);
} else {
newProbes.add(p);
}
start = -1;
}
}
// If there's nothing there already then just add it
if (currentSetSize == 0) {
currentSet.add(new SequenceReadWithCount(reads.reads[r], reads.counts[r]));
currentSetSize += reads.counts[r];
} else // If there are reads in the current set then we need to add this read
// so that the current set is ordered by the end positions of the
// reads, with the earliest end first. We therefore start from the back
// and work our way to the front, as soon as we see an entry whose end
// is lower than ours we add ourselves after that
{
// Now we add this read at a position based on its end position
ListIterator<SequenceReadWithCount> it = currentSet.listIterator(currentSet.size());
while (true) {
// If we reach the front of the set then we add ourselves to the front
if (!it.hasPrevious()) {
currentSet.addFirst(new SequenceReadWithCount(reads.reads[r], reads.counts[r]));
currentSetSize += reads.counts[r];
break;
} else {
SequenceReadWithCount previousRead = it.previous();
if (SequenceRead.end(previousRead.read) < SequenceRead.end(reads.reads[r])) {
// We want to add ourselves after this element so backtrack
// by one position (which must exist because we just went
// past it
it.next();
it.add(new SequenceReadWithCount(reads.reads[r], reads.counts[r]));
currentSetSize += reads.counts[r];
break;
}
}
}
}
// See if we crossed the threshold for starting a new probe
if (start < 0 && currentSetSize >= depthCutoff) {
start = SequenceRead.start(reads.reads[r]);
}
}
// out of the so far unprocessed reads on this chromosome
if (start > 0) {
Probe p = new Probe(chromosomes[c], start, SequenceRead.end(currentSet.getFirst().read), strandForNewProbes);
// Check to see if we can merge with the last probe made
Probe lastProbe = null;
if (!newProbes.isEmpty())
lastProbe = newProbes.lastElement();
// Can we merge?
if (lastProbe != null && p.chromosome() == lastProbe.chromosome() && p.start() - lastProbe.end() <= distance) {
newProbes.remove(newProbes.size() - 1);
newProbes.add(new Probe(p.chromosome(), lastProbe.start(), p.end(), strandForNewProbes));
} else if (lastProbe != null) {
// We might still remove this if it's too small
if (lastProbe.length() < minSize) {
newProbes.remove(newProbes.size() - 1);
}
// final chance
if (p.length() > minSize) {
newProbes.add(p);
}
} else {
// Add the remaining probe if it's big enough.
if (p.length() > minSize) {
newProbes.add(p);
}
}
}
}
}
Probe[] finalList = newProbes.toArray(new Probe[0]);
newProbes.clear();
ProbeSet finalSet = new ProbeSet(getDescription(), finalList);
generationComplete(finalSet);
}
use of uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome in project SeqMonk by s-andrews.
the class DeduplicationProbeGenerator method run.
/* (non-Javadoc)
* @see java.lang.Runnable#run()
*/
public void run() {
Chromosome[] chromosomes = collection.genome().getAllChromosomes();
boolean separateStrands = separateStrandsBox.isSelected();
int maxDistance = 0;
if (maxDistanceField.getText().length() > 0) {
maxDistance = Integer.parseInt(maxDistanceField.getText());
}
Vector<Probe> newProbes = new Vector<Probe>();
for (int c = 0; c < chromosomes.length; c++) {
// Time for an update
updateGenerationProgress("Processed " + c + " chromosomes", c, chromosomes.length);
Probe[] startingProbes = initialList.getProbesForChromosome(chromosomes[c]);
// For directional merging we use 3 probes
Probe currentForward = null;
Probe currentReverse = null;
Probe currentUnknown = null;
// For non-directional merging we use only one
Probe currentProbe = null;
// Now we can make the actual probes
for (int i = 0; i < startingProbes.length; i++) {
if (cancel) {
generationCancelled();
return;
}
if (separateStrands) {
switch(startingProbes[i].strand()) {
case (Probe.FORWARD):
if (currentForward == null) {
currentForward = new Probe(chromosomes[c], startingProbes[i].packedPosition());
} else if (startingProbes[i].start() <= currentForward.end() + maxDistance) {
if (startingProbes[i].end() > currentForward.end()) {
// Extend the current probe
currentForward = new Probe(chromosomes[c], currentForward.start(), startingProbes[i].end(), currentForward.strand(), currentForward.name());
}
} else {
newProbes.add(currentForward);
currentForward = new Probe(chromosomes[c], startingProbes[i].packedPosition());
}
break;
case (Probe.REVERSE):
if (currentReverse == null) {
currentReverse = new Probe(chromosomes[c], startingProbes[i].packedPosition());
} else if (startingProbes[i].start() <= currentReverse.end() + maxDistance) {
if (startingProbes[i].end() > currentReverse.end()) {
// Extend the current probe
currentReverse = new Probe(chromosomes[c], currentReverse.start(), startingProbes[i].end(), currentReverse.strand(), currentReverse.name());
}
} else {
newProbes.add(currentReverse);
currentReverse = new Probe(chromosomes[c], startingProbes[i].packedPosition());
}
break;
case (Probe.UNKNOWN):
if (currentUnknown == null) {
currentUnknown = new Probe(chromosomes[c], startingProbes[i].packedPosition());
} else if (startingProbes[i].start() <= currentUnknown.end() + maxDistance) {
if (startingProbes[i].end() > currentUnknown.end()) {
// Extend the current probe
currentUnknown = new Probe(chromosomes[c], currentUnknown.start(), startingProbes[i].end(), currentUnknown.strand(), currentUnknown.name());
}
} else {
newProbes.add(currentUnknown);
currentUnknown = new Probe(chromosomes[c], startingProbes[i].packedPosition());
}
break;
}
} else {
if (currentProbe == null) {
currentProbe = new Probe(chromosomes[c], startingProbes[i].packedPosition());
} else if (startingProbes[i].start() <= currentProbe.end() + maxDistance) {
if (startingProbes[i].end() > currentProbe.end() || startingProbes[i].strand() != currentProbe.strand()) {
// Update the current probe
int usedStrand = currentProbe.strand();
if (startingProbes[i].strand() != currentProbe.strand()) {
usedStrand = Probe.UNKNOWN;
}
currentProbe = new Probe(chromosomes[c], currentProbe.start(), Math.max(startingProbes[i].end(), currentProbe.end()), usedStrand, currentProbe.name());
}
} else {
newProbes.add(currentProbe);
currentProbe = new Probe(chromosomes[c], startingProbes[i].packedPosition());
}
}
}
// At the end of each chromosome we add any probes we have remaining
if (currentProbe != null)
newProbes.add(currentProbe);
if (currentForward != null)
newProbes.add(currentForward);
if (currentReverse != null)
newProbes.add(currentReverse);
if (currentUnknown != null)
newProbes.add(currentUnknown);
}
Probe[] finalList = newProbes.toArray(new Probe[0]);
ProbeSet finalSet = new ProbeSet(getDescription(), finalList);
generationComplete(finalSet);
}
use of uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome in project SeqMonk by s-andrews.
the class MacsPeakCaller method run.
public void run() {
// for (int i=0;i<selectedChIPStores.length;i++) {
// System.err.println("Selcted ChIP="+selectedChIPStores[i]);
// }
// for (int i=0;i<selectedInputStores.length;i++) {
// System.err.println("Selcted Input="+selectedInputStores[i]);
// }
// First find the tag offsets between the watson and crick strands
// Work out the total average coverage for all of the combined ChIP samples
long totalChIPCoverage = 0;
for (int i = 0; i < selectedChIPStores.length; i++) {
totalChIPCoverage += selectedChIPStores[i].getTotalReadLength();
}
if (cancel) {
generationCancelled();
return;
}
double averageChIPCoveragePerBase = totalChIPCoverage / (double) collection.genome().getTotalGenomeLength();
double lowerCoverage = averageChIPCoveragePerBase * minFoldEnrichment;
double upperCoverage = averageChIPCoveragePerBase * maxFoldEnrichment;
System.err.println("Coverage range for high confidence peaks is " + lowerCoverage + " - " + upperCoverage);
// Now we go through the data to find locations for our high confidence peaks so we can
// randomly select 1000 of these to use to find the offset between the two strands
Chromosome[] chromosomes = collection.genome().getAllChromosomes();
Vector<Probe> potentialHighConfidencePeaks = new Vector<Probe>();
for (int c = 0; c < chromosomes.length; c++) {
if (cancel) {
generationCancelled();
return;
}
// Time for an update
updateGenerationProgress("Finding high confidence peaks on chromosome " + chromosomes[c].name(), c, chromosomes.length);
Probe lastValidProbe = null;
for (int startPosition = 1; startPosition < chromosomes[c].length() - fragmentSize; startPosition += fragmentSize / 2) {
// See if we need to quit
if (cancel) {
generationCancelled();
return;
}
long totalLength = 0;
Probe probe = new Probe(chromosomes[c], startPosition, startPosition + fragmentSize);
for (int s = 0; s < selectedChIPStores.length; s++) {
long[] reads = selectedChIPStores[s].getReadsForProbe(probe);
for (int j = 0; j < reads.length; j++) {
totalLength += SequenceRead.length(reads[j]);
}
}
if (totalLength >= (lowerCoverage * probe.length()) && totalLength <= upperCoverage * probe.length()) {
if (lastValidProbe != null && SequenceRead.overlaps(lastValidProbe.packedPosition(), probe.packedPosition())) {
lastValidProbe = new Probe(chromosomes[c], lastValidProbe.start(), probe.end());
} else if (lastValidProbe != null) {
// Check that the overall density over the region falls within our limits
totalLength = 0;
for (int s = 0; s < selectedChIPStores.length; s++) {
long[] reads = selectedChIPStores[s].getReadsForProbe(lastValidProbe);
for (int j = 0; j < reads.length; j++) {
totalLength += SequenceRead.length(reads[j]);
}
}
if (totalLength >= (lowerCoverage * lastValidProbe.length()) && totalLength <= upperCoverage * lastValidProbe.length()) {
potentialHighConfidencePeaks.add(lastValidProbe);
}
lastValidProbe = probe;
} else {
lastValidProbe = probe;
}
}
}
if (lastValidProbe != null) {
long totalLength = 0;
for (int s = 0; s < selectedChIPStores.length; s++) {
long[] reads = selectedChIPStores[s].getReadsForProbe(lastValidProbe);
for (int j = 0; j < reads.length; j++) {
totalLength += SequenceRead.length(reads[j]);
}
}
if (totalLength >= (lowerCoverage * lastValidProbe.length()) && totalLength <= upperCoverage * lastValidProbe.length()) {
potentialHighConfidencePeaks.add(lastValidProbe);
}
}
}
if (potentialHighConfidencePeaks.size() == 0) {
JOptionPane.showMessageDialog(SeqMonkApplication.getInstance(), "No high confidence peaks found", "Quitting generator", JOptionPane.INFORMATION_MESSAGE);
generationCancelled();
return;
}
// System.err.println("Found "+potentialHighConfidencePeaks.size()+" high confidence peaks");
// Now we select 1000 random probes from this set
Probe[] highConfidencePeaks = potentialHighConfidencePeaks.toArray(new Probe[0]);
Collections.shuffle(Arrays.asList(highConfidencePeaks));
Probe[] randomHighConfidenceProbes = new Probe[Math.min(highConfidencePeaks.length, 1000)];
for (int i = 0; i < randomHighConfidenceProbes.length; i++) {
randomHighConfidenceProbes[i] = highConfidencePeaks[i];
}
// Now find the average distance between forward / reverse reads in the candidate peaks
int[] distances = new int[highConfidencePeaks.length];
// Sort the candidates so we don't do stupid stuff with the cache
Arrays.sort(randomHighConfidenceProbes);
for (int p = 0; p < randomHighConfidenceProbes.length; p++) {
// See if we need to quit
if (cancel) {
generationCancelled();
return;
}
distances[p] = getInterStrandDistance(randomHighConfidenceProbes[p], selectedChIPStores);
}
int medianInterStrandDistance = (int) SimpleStats.median(distances);
if (medianInterStrandDistance < 0)
medianInterStrandDistance = 0;
// System.err.println("Median inter strand difference = "+medianInterStrandDistance);
// Now we find the depth cutoff for overrepresented single tags using a binomial distribution
int totalReadCount = 0;
for (int i = 0; i < selectedChIPStores.length; i++) {
totalReadCount += selectedChIPStores[i].getTotalReadCount();
}
BinomialDistribution bin = new BinomialDistribution(totalReadCount, 1d / collection.genome().getTotalGenomeLength());
// We want to know what depth has a chance of less than 1^-5
int redundantThreshold = bin.inverseCumulativeProbability(1 - 0.00001d);
if (redundantThreshold < 1)
redundantThreshold = 1;
// System.err.println("Redundancy threshold is "+redundantThreshold);
// Now we construct a poisson distribution to work out the threshold to use for
// constructing a full candidate peak set.
updateGenerationProgress("Counting non-redundant reads", 0, 1);
// To do this we need to get the full non-redundant length from the whole set
int totalNonRedCount = getNonRedundantReadCount(selectedChIPStores, redundantThreshold);
// System.err.println("Total non-redundant sequences is "+totalNonRedCount);
// We need to know the median read length for the data
int readLength = 0;
for (int i = 0; i < selectedChIPStores.length; i++) {
readLength += selectedChIPStores[i].getTotalReadLength() / selectedChIPStores[i].getTotalReadCount();
}
readLength /= selectedChIPStores.length;
double expectedCountsPerWindow = getExpectedCountPerWindow(totalNonRedCount, collection.genome().getTotalGenomeLength(), fragmentSize, readLength);
PoissonDistribution poisson = new PoissonDistribution(expectedCountsPerWindow);
int readCountCutoff = poisson.inverseCumulativeProbability(1 - pValue);
// System.err.println("Threshold for enrichment in a window is "+readCountCutoff+" reads using a p-value of "+pValue+" and a mean of "+(totalNonRedCount/(collection.genome().getTotalGenomeLength()/(double)fragmentSize)));
// Now we go back through the whole dataset to do a search for all possible candidate probes
// We re-use the peak vector we came up with before.
potentialHighConfidencePeaks.clear();
for (int c = 0; c < chromosomes.length; c++) {
// Time for an update
updateGenerationProgress("Finding candidate peaks on chromosome " + chromosomes[c].name(), c, chromosomes.length);
Probe lastValidProbe = null;
for (int startPosition = 1; startPosition < chromosomes[c].length() - fragmentSize; startPosition += fragmentSize / 2) {
// See if we need to quit
if (cancel) {
generationCancelled();
return;
}
// We expand the region we're looking at by the inter-strand distance as we're going to
// be adjusting the read positions
Probe probe = new Probe(chromosomes[c], startPosition, (startPosition + fragmentSize - 1));
long[] mergedProbeReads = getReadsFromDataStoreCollection(probe, selectedChIPStores, medianInterStrandDistance);
mergedProbeReads = deduplicateReads(mergedProbeReads, redundantThreshold);
SequenceRead.sort(mergedProbeReads);
int thisProbeOverlapCount = 0;
for (int i = 0; i < mergedProbeReads.length; i++) {
if (SequenceRead.overlaps(mergedProbeReads[i], probe.packedPosition())) {
++thisProbeOverlapCount;
}
}
if (thisProbeOverlapCount > readCountCutoff) {
if (lastValidProbe != null && SequenceRead.overlaps(lastValidProbe.packedPosition(), probe.packedPosition())) {
lastValidProbe = new Probe(chromosomes[c], lastValidProbe.start(), probe.end());
} else if (lastValidProbe != null) {
potentialHighConfidencePeaks.add(lastValidProbe);
lastValidProbe = probe;
} else {
lastValidProbe = probe;
}
}
}
if (lastValidProbe != null) {
potentialHighConfidencePeaks.add(lastValidProbe);
}
}
// Finally we re-filter the peaks we have using local poisson distributions with densities taken
// from either the input samples (if there are any), or the local region. The densities are
// estimated over 1,5 and 10kb around the peak and genome wide and the max of these is taken.
// If there is no input then the 1kb region is not used.
Probe[] allCandidateProbes = potentialHighConfidencePeaks.toArray(new Probe[0]);
// Work out which stores we're using to validate against.
DataStore[] validationStores;
boolean useInput = false;
double inputCorrection = 1;
int validationNonRedCount;
if (selectedInputStores != null && selectedInputStores.length > 0) {
// See if we need to quit
if (cancel) {
generationCancelled();
return;
}
validationStores = selectedInputStores;
useInput = true;
// We also need to work out the total number of nonredundant seqences
// in the input so we can work out a scaling factor so that the densities
// for input and chip are comparable.
validationNonRedCount = getNonRedundantReadCount(validationStores, redundantThreshold);
inputCorrection = totalNonRedCount / (double) validationNonRedCount;
System.err.println("From chip=" + totalNonRedCount + " input=" + validationNonRedCount + " correction is " + inputCorrection);
} else {
validationStores = selectedChIPStores;
validationNonRedCount = totalNonRedCount;
}
Vector<Probe> finalValidatedProbes = new Vector<Probe>();
for (int p = 0; p < allCandidateProbes.length; p++) {
// See if we need to quit
if (cancel) {
generationCancelled();
return;
}
if (p % 100 == 0) {
updateGenerationProgress("Validated " + p + " out of " + allCandidateProbes.length + " raw peaks", p, allCandidateProbes.length);
}
// System.err.println("Validating "+allCandidateProbes[p].chromosome()+":"+allCandidateProbes[p].start()+"-"+allCandidateProbes[p].end());
// We now need to find the maximum read density per 2*bandwidth against which
// we're going to validate this peak
// We're going to get all reads within 10kb of the peak, and then we can subselect from there
int midPoint = allCandidateProbes[p].middle();
Probe region10kb = new Probe(allCandidateProbes[p].chromosome(), Math.max(midPoint - 5000, 1), Math.min(midPoint + 4999, allCandidateProbes[p].chromosome().length()), allCandidateProbes[p].strand());
Probe region5kb = new Probe(allCandidateProbes[p].chromosome(), Math.max(midPoint - 2500, 1), Math.min(midPoint + 2499, allCandidateProbes[p].chromosome().length()), allCandidateProbes[p].strand());
Probe region1kb = new Probe(allCandidateProbes[p].chromosome(), Math.max(midPoint - 500, 1), Math.min(midPoint + 499, allCandidateProbes[p].chromosome().length()), allCandidateProbes[p].strand());
// Get the probes for the largest region
long[] thisRegionReads = getReadsFromDataStoreCollection(region10kb, validationStores, 0);
// Deduplicate so it's a fair comparison
// Should we recalculate the redundant threshold based on the input coverage?
thisRegionReads = deduplicateReads(thisRegionReads, redundantThreshold);
int region10kbcount = thisRegionReads.length;
int region5kbcount = 0;
int region1kbcount = 0;
// Go through the reads seeing if they fit into the 5 or 1kb regions
for (int r = 0; r < thisRegionReads.length; r++) {
if (SequenceRead.overlaps(region5kb.packedPosition(), thisRegionReads[r]))
++region5kbcount;
if (SequenceRead.overlaps(region1kb.packedPosition(), thisRegionReads[r]))
++region1kbcount;
}
// System.err.println("Input counts 10kb="+region10kbcount+" 5kb="+region5kbcount+" 1kb="+region1kbcount);
// Convert to densities per window and ajdust for global coverage
double globalDensity = getExpectedCountPerWindow(validationNonRedCount, collection.genome().getTotalGenomeLength(), allCandidateProbes[p].length(), readLength) * inputCorrection;
double density10kb = getExpectedCountPerWindow(region10kbcount, region10kb.length(), allCandidateProbes[p].length(), readLength) * inputCorrection;
double density5kb = getExpectedCountPerWindow(region5kbcount, region5kb.length(), allCandidateProbes[p].length(), readLength) * inputCorrection;
double density1kb = getExpectedCountPerWindow(region1kbcount, region1kb.length(), allCandidateProbes[p].length(), readLength) * inputCorrection;
// Find the highest density to use for the validation
double highestDensity = globalDensity;
if (density10kb > highestDensity)
highestDensity = density10kb;
if (density5kb > highestDensity)
highestDensity = density5kb;
if (useInput && density1kb > highestDensity)
highestDensity = density1kb;
// System.err.println("Global="+globalDensity+" 10kb="+density10kb+" 5kb="+density5kb+" 1kb="+density1kb+" using="+highestDensity);
// Construct a poisson distribution with this density
PoissonDistribution localPoisson = new PoissonDistribution(highestDensity);
// System.err.println("Cutoff from global="+(new PoissonDistribution(globalDensity)).inverseCumulativeProbability(1-pValue)+" 10kb="+(new PoissonDistribution(density10kb)).inverseCumulativeProbability(1-pValue)+" 5kb="+(new PoissonDistribution(density5kb)).inverseCumulativeProbability(1-pValue)+" 1kb="+(new PoissonDistribution(density1kb)).inverseCumulativeProbability(1-pValue));
// Now check to see if the actual count from this peak is enough to still pass
long[] mergedProbeReads = getReadsFromDataStoreCollection(allCandidateProbes[p], selectedChIPStores, medianInterStrandDistance);
mergedProbeReads = deduplicateReads(mergedProbeReads, redundantThreshold);
SequenceRead.sort(mergedProbeReads);
int thisProbeOverlapCount = 0;
for (int i = 0; i < mergedProbeReads.length; i++) {
if (SequenceRead.overlaps(mergedProbeReads[i], allCandidateProbes[p].packedPosition())) {
++thisProbeOverlapCount;
}
}
if (thisProbeOverlapCount > localPoisson.inverseCumulativeProbability(1 - pValue)) {
finalValidatedProbes.add(allCandidateProbes[p]);
// System.err.println("Adding probe to final set");
}
}
// System.err.println("From "+allCandidateProbes.length+" candidates "+finalValidatedProbes.size()+" peaks were validated");
ProbeSet finalSet = new ProbeSet(getDescription(), finalValidatedProbes.toArray(new Probe[0]));
generationComplete(finalSet);
}
use of uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome in project SeqMonk by s-andrews.
the class RNASeqPipeline method startPipeline.
protected void startPipeline() {
// We first need to generate probes over all of the features listed in
// the feature types. The probes should cover the whole area of the
// feature regardless of where it splices.
Vector<Probe> probes = new Vector<Probe>();
boolean mergeTranscripts = optionsPanel.mergeTranscripts();
boolean pairedEnd = optionsPanel.pairedEnd();
boolean logTransform = optionsPanel.logTransform();
boolean applyTranscriptLengthCorrection = optionsPanel.applyTranscriptLengthCorrection();
boolean rawCounts = optionsPanel.rawCounts();
boolean noValueForZeroCounts = optionsPanel.noValueForZeroCounts();
boolean correctDNAContamination = optionsPanel.correctForDNAContamination();
boolean correctDuplication = optionsPanel.correctForDNADuplication();
if (rawCounts) {
logTransform = false;
applyTranscriptLengthCorrection = false;
noValueForZeroCounts = false;
}
Chromosome[] chrs = collection().genome().getAllChromosomes();
for (int c = 0; c < chrs.length; c++) {
// System.err.println("Processing chr "+chrs[c].name());
if (cancel) {
progressCancelled();
return;
}
progressUpdated("Making features for chr" + chrs[c].name(), c, chrs.length * 2);
Feature[] features = collection().genome().annotationCollection().getFeaturesForType(chrs[c], optionsPanel.getSelectedFeatureType());
Arrays.sort(features);
FeatureGroup[] mergedTranscripts = mergeTranscripts(features, mergeTranscripts);
for (int f = 0; f < mergedTranscripts.length; f++) {
if (cancel) {
progressCancelled();
return;
}
probes.add(new Probe(chrs[c], mergedTranscripts[f].start(), mergedTranscripts[f].end(), mergedTranscripts[f].strand(), mergedTranscripts[f].name()));
}
}
Probe[] allProbes = probes.toArray(new Probe[0]);
Arrays.sort(allProbes);
if (collection().probeSet() == null) {
collection().setProbeSet(new ProbeSet("Transcript features over " + optionsPanel.getSelectedFeatureType(), allProbes));
} else {
Probe[] existingProbes = collection().probeSet().getAllProbes();
Arrays.sort(existingProbes);
if (allProbes.length != existingProbes.length) {
collection().setProbeSet(new ProbeSet("Transcript features over " + optionsPanel.getSelectedFeatureType(), allProbes));
} else {
// Check the positions against the new ones
boolean areTheyTheSame = true;
for (int p = 0; p < allProbes.length; p++) {
if (allProbes[p].packedPosition() != existingProbes[p].packedPosition()) {
areTheyTheSame = false;
break;
}
}
if (areTheyTheSame) {
allProbes = existingProbes;
} else {
collection().setProbeSet(new ProbeSet("Transcript features over " + optionsPanel.getSelectedFeatureType(), allProbes));
}
}
}
// If we're correcting for DNA contamination we need to work out the average density of
// reads in intergenic regions
float[] dnaDensityPerKb = new float[data.length];
int[] correctedTotalCounts = new int[data.length];
if (correctDNAContamination) {
// We need to make interstitial probes to the set we already have, ignoring those at the end of chromosomes
Vector<Probe> intergenicProbes = new Vector<Probe>();
Chromosome lastChr = allProbes[0].chromosome();
for (int p = 1; p < allProbes.length; p++) {
if (allProbes[p].chromosome() != lastChr) {
lastChr = allProbes[p].chromosome();
continue;
}
// See if there's a gap back to the last probe
if (allProbes[p].start() > allProbes[p - 1].end()) {
if (allProbes[p].start() - allProbes[p - 1].end() < 1000) {
// Don't bother with really short probes
continue;
}
intergenicProbes.add(new Probe(lastChr, allProbes[p - 1].end() + 1, allProbes[p].start() - 1));
}
}
Probe[] allIntergenicProbes = intergenicProbes.toArray(new Probe[0]);
for (int d = 0; d < data.length; d++) {
progressUpdated("Quantitating DNA contamination", 1, 2);
float[] densities = new float[allIntergenicProbes.length];
for (int p = 0; p < allIntergenicProbes.length; p++) {
densities[p] = data[d].getReadsForProbe(allIntergenicProbes[p]).length / (allIntergenicProbes[p].length() / 1000f);
}
dnaDensityPerKb[d] = SimpleStats.median(densities);
}
// Work out adjusted total counts having subtracted the DNA contamination
for (int d = 0; d < data.length; d++) {
int predictedContamination = (int) (dnaDensityPerKb[d] * (SeqMonkApplication.getInstance().dataCollection().genome().getTotalGenomeLength() / 1000));
int correctedTotalReadCount = data[d].getTotalReadCount() - predictedContamination;
correctedTotalCounts[d] = correctedTotalReadCount;
}
// Halve the density if they're doing a directional quantitation
if (optionsPanel.isDirectional()) {
for (int i = 0; i < dnaDensityPerKb.length; i++) {
dnaDensityPerKb[i] /= 2;
}
}
// Halve the density if the libraries are paired end
if (pairedEnd) {
for (int i = 0; i < dnaDensityPerKb.length; i++) {
dnaDensityPerKb[i] /= 2;
}
}
}
// If we're correcting for duplication we need to work out the modal count depth in
// intergenic regions
int[] modalDuplicationLevels = new int[data.length];
if (correctDuplication) {
for (int d = 0; d < data.length; d++) {
progressUpdated("Quantitating DNA duplication", 1, 2);
// We're not going to look at depths which are > 200. If it's that duplicated
// then there's no point continuing anyway.
int[] depthCount = new int[200];
for (int p = 0; p < allProbes.length; p++) {
long[] reads = data[d].getReadsForProbe(allProbes[p]);
int currentCount = 0;
for (int r = 1; r < reads.length; r++) {
if (reads[r] == reads[r - 1]) {
++currentCount;
} else {
if (currentCount > 0 && currentCount < 200) {
++depthCount[currentCount];
}
currentCount = 1;
}
}
}
// Find the modal depth in intergenic regions. This is the best estimate
// of duplication
// Since unique reads turn up all over the place even in duplicated
// data we say that if unique reads are higher than the sum of 2-10 there
// is no duplication
int twoTenSum = 0;
for (int i = 2; i <= 10; i++) {
twoTenSum += depthCount[i];
}
if (depthCount[1] > twoTenSum) {
modalDuplicationLevels[d] = 1;
} else {
int highestDepth = 0;
int bestDupGuess = 1;
for (int i = 2; i < depthCount.length; i++) {
// System.err.println("For depth "+i+" count was "+depthCount[i]);
if (depthCount[i] > highestDepth) {
bestDupGuess = i;
highestDepth = depthCount[i];
}
}
modalDuplicationLevels[d] = bestDupGuess;
}
}
}
// Having made probes we now need to quantitate them. We'll fetch the
// probes overlapping each sub-feature and then aggregate these together
// to get the final quantitation.
QuantitationStrandType readFilter = optionsPanel.readFilter();
int currentIndex = 0;
for (int c = 0; c < chrs.length; c++) {
if (cancel) {
progressCancelled();
return;
}
progressUpdated("Quantitating features on chr" + chrs[c].name(), chrs.length + c, chrs.length * 2);
Feature[] features = collection().genome().annotationCollection().getFeaturesForType(chrs[c], optionsPanel.getSelectedFeatureType());
Arrays.sort(features);
FeatureGroup[] mergedTranscripts = mergeTranscripts(features, mergeTranscripts);
int[] readLengths = new int[data.length];
for (int d = 0; d < data.length; d++) {
readLengths[d] = data[d].getMaxReadLength();
// actual length.
if (pairedEnd) {
readLengths[d] *= 2;
}
}
for (int f = 0; f < mergedTranscripts.length; f++) {
Location[] subLocations = mergedTranscripts[f].getSubLocations();
int totalLength = 0;
// Find the total length of all of the exons
for (int s = 0; s < subLocations.length; s++) {
totalLength += subLocations[s].length();
}
for (int d = 0; d < data.length; d++) {
if (cancel) {
progressCancelled();
return;
}
long totalCount = 0;
for (int s = 0; s < subLocations.length; s++) {
long[] reads = data[d].getReadsForProbe(new Probe(chrs[c], subLocations[s].start(), subLocations[s].end()));
for (int r = 0; r < reads.length; r++) {
if (!readFilter.useRead(subLocations[s], reads[r])) {
continue;
}
int overlap = (Math.min(subLocations[s].end(), SequenceRead.end(reads[r])) - Math.max(subLocations[s].start(), SequenceRead.start(reads[r]))) + 1;
totalCount += overlap;
}
}
// Now we correct the count by the total length of reads in the data and by
// the length of the split parts of the probe, and assign this to the probe.
// As we're correcting for read length then we work out the whole number of
// reads which this count could comprise, rounding down to a whole number.
totalCount /= readLengths[d];
// We can now subtract the DNA contamination prediction.
if (correctDNAContamination) {
int predictedContamination = (int) ((totalLength / 1000f) * dnaDensityPerKb[d]);
totalCount -= predictedContamination;
// Makes no sense to have negative counts
if (totalCount < 0)
totalCount = 0;
}
// ..and we can divide by the duplication level if we know it.
if (correctDuplication) {
totalCount /= modalDuplicationLevels[d];
}
// System.err.println("Total read count for "+mergedTranscripts[f].name+" is "+totalCount);
float value = totalCount;
if (value == 0 && noValueForZeroCounts) {
value = Float.NaN;
}
// If we're log transforming then we need to set zero values to 0.9
if (logTransform && value == 0 && !noValueForZeroCounts) {
value = 0.9f;
}
// been asked to.
if (applyTranscriptLengthCorrection) {
value /= (totalLength / 1000f);
}
// We also correct by the total read count
if (!rawCounts) {
// System.err.println("True total is "+data[d].getTotalReadCount()+" corrected total is "+correctedTotalCounts[d]);
// If these libraries are paired end then the total number of
// reads is also effectively halved.
float totalReadCount;
// calculated this already, but otherwise we'll take the total count (total length/read length)
if (correctDNAContamination) {
totalReadCount = correctedTotalCounts[d];
} else {
totalReadCount = data[d].getTotalReadLength() / readLengths[d];
}
// If we're correcting for duplication we divide by the duplication level.
if (correctDuplication) {
totalReadCount /= modalDuplicationLevels[d];
}
// Finally we work out millions of reads (single end) or fragments (paired end)
if (pairedEnd) {
totalReadCount /= 2000000f;
} else {
totalReadCount /= 1000000f;
}
// Lastly we divide the value by the total millions of reads to get the globally corrected count.
value /= totalReadCount;
}
// Finally we do the log transform if we've been asked to
if (logTransform) {
value = (float) Math.log(value) / log2;
}
data[d].setValueForProbe(allProbes[currentIndex], value);
}
currentIndex++;
}
}
collection().probeSet().setCurrentQuantitation(getQuantitationDescription(mergeTranscripts, applyTranscriptLengthCorrection, correctDNAContamination, logTransform, rawCounts));
// If we estimated any parameters let's report them.
if (correctDNAContamination || correctDuplication) {
float[] dna = null;
if (correctDNAContamination) {
dna = dnaDensityPerKb;
}
int[] dup = null;
if (correctDuplication) {
dup = modalDuplicationLevels;
}
RNASeqParametersModel model = new RNASeqParametersModel(data, dna, dup);
ReportTableDialog report = new ReportTableDialog(SeqMonkApplication.getInstance(), new Report(null, null) {
@Override
public void run() {
}
@Override
public String name() {
return "RNA-Seq parameter";
}
@Override
public boolean isReady() {
return true;
}
@Override
public JPanel getOptionsPanel() {
return null;
}
@Override
public void generateReport() {
}
}, model);
}
quantitatonComplete();
}
Aggregations